Files
Tokara-0.5B-v0.1/trainer_state.json
ModelHub XC 9652444fa6 初始化项目,由ModelHub XC社区提供模型
Model: QwenCollection/Tokara-0.5B-v0.1
Source: Original Platform
2026-05-29 16:20:14 +08:00

17602 lines
405 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7135896458142392,
"eval_steps": 250,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002854358583256957,
"grad_norm": 2.75,
"learning_rate": 8.571428571428571e-06,
"loss": 3.7737,
"step": 1
},
{
"epoch": 0.0005708717166513914,
"grad_norm": 3.03125,
"learning_rate": 1.7142857142857142e-05,
"loss": 3.8253,
"step": 2
},
{
"epoch": 0.000856307574977087,
"grad_norm": 2.078125,
"learning_rate": 2.571428571428571e-05,
"loss": 3.8136,
"step": 3
},
{
"epoch": 0.0011417434333027827,
"grad_norm": 2.53125,
"learning_rate": 3.4285714285714284e-05,
"loss": 3.7592,
"step": 4
},
{
"epoch": 0.0014271792916284785,
"grad_norm": 2.515625,
"learning_rate": 4.285714285714285e-05,
"loss": 3.7806,
"step": 5
},
{
"epoch": 0.001712615149954174,
"grad_norm": 2.375,
"learning_rate": 5.142857142857142e-05,
"loss": 3.7962,
"step": 6
},
{
"epoch": 0.0019980510082798697,
"grad_norm": 2.53125,
"learning_rate": 5.9999999999999995e-05,
"loss": 3.7494,
"step": 7
},
{
"epoch": 0.0022834868666055655,
"grad_norm": 2.859375,
"learning_rate": 6.857142857142857e-05,
"loss": 3.7721,
"step": 8
},
{
"epoch": 0.0025689227249312612,
"grad_norm": 2.0625,
"learning_rate": 7.714285714285713e-05,
"loss": 3.744,
"step": 9
},
{
"epoch": 0.002854358583256957,
"grad_norm": 1.828125,
"learning_rate": 8.57142857142857e-05,
"loss": 3.7373,
"step": 10
},
{
"epoch": 0.003139794441582653,
"grad_norm": 2.65625,
"learning_rate": 9.428571428571427e-05,
"loss": 3.7021,
"step": 11
},
{
"epoch": 0.003425230299908348,
"grad_norm": 4.375,
"learning_rate": 0.00010285714285714284,
"loss": 3.6838,
"step": 12
},
{
"epoch": 0.003710666158234044,
"grad_norm": 2.5,
"learning_rate": 0.00011142857142857142,
"loss": 3.7084,
"step": 13
},
{
"epoch": 0.003996102016559739,
"grad_norm": 4.25,
"learning_rate": 0.00011999999999999999,
"loss": 3.6489,
"step": 14
},
{
"epoch": 0.004281537874885435,
"grad_norm": 2.84375,
"learning_rate": 0.00012857142857142855,
"loss": 3.6588,
"step": 15
},
{
"epoch": 0.004566973733211131,
"grad_norm": 4.4375,
"learning_rate": 0.00013714285714285713,
"loss": 3.6394,
"step": 16
},
{
"epoch": 0.004852409591536827,
"grad_norm": 3.203125,
"learning_rate": 0.0001457142857142857,
"loss": 3.5906,
"step": 17
},
{
"epoch": 0.0051378454498625225,
"grad_norm": 2.640625,
"learning_rate": 0.00015428571428571425,
"loss": 3.5944,
"step": 18
},
{
"epoch": 0.005423281308188218,
"grad_norm": 4.21875,
"learning_rate": 0.00016285714285714284,
"loss": 3.5843,
"step": 19
},
{
"epoch": 0.005708717166513914,
"grad_norm": 2.875,
"learning_rate": 0.0001714285714285714,
"loss": 3.5661,
"step": 20
},
{
"epoch": 0.00599415302483961,
"grad_norm": 4.375,
"learning_rate": 0.00017999999999999998,
"loss": 3.5976,
"step": 21
},
{
"epoch": 0.006279588883165306,
"grad_norm": 3.0,
"learning_rate": 0.00018857142857142854,
"loss": 3.5226,
"step": 22
},
{
"epoch": 0.006565024741491001,
"grad_norm": 2.828125,
"learning_rate": 0.00019714285714285713,
"loss": 3.5581,
"step": 23
},
{
"epoch": 0.006850460599816696,
"grad_norm": 4.1875,
"learning_rate": 0.0002057142857142857,
"loss": 3.5337,
"step": 24
},
{
"epoch": 0.007135896458142392,
"grad_norm": 5.375,
"learning_rate": 0.00021428571428571427,
"loss": 3.502,
"step": 25
},
{
"epoch": 0.007421332316468088,
"grad_norm": 2.359375,
"learning_rate": 0.00022285714285714283,
"loss": 3.4848,
"step": 26
},
{
"epoch": 0.007706768174793784,
"grad_norm": 7.65625,
"learning_rate": 0.00023142857142857142,
"loss": 3.5451,
"step": 27
},
{
"epoch": 0.007992204033119479,
"grad_norm": 4.96875,
"learning_rate": 0.00023999999999999998,
"loss": 3.5235,
"step": 28
},
{
"epoch": 0.008277639891445174,
"grad_norm": 6.65625,
"learning_rate": 0.00024857142857142857,
"loss": 3.5061,
"step": 29
},
{
"epoch": 0.00856307574977087,
"grad_norm": 4.9375,
"learning_rate": 0.0002571428571428571,
"loss": 3.5228,
"step": 30
},
{
"epoch": 0.008848511608096566,
"grad_norm": 7.75,
"learning_rate": 0.0002657142857142857,
"loss": 3.4963,
"step": 31
},
{
"epoch": 0.009133947466422262,
"grad_norm": 4.75,
"learning_rate": 0.00027428571428571427,
"loss": 3.5074,
"step": 32
},
{
"epoch": 0.009419383324747958,
"grad_norm": 5.3125,
"learning_rate": 0.0002828571428571428,
"loss": 3.4555,
"step": 33
},
{
"epoch": 0.009704819183073653,
"grad_norm": 4.40625,
"learning_rate": 0.0002914285714285714,
"loss": 3.4634,
"step": 34
},
{
"epoch": 0.00999025504139935,
"grad_norm": 4.8125,
"learning_rate": 0.0003,
"loss": 3.4516,
"step": 35
},
{
"epoch": 0.010275690899725045,
"grad_norm": 3.921875,
"learning_rate": 0.00029999993845357924,
"loss": 3.4341,
"step": 36
},
{
"epoch": 0.01056112675805074,
"grad_norm": 5.40625,
"learning_rate": 0.0002999997538143675,
"loss": 3.4625,
"step": 37
},
{
"epoch": 0.010846562616376437,
"grad_norm": 4.59375,
"learning_rate": 0.0002999994460825163,
"loss": 3.4492,
"step": 38
},
{
"epoch": 0.011131998474702132,
"grad_norm": 3.5625,
"learning_rate": 0.0002999990152582781,
"loss": 3.4078,
"step": 39
},
{
"epoch": 0.011417434333027828,
"grad_norm": 4.75,
"learning_rate": 0.00029999846134200653,
"loss": 3.4077,
"step": 40
},
{
"epoch": 0.011702870191353524,
"grad_norm": 4.03125,
"learning_rate": 0.0002999977843341562,
"loss": 3.4062,
"step": 41
},
{
"epoch": 0.01198830604967922,
"grad_norm": 3.859375,
"learning_rate": 0.0002999969842352825,
"loss": 3.3895,
"step": 42
},
{
"epoch": 0.012273741908004916,
"grad_norm": 3.25,
"learning_rate": 0.0002999960610460421,
"loss": 3.3762,
"step": 43
},
{
"epoch": 0.012559177766330611,
"grad_norm": 4.0625,
"learning_rate": 0.00029999501476719257,
"loss": 3.3807,
"step": 44
},
{
"epoch": 0.012844613624656307,
"grad_norm": 3.71875,
"learning_rate": 0.00029999384539959253,
"loss": 3.3432,
"step": 45
},
{
"epoch": 0.013130049482982001,
"grad_norm": 3.328125,
"learning_rate": 0.0002999925529442016,
"loss": 3.3543,
"step": 46
},
{
"epoch": 0.013415485341307697,
"grad_norm": 5.5625,
"learning_rate": 0.0002999911374020804,
"loss": 3.3339,
"step": 47
},
{
"epoch": 0.013700921199633393,
"grad_norm": 2.25,
"learning_rate": 0.00029998959877439044,
"loss": 3.3377,
"step": 48
},
{
"epoch": 0.013986357057959089,
"grad_norm": 4.84375,
"learning_rate": 0.0002999879370623944,
"loss": 3.4033,
"step": 49
},
{
"epoch": 0.014271792916284784,
"grad_norm": 4.375,
"learning_rate": 0.00029998615226745605,
"loss": 3.3567,
"step": 50
},
{
"epoch": 0.01455722877461048,
"grad_norm": 3.15625,
"learning_rate": 0.0002999842443910399,
"loss": 3.3819,
"step": 51
},
{
"epoch": 0.014842664632936176,
"grad_norm": 4.5,
"learning_rate": 0.0002999822134347115,
"loss": 3.3586,
"step": 52
},
{
"epoch": 0.015128100491261872,
"grad_norm": 3.671875,
"learning_rate": 0.0002999800594001376,
"loss": 3.3414,
"step": 53
},
{
"epoch": 0.015413536349587567,
"grad_norm": 2.765625,
"learning_rate": 0.000299977782289086,
"loss": 3.3165,
"step": 54
},
{
"epoch": 0.01569897220791326,
"grad_norm": 4.6875,
"learning_rate": 0.00029997538210342503,
"loss": 3.3446,
"step": 55
},
{
"epoch": 0.015984408066238957,
"grad_norm": 4.0625,
"learning_rate": 0.0002999728588451245,
"loss": 3.3649,
"step": 56
},
{
"epoch": 0.016269843924564653,
"grad_norm": 2.828125,
"learning_rate": 0.000299970212516255,
"loss": 3.3258,
"step": 57
},
{
"epoch": 0.01655527978289035,
"grad_norm": 3.84375,
"learning_rate": 0.0002999674431189883,
"loss": 3.3137,
"step": 58
},
{
"epoch": 0.016840715641216045,
"grad_norm": 2.53125,
"learning_rate": 0.0002999645506555967,
"loss": 3.31,
"step": 59
},
{
"epoch": 0.01712615149954174,
"grad_norm": 3.796875,
"learning_rate": 0.00029996153512845415,
"loss": 3.3022,
"step": 60
},
{
"epoch": 0.017411587357867436,
"grad_norm": 3.671875,
"learning_rate": 0.00029995839654003504,
"loss": 3.3119,
"step": 61
},
{
"epoch": 0.017697023216193132,
"grad_norm": 2.828125,
"learning_rate": 0.00029995513489291506,
"loss": 3.306,
"step": 62
},
{
"epoch": 0.017982459074518828,
"grad_norm": 2.96875,
"learning_rate": 0.0002999517501897707,
"loss": 3.2965,
"step": 63
},
{
"epoch": 0.018267894932844524,
"grad_norm": 3.765625,
"learning_rate": 0.0002999482424333796,
"loss": 3.3035,
"step": 64
},
{
"epoch": 0.01855333079117022,
"grad_norm": 2.96875,
"learning_rate": 0.00029994461162662024,
"loss": 3.2734,
"step": 65
},
{
"epoch": 0.018838766649495915,
"grad_norm": 2.28125,
"learning_rate": 0.0002999408577724721,
"loss": 3.2772,
"step": 66
},
{
"epoch": 0.01912420250782161,
"grad_norm": 3.46875,
"learning_rate": 0.0002999369808740157,
"loss": 3.2491,
"step": 67
},
{
"epoch": 0.019409638366147307,
"grad_norm": 3.78125,
"learning_rate": 0.00029993298093443246,
"loss": 3.2943,
"step": 68
},
{
"epoch": 0.019695074224473003,
"grad_norm": 1.9296875,
"learning_rate": 0.0002999288579570049,
"loss": 3.2525,
"step": 69
},
{
"epoch": 0.0199805100827987,
"grad_norm": 4.0,
"learning_rate": 0.00029992461194511624,
"loss": 3.2765,
"step": 70
},
{
"epoch": 0.020265945941124394,
"grad_norm": 2.578125,
"learning_rate": 0.000299920242902251,
"loss": 3.2538,
"step": 71
},
{
"epoch": 0.02055138179945009,
"grad_norm": 2.84375,
"learning_rate": 0.00029991575083199455,
"loss": 3.2407,
"step": 72
},
{
"epoch": 0.020836817657775786,
"grad_norm": 3.203125,
"learning_rate": 0.00029991113573803294,
"loss": 3.2537,
"step": 73
},
{
"epoch": 0.02112225351610148,
"grad_norm": 4.34375,
"learning_rate": 0.0002999063976241536,
"loss": 3.2618,
"step": 74
},
{
"epoch": 0.021407689374427177,
"grad_norm": 1.5390625,
"learning_rate": 0.00029990153649424463,
"loss": 3.2486,
"step": 75
},
{
"epoch": 0.021693125232752873,
"grad_norm": 6.15625,
"learning_rate": 0.0002998965523522951,
"loss": 3.2839,
"step": 76
},
{
"epoch": 0.02197856109107857,
"grad_norm": 3.953125,
"learning_rate": 0.0002998914452023953,
"loss": 3.2866,
"step": 77
},
{
"epoch": 0.022263996949404265,
"grad_norm": 4.9375,
"learning_rate": 0.00029988621504873606,
"loss": 3.3082,
"step": 78
},
{
"epoch": 0.02254943280772996,
"grad_norm": 3.578125,
"learning_rate": 0.0002998808618956094,
"loss": 3.2833,
"step": 79
},
{
"epoch": 0.022834868666055656,
"grad_norm": 4.375,
"learning_rate": 0.00029987538574740826,
"loss": 3.2748,
"step": 80
},
{
"epoch": 0.023120304524381352,
"grad_norm": 2.921875,
"learning_rate": 0.0002998697866086264,
"loss": 3.2491,
"step": 81
},
{
"epoch": 0.023405740382707048,
"grad_norm": 3.5,
"learning_rate": 0.0002998640644838587,
"loss": 3.2526,
"step": 82
},
{
"epoch": 0.023691176241032744,
"grad_norm": 3.09375,
"learning_rate": 0.0002998582193778006,
"loss": 3.2262,
"step": 83
},
{
"epoch": 0.02397661209935844,
"grad_norm": 2.96875,
"learning_rate": 0.000299852251295249,
"loss": 3.2321,
"step": 84
},
{
"epoch": 0.024262047957684135,
"grad_norm": 2.796875,
"learning_rate": 0.0002998461602411013,
"loss": 3.2485,
"step": 85
},
{
"epoch": 0.02454748381600983,
"grad_norm": 2.46875,
"learning_rate": 0.00029983994622035585,
"loss": 3.2223,
"step": 86
},
{
"epoch": 0.024832919674335527,
"grad_norm": 3.484375,
"learning_rate": 0.0002998336092381121,
"loss": 3.2184,
"step": 87
},
{
"epoch": 0.025118355532661223,
"grad_norm": 2.734375,
"learning_rate": 0.0002998271492995702,
"loss": 3.2204,
"step": 88
},
{
"epoch": 0.02540379139098692,
"grad_norm": 3.34375,
"learning_rate": 0.00029982056641003147,
"loss": 3.2185,
"step": 89
},
{
"epoch": 0.025689227249312614,
"grad_norm": 2.03125,
"learning_rate": 0.00029981386057489776,
"loss": 3.1942,
"step": 90
},
{
"epoch": 0.025974663107638307,
"grad_norm": 2.953125,
"learning_rate": 0.00029980703179967213,
"loss": 3.1724,
"step": 91
},
{
"epoch": 0.026260098965964002,
"grad_norm": 3.015625,
"learning_rate": 0.00029980008008995834,
"loss": 3.2225,
"step": 92
},
{
"epoch": 0.026545534824289698,
"grad_norm": 3.125,
"learning_rate": 0.0002997930054514612,
"loss": 3.2103,
"step": 93
},
{
"epoch": 0.026830970682615394,
"grad_norm": 2.3125,
"learning_rate": 0.0002997858078899861,
"loss": 3.1942,
"step": 94
},
{
"epoch": 0.02711640654094109,
"grad_norm": 2.234375,
"learning_rate": 0.00029977848741143966,
"loss": 3.1652,
"step": 95
},
{
"epoch": 0.027401842399266785,
"grad_norm": 3.234375,
"learning_rate": 0.0002997710440218291,
"loss": 3.186,
"step": 96
},
{
"epoch": 0.02768727825759248,
"grad_norm": 2.40625,
"learning_rate": 0.0002997634777272627,
"loss": 3.1928,
"step": 97
},
{
"epoch": 0.027972714115918177,
"grad_norm": 2.625,
"learning_rate": 0.0002997557885339494,
"loss": 3.169,
"step": 98
},
{
"epoch": 0.028258149974243873,
"grad_norm": 2.015625,
"learning_rate": 0.00029974797644819926,
"loss": 3.174,
"step": 99
},
{
"epoch": 0.02854358583256957,
"grad_norm": 3.984375,
"learning_rate": 0.0002997400414764229,
"loss": 3.1859,
"step": 100
},
{
"epoch": 0.028829021690895264,
"grad_norm": 2.234375,
"learning_rate": 0.0002997319836251319,
"loss": 3.1975,
"step": 101
},
{
"epoch": 0.02911445754922096,
"grad_norm": 2.65625,
"learning_rate": 0.0002997238029009387,
"loss": 3.163,
"step": 102
},
{
"epoch": 0.029399893407546656,
"grad_norm": 3.359375,
"learning_rate": 0.0002997154993105566,
"loss": 3.1766,
"step": 103
},
{
"epoch": 0.029685329265872352,
"grad_norm": 3.078125,
"learning_rate": 0.00029970707286079966,
"loss": 3.1692,
"step": 104
},
{
"epoch": 0.029970765124198048,
"grad_norm": 3.171875,
"learning_rate": 0.00029969852355858276,
"loss": 3.1785,
"step": 105
},
{
"epoch": 0.030256200982523743,
"grad_norm": 2.09375,
"learning_rate": 0.00029968985141092165,
"loss": 3.1622,
"step": 106
},
{
"epoch": 0.03054163684084944,
"grad_norm": 2.625,
"learning_rate": 0.00029968105642493286,
"loss": 3.1934,
"step": 107
},
{
"epoch": 0.030827072699175135,
"grad_norm": 3.25,
"learning_rate": 0.0002996721386078337,
"loss": 3.1503,
"step": 108
},
{
"epoch": 0.03111250855750083,
"grad_norm": 2.34375,
"learning_rate": 0.00029966309796694226,
"loss": 3.1415,
"step": 109
},
{
"epoch": 0.03139794441582652,
"grad_norm": 2.6875,
"learning_rate": 0.0002996539345096776,
"loss": 3.169,
"step": 110
},
{
"epoch": 0.03168338027415222,
"grad_norm": 1.828125,
"learning_rate": 0.0002996446482435593,
"loss": 3.1381,
"step": 111
},
{
"epoch": 0.031968816132477915,
"grad_norm": 2.8125,
"learning_rate": 0.0002996352391762079,
"loss": 3.1506,
"step": 112
},
{
"epoch": 0.03225425199080361,
"grad_norm": 2.796875,
"learning_rate": 0.0002996257073153446,
"loss": 3.1666,
"step": 113
},
{
"epoch": 0.032539687849129306,
"grad_norm": 2.546875,
"learning_rate": 0.00029961605266879153,
"loss": 3.1883,
"step": 114
},
{
"epoch": 0.032825123707455,
"grad_norm": 2.703125,
"learning_rate": 0.0002996062752444714,
"loss": 3.1594,
"step": 115
},
{
"epoch": 0.0331105595657807,
"grad_norm": 2.15625,
"learning_rate": 0.00029959637505040773,
"loss": 3.1553,
"step": 116
},
{
"epoch": 0.033395995424106394,
"grad_norm": 2.8125,
"learning_rate": 0.00029958635209472486,
"loss": 3.125,
"step": 117
},
{
"epoch": 0.03368143128243209,
"grad_norm": 2.4375,
"learning_rate": 0.00029957620638564785,
"loss": 3.1074,
"step": 118
},
{
"epoch": 0.033966867140757785,
"grad_norm": 2.03125,
"learning_rate": 0.00029956593793150233,
"loss": 3.1193,
"step": 119
},
{
"epoch": 0.03425230299908348,
"grad_norm": 2.484375,
"learning_rate": 0.0002995555467407149,
"loss": 3.107,
"step": 120
},
{
"epoch": 0.03453773885740918,
"grad_norm": 2.84375,
"learning_rate": 0.0002995450328218127,
"loss": 3.1292,
"step": 121
},
{
"epoch": 0.03482317471573487,
"grad_norm": 2.0,
"learning_rate": 0.0002995343961834238,
"loss": 3.1159,
"step": 122
},
{
"epoch": 0.03510861057406057,
"grad_norm": 2.390625,
"learning_rate": 0.0002995236368342766,
"loss": 3.1207,
"step": 123
},
{
"epoch": 0.035394046432386264,
"grad_norm": 2.109375,
"learning_rate": 0.00029951275478320056,
"loss": 3.1056,
"step": 124
},
{
"epoch": 0.03567948229071196,
"grad_norm": 2.984375,
"learning_rate": 0.00029950175003912573,
"loss": 3.1206,
"step": 125
},
{
"epoch": 0.035964918149037656,
"grad_norm": 1.484375,
"learning_rate": 0.0002994906226110827,
"loss": 3.1213,
"step": 126
},
{
"epoch": 0.03625035400736335,
"grad_norm": 2.96875,
"learning_rate": 0.00029947937250820295,
"loss": 3.1091,
"step": 127
},
{
"epoch": 0.03653578986568905,
"grad_norm": 1.8125,
"learning_rate": 0.0002994679997397185,
"loss": 3.1071,
"step": 128
},
{
"epoch": 0.03682122572401474,
"grad_norm": 3.15625,
"learning_rate": 0.000299456504314962,
"loss": 3.143,
"step": 129
},
{
"epoch": 0.03710666158234044,
"grad_norm": 1.9765625,
"learning_rate": 0.00029944488624336683,
"loss": 3.1106,
"step": 130
},
{
"epoch": 0.037392097440666135,
"grad_norm": 3.3125,
"learning_rate": 0.00029943314553446706,
"loss": 3.1163,
"step": 131
},
{
"epoch": 0.03767753329899183,
"grad_norm": 2.578125,
"learning_rate": 0.00029942128219789734,
"loss": 3.1173,
"step": 132
},
{
"epoch": 0.037962969157317526,
"grad_norm": 2.734375,
"learning_rate": 0.0002994092962433929,
"loss": 3.1289,
"step": 133
},
{
"epoch": 0.03824840501564322,
"grad_norm": 2.484375,
"learning_rate": 0.0002993971876807896,
"loss": 3.1056,
"step": 134
},
{
"epoch": 0.03853384087396892,
"grad_norm": 2.40625,
"learning_rate": 0.0002993849565200241,
"loss": 3.0896,
"step": 135
},
{
"epoch": 0.038819276732294614,
"grad_norm": 2.359375,
"learning_rate": 0.0002993726027711333,
"loss": 3.1087,
"step": 136
},
{
"epoch": 0.03910471259062031,
"grad_norm": 2.328125,
"learning_rate": 0.00029936012644425517,
"loss": 3.1059,
"step": 137
},
{
"epoch": 0.039390148448946005,
"grad_norm": 2.984375,
"learning_rate": 0.00029934752754962783,
"loss": 3.1265,
"step": 138
},
{
"epoch": 0.0396755843072717,
"grad_norm": 2.15625,
"learning_rate": 0.00029933480609759027,
"loss": 3.0987,
"step": 139
},
{
"epoch": 0.0399610201655974,
"grad_norm": 2.59375,
"learning_rate": 0.00029932196209858197,
"loss": 3.1122,
"step": 140
},
{
"epoch": 0.04024645602392309,
"grad_norm": 2.375,
"learning_rate": 0.0002993089955631429,
"loss": 3.0887,
"step": 141
},
{
"epoch": 0.04053189188224879,
"grad_norm": 2.25,
"learning_rate": 0.0002992959065019136,
"loss": 3.0815,
"step": 142
},
{
"epoch": 0.040817327740574484,
"grad_norm": 3.0,
"learning_rate": 0.00029928269492563537,
"loss": 3.0889,
"step": 143
},
{
"epoch": 0.04110276359890018,
"grad_norm": 1.53125,
"learning_rate": 0.00029926936084514967,
"loss": 3.0793,
"step": 144
},
{
"epoch": 0.041388199457225876,
"grad_norm": 2.59375,
"learning_rate": 0.00029925590427139887,
"loss": 3.0804,
"step": 145
},
{
"epoch": 0.04167363531555157,
"grad_norm": 1.8984375,
"learning_rate": 0.00029924232521542557,
"loss": 3.0612,
"step": 146
},
{
"epoch": 0.04195907117387727,
"grad_norm": 2.71875,
"learning_rate": 0.00029922862368837315,
"loss": 3.0698,
"step": 147
},
{
"epoch": 0.04224450703220296,
"grad_norm": 2.859375,
"learning_rate": 0.00029921479970148517,
"loss": 3.088,
"step": 148
},
{
"epoch": 0.04252994289052866,
"grad_norm": 1.9609375,
"learning_rate": 0.00029920085326610595,
"loss": 3.0765,
"step": 149
},
{
"epoch": 0.042815378748854355,
"grad_norm": 3.515625,
"learning_rate": 0.00029918678439368017,
"loss": 3.0926,
"step": 150
},
{
"epoch": 0.04310081460718005,
"grad_norm": 2.453125,
"learning_rate": 0.000299172593095753,
"loss": 3.0821,
"step": 151
},
{
"epoch": 0.043386250465505746,
"grad_norm": 5.25,
"learning_rate": 0.00029915827938397017,
"loss": 3.0682,
"step": 152
},
{
"epoch": 0.04367168632383144,
"grad_norm": 3.078125,
"learning_rate": 0.0002991438432700777,
"loss": 3.0657,
"step": 153
},
{
"epoch": 0.04395712218215714,
"grad_norm": 4.03125,
"learning_rate": 0.0002991292847659222,
"loss": 3.0883,
"step": 154
},
{
"epoch": 0.044242558040482834,
"grad_norm": 3.828125,
"learning_rate": 0.0002991146038834505,
"loss": 3.0962,
"step": 155
},
{
"epoch": 0.04452799389880853,
"grad_norm": 2.578125,
"learning_rate": 0.0002990998006347102,
"loss": 3.0695,
"step": 156
},
{
"epoch": 0.044813429757134225,
"grad_norm": 4.0625,
"learning_rate": 0.0002990848750318491,
"loss": 3.1003,
"step": 157
},
{
"epoch": 0.04509886561545992,
"grad_norm": 2.90625,
"learning_rate": 0.00029906982708711533,
"loss": 3.0733,
"step": 158
},
{
"epoch": 0.04538430147378562,
"grad_norm": 5.53125,
"learning_rate": 0.0002990546568128576,
"loss": 3.1179,
"step": 159
},
{
"epoch": 0.04566973733211131,
"grad_norm": 4.625,
"learning_rate": 0.00029903936422152487,
"loss": 3.1125,
"step": 160
},
{
"epoch": 0.04595517319043701,
"grad_norm": 4.90625,
"learning_rate": 0.00029902394932566657,
"loss": 3.0922,
"step": 161
},
{
"epoch": 0.046240609048762704,
"grad_norm": 3.34375,
"learning_rate": 0.00029900841213793247,
"loss": 3.048,
"step": 162
},
{
"epoch": 0.0465260449070884,
"grad_norm": 9.5,
"learning_rate": 0.00029899275267107264,
"loss": 3.1456,
"step": 163
},
{
"epoch": 0.046811480765414096,
"grad_norm": 8.3125,
"learning_rate": 0.00029897697093793753,
"loss": 3.1066,
"step": 164
},
{
"epoch": 0.04709691662373979,
"grad_norm": 3.0,
"learning_rate": 0.000298961066951478,
"loss": 3.0876,
"step": 165
},
{
"epoch": 0.04738235248206549,
"grad_norm": 6.0625,
"learning_rate": 0.0002989450407247451,
"loss": 3.1259,
"step": 166
},
{
"epoch": 0.04766778834039118,
"grad_norm": 5.96875,
"learning_rate": 0.0002989288922708902,
"loss": 3.1248,
"step": 167
},
{
"epoch": 0.04795322419871688,
"grad_norm": 3.4375,
"learning_rate": 0.0002989126216031652,
"loss": 3.0802,
"step": 168
},
{
"epoch": 0.048238660057042575,
"grad_norm": 3.890625,
"learning_rate": 0.00029889622873492195,
"loss": 3.0777,
"step": 169
},
{
"epoch": 0.04852409591536827,
"grad_norm": 2.78125,
"learning_rate": 0.0002988797136796128,
"loss": 3.0904,
"step": 170
},
{
"epoch": 0.048809531773693966,
"grad_norm": 3.453125,
"learning_rate": 0.0002988630764507904,
"loss": 3.081,
"step": 171
},
{
"epoch": 0.04909496763201966,
"grad_norm": 2.859375,
"learning_rate": 0.0002988463170621074,
"loss": 3.0743,
"step": 172
},
{
"epoch": 0.04938040349034536,
"grad_norm": 2.515625,
"learning_rate": 0.00029882943552731703,
"loss": 3.0189,
"step": 173
},
{
"epoch": 0.049665839348671054,
"grad_norm": 2.6875,
"learning_rate": 0.0002988124318602725,
"loss": 3.0684,
"step": 174
},
{
"epoch": 0.04995127520699675,
"grad_norm": 2.21875,
"learning_rate": 0.0002987953060749274,
"loss": 3.0479,
"step": 175
},
{
"epoch": 0.050236711065322445,
"grad_norm": 2.90625,
"learning_rate": 0.0002987780581853355,
"loss": 3.0374,
"step": 176
},
{
"epoch": 0.05052214692364814,
"grad_norm": 2.078125,
"learning_rate": 0.0002987606882056507,
"loss": 3.0589,
"step": 177
},
{
"epoch": 0.05080758278197384,
"grad_norm": 3.59375,
"learning_rate": 0.00029874319615012714,
"loss": 3.0731,
"step": 178
},
{
"epoch": 0.05109301864029953,
"grad_norm": 3.109375,
"learning_rate": 0.00029872558203311914,
"loss": 3.0793,
"step": 179
},
{
"epoch": 0.05137845449862523,
"grad_norm": 2.546875,
"learning_rate": 0.0002987078458690811,
"loss": 3.0748,
"step": 180
},
{
"epoch": 0.05166389035695092,
"grad_norm": 3.109375,
"learning_rate": 0.0002986899876725678,
"loss": 3.0308,
"step": 181
},
{
"epoch": 0.05194932621527661,
"grad_norm": 2.0625,
"learning_rate": 0.00029867200745823384,
"loss": 3.0496,
"step": 182
},
{
"epoch": 0.05223476207360231,
"grad_norm": 2.40625,
"learning_rate": 0.0002986539052408343,
"loss": 3.0577,
"step": 183
},
{
"epoch": 0.052520197931928005,
"grad_norm": 2.75,
"learning_rate": 0.0002986356810352241,
"loss": 3.0357,
"step": 184
},
{
"epoch": 0.0528056337902537,
"grad_norm": 1.546875,
"learning_rate": 0.00029861733485635834,
"loss": 3.023,
"step": 185
},
{
"epoch": 0.053091069648579396,
"grad_norm": 2.6875,
"learning_rate": 0.00029859886671929233,
"loss": 3.0768,
"step": 186
},
{
"epoch": 0.05337650550690509,
"grad_norm": 1.90625,
"learning_rate": 0.00029858027663918135,
"loss": 3.0272,
"step": 187
},
{
"epoch": 0.05366194136523079,
"grad_norm": 2.328125,
"learning_rate": 0.0002985615646312807,
"loss": 3.0348,
"step": 188
},
{
"epoch": 0.053947377223556484,
"grad_norm": 2.140625,
"learning_rate": 0.00029854273071094596,
"loss": 3.0245,
"step": 189
},
{
"epoch": 0.05423281308188218,
"grad_norm": 1.9375,
"learning_rate": 0.00029852377489363247,
"loss": 3.0558,
"step": 190
},
{
"epoch": 0.054518248940207875,
"grad_norm": 2.578125,
"learning_rate": 0.00029850469719489573,
"loss": 3.0611,
"step": 191
},
{
"epoch": 0.05480368479853357,
"grad_norm": 1.9453125,
"learning_rate": 0.00029848549763039135,
"loss": 3.0442,
"step": 192
},
{
"epoch": 0.05508912065685927,
"grad_norm": 2.796875,
"learning_rate": 0.00029846617621587474,
"loss": 3.06,
"step": 193
},
{
"epoch": 0.05537455651518496,
"grad_norm": 1.765625,
"learning_rate": 0.00029844673296720154,
"loss": 3.0144,
"step": 194
},
{
"epoch": 0.05565999237351066,
"grad_norm": 2.46875,
"learning_rate": 0.0002984271679003272,
"loss": 3.0423,
"step": 195
},
{
"epoch": 0.055945428231836354,
"grad_norm": 1.8671875,
"learning_rate": 0.0002984074810313071,
"loss": 3.0504,
"step": 196
},
{
"epoch": 0.05623086409016205,
"grad_norm": 2.203125,
"learning_rate": 0.00029838767237629684,
"loss": 3.0031,
"step": 197
},
{
"epoch": 0.056516299948487746,
"grad_norm": 1.8046875,
"learning_rate": 0.0002983677419515516,
"loss": 3.0401,
"step": 198
},
{
"epoch": 0.05680173580681344,
"grad_norm": 2.015625,
"learning_rate": 0.00029834768977342677,
"loss": 3.0359,
"step": 199
},
{
"epoch": 0.05708717166513914,
"grad_norm": 2.5625,
"learning_rate": 0.0002983275158583775,
"loss": 3.028,
"step": 200
},
{
"epoch": 0.05737260752346483,
"grad_norm": 1.9140625,
"learning_rate": 0.0002983072202229589,
"loss": 3.0115,
"step": 201
},
{
"epoch": 0.05765804338179053,
"grad_norm": 1.953125,
"learning_rate": 0.000298286802883826,
"loss": 3.0221,
"step": 202
},
{
"epoch": 0.057943479240116225,
"grad_norm": 2.109375,
"learning_rate": 0.0002982662638577335,
"loss": 3.0104,
"step": 203
},
{
"epoch": 0.05822891509844192,
"grad_norm": 2.21875,
"learning_rate": 0.00029824560316153633,
"loss": 2.9983,
"step": 204
},
{
"epoch": 0.058514350956767616,
"grad_norm": 1.8984375,
"learning_rate": 0.00029822482081218887,
"loss": 3.0208,
"step": 205
},
{
"epoch": 0.05879978681509331,
"grad_norm": 2.5,
"learning_rate": 0.00029820391682674563,
"loss": 3.0206,
"step": 206
},
{
"epoch": 0.05908522267341901,
"grad_norm": 2.109375,
"learning_rate": 0.00029818289122236075,
"loss": 3.0552,
"step": 207
},
{
"epoch": 0.059370658531744704,
"grad_norm": 1.7421875,
"learning_rate": 0.00029816174401628827,
"loss": 3.0075,
"step": 208
},
{
"epoch": 0.0596560943900704,
"grad_norm": 2.03125,
"learning_rate": 0.00029814047522588194,
"loss": 3.0068,
"step": 209
},
{
"epoch": 0.059941530248396095,
"grad_norm": 1.5546875,
"learning_rate": 0.0002981190848685954,
"loss": 2.9909,
"step": 210
},
{
"epoch": 0.06022696610672179,
"grad_norm": 2.453125,
"learning_rate": 0.00029809757296198194,
"loss": 2.9962,
"step": 211
},
{
"epoch": 0.06051240196504749,
"grad_norm": 1.515625,
"learning_rate": 0.00029807593952369465,
"loss": 3.0294,
"step": 212
},
{
"epoch": 0.06079783782337318,
"grad_norm": 2.484375,
"learning_rate": 0.00029805418457148637,
"loss": 2.9857,
"step": 213
},
{
"epoch": 0.06108327368169888,
"grad_norm": 2.15625,
"learning_rate": 0.00029803230812320956,
"loss": 3.0202,
"step": 214
},
{
"epoch": 0.061368709540024574,
"grad_norm": 2.0625,
"learning_rate": 0.00029801031019681645,
"loss": 2.9734,
"step": 215
},
{
"epoch": 0.06165414539835027,
"grad_norm": 1.7109375,
"learning_rate": 0.000297988190810359,
"loss": 2.9859,
"step": 216
},
{
"epoch": 0.061939581256675966,
"grad_norm": 3.203125,
"learning_rate": 0.0002979659499819888,
"loss": 3.0128,
"step": 217
},
{
"epoch": 0.06222501711500166,
"grad_norm": 1.7734375,
"learning_rate": 0.0002979435877299571,
"loss": 3.0178,
"step": 218
},
{
"epoch": 0.06251045297332736,
"grad_norm": 2.75,
"learning_rate": 0.0002979211040726147,
"loss": 2.9779,
"step": 219
},
{
"epoch": 0.06279588883165305,
"grad_norm": 2.375,
"learning_rate": 0.00029789849902841223,
"loss": 2.9843,
"step": 220
},
{
"epoch": 0.06308132468997875,
"grad_norm": 2.3125,
"learning_rate": 0.0002978757726158998,
"loss": 2.9943,
"step": 221
},
{
"epoch": 0.06336676054830444,
"grad_norm": 2.421875,
"learning_rate": 0.0002978529248537271,
"loss": 3.0043,
"step": 222
},
{
"epoch": 0.06365219640663014,
"grad_norm": 2.109375,
"learning_rate": 0.00029782995576064337,
"loss": 2.9729,
"step": 223
},
{
"epoch": 0.06393763226495583,
"grad_norm": 1.484375,
"learning_rate": 0.00029780686535549756,
"loss": 2.9874,
"step": 224
},
{
"epoch": 0.06422306812328153,
"grad_norm": 2.5,
"learning_rate": 0.0002977836536572382,
"loss": 3.0055,
"step": 225
},
{
"epoch": 0.06450850398160722,
"grad_norm": 1.8046875,
"learning_rate": 0.00029776032068491303,
"loss": 3.0,
"step": 226
},
{
"epoch": 0.06479393983993292,
"grad_norm": 3.03125,
"learning_rate": 0.0002977368664576696,
"loss": 3.0042,
"step": 227
},
{
"epoch": 0.06507937569825861,
"grad_norm": 2.453125,
"learning_rate": 0.000297713290994755,
"loss": 2.9981,
"step": 228
},
{
"epoch": 0.06536481155658432,
"grad_norm": 2.65625,
"learning_rate": 0.0002976895943155156,
"loss": 2.9803,
"step": 229
},
{
"epoch": 0.06565024741491,
"grad_norm": 3.015625,
"learning_rate": 0.00029766577643939744,
"loss": 2.9994,
"step": 230
},
{
"epoch": 0.0659356832732357,
"grad_norm": 1.8203125,
"learning_rate": 0.0002976418373859458,
"loss": 2.9842,
"step": 231
},
{
"epoch": 0.0662211191315614,
"grad_norm": 5.125,
"learning_rate": 0.00029761777717480554,
"loss": 3.0053,
"step": 232
},
{
"epoch": 0.0665065549898871,
"grad_norm": 3.8125,
"learning_rate": 0.00029759359582572103,
"loss": 2.9906,
"step": 233
},
{
"epoch": 0.06679199084821279,
"grad_norm": 4.03125,
"learning_rate": 0.00029756929335853584,
"loss": 3.0234,
"step": 234
},
{
"epoch": 0.06707742670653849,
"grad_norm": 3.125,
"learning_rate": 0.0002975448697931931,
"loss": 2.9871,
"step": 235
},
{
"epoch": 0.06736286256486418,
"grad_norm": 4.03125,
"learning_rate": 0.00029752032514973516,
"loss": 3.0048,
"step": 236
},
{
"epoch": 0.06764829842318988,
"grad_norm": 3.453125,
"learning_rate": 0.0002974956594483039,
"loss": 3.0141,
"step": 237
},
{
"epoch": 0.06793373428151557,
"grad_norm": 2.90625,
"learning_rate": 0.0002974708727091404,
"loss": 2.9658,
"step": 238
},
{
"epoch": 0.06821917013984127,
"grad_norm": 2.6875,
"learning_rate": 0.00029744596495258525,
"loss": 3.002,
"step": 239
},
{
"epoch": 0.06850460599816696,
"grad_norm": 2.625,
"learning_rate": 0.0002974209361990781,
"loss": 2.9831,
"step": 240
},
{
"epoch": 0.06879004185649266,
"grad_norm": 2.28125,
"learning_rate": 0.0002973957864691581,
"loss": 2.9823,
"step": 241
},
{
"epoch": 0.06907547771481835,
"grad_norm": 2.609375,
"learning_rate": 0.00029737051578346345,
"loss": 2.9626,
"step": 242
},
{
"epoch": 0.06936091357314406,
"grad_norm": 1.8203125,
"learning_rate": 0.000297345124162732,
"loss": 2.9729,
"step": 243
},
{
"epoch": 0.06964634943146975,
"grad_norm": 3.3125,
"learning_rate": 0.00029731961162780037,
"loss": 3.0036,
"step": 244
},
{
"epoch": 0.06993178528979545,
"grad_norm": 2.59375,
"learning_rate": 0.0002972939781996047,
"loss": 2.9818,
"step": 245
},
{
"epoch": 0.07021722114812114,
"grad_norm": 4.53125,
"learning_rate": 0.00029726822389918034,
"loss": 2.9709,
"step": 246
},
{
"epoch": 0.07050265700644684,
"grad_norm": 4.09375,
"learning_rate": 0.0002972423487476617,
"loss": 2.9748,
"step": 247
},
{
"epoch": 0.07078809286477253,
"grad_norm": 2.8125,
"learning_rate": 0.0002972163527662824,
"loss": 2.96,
"step": 248
},
{
"epoch": 0.07107352872309823,
"grad_norm": 3.75,
"learning_rate": 0.00029719023597637523,
"loss": 2.9929,
"step": 249
},
{
"epoch": 0.07135896458142392,
"grad_norm": 2.640625,
"learning_rate": 0.00029716399839937216,
"loss": 2.9467,
"step": 250
},
{
"epoch": 0.07135896458142392,
"eval_loss": 2.805173873901367,
"eval_runtime": 5998.7495,
"eval_samples_per_second": 10.717,
"eval_steps_per_second": 10.717,
"step": 250
},
{
"epoch": 0.07164440043974962,
"grad_norm": 3.8125,
"learning_rate": 0.00029713764005680427,
"loss": 2.9764,
"step": 251
},
{
"epoch": 0.07192983629807531,
"grad_norm": 3.625,
"learning_rate": 0.00029711116097030167,
"loss": 2.9982,
"step": 252
},
{
"epoch": 0.07221527215640101,
"grad_norm": 2.359375,
"learning_rate": 0.0002970845611615935,
"loss": 2.9649,
"step": 253
},
{
"epoch": 0.0725007080147267,
"grad_norm": 3.265625,
"learning_rate": 0.00029705784065250826,
"loss": 2.9516,
"step": 254
},
{
"epoch": 0.0727861438730524,
"grad_norm": 2.875,
"learning_rate": 0.00029703099946497323,
"loss": 2.9788,
"step": 255
},
{
"epoch": 0.0730715797313781,
"grad_norm": 2.734375,
"learning_rate": 0.0002970040376210148,
"loss": 2.9737,
"step": 256
},
{
"epoch": 0.0733570155897038,
"grad_norm": 2.5625,
"learning_rate": 0.00029697695514275824,
"loss": 2.9806,
"step": 257
},
{
"epoch": 0.07364245144802949,
"grad_norm": 2.375,
"learning_rate": 0.00029694975205242816,
"loss": 2.9629,
"step": 258
},
{
"epoch": 0.07392788730635519,
"grad_norm": 2.578125,
"learning_rate": 0.00029692242837234777,
"loss": 2.9698,
"step": 259
},
{
"epoch": 0.07421332316468088,
"grad_norm": 1.7890625,
"learning_rate": 0.0002968949841249395,
"loss": 2.9449,
"step": 260
},
{
"epoch": 0.07449875902300658,
"grad_norm": 3.234375,
"learning_rate": 0.00029686741933272455,
"loss": 2.9724,
"step": 261
},
{
"epoch": 0.07478419488133227,
"grad_norm": 2.421875,
"learning_rate": 0.0002968397340183232,
"loss": 2.9606,
"step": 262
},
{
"epoch": 0.07506963073965797,
"grad_norm": 4.5,
"learning_rate": 0.00029681192820445445,
"loss": 3.0101,
"step": 263
},
{
"epoch": 0.07535506659798366,
"grad_norm": 3.3125,
"learning_rate": 0.00029678400191393626,
"loss": 2.9797,
"step": 264
},
{
"epoch": 0.07564050245630936,
"grad_norm": 4.4375,
"learning_rate": 0.0002967559551696856,
"loss": 2.9859,
"step": 265
},
{
"epoch": 0.07592593831463505,
"grad_norm": 3.953125,
"learning_rate": 0.00029672778799471797,
"loss": 2.9839,
"step": 266
},
{
"epoch": 0.07621137417296076,
"grad_norm": 3.640625,
"learning_rate": 0.0002966995004121481,
"loss": 2.9812,
"step": 267
},
{
"epoch": 0.07649681003128644,
"grad_norm": 3.546875,
"learning_rate": 0.00029667109244518923,
"loss": 2.9904,
"step": 268
},
{
"epoch": 0.07678224588961215,
"grad_norm": 2.671875,
"learning_rate": 0.0002966425641171534,
"loss": 2.9614,
"step": 269
},
{
"epoch": 0.07706768174793784,
"grad_norm": 2.6875,
"learning_rate": 0.00029661391545145156,
"loss": 2.9671,
"step": 270
},
{
"epoch": 0.07735311760626354,
"grad_norm": 2.125,
"learning_rate": 0.00029658514647159335,
"loss": 2.9646,
"step": 271
},
{
"epoch": 0.07763855346458923,
"grad_norm": 2.640625,
"learning_rate": 0.0002965562572011872,
"loss": 2.9729,
"step": 272
},
{
"epoch": 0.07792398932291493,
"grad_norm": 1.78125,
"learning_rate": 0.00029652724766394007,
"loss": 2.9315,
"step": 273
},
{
"epoch": 0.07820942518124062,
"grad_norm": 3.015625,
"learning_rate": 0.0002964981178836578,
"loss": 2.9511,
"step": 274
},
{
"epoch": 0.07849486103956632,
"grad_norm": 2.171875,
"learning_rate": 0.00029646886788424487,
"loss": 2.9338,
"step": 275
},
{
"epoch": 0.07878029689789201,
"grad_norm": 3.296875,
"learning_rate": 0.0002964394976897043,
"loss": 2.936,
"step": 276
},
{
"epoch": 0.07906573275621771,
"grad_norm": 2.734375,
"learning_rate": 0.0002964100073241379,
"loss": 2.9335,
"step": 277
},
{
"epoch": 0.0793511686145434,
"grad_norm": 2.65625,
"learning_rate": 0.000296380396811746,
"loss": 2.9638,
"step": 278
},
{
"epoch": 0.0796366044728691,
"grad_norm": 2.203125,
"learning_rate": 0.00029635066617682754,
"loss": 2.9612,
"step": 279
},
{
"epoch": 0.0799220403311948,
"grad_norm": 2.09375,
"learning_rate": 0.00029632081544378003,
"loss": 2.9579,
"step": 280
},
{
"epoch": 0.0802074761895205,
"grad_norm": 1.734375,
"learning_rate": 0.00029629084463709957,
"loss": 2.9506,
"step": 281
},
{
"epoch": 0.08049291204784619,
"grad_norm": 1.8671875,
"learning_rate": 0.0002962607537813808,
"loss": 2.9479,
"step": 282
},
{
"epoch": 0.08077834790617189,
"grad_norm": 1.4609375,
"learning_rate": 0.0002962305429013168,
"loss": 2.9124,
"step": 283
},
{
"epoch": 0.08106378376449758,
"grad_norm": 2.03125,
"learning_rate": 0.0002962002120216992,
"loss": 2.9741,
"step": 284
},
{
"epoch": 0.08134921962282328,
"grad_norm": 1.625,
"learning_rate": 0.0002961697611674181,
"loss": 2.9481,
"step": 285
},
{
"epoch": 0.08163465548114897,
"grad_norm": 1.875,
"learning_rate": 0.00029613919036346203,
"loss": 2.9457,
"step": 286
},
{
"epoch": 0.08192009133947467,
"grad_norm": 1.78125,
"learning_rate": 0.00029610849963491797,
"loss": 2.9509,
"step": 287
},
{
"epoch": 0.08220552719780036,
"grad_norm": 2.40625,
"learning_rate": 0.0002960776890069714,
"loss": 2.9441,
"step": 288
},
{
"epoch": 0.08249096305612605,
"grad_norm": 2.03125,
"learning_rate": 0.0002960467585049059,
"loss": 2.9625,
"step": 289
},
{
"epoch": 0.08277639891445175,
"grad_norm": 1.453125,
"learning_rate": 0.0002960157081541039,
"loss": 2.9183,
"step": 290
},
{
"epoch": 0.08306183477277744,
"grad_norm": 2.0,
"learning_rate": 0.0002959845379800457,
"loss": 2.9312,
"step": 291
},
{
"epoch": 0.08334727063110314,
"grad_norm": 1.734375,
"learning_rate": 0.00029595324800831024,
"loss": 2.9224,
"step": 292
},
{
"epoch": 0.08363270648942883,
"grad_norm": 2.1875,
"learning_rate": 0.0002959218382645746,
"loss": 2.9394,
"step": 293
},
{
"epoch": 0.08391814234775453,
"grad_norm": 1.96875,
"learning_rate": 0.00029589030877461426,
"loss": 2.9493,
"step": 294
},
{
"epoch": 0.08420357820608022,
"grad_norm": 1.5078125,
"learning_rate": 0.00029585865956430283,
"loss": 2.9385,
"step": 295
},
{
"epoch": 0.08448901406440593,
"grad_norm": 2.4375,
"learning_rate": 0.00029582689065961237,
"loss": 2.9265,
"step": 296
},
{
"epoch": 0.08477444992273162,
"grad_norm": 1.609375,
"learning_rate": 0.00029579500208661296,
"loss": 2.9448,
"step": 297
},
{
"epoch": 0.08505988578105732,
"grad_norm": 1.8984375,
"learning_rate": 0.00029576299387147305,
"loss": 2.9555,
"step": 298
},
{
"epoch": 0.085345321639383,
"grad_norm": 1.6953125,
"learning_rate": 0.00029573086604045904,
"loss": 2.904,
"step": 299
},
{
"epoch": 0.08563075749770871,
"grad_norm": 2.046875,
"learning_rate": 0.0002956986186199358,
"loss": 2.959,
"step": 300
},
{
"epoch": 0.0859161933560344,
"grad_norm": 1.171875,
"learning_rate": 0.0002956662516363661,
"loss": 2.9075,
"step": 301
},
{
"epoch": 0.0862016292143601,
"grad_norm": 2.40625,
"learning_rate": 0.0002956337651163109,
"loss": 2.9521,
"step": 302
},
{
"epoch": 0.08648706507268579,
"grad_norm": 1.703125,
"learning_rate": 0.00029560115908642924,
"loss": 2.9425,
"step": 303
},
{
"epoch": 0.08677250093101149,
"grad_norm": 2.59375,
"learning_rate": 0.0002955684335734783,
"loss": 2.9626,
"step": 304
},
{
"epoch": 0.08705793678933718,
"grad_norm": 1.8515625,
"learning_rate": 0.00029553558860431317,
"loss": 2.9293,
"step": 305
},
{
"epoch": 0.08734337264766288,
"grad_norm": 2.515625,
"learning_rate": 0.0002955026242058872,
"loss": 2.9332,
"step": 306
},
{
"epoch": 0.08762880850598857,
"grad_norm": 2.03125,
"learning_rate": 0.0002954695404052514,
"loss": 2.9323,
"step": 307
},
{
"epoch": 0.08791424436431428,
"grad_norm": 2.265625,
"learning_rate": 0.0002954363372295551,
"loss": 2.9408,
"step": 308
},
{
"epoch": 0.08819968022263996,
"grad_norm": 1.859375,
"learning_rate": 0.0002954030147060454,
"loss": 2.9305,
"step": 309
},
{
"epoch": 0.08848511608096567,
"grad_norm": 2.1875,
"learning_rate": 0.0002953695728620675,
"loss": 2.9323,
"step": 310
},
{
"epoch": 0.08877055193929136,
"grad_norm": 1.7890625,
"learning_rate": 0.00029533601172506427,
"loss": 2.9138,
"step": 311
},
{
"epoch": 0.08905598779761706,
"grad_norm": 2.40625,
"learning_rate": 0.00029530233132257663,
"loss": 2.9394,
"step": 312
},
{
"epoch": 0.08934142365594275,
"grad_norm": 1.7578125,
"learning_rate": 0.00029526853168224343,
"loss": 2.8984,
"step": 313
},
{
"epoch": 0.08962685951426845,
"grad_norm": 2.359375,
"learning_rate": 0.0002952346128318013,
"loss": 2.9322,
"step": 314
},
{
"epoch": 0.08991229537259414,
"grad_norm": 1.9375,
"learning_rate": 0.00029520057479908465,
"loss": 2.9164,
"step": 315
},
{
"epoch": 0.09019773123091984,
"grad_norm": 2.234375,
"learning_rate": 0.0002951664176120257,
"loss": 2.9167,
"step": 316
},
{
"epoch": 0.09048316708924553,
"grad_norm": 1.9375,
"learning_rate": 0.00029513214129865456,
"loss": 2.9398,
"step": 317
},
{
"epoch": 0.09076860294757123,
"grad_norm": 2.21875,
"learning_rate": 0.00029509774588709896,
"loss": 2.9395,
"step": 318
},
{
"epoch": 0.09105403880589692,
"grad_norm": 1.828125,
"learning_rate": 0.00029506323140558445,
"loss": 2.9478,
"step": 319
},
{
"epoch": 0.09133947466422263,
"grad_norm": 1.9140625,
"learning_rate": 0.0002950285978824343,
"loss": 2.9216,
"step": 320
},
{
"epoch": 0.09162491052254831,
"grad_norm": 1.7109375,
"learning_rate": 0.00029499384534606936,
"loss": 2.8959,
"step": 321
},
{
"epoch": 0.09191034638087402,
"grad_norm": 1.75,
"learning_rate": 0.00029495897382500827,
"loss": 2.9072,
"step": 322
},
{
"epoch": 0.0921957822391997,
"grad_norm": 1.5234375,
"learning_rate": 0.00029492398334786727,
"loss": 2.9121,
"step": 323
},
{
"epoch": 0.09248121809752541,
"grad_norm": 2.09375,
"learning_rate": 0.0002948888739433602,
"loss": 2.9344,
"step": 324
},
{
"epoch": 0.0927666539558511,
"grad_norm": 1.765625,
"learning_rate": 0.0002948536456402985,
"loss": 2.9211,
"step": 325
},
{
"epoch": 0.0930520898141768,
"grad_norm": 1.9296875,
"learning_rate": 0.00029481829846759116,
"loss": 2.9041,
"step": 326
},
{
"epoch": 0.09333752567250249,
"grad_norm": 2.265625,
"learning_rate": 0.0002947828324542448,
"loss": 2.9353,
"step": 327
},
{
"epoch": 0.09362296153082819,
"grad_norm": 1.1328125,
"learning_rate": 0.0002947472476293634,
"loss": 2.9037,
"step": 328
},
{
"epoch": 0.09390839738915388,
"grad_norm": 1.8359375,
"learning_rate": 0.00029471154402214864,
"loss": 2.9166,
"step": 329
},
{
"epoch": 0.09419383324747958,
"grad_norm": 2.078125,
"learning_rate": 0.00029467572166189956,
"loss": 2.9074,
"step": 330
},
{
"epoch": 0.09447926910580527,
"grad_norm": 2.015625,
"learning_rate": 0.00029463978057801257,
"loss": 2.9137,
"step": 331
},
{
"epoch": 0.09476470496413097,
"grad_norm": 1.7734375,
"learning_rate": 0.00029460372079998177,
"loss": 2.8971,
"step": 332
},
{
"epoch": 0.09505014082245666,
"grad_norm": 1.296875,
"learning_rate": 0.00029456754235739833,
"loss": 2.8784,
"step": 333
},
{
"epoch": 0.09533557668078237,
"grad_norm": 2.203125,
"learning_rate": 0.0002945312452799511,
"loss": 2.9102,
"step": 334
},
{
"epoch": 0.09562101253910806,
"grad_norm": 1.34375,
"learning_rate": 0.00029449482959742604,
"loss": 2.9096,
"step": 335
},
{
"epoch": 0.09590644839743376,
"grad_norm": 1.3671875,
"learning_rate": 0.0002944582953397067,
"loss": 2.8925,
"step": 336
},
{
"epoch": 0.09619188425575945,
"grad_norm": 1.9375,
"learning_rate": 0.0002944216425367736,
"loss": 2.9094,
"step": 337
},
{
"epoch": 0.09647732011408515,
"grad_norm": 1.9140625,
"learning_rate": 0.0002943848712187048,
"loss": 2.9133,
"step": 338
},
{
"epoch": 0.09676275597241084,
"grad_norm": 1.90625,
"learning_rate": 0.0002943479814156756,
"loss": 2.9073,
"step": 339
},
{
"epoch": 0.09704819183073654,
"grad_norm": 1.2734375,
"learning_rate": 0.00029431097315795834,
"loss": 2.8993,
"step": 340
},
{
"epoch": 0.09733362768906223,
"grad_norm": 2.296875,
"learning_rate": 0.00029427384647592284,
"loss": 2.8968,
"step": 341
},
{
"epoch": 0.09761906354738793,
"grad_norm": 1.4609375,
"learning_rate": 0.0002942366014000359,
"loss": 2.9124,
"step": 342
},
{
"epoch": 0.09790449940571362,
"grad_norm": 2.484375,
"learning_rate": 0.0002941992379608615,
"loss": 2.8816,
"step": 343
},
{
"epoch": 0.09818993526403932,
"grad_norm": 1.609375,
"learning_rate": 0.00029416175618906084,
"loss": 2.9015,
"step": 344
},
{
"epoch": 0.09847537112236501,
"grad_norm": 2.6875,
"learning_rate": 0.00029412415611539214,
"loss": 2.9286,
"step": 345
},
{
"epoch": 0.09876080698069072,
"grad_norm": 2.140625,
"learning_rate": 0.00029408643777071073,
"loss": 2.9316,
"step": 346
},
{
"epoch": 0.0990462428390164,
"grad_norm": 2.28125,
"learning_rate": 0.00029404860118596905,
"loss": 2.894,
"step": 347
},
{
"epoch": 0.09933167869734211,
"grad_norm": 2.09375,
"learning_rate": 0.00029401064639221643,
"loss": 2.8946,
"step": 348
},
{
"epoch": 0.0996171145556678,
"grad_norm": 2.21875,
"learning_rate": 0.0002939725734205994,
"loss": 2.9068,
"step": 349
},
{
"epoch": 0.0999025504139935,
"grad_norm": 1.84375,
"learning_rate": 0.00029393438230236124,
"loss": 2.8898,
"step": 350
},
{
"epoch": 0.10018798627231919,
"grad_norm": 1.7578125,
"learning_rate": 0.0002938960730688424,
"loss": 2.8922,
"step": 351
},
{
"epoch": 0.10047342213064489,
"grad_norm": 1.640625,
"learning_rate": 0.00029385764575148014,
"loss": 2.8772,
"step": 352
},
{
"epoch": 0.10075885798897058,
"grad_norm": 1.7890625,
"learning_rate": 0.00029381910038180856,
"loss": 2.8961,
"step": 353
},
{
"epoch": 0.10104429384729628,
"grad_norm": 1.890625,
"learning_rate": 0.00029378043699145886,
"loss": 2.9052,
"step": 354
},
{
"epoch": 0.10132972970562197,
"grad_norm": 1.9375,
"learning_rate": 0.0002937416556121589,
"loss": 2.8703,
"step": 355
},
{
"epoch": 0.10161516556394767,
"grad_norm": 1.5625,
"learning_rate": 0.0002937027562757334,
"loss": 2.8967,
"step": 356
},
{
"epoch": 0.10190060142227336,
"grad_norm": 1.6640625,
"learning_rate": 0.00029366373901410387,
"loss": 2.913,
"step": 357
},
{
"epoch": 0.10218603728059907,
"grad_norm": 1.53125,
"learning_rate": 0.0002936246038592886,
"loss": 2.8944,
"step": 358
},
{
"epoch": 0.10247147313892475,
"grad_norm": 2.390625,
"learning_rate": 0.00029358535084340274,
"loss": 2.8808,
"step": 359
},
{
"epoch": 0.10275690899725046,
"grad_norm": 1.421875,
"learning_rate": 0.000293545979998658,
"loss": 2.9055,
"step": 360
},
{
"epoch": 0.10304234485557615,
"grad_norm": 1.71875,
"learning_rate": 0.0002935064913573628,
"loss": 2.8925,
"step": 361
},
{
"epoch": 0.10332778071390183,
"grad_norm": 2.0625,
"learning_rate": 0.0002934668849519223,
"loss": 2.8751,
"step": 362
},
{
"epoch": 0.10361321657222754,
"grad_norm": 2.015625,
"learning_rate": 0.00029342716081483825,
"loss": 2.8836,
"step": 363
},
{
"epoch": 0.10389865243055323,
"grad_norm": 1.4375,
"learning_rate": 0.0002933873189787091,
"loss": 2.8702,
"step": 364
},
{
"epoch": 0.10418408828887893,
"grad_norm": 2.375,
"learning_rate": 0.0002933473594762297,
"loss": 2.8953,
"step": 365
},
{
"epoch": 0.10446952414720462,
"grad_norm": 1.3671875,
"learning_rate": 0.00029330728234019173,
"loss": 2.8753,
"step": 366
},
{
"epoch": 0.10475496000553032,
"grad_norm": 2.90625,
"learning_rate": 0.0002932670876034831,
"loss": 2.8844,
"step": 367
},
{
"epoch": 0.10504039586385601,
"grad_norm": 2.109375,
"learning_rate": 0.00029322677529908844,
"loss": 2.9018,
"step": 368
},
{
"epoch": 0.10532583172218171,
"grad_norm": 2.328125,
"learning_rate": 0.0002931863454600888,
"loss": 2.8967,
"step": 369
},
{
"epoch": 0.1056112675805074,
"grad_norm": 2.0625,
"learning_rate": 0.0002931457981196616,
"loss": 2.882,
"step": 370
},
{
"epoch": 0.1058967034388331,
"grad_norm": 2.1875,
"learning_rate": 0.00029310513331108086,
"loss": 2.8641,
"step": 371
},
{
"epoch": 0.10618213929715879,
"grad_norm": 1.78125,
"learning_rate": 0.0002930643510677168,
"loss": 2.8808,
"step": 372
},
{
"epoch": 0.1064675751554845,
"grad_norm": 1.703125,
"learning_rate": 0.00029302345142303616,
"loss": 2.8699,
"step": 373
},
{
"epoch": 0.10675301101381018,
"grad_norm": 1.8984375,
"learning_rate": 0.0002929824344106019,
"loss": 2.8467,
"step": 374
},
{
"epoch": 0.10703844687213589,
"grad_norm": 1.546875,
"learning_rate": 0.0002929413000640735,
"loss": 2.8674,
"step": 375
},
{
"epoch": 0.10732388273046158,
"grad_norm": 2.265625,
"learning_rate": 0.0002929000484172064,
"loss": 2.8897,
"step": 376
},
{
"epoch": 0.10760931858878728,
"grad_norm": 1.328125,
"learning_rate": 0.00029285867950385255,
"loss": 2.8601,
"step": 377
},
{
"epoch": 0.10789475444711297,
"grad_norm": 2.65625,
"learning_rate": 0.00029281719335796013,
"loss": 2.89,
"step": 378
},
{
"epoch": 0.10818019030543867,
"grad_norm": 2.046875,
"learning_rate": 0.00029277559001357343,
"loss": 2.9044,
"step": 379
},
{
"epoch": 0.10846562616376436,
"grad_norm": 2.109375,
"learning_rate": 0.00029273386950483287,
"loss": 2.8765,
"step": 380
},
{
"epoch": 0.10875106202209006,
"grad_norm": 1.7265625,
"learning_rate": 0.00029269203186597513,
"loss": 2.8911,
"step": 381
},
{
"epoch": 0.10903649788041575,
"grad_norm": 2.296875,
"learning_rate": 0.00029265007713133304,
"loss": 2.8756,
"step": 382
},
{
"epoch": 0.10932193373874145,
"grad_norm": 1.5703125,
"learning_rate": 0.00029260800533533534,
"loss": 2.889,
"step": 383
},
{
"epoch": 0.10960736959706714,
"grad_norm": 2.609375,
"learning_rate": 0.000292565816512507,
"loss": 2.8758,
"step": 384
},
{
"epoch": 0.10989280545539284,
"grad_norm": 2.125,
"learning_rate": 0.000292523510697469,
"loss": 2.8699,
"step": 385
},
{
"epoch": 0.11017824131371853,
"grad_norm": 2.515625,
"learning_rate": 0.0002924810879249382,
"loss": 2.8935,
"step": 386
},
{
"epoch": 0.11046367717204424,
"grad_norm": 2.015625,
"learning_rate": 0.00029243854822972763,
"loss": 2.8723,
"step": 387
},
{
"epoch": 0.11074911303036993,
"grad_norm": 2.21875,
"learning_rate": 0.0002923958916467461,
"loss": 2.894,
"step": 388
},
{
"epoch": 0.11103454888869563,
"grad_norm": 2.0625,
"learning_rate": 0.00029235311821099847,
"loss": 2.8676,
"step": 389
},
{
"epoch": 0.11131998474702132,
"grad_norm": 1.75,
"learning_rate": 0.00029231022795758537,
"loss": 2.8786,
"step": 390
},
{
"epoch": 0.11160542060534702,
"grad_norm": 1.671875,
"learning_rate": 0.0002922672209217033,
"loss": 2.867,
"step": 391
},
{
"epoch": 0.11189085646367271,
"grad_norm": 1.8359375,
"learning_rate": 0.00029222409713864484,
"loss": 2.8938,
"step": 392
},
{
"epoch": 0.11217629232199841,
"grad_norm": 1.6640625,
"learning_rate": 0.00029218085664379806,
"loss": 2.8601,
"step": 393
},
{
"epoch": 0.1124617281803241,
"grad_norm": 1.5078125,
"learning_rate": 0.0002921374994726469,
"loss": 2.8817,
"step": 394
},
{
"epoch": 0.1127471640386498,
"grad_norm": 1.6015625,
"learning_rate": 0.0002920940256607711,
"loss": 2.8482,
"step": 395
},
{
"epoch": 0.11303259989697549,
"grad_norm": 1.859375,
"learning_rate": 0.0002920504352438462,
"loss": 2.8996,
"step": 396
},
{
"epoch": 0.1133180357553012,
"grad_norm": 2.015625,
"learning_rate": 0.00029200672825764314,
"loss": 2.8592,
"step": 397
},
{
"epoch": 0.11360347161362688,
"grad_norm": 1.46875,
"learning_rate": 0.00029196290473802885,
"loss": 2.8327,
"step": 398
},
{
"epoch": 0.11388890747195259,
"grad_norm": 1.515625,
"learning_rate": 0.0002919189647209656,
"loss": 2.8438,
"step": 399
},
{
"epoch": 0.11417434333027827,
"grad_norm": 1.3359375,
"learning_rate": 0.00029187490824251154,
"loss": 2.884,
"step": 400
},
{
"epoch": 0.11445977918860398,
"grad_norm": 1.828125,
"learning_rate": 0.00029183073533882025,
"loss": 2.8601,
"step": 401
},
{
"epoch": 0.11474521504692967,
"grad_norm": 1.96875,
"learning_rate": 0.00029178644604614077,
"loss": 2.8788,
"step": 402
},
{
"epoch": 0.11503065090525537,
"grad_norm": 1.7890625,
"learning_rate": 0.00029174204040081773,
"loss": 2.8823,
"step": 403
},
{
"epoch": 0.11531608676358106,
"grad_norm": 1.7265625,
"learning_rate": 0.0002916975184392912,
"loss": 2.8464,
"step": 404
},
{
"epoch": 0.11560152262190676,
"grad_norm": 1.0625,
"learning_rate": 0.0002916528801980969,
"loss": 2.8377,
"step": 405
},
{
"epoch": 0.11588695848023245,
"grad_norm": 2.109375,
"learning_rate": 0.00029160812571386575,
"loss": 2.8409,
"step": 406
},
{
"epoch": 0.11617239433855815,
"grad_norm": 1.4609375,
"learning_rate": 0.00029156325502332413,
"loss": 2.8581,
"step": 407
},
{
"epoch": 0.11645783019688384,
"grad_norm": 1.9296875,
"learning_rate": 0.00029151826816329365,
"loss": 2.865,
"step": 408
},
{
"epoch": 0.11674326605520954,
"grad_norm": 1.71875,
"learning_rate": 0.00029147316517069157,
"loss": 2.8527,
"step": 409
},
{
"epoch": 0.11702870191353523,
"grad_norm": 1.4375,
"learning_rate": 0.00029142794608253016,
"loss": 2.8494,
"step": 410
},
{
"epoch": 0.11731413777186094,
"grad_norm": 3.875,
"learning_rate": 0.0002913826109359171,
"loss": 2.8461,
"step": 411
},
{
"epoch": 0.11759957363018662,
"grad_norm": 1.875,
"learning_rate": 0.00029133715976805525,
"loss": 2.8565,
"step": 412
},
{
"epoch": 0.11788500948851233,
"grad_norm": 3.125,
"learning_rate": 0.0002912915926162427,
"loss": 2.8667,
"step": 413
},
{
"epoch": 0.11817044534683802,
"grad_norm": 2.0625,
"learning_rate": 0.00029124590951787267,
"loss": 2.8504,
"step": 414
},
{
"epoch": 0.11845588120516372,
"grad_norm": 3.46875,
"learning_rate": 0.0002912001105104337,
"loss": 2.8719,
"step": 415
},
{
"epoch": 0.11874131706348941,
"grad_norm": 2.171875,
"learning_rate": 0.00029115419563150916,
"loss": 2.8702,
"step": 416
},
{
"epoch": 0.11902675292181511,
"grad_norm": 4.71875,
"learning_rate": 0.0002911081649187778,
"loss": 2.8971,
"step": 417
},
{
"epoch": 0.1193121887801408,
"grad_norm": 3.828125,
"learning_rate": 0.0002910620184100133,
"loss": 2.9119,
"step": 418
},
{
"epoch": 0.1195976246384665,
"grad_norm": 4.25,
"learning_rate": 0.0002910157561430842,
"loss": 2.8927,
"step": 419
},
{
"epoch": 0.11988306049679219,
"grad_norm": 3.65625,
"learning_rate": 0.0002909693781559544,
"loss": 2.861,
"step": 420
},
{
"epoch": 0.1201684963551179,
"grad_norm": 3.59375,
"learning_rate": 0.0002909228844866824,
"loss": 2.8826,
"step": 421
},
{
"epoch": 0.12045393221344358,
"grad_norm": 3.265625,
"learning_rate": 0.0002908762751734219,
"loss": 2.8495,
"step": 422
},
{
"epoch": 0.12073936807176928,
"grad_norm": 3.484375,
"learning_rate": 0.0002908295502544213,
"loss": 2.8707,
"step": 423
},
{
"epoch": 0.12102480393009497,
"grad_norm": 2.828125,
"learning_rate": 0.00029078270976802393,
"loss": 2.8647,
"step": 424
},
{
"epoch": 0.12131023978842068,
"grad_norm": 3.515625,
"learning_rate": 0.00029073575375266806,
"loss": 2.8505,
"step": 425
},
{
"epoch": 0.12159567564674637,
"grad_norm": 2.5,
"learning_rate": 0.0002906886822468867,
"loss": 2.8821,
"step": 426
},
{
"epoch": 0.12188111150507207,
"grad_norm": 5.4375,
"learning_rate": 0.0002906414952893075,
"loss": 2.8788,
"step": 427
},
{
"epoch": 0.12216654736339776,
"grad_norm": 4.40625,
"learning_rate": 0.00029059419291865314,
"loss": 2.8715,
"step": 428
},
{
"epoch": 0.12245198322172346,
"grad_norm": 3.6875,
"learning_rate": 0.0002905467751737407,
"loss": 2.846,
"step": 429
},
{
"epoch": 0.12273741908004915,
"grad_norm": 3.765625,
"learning_rate": 0.00029049924209348214,
"loss": 2.856,
"step": 430
},
{
"epoch": 0.12302285493837485,
"grad_norm": 2.90625,
"learning_rate": 0.000290451593716884,
"loss": 2.8646,
"step": 431
},
{
"epoch": 0.12330829079670054,
"grad_norm": 2.890625,
"learning_rate": 0.00029040383008304744,
"loss": 2.8408,
"step": 432
},
{
"epoch": 0.12359372665502623,
"grad_norm": 2.984375,
"learning_rate": 0.00029035595123116817,
"loss": 2.8501,
"step": 433
},
{
"epoch": 0.12387916251335193,
"grad_norm": 2.53125,
"learning_rate": 0.0002903079572005365,
"loss": 2.8384,
"step": 434
},
{
"epoch": 0.12416459837167762,
"grad_norm": 3.40625,
"learning_rate": 0.00029025984803053735,
"loss": 2.8436,
"step": 435
},
{
"epoch": 0.12445003423000332,
"grad_norm": 2.984375,
"learning_rate": 0.0002902116237606498,
"loss": 2.8543,
"step": 436
},
{
"epoch": 0.12473547008832901,
"grad_norm": 3.65625,
"learning_rate": 0.0002901632844304478,
"loss": 2.8469,
"step": 437
},
{
"epoch": 0.12502090594665471,
"grad_norm": 3.40625,
"learning_rate": 0.0002901148300795994,
"loss": 2.8636,
"step": 438
},
{
"epoch": 0.1253063418049804,
"grad_norm": 3.546875,
"learning_rate": 0.0002900662607478672,
"loss": 2.8424,
"step": 439
},
{
"epoch": 0.1255917776633061,
"grad_norm": 3.265625,
"learning_rate": 0.00029001757647510815,
"loss": 2.8493,
"step": 440
},
{
"epoch": 0.1258772135216318,
"grad_norm": 2.890625,
"learning_rate": 0.0002899687773012734,
"loss": 2.8214,
"step": 441
},
{
"epoch": 0.1261626493799575,
"grad_norm": 2.75,
"learning_rate": 0.0002899198632664086,
"loss": 2.8492,
"step": 442
},
{
"epoch": 0.1264480852382832,
"grad_norm": 3.6875,
"learning_rate": 0.0002898708344106533,
"loss": 2.8111,
"step": 443
},
{
"epoch": 0.12673352109660888,
"grad_norm": 3.546875,
"learning_rate": 0.0002898216907742418,
"loss": 2.8513,
"step": 444
},
{
"epoch": 0.1270189569549346,
"grad_norm": 2.671875,
"learning_rate": 0.0002897724323975021,
"loss": 2.8602,
"step": 445
},
{
"epoch": 0.12730439281326028,
"grad_norm": 2.578125,
"learning_rate": 0.0002897230593208567,
"loss": 2.8462,
"step": 446
},
{
"epoch": 0.12758982867158597,
"grad_norm": 2.828125,
"learning_rate": 0.00028967357158482196,
"loss": 2.8422,
"step": 447
},
{
"epoch": 0.12787526452991166,
"grad_norm": 2.375,
"learning_rate": 0.00028962396923000846,
"loss": 2.8382,
"step": 448
},
{
"epoch": 0.12816070038823738,
"grad_norm": 3.6875,
"learning_rate": 0.0002895742522971209,
"loss": 2.8544,
"step": 449
},
{
"epoch": 0.12844613624656306,
"grad_norm": 3.640625,
"learning_rate": 0.0002895244208269579,
"loss": 2.8542,
"step": 450
},
{
"epoch": 0.12873157210488875,
"grad_norm": 2.40625,
"learning_rate": 0.0002894744748604121,
"loss": 2.8417,
"step": 451
},
{
"epoch": 0.12901700796321444,
"grad_norm": 2.375,
"learning_rate": 0.0002894244144384701,
"loss": 2.8588,
"step": 452
},
{
"epoch": 0.12930244382154016,
"grad_norm": 2.765625,
"learning_rate": 0.0002893742396022125,
"loss": 2.8388,
"step": 453
},
{
"epoch": 0.12958787967986585,
"grad_norm": 2.359375,
"learning_rate": 0.0002893239503928137,
"loss": 2.8559,
"step": 454
},
{
"epoch": 0.12987331553819154,
"grad_norm": 3.96875,
"learning_rate": 0.00028927354685154185,
"loss": 2.8341,
"step": 455
},
{
"epoch": 0.13015875139651722,
"grad_norm": 3.765625,
"learning_rate": 0.0002892230290197592,
"loss": 2.8267,
"step": 456
},
{
"epoch": 0.13044418725484294,
"grad_norm": 2.078125,
"learning_rate": 0.0002891723969389216,
"loss": 2.8497,
"step": 457
},
{
"epoch": 0.13072962311316863,
"grad_norm": 2.0,
"learning_rate": 0.0002891216506505787,
"loss": 2.8252,
"step": 458
},
{
"epoch": 0.13101505897149432,
"grad_norm": 3.5625,
"learning_rate": 0.0002890707901963738,
"loss": 2.8563,
"step": 459
},
{
"epoch": 0.13130049482982,
"grad_norm": 3.171875,
"learning_rate": 0.00028901981561804403,
"loss": 2.861,
"step": 460
},
{
"epoch": 0.13158593068814572,
"grad_norm": 2.734375,
"learning_rate": 0.0002889687269574201,
"loss": 2.8336,
"step": 461
},
{
"epoch": 0.1318713665464714,
"grad_norm": 2.78125,
"learning_rate": 0.0002889175242564263,
"loss": 2.8575,
"step": 462
},
{
"epoch": 0.1321568024047971,
"grad_norm": 2.265625,
"learning_rate": 0.00028886620755708045,
"loss": 2.8301,
"step": 463
},
{
"epoch": 0.1324422382631228,
"grad_norm": 2.015625,
"learning_rate": 0.0002888147769014942,
"loss": 2.8299,
"step": 464
},
{
"epoch": 0.1327276741214485,
"grad_norm": 3.1875,
"learning_rate": 0.0002887632323318723,
"loss": 2.8261,
"step": 465
},
{
"epoch": 0.1330131099797742,
"grad_norm": 2.84375,
"learning_rate": 0.0002887115738905134,
"loss": 2.8398,
"step": 466
},
{
"epoch": 0.13329854583809989,
"grad_norm": 2.828125,
"learning_rate": 0.0002886598016198093,
"loss": 2.8414,
"step": 467
},
{
"epoch": 0.13358398169642557,
"grad_norm": 2.640625,
"learning_rate": 0.00028860791556224524,
"loss": 2.8286,
"step": 468
},
{
"epoch": 0.1338694175547513,
"grad_norm": 2.8125,
"learning_rate": 0.00028855591576040004,
"loss": 2.8641,
"step": 469
},
{
"epoch": 0.13415485341307698,
"grad_norm": 2.65625,
"learning_rate": 0.0002885038022569457,
"loss": 2.8478,
"step": 470
},
{
"epoch": 0.13444028927140267,
"grad_norm": 2.703125,
"learning_rate": 0.0002884515750946474,
"loss": 2.8215,
"step": 471
},
{
"epoch": 0.13472572512972836,
"grad_norm": 2.515625,
"learning_rate": 0.0002883992343163639,
"loss": 2.8004,
"step": 472
},
{
"epoch": 0.13501116098805407,
"grad_norm": 2.75,
"learning_rate": 0.00028834677996504696,
"loss": 2.8395,
"step": 473
},
{
"epoch": 0.13529659684637976,
"grad_norm": 2.625,
"learning_rate": 0.00028829421208374166,
"loss": 2.8313,
"step": 474
},
{
"epoch": 0.13558203270470545,
"grad_norm": 2.640625,
"learning_rate": 0.0002882415307155862,
"loss": 2.841,
"step": 475
},
{
"epoch": 0.13586746856303114,
"grad_norm": 2.4375,
"learning_rate": 0.00028818873590381183,
"loss": 2.8614,
"step": 476
},
{
"epoch": 0.13615290442135686,
"grad_norm": 2.71875,
"learning_rate": 0.000288135827691743,
"loss": 2.8482,
"step": 477
},
{
"epoch": 0.13643834027968255,
"grad_norm": 2.5,
"learning_rate": 0.0002880828061227973,
"loss": 2.8532,
"step": 478
},
{
"epoch": 0.13672377613800824,
"grad_norm": 2.75,
"learning_rate": 0.0002880296712404851,
"loss": 2.8337,
"step": 479
},
{
"epoch": 0.13700921199633392,
"grad_norm": 2.578125,
"learning_rate": 0.0002879764230884099,
"loss": 2.8183,
"step": 480
},
{
"epoch": 0.13729464785465964,
"grad_norm": 2.515625,
"learning_rate": 0.00028792306171026823,
"loss": 2.8161,
"step": 481
},
{
"epoch": 0.13758008371298533,
"grad_norm": 2.390625,
"learning_rate": 0.00028786958714984936,
"loss": 2.8174,
"step": 482
},
{
"epoch": 0.13786551957131102,
"grad_norm": 2.609375,
"learning_rate": 0.0002878159994510356,
"loss": 2.8075,
"step": 483
},
{
"epoch": 0.1381509554296367,
"grad_norm": 2.5,
"learning_rate": 0.00028776229865780205,
"loss": 2.8157,
"step": 484
},
{
"epoch": 0.13843639128796242,
"grad_norm": 2.453125,
"learning_rate": 0.0002877084848142165,
"loss": 2.8291,
"step": 485
},
{
"epoch": 0.1387218271462881,
"grad_norm": 2.375,
"learning_rate": 0.0002876545579644396,
"loss": 2.8247,
"step": 486
},
{
"epoch": 0.1390072630046138,
"grad_norm": 2.671875,
"learning_rate": 0.0002876005181527249,
"loss": 2.8366,
"step": 487
},
{
"epoch": 0.1392926988629395,
"grad_norm": 2.546875,
"learning_rate": 0.0002875463654234183,
"loss": 2.8679,
"step": 488
},
{
"epoch": 0.1395781347212652,
"grad_norm": 2.5625,
"learning_rate": 0.0002874920998209587,
"loss": 2.8432,
"step": 489
},
{
"epoch": 0.1398635705795909,
"grad_norm": 2.40625,
"learning_rate": 0.00028743772138987745,
"loss": 2.8366,
"step": 490
},
{
"epoch": 0.14014900643791658,
"grad_norm": 2.546875,
"learning_rate": 0.0002873832301747985,
"loss": 2.8279,
"step": 491
},
{
"epoch": 0.14043444229624227,
"grad_norm": 2.28125,
"learning_rate": 0.00028732862622043835,
"loss": 2.7933,
"step": 492
},
{
"epoch": 0.140719878154568,
"grad_norm": 2.671875,
"learning_rate": 0.000287273909571606,
"loss": 2.8563,
"step": 493
},
{
"epoch": 0.14100531401289368,
"grad_norm": 2.546875,
"learning_rate": 0.00028721908027320314,
"loss": 2.858,
"step": 494
},
{
"epoch": 0.14129074987121937,
"grad_norm": 2.390625,
"learning_rate": 0.00028716413837022355,
"loss": 2.7946,
"step": 495
},
{
"epoch": 0.14157618572954506,
"grad_norm": 2.15625,
"learning_rate": 0.0002871090839077537,
"loss": 2.7874,
"step": 496
},
{
"epoch": 0.14186162158787077,
"grad_norm": 2.546875,
"learning_rate": 0.0002870539169309723,
"loss": 2.8255,
"step": 497
},
{
"epoch": 0.14214705744619646,
"grad_norm": 2.453125,
"learning_rate": 0.0002869986374851504,
"loss": 2.8218,
"step": 498
},
{
"epoch": 0.14243249330452215,
"grad_norm": 2.46875,
"learning_rate": 0.00028694324561565136,
"loss": 2.8197,
"step": 499
},
{
"epoch": 0.14271792916284784,
"grad_norm": 2.296875,
"learning_rate": 0.00028688774136793085,
"loss": 2.8208,
"step": 500
},
{
"epoch": 0.14271792916284784,
"eval_loss": 2.6708414554595947,
"eval_runtime": 6008.9725,
"eval_samples_per_second": 10.698,
"eval_steps_per_second": 10.698,
"step": 500
},
{
"epoch": 0.14300336502117356,
"grad_norm": 2.34375,
"learning_rate": 0.00028683212478753663,
"loss": 2.8263,
"step": 501
},
{
"epoch": 0.14328880087949925,
"grad_norm": 2.046875,
"learning_rate": 0.00028677639592010874,
"loss": 2.8395,
"step": 502
},
{
"epoch": 0.14357423673782493,
"grad_norm": 2.828125,
"learning_rate": 0.00028672055481137937,
"loss": 2.815,
"step": 503
},
{
"epoch": 0.14385967259615062,
"grad_norm": 2.53125,
"learning_rate": 0.0002866646015071728,
"loss": 2.8157,
"step": 504
},
{
"epoch": 0.1441451084544763,
"grad_norm": 2.421875,
"learning_rate": 0.0002866085360534053,
"loss": 2.8449,
"step": 505
},
{
"epoch": 0.14443054431280203,
"grad_norm": 2.203125,
"learning_rate": 0.00028655235849608533,
"loss": 2.7893,
"step": 506
},
{
"epoch": 0.14471598017112772,
"grad_norm": 2.359375,
"learning_rate": 0.00028649606888131327,
"loss": 2.8099,
"step": 507
},
{
"epoch": 0.1450014160294534,
"grad_norm": 1.9453125,
"learning_rate": 0.00028643966725528134,
"loss": 2.8032,
"step": 508
},
{
"epoch": 0.1452868518877791,
"grad_norm": 2.921875,
"learning_rate": 0.0002863831536642739,
"loss": 2.8453,
"step": 509
},
{
"epoch": 0.1455722877461048,
"grad_norm": 2.59375,
"learning_rate": 0.0002863265281546669,
"loss": 2.7995,
"step": 510
},
{
"epoch": 0.1458577236044305,
"grad_norm": 2.1875,
"learning_rate": 0.0002862697907729285,
"loss": 2.8297,
"step": 511
},
{
"epoch": 0.1461431594627562,
"grad_norm": 2.015625,
"learning_rate": 0.00028621294156561843,
"loss": 2.7948,
"step": 512
},
{
"epoch": 0.14642859532108188,
"grad_norm": 2.5,
"learning_rate": 0.0002861559805793881,
"loss": 2.8182,
"step": 513
},
{
"epoch": 0.1467140311794076,
"grad_norm": 2.203125,
"learning_rate": 0.0002860989078609809,
"loss": 2.8126,
"step": 514
},
{
"epoch": 0.14699946703773328,
"grad_norm": 2.5625,
"learning_rate": 0.00028604172345723174,
"loss": 2.8018,
"step": 515
},
{
"epoch": 0.14728490289605897,
"grad_norm": 2.421875,
"learning_rate": 0.00028598442741506724,
"loss": 2.8455,
"step": 516
},
{
"epoch": 0.14757033875438466,
"grad_norm": 2.671875,
"learning_rate": 0.0002859270197815056,
"loss": 2.82,
"step": 517
},
{
"epoch": 0.14785577461271038,
"grad_norm": 2.375,
"learning_rate": 0.0002858695006036566,
"loss": 2.8428,
"step": 518
},
{
"epoch": 0.14814121047103607,
"grad_norm": 2.828125,
"learning_rate": 0.0002858118699287216,
"loss": 2.8128,
"step": 519
},
{
"epoch": 0.14842664632936176,
"grad_norm": 2.53125,
"learning_rate": 0.00028575412780399345,
"loss": 2.8563,
"step": 520
},
{
"epoch": 0.14871208218768744,
"grad_norm": 3.28125,
"learning_rate": 0.00028569627427685627,
"loss": 2.8428,
"step": 521
},
{
"epoch": 0.14899751804601316,
"grad_norm": 2.5625,
"learning_rate": 0.000285638309394786,
"loss": 2.8274,
"step": 522
},
{
"epoch": 0.14928295390433885,
"grad_norm": 3.84375,
"learning_rate": 0.0002855802332053496,
"loss": 2.8169,
"step": 523
},
{
"epoch": 0.14956838976266454,
"grad_norm": 3.453125,
"learning_rate": 0.00028552204575620543,
"loss": 2.828,
"step": 524
},
{
"epoch": 0.14985382562099023,
"grad_norm": 2.53125,
"learning_rate": 0.0002854637470951033,
"loss": 2.8265,
"step": 525
},
{
"epoch": 0.15013926147931594,
"grad_norm": 2.484375,
"learning_rate": 0.00028540533726988414,
"loss": 2.853,
"step": 526
},
{
"epoch": 0.15042469733764163,
"grad_norm": 2.328125,
"learning_rate": 0.00028534681632848025,
"loss": 2.8193,
"step": 527
},
{
"epoch": 0.15071013319596732,
"grad_norm": 2.015625,
"learning_rate": 0.0002852881843189149,
"loss": 2.8112,
"step": 528
},
{
"epoch": 0.150995569054293,
"grad_norm": 2.71875,
"learning_rate": 0.0002852294412893027,
"loss": 2.8376,
"step": 529
},
{
"epoch": 0.15128100491261873,
"grad_norm": 2.421875,
"learning_rate": 0.00028517058728784933,
"loss": 2.8126,
"step": 530
},
{
"epoch": 0.15156644077094442,
"grad_norm": 2.5625,
"learning_rate": 0.0002851116223628514,
"loss": 2.8375,
"step": 531
},
{
"epoch": 0.1518518766292701,
"grad_norm": 2.40625,
"learning_rate": 0.00028505254656269673,
"loss": 2.8186,
"step": 532
},
{
"epoch": 0.1521373124875958,
"grad_norm": 2.3125,
"learning_rate": 0.00028499335993586403,
"loss": 2.8437,
"step": 533
},
{
"epoch": 0.1524227483459215,
"grad_norm": 1.96875,
"learning_rate": 0.0002849340625309229,
"loss": 2.7927,
"step": 534
},
{
"epoch": 0.1527081842042472,
"grad_norm": 2.578125,
"learning_rate": 0.000284874654396534,
"loss": 2.8123,
"step": 535
},
{
"epoch": 0.1529936200625729,
"grad_norm": 2.171875,
"learning_rate": 0.0002848151355814487,
"loss": 2.8459,
"step": 536
},
{
"epoch": 0.15327905592089858,
"grad_norm": 2.953125,
"learning_rate": 0.0002847555061345093,
"loss": 2.8225,
"step": 537
},
{
"epoch": 0.1535644917792243,
"grad_norm": 2.84375,
"learning_rate": 0.0002846957661046488,
"loss": 2.8028,
"step": 538
},
{
"epoch": 0.15384992763754998,
"grad_norm": 2.03125,
"learning_rate": 0.0002846359155408911,
"loss": 2.8167,
"step": 539
},
{
"epoch": 0.15413536349587567,
"grad_norm": 1.8984375,
"learning_rate": 0.0002845759544923507,
"loss": 2.83,
"step": 540
},
{
"epoch": 0.15442079935420136,
"grad_norm": 2.578125,
"learning_rate": 0.00028451588300823266,
"loss": 2.8233,
"step": 541
},
{
"epoch": 0.15470623521252708,
"grad_norm": 2.03125,
"learning_rate": 0.0002844557011378328,
"loss": 2.8076,
"step": 542
},
{
"epoch": 0.15499167107085277,
"grad_norm": 2.734375,
"learning_rate": 0.00028439540893053766,
"loss": 2.8473,
"step": 543
},
{
"epoch": 0.15527710692917845,
"grad_norm": 2.5625,
"learning_rate": 0.000284335006435824,
"loss": 2.8175,
"step": 544
},
{
"epoch": 0.15556254278750414,
"grad_norm": 2.28125,
"learning_rate": 0.00028427449370325937,
"loss": 2.8237,
"step": 545
},
{
"epoch": 0.15584797864582986,
"grad_norm": 1.9921875,
"learning_rate": 0.0002842138707825015,
"loss": 2.8176,
"step": 546
},
{
"epoch": 0.15613341450415555,
"grad_norm": 2.421875,
"learning_rate": 0.0002841531377232989,
"loss": 2.8295,
"step": 547
},
{
"epoch": 0.15641885036248124,
"grad_norm": 1.9453125,
"learning_rate": 0.0002840922945754901,
"loss": 2.8035,
"step": 548
},
{
"epoch": 0.15670428622080693,
"grad_norm": 2.921875,
"learning_rate": 0.00028403134138900427,
"loss": 2.8217,
"step": 549
},
{
"epoch": 0.15698972207913264,
"grad_norm": 2.546875,
"learning_rate": 0.0002839702782138607,
"loss": 2.8093,
"step": 550
},
{
"epoch": 0.15727515793745833,
"grad_norm": 2.296875,
"learning_rate": 0.00028390910510016896,
"loss": 2.8026,
"step": 551
},
{
"epoch": 0.15756059379578402,
"grad_norm": 2.34375,
"learning_rate": 0.00028384782209812893,
"loss": 2.8124,
"step": 552
},
{
"epoch": 0.1578460296541097,
"grad_norm": 1.84375,
"learning_rate": 0.0002837864292580305,
"loss": 2.8342,
"step": 553
},
{
"epoch": 0.15813146551243543,
"grad_norm": 1.75,
"learning_rate": 0.00028372492663025393,
"loss": 2.7897,
"step": 554
},
{
"epoch": 0.15841690137076112,
"grad_norm": 1.703125,
"learning_rate": 0.0002836633142652693,
"loss": 2.8149,
"step": 555
},
{
"epoch": 0.1587023372290868,
"grad_norm": 1.390625,
"learning_rate": 0.00028360159221363704,
"loss": 2.8298,
"step": 556
},
{
"epoch": 0.1589877730874125,
"grad_norm": 2.453125,
"learning_rate": 0.00028353976052600727,
"loss": 2.8108,
"step": 557
},
{
"epoch": 0.1592732089457382,
"grad_norm": 1.8125,
"learning_rate": 0.0002834778192531204,
"loss": 2.7943,
"step": 558
},
{
"epoch": 0.1595586448040639,
"grad_norm": 2.890625,
"learning_rate": 0.00028341576844580647,
"loss": 2.8394,
"step": 559
},
{
"epoch": 0.1598440806623896,
"grad_norm": 2.796875,
"learning_rate": 0.00028335360815498565,
"loss": 2.8056,
"step": 560
},
{
"epoch": 0.16012951652071528,
"grad_norm": 1.8515625,
"learning_rate": 0.00028329133843166786,
"loss": 2.8123,
"step": 561
},
{
"epoch": 0.160414952379041,
"grad_norm": 2.515625,
"learning_rate": 0.0002832289593269527,
"loss": 2.8239,
"step": 562
},
{
"epoch": 0.16070038823736668,
"grad_norm": 1.8203125,
"learning_rate": 0.00028316647089202975,
"loss": 2.8298,
"step": 563
},
{
"epoch": 0.16098582409569237,
"grad_norm": 2.875,
"learning_rate": 0.0002831038731781782,
"loss": 2.839,
"step": 564
},
{
"epoch": 0.16127125995401806,
"grad_norm": 2.65625,
"learning_rate": 0.00028304116623676685,
"loss": 2.8498,
"step": 565
},
{
"epoch": 0.16155669581234378,
"grad_norm": 2.21875,
"learning_rate": 0.0002829783501192542,
"loss": 2.8228,
"step": 566
},
{
"epoch": 0.16184213167066946,
"grad_norm": 2.109375,
"learning_rate": 0.0002829154248771885,
"loss": 2.8171,
"step": 567
},
{
"epoch": 0.16212756752899515,
"grad_norm": 1.9453125,
"learning_rate": 0.00028285239056220724,
"loss": 2.7826,
"step": 568
},
{
"epoch": 0.16241300338732084,
"grad_norm": 1.5078125,
"learning_rate": 0.0002827892472260376,
"loss": 2.8087,
"step": 569
},
{
"epoch": 0.16269843924564656,
"grad_norm": 2.046875,
"learning_rate": 0.00028272599492049625,
"loss": 2.7997,
"step": 570
},
{
"epoch": 0.16298387510397225,
"grad_norm": 1.609375,
"learning_rate": 0.00028266263369748916,
"loss": 2.8093,
"step": 571
},
{
"epoch": 0.16326931096229794,
"grad_norm": 1.765625,
"learning_rate": 0.0002825991636090118,
"loss": 2.7765,
"step": 572
},
{
"epoch": 0.16355474682062363,
"grad_norm": 1.3984375,
"learning_rate": 0.0002825355847071489,
"loss": 2.8033,
"step": 573
},
{
"epoch": 0.16384018267894934,
"grad_norm": 50.75,
"learning_rate": 0.00028247189704407456,
"loss": 2.8378,
"step": 574
},
{
"epoch": 0.16412561853727503,
"grad_norm": 4.03125,
"learning_rate": 0.000282408100672052,
"loss": 2.8366,
"step": 575
},
{
"epoch": 0.16441105439560072,
"grad_norm": 2.625,
"learning_rate": 0.0002823441956434338,
"loss": 2.8565,
"step": 576
},
{
"epoch": 0.1646964902539264,
"grad_norm": 3.15625,
"learning_rate": 0.0002822801820106617,
"loss": 2.8216,
"step": 577
},
{
"epoch": 0.1649819261122521,
"grad_norm": 2.84375,
"learning_rate": 0.0002822160598262663,
"loss": 2.8249,
"step": 578
},
{
"epoch": 0.16526736197057781,
"grad_norm": 2.25,
"learning_rate": 0.00028215182914286766,
"loss": 2.8343,
"step": 579
},
{
"epoch": 0.1655527978289035,
"grad_norm": 2.53125,
"learning_rate": 0.0002820874900131746,
"loss": 2.8027,
"step": 580
},
{
"epoch": 0.1658382336872292,
"grad_norm": 2.21875,
"learning_rate": 0.00028202304248998506,
"loss": 2.8204,
"step": 581
},
{
"epoch": 0.16612366954555488,
"grad_norm": 2.078125,
"learning_rate": 0.0002819584866261859,
"loss": 2.8122,
"step": 582
},
{
"epoch": 0.1664091054038806,
"grad_norm": 1.828125,
"learning_rate": 0.0002818938224747529,
"loss": 2.816,
"step": 583
},
{
"epoch": 0.16669454126220629,
"grad_norm": 1.5390625,
"learning_rate": 0.0002818290500887506,
"loss": 2.8286,
"step": 584
},
{
"epoch": 0.16697997712053197,
"grad_norm": 2.0625,
"learning_rate": 0.0002817641695213327,
"loss": 2.8046,
"step": 585
},
{
"epoch": 0.16726541297885766,
"grad_norm": 1.3359375,
"learning_rate": 0.00028169918082574105,
"loss": 2.8249,
"step": 586
},
{
"epoch": 0.16755084883718338,
"grad_norm": 1.9921875,
"learning_rate": 0.0002816340840553069,
"loss": 2.8051,
"step": 587
},
{
"epoch": 0.16783628469550907,
"grad_norm": 1.4140625,
"learning_rate": 0.00028156887926344975,
"loss": 2.8328,
"step": 588
},
{
"epoch": 0.16812172055383476,
"grad_norm": 2.1875,
"learning_rate": 0.00028150356650367796,
"loss": 2.8087,
"step": 589
},
{
"epoch": 0.16840715641216045,
"grad_norm": 1.7421875,
"learning_rate": 0.00028143814582958827,
"loss": 2.7976,
"step": 590
},
{
"epoch": 0.16869259227048616,
"grad_norm": 2.671875,
"learning_rate": 0.0002813726172948664,
"loss": 2.8238,
"step": 591
},
{
"epoch": 0.16897802812881185,
"grad_norm": 2.0625,
"learning_rate": 0.000281306980953286,
"loss": 2.8243,
"step": 592
},
{
"epoch": 0.16926346398713754,
"grad_norm": 2.734375,
"learning_rate": 0.0002812412368587097,
"loss": 2.8078,
"step": 593
},
{
"epoch": 0.16954889984546323,
"grad_norm": 2.328125,
"learning_rate": 0.0002811753850650883,
"loss": 2.7899,
"step": 594
},
{
"epoch": 0.16983433570378895,
"grad_norm": 3.09375,
"learning_rate": 0.000281109425626461,
"loss": 2.8176,
"step": 595
},
{
"epoch": 0.17011977156211464,
"grad_norm": 3.015625,
"learning_rate": 0.00028104335859695543,
"loss": 2.8235,
"step": 596
},
{
"epoch": 0.17040520742044032,
"grad_norm": 2.125,
"learning_rate": 0.0002809771840307873,
"loss": 2.7986,
"step": 597
},
{
"epoch": 0.170690643278766,
"grad_norm": 1.8984375,
"learning_rate": 0.0002809109019822609,
"loss": 2.7848,
"step": 598
},
{
"epoch": 0.17097607913709173,
"grad_norm": 2.203125,
"learning_rate": 0.00028084451250576844,
"loss": 2.7914,
"step": 599
},
{
"epoch": 0.17126151499541742,
"grad_norm": 1.625,
"learning_rate": 0.00028077801565579033,
"loss": 2.8036,
"step": 600
},
{
"epoch": 0.1715469508537431,
"grad_norm": 2.84375,
"learning_rate": 0.0002807114114868953,
"loss": 2.8006,
"step": 601
},
{
"epoch": 0.1718323867120688,
"grad_norm": 2.40625,
"learning_rate": 0.0002806447000537398,
"loss": 2.7898,
"step": 602
},
{
"epoch": 0.1721178225703945,
"grad_norm": 2.78125,
"learning_rate": 0.00028057788141106865,
"loss": 2.7905,
"step": 603
},
{
"epoch": 0.1724032584287202,
"grad_norm": 2.5625,
"learning_rate": 0.0002805109556137144,
"loss": 2.8129,
"step": 604
},
{
"epoch": 0.1726886942870459,
"grad_norm": 2.375,
"learning_rate": 0.0002804439227165977,
"loss": 2.8151,
"step": 605
},
{
"epoch": 0.17297413014537158,
"grad_norm": 2.140625,
"learning_rate": 0.00028037678277472697,
"loss": 2.7888,
"step": 606
},
{
"epoch": 0.1732595660036973,
"grad_norm": 2.515625,
"learning_rate": 0.0002803095358431985,
"loss": 2.7996,
"step": 607
},
{
"epoch": 0.17354500186202299,
"grad_norm": 2.234375,
"learning_rate": 0.00028024218197719643,
"loss": 2.7932,
"step": 608
},
{
"epoch": 0.17383043772034867,
"grad_norm": 2.609375,
"learning_rate": 0.0002801747212319926,
"loss": 2.7972,
"step": 609
},
{
"epoch": 0.17411587357867436,
"grad_norm": 2.375,
"learning_rate": 0.0002801071536629466,
"loss": 2.8141,
"step": 610
},
{
"epoch": 0.17440130943700008,
"grad_norm": 2.5,
"learning_rate": 0.0002800394793255056,
"loss": 2.8014,
"step": 611
},
{
"epoch": 0.17468674529532577,
"grad_norm": 2.40625,
"learning_rate": 0.00027997169827520454,
"loss": 2.8036,
"step": 612
},
{
"epoch": 0.17497218115365146,
"grad_norm": 2.359375,
"learning_rate": 0.0002799038105676658,
"loss": 2.8235,
"step": 613
},
{
"epoch": 0.17525761701197715,
"grad_norm": 2.109375,
"learning_rate": 0.00027983581625859927,
"loss": 2.7849,
"step": 614
},
{
"epoch": 0.17554305287030286,
"grad_norm": 2.40625,
"learning_rate": 0.0002797677154038024,
"loss": 2.7964,
"step": 615
},
{
"epoch": 0.17582848872862855,
"grad_norm": 2.15625,
"learning_rate": 0.00027969950805916,
"loss": 2.8027,
"step": 616
},
{
"epoch": 0.17611392458695424,
"grad_norm": 2.5,
"learning_rate": 0.0002796311942806444,
"loss": 2.783,
"step": 617
},
{
"epoch": 0.17639936044527993,
"grad_norm": 2.25,
"learning_rate": 0.00027956277412431507,
"loss": 2.7981,
"step": 618
},
{
"epoch": 0.17668479630360565,
"grad_norm": 2.46875,
"learning_rate": 0.00027949424764631896,
"loss": 2.8145,
"step": 619
},
{
"epoch": 0.17697023216193133,
"grad_norm": 2.265625,
"learning_rate": 0.0002794256149028902,
"loss": 2.83,
"step": 620
},
{
"epoch": 0.17725566802025702,
"grad_norm": 2.375,
"learning_rate": 0.00027935687595035015,
"loss": 2.811,
"step": 621
},
{
"epoch": 0.1775411038785827,
"grad_norm": 2.09375,
"learning_rate": 0.00027928803084510716,
"loss": 2.8016,
"step": 622
},
{
"epoch": 0.17782653973690843,
"grad_norm": 2.421875,
"learning_rate": 0.000279219079643657,
"loss": 2.7996,
"step": 623
},
{
"epoch": 0.17811197559523412,
"grad_norm": 2.203125,
"learning_rate": 0.0002791500224025822,
"loss": 2.817,
"step": 624
},
{
"epoch": 0.1783974114535598,
"grad_norm": 2.40625,
"learning_rate": 0.00027908085917855243,
"loss": 2.8096,
"step": 625
},
{
"epoch": 0.1786828473118855,
"grad_norm": 2.09375,
"learning_rate": 0.0002790115900283245,
"loss": 2.7852,
"step": 626
},
{
"epoch": 0.1789682831702112,
"grad_norm": 2.28125,
"learning_rate": 0.00027894221500874184,
"loss": 2.8088,
"step": 627
},
{
"epoch": 0.1792537190285369,
"grad_norm": 2.046875,
"learning_rate": 0.0002788727341767349,
"loss": 2.767,
"step": 628
},
{
"epoch": 0.1795391548868626,
"grad_norm": 2.4375,
"learning_rate": 0.0002788031475893211,
"loss": 2.7955,
"step": 629
},
{
"epoch": 0.17982459074518828,
"grad_norm": 2.125,
"learning_rate": 0.00027873345530360436,
"loss": 2.8143,
"step": 630
},
{
"epoch": 0.180110026603514,
"grad_norm": 2.625,
"learning_rate": 0.00027866365737677564,
"loss": 2.777,
"step": 631
},
{
"epoch": 0.18039546246183968,
"grad_norm": 2.234375,
"learning_rate": 0.00027859375386611227,
"loss": 2.8,
"step": 632
},
{
"epoch": 0.18068089832016537,
"grad_norm": 2.65625,
"learning_rate": 0.0002785237448289786,
"loss": 2.7796,
"step": 633
},
{
"epoch": 0.18096633417849106,
"grad_norm": 2.421875,
"learning_rate": 0.00027845363032282514,
"loss": 2.8042,
"step": 634
},
{
"epoch": 0.18125177003681678,
"grad_norm": 2.171875,
"learning_rate": 0.0002783834104051893,
"loss": 2.8206,
"step": 635
},
{
"epoch": 0.18153720589514247,
"grad_norm": 2.171875,
"learning_rate": 0.00027831308513369494,
"loss": 2.812,
"step": 636
},
{
"epoch": 0.18182264175346816,
"grad_norm": 1.953125,
"learning_rate": 0.00027824265456605224,
"loss": 2.7804,
"step": 637
},
{
"epoch": 0.18210807761179384,
"grad_norm": 1.859375,
"learning_rate": 0.00027817211876005786,
"loss": 2.7941,
"step": 638
},
{
"epoch": 0.18239351347011956,
"grad_norm": 1.734375,
"learning_rate": 0.0002781014777735948,
"loss": 2.7842,
"step": 639
},
{
"epoch": 0.18267894932844525,
"grad_norm": 1.671875,
"learning_rate": 0.00027803073166463244,
"loss": 2.7955,
"step": 640
},
{
"epoch": 0.18296438518677094,
"grad_norm": 1.7578125,
"learning_rate": 0.00027795988049122625,
"loss": 2.7597,
"step": 641
},
{
"epoch": 0.18324982104509663,
"grad_norm": 1.4453125,
"learning_rate": 0.0002778889243115183,
"loss": 2.811,
"step": 642
},
{
"epoch": 0.18353525690342234,
"grad_norm": 1.7734375,
"learning_rate": 0.00027781786318373627,
"loss": 2.7948,
"step": 643
},
{
"epoch": 0.18382069276174803,
"grad_norm": 1.4296875,
"learning_rate": 0.0002777466971661945,
"loss": 2.7811,
"step": 644
},
{
"epoch": 0.18410612862007372,
"grad_norm": 2.0625,
"learning_rate": 0.00027767542631729306,
"loss": 2.7838,
"step": 645
},
{
"epoch": 0.1843915644783994,
"grad_norm": 1.65625,
"learning_rate": 0.0002776040506955182,
"loss": 2.7958,
"step": 646
},
{
"epoch": 0.18467700033672513,
"grad_norm": 2.1875,
"learning_rate": 0.0002775325703594421,
"loss": 2.7798,
"step": 647
},
{
"epoch": 0.18496243619505082,
"grad_norm": 1.8984375,
"learning_rate": 0.0002774609853677229,
"loss": 2.7891,
"step": 648
},
{
"epoch": 0.1852478720533765,
"grad_norm": 2.25,
"learning_rate": 0.0002773892957791045,
"loss": 2.8067,
"step": 649
},
{
"epoch": 0.1855333079117022,
"grad_norm": 1.9140625,
"learning_rate": 0.0002773175016524169,
"loss": 2.7842,
"step": 650
},
{
"epoch": 0.18581874377002788,
"grad_norm": 2.265625,
"learning_rate": 0.00027724560304657553,
"loss": 2.7706,
"step": 651
},
{
"epoch": 0.1861041796283536,
"grad_norm": 2.03125,
"learning_rate": 0.0002771736000205819,
"loss": 2.7912,
"step": 652
},
{
"epoch": 0.1863896154866793,
"grad_norm": 2.40625,
"learning_rate": 0.000277101492633523,
"loss": 2.7859,
"step": 653
},
{
"epoch": 0.18667505134500498,
"grad_norm": 2.140625,
"learning_rate": 0.0002770292809445715,
"loss": 2.7637,
"step": 654
},
{
"epoch": 0.18696048720333067,
"grad_norm": 2.359375,
"learning_rate": 0.0002769569650129857,
"loss": 2.7884,
"step": 655
},
{
"epoch": 0.18724592306165638,
"grad_norm": 2.234375,
"learning_rate": 0.00027688454489810946,
"loss": 2.7858,
"step": 656
},
{
"epoch": 0.18753135891998207,
"grad_norm": 1.9921875,
"learning_rate": 0.00027681202065937203,
"loss": 2.7677,
"step": 657
},
{
"epoch": 0.18781679477830776,
"grad_norm": 1.796875,
"learning_rate": 0.00027673939235628827,
"loss": 2.7883,
"step": 658
},
{
"epoch": 0.18810223063663345,
"grad_norm": 2.21875,
"learning_rate": 0.00027666666004845823,
"loss": 2.7624,
"step": 659
},
{
"epoch": 0.18838766649495917,
"grad_norm": 1.9609375,
"learning_rate": 0.0002765938237955674,
"loss": 2.8089,
"step": 660
},
{
"epoch": 0.18867310235328486,
"grad_norm": 2.328125,
"learning_rate": 0.0002765208836573868,
"loss": 2.7795,
"step": 661
},
{
"epoch": 0.18895853821161054,
"grad_norm": 2.140625,
"learning_rate": 0.0002764478396937722,
"loss": 2.7722,
"step": 662
},
{
"epoch": 0.18924397406993623,
"grad_norm": 2.171875,
"learning_rate": 0.00027637469196466506,
"loss": 2.7653,
"step": 663
},
{
"epoch": 0.18952940992826195,
"grad_norm": 1.984375,
"learning_rate": 0.00027630144053009174,
"loss": 2.7717,
"step": 664
},
{
"epoch": 0.18981484578658764,
"grad_norm": 2.15625,
"learning_rate": 0.0002762280854501638,
"loss": 2.762,
"step": 665
},
{
"epoch": 0.19010028164491333,
"grad_norm": 2.03125,
"learning_rate": 0.00027615462678507775,
"loss": 2.7989,
"step": 666
},
{
"epoch": 0.19038571750323902,
"grad_norm": 2.203125,
"learning_rate": 0.00027608106459511513,
"loss": 2.7851,
"step": 667
},
{
"epoch": 0.19067115336156473,
"grad_norm": 2.15625,
"learning_rate": 0.0002760073989406425,
"loss": 2.7428,
"step": 668
},
{
"epoch": 0.19095658921989042,
"grad_norm": 1.9921875,
"learning_rate": 0.00027593362988211133,
"loss": 2.7699,
"step": 669
},
{
"epoch": 0.1912420250782161,
"grad_norm": 1.875,
"learning_rate": 0.00027585975748005783,
"loss": 2.7797,
"step": 670
},
{
"epoch": 0.1915274609365418,
"grad_norm": 2.109375,
"learning_rate": 0.0002757857817951032,
"loss": 2.7656,
"step": 671
},
{
"epoch": 0.19181289679486752,
"grad_norm": 2.078125,
"learning_rate": 0.00027571170288795323,
"loss": 2.7674,
"step": 672
},
{
"epoch": 0.1920983326531932,
"grad_norm": 1.9765625,
"learning_rate": 0.0002756375208193985,
"loss": 2.7576,
"step": 673
},
{
"epoch": 0.1923837685115189,
"grad_norm": 1.8984375,
"learning_rate": 0.0002755632356503141,
"loss": 2.7844,
"step": 674
},
{
"epoch": 0.19266920436984458,
"grad_norm": 2.03125,
"learning_rate": 0.00027548884744166,
"loss": 2.7817,
"step": 675
},
{
"epoch": 0.1929546402281703,
"grad_norm": 1.8671875,
"learning_rate": 0.0002754143562544805,
"loss": 2.7589,
"step": 676
},
{
"epoch": 0.193240076086496,
"grad_norm": 2.125,
"learning_rate": 0.0002753397621499045,
"loss": 2.7841,
"step": 677
},
{
"epoch": 0.19352551194482168,
"grad_norm": 1.9453125,
"learning_rate": 0.00027526506518914533,
"loss": 2.7945,
"step": 678
},
{
"epoch": 0.19381094780314737,
"grad_norm": 2.140625,
"learning_rate": 0.00027519026543350067,
"loss": 2.7896,
"step": 679
},
{
"epoch": 0.19409638366147308,
"grad_norm": 1.9609375,
"learning_rate": 0.0002751153629443528,
"loss": 2.7839,
"step": 680
},
{
"epoch": 0.19438181951979877,
"grad_norm": 2.078125,
"learning_rate": 0.0002750403577831679,
"loss": 2.7684,
"step": 681
},
{
"epoch": 0.19466725537812446,
"grad_norm": 1.828125,
"learning_rate": 0.00027496525001149676,
"loss": 2.7598,
"step": 682
},
{
"epoch": 0.19495269123645015,
"grad_norm": 2.015625,
"learning_rate": 0.00027489003969097416,
"loss": 2.7652,
"step": 683
},
{
"epoch": 0.19523812709477587,
"grad_norm": 1.9765625,
"learning_rate": 0.00027481472688331923,
"loss": 2.7909,
"step": 684
},
{
"epoch": 0.19552356295310155,
"grad_norm": 1.984375,
"learning_rate": 0.00027473931165033496,
"loss": 2.7535,
"step": 685
},
{
"epoch": 0.19580899881142724,
"grad_norm": 1.8671875,
"learning_rate": 0.00027466379405390864,
"loss": 2.763,
"step": 686
},
{
"epoch": 0.19609443466975293,
"grad_norm": 1.96875,
"learning_rate": 0.0002745881741560113,
"loss": 2.8034,
"step": 687
},
{
"epoch": 0.19637987052807865,
"grad_norm": 1.671875,
"learning_rate": 0.0002745124520186981,
"loss": 2.7538,
"step": 688
},
{
"epoch": 0.19666530638640434,
"grad_norm": 2.109375,
"learning_rate": 0.0002744366277041082,
"loss": 2.7494,
"step": 689
},
{
"epoch": 0.19695074224473003,
"grad_norm": 1.8125,
"learning_rate": 0.0002743607012744643,
"loss": 2.7578,
"step": 690
},
{
"epoch": 0.19723617810305571,
"grad_norm": 1.984375,
"learning_rate": 0.00027428467279207316,
"loss": 2.7845,
"step": 691
},
{
"epoch": 0.19752161396138143,
"grad_norm": 1.7890625,
"learning_rate": 0.00027420854231932515,
"loss": 2.7833,
"step": 692
},
{
"epoch": 0.19780704981970712,
"grad_norm": 1.8671875,
"learning_rate": 0.0002741323099186944,
"loss": 2.7835,
"step": 693
},
{
"epoch": 0.1980924856780328,
"grad_norm": 1.6640625,
"learning_rate": 0.00027405597565273866,
"loss": 2.7663,
"step": 694
},
{
"epoch": 0.1983779215363585,
"grad_norm": 2.0,
"learning_rate": 0.00027397953958409923,
"loss": 2.7737,
"step": 695
},
{
"epoch": 0.19866335739468421,
"grad_norm": 1.71875,
"learning_rate": 0.00027390300177550106,
"loss": 2.7501,
"step": 696
},
{
"epoch": 0.1989487932530099,
"grad_norm": 1.9296875,
"learning_rate": 0.0002738263622897525,
"loss": 2.7862,
"step": 697
},
{
"epoch": 0.1992342291113356,
"grad_norm": 1.71875,
"learning_rate": 0.0002737496211897453,
"loss": 2.7629,
"step": 698
},
{
"epoch": 0.19951966496966128,
"grad_norm": 1.8515625,
"learning_rate": 0.0002736727785384548,
"loss": 2.7394,
"step": 699
},
{
"epoch": 0.199805100827987,
"grad_norm": 1.7421875,
"learning_rate": 0.00027359583439893944,
"loss": 2.7867,
"step": 700
},
{
"epoch": 0.2000905366863127,
"grad_norm": 1.765625,
"learning_rate": 0.00027351878883434105,
"loss": 2.7564,
"step": 701
},
{
"epoch": 0.20037597254463838,
"grad_norm": 1.53125,
"learning_rate": 0.0002734416419078847,
"loss": 2.7623,
"step": 702
},
{
"epoch": 0.20066140840296406,
"grad_norm": 2.0625,
"learning_rate": 0.00027336439368287857,
"loss": 2.7678,
"step": 703
},
{
"epoch": 0.20094684426128978,
"grad_norm": 1.9375,
"learning_rate": 0.0002732870442227141,
"loss": 2.7727,
"step": 704
},
{
"epoch": 0.20123228011961547,
"grad_norm": 1.828125,
"learning_rate": 0.00027320959359086565,
"loss": 2.7808,
"step": 705
},
{
"epoch": 0.20151771597794116,
"grad_norm": 1.703125,
"learning_rate": 0.0002731320418508907,
"loss": 2.7509,
"step": 706
},
{
"epoch": 0.20180315183626685,
"grad_norm": 1.8125,
"learning_rate": 0.0002730543890664297,
"loss": 2.7839,
"step": 707
},
{
"epoch": 0.20208858769459256,
"grad_norm": 1.546875,
"learning_rate": 0.0002729766353012059,
"loss": 2.7573,
"step": 708
},
{
"epoch": 0.20237402355291825,
"grad_norm": 2.296875,
"learning_rate": 0.0002728987806190257,
"loss": 2.7872,
"step": 709
},
{
"epoch": 0.20265945941124394,
"grad_norm": 1.9609375,
"learning_rate": 0.00027282082508377795,
"loss": 2.7727,
"step": 710
},
{
"epoch": 0.20294489526956963,
"grad_norm": 2.203125,
"learning_rate": 0.0002727427687594345,
"loss": 2.7632,
"step": 711
},
{
"epoch": 0.20323033112789535,
"grad_norm": 2.03125,
"learning_rate": 0.00027266461171004985,
"loss": 2.7631,
"step": 712
},
{
"epoch": 0.20351576698622104,
"grad_norm": 2.046875,
"learning_rate": 0.00027258635399976115,
"loss": 2.768,
"step": 713
},
{
"epoch": 0.20380120284454672,
"grad_norm": 1.7890625,
"learning_rate": 0.00027250799569278816,
"loss": 2.7666,
"step": 714
},
{
"epoch": 0.2040866387028724,
"grad_norm": 2.25,
"learning_rate": 0.00027242953685343327,
"loss": 2.7794,
"step": 715
},
{
"epoch": 0.20437207456119813,
"grad_norm": 1.890625,
"learning_rate": 0.0002723509775460811,
"loss": 2.7449,
"step": 716
},
{
"epoch": 0.20465751041952382,
"grad_norm": 2.140625,
"learning_rate": 0.00027227231783519913,
"loss": 2.7529,
"step": 717
},
{
"epoch": 0.2049429462778495,
"grad_norm": 1.8984375,
"learning_rate": 0.0002721935577853368,
"loss": 2.7785,
"step": 718
},
{
"epoch": 0.2052283821361752,
"grad_norm": 1.9765625,
"learning_rate": 0.00027211469746112624,
"loss": 2.7653,
"step": 719
},
{
"epoch": 0.2055138179945009,
"grad_norm": 1.734375,
"learning_rate": 0.00027203573692728174,
"loss": 2.7664,
"step": 720
},
{
"epoch": 0.2057992538528266,
"grad_norm": 2.0625,
"learning_rate": 0.0002719566762485997,
"loss": 2.7677,
"step": 721
},
{
"epoch": 0.2060846897111523,
"grad_norm": 1.7578125,
"learning_rate": 0.0002718775154899589,
"loss": 2.7667,
"step": 722
},
{
"epoch": 0.20637012556947798,
"grad_norm": 2.078125,
"learning_rate": 0.0002717982547163201,
"loss": 2.7674,
"step": 723
},
{
"epoch": 0.20665556142780367,
"grad_norm": 1.84375,
"learning_rate": 0.0002717188939927262,
"loss": 2.7747,
"step": 724
},
{
"epoch": 0.20694099728612939,
"grad_norm": 1.984375,
"learning_rate": 0.00027163943338430214,
"loss": 2.7299,
"step": 725
},
{
"epoch": 0.20722643314445507,
"grad_norm": 1.8359375,
"learning_rate": 0.0002715598729562548,
"loss": 2.7672,
"step": 726
},
{
"epoch": 0.20751186900278076,
"grad_norm": 1.875,
"learning_rate": 0.000271480212773873,
"loss": 2.7847,
"step": 727
},
{
"epoch": 0.20779730486110645,
"grad_norm": 1.609375,
"learning_rate": 0.0002714004529025273,
"loss": 2.7886,
"step": 728
},
{
"epoch": 0.20808274071943217,
"grad_norm": 1.890625,
"learning_rate": 0.00027132059340767025,
"loss": 2.7586,
"step": 729
},
{
"epoch": 0.20836817657775786,
"grad_norm": 1.5859375,
"learning_rate": 0.00027124063435483603,
"loss": 2.779,
"step": 730
},
{
"epoch": 0.20865361243608355,
"grad_norm": 2.0625,
"learning_rate": 0.0002711605758096406,
"loss": 2.7593,
"step": 731
},
{
"epoch": 0.20893904829440924,
"grad_norm": 1.7109375,
"learning_rate": 0.0002710804178377814,
"loss": 2.7684,
"step": 732
},
{
"epoch": 0.20922448415273495,
"grad_norm": 2.03125,
"learning_rate": 0.0002710001605050377,
"loss": 2.7542,
"step": 733
},
{
"epoch": 0.20950992001106064,
"grad_norm": 1.7578125,
"learning_rate": 0.00027091980387727014,
"loss": 2.7644,
"step": 734
},
{
"epoch": 0.20979535586938633,
"grad_norm": 2.15625,
"learning_rate": 0.00027083934802042084,
"loss": 2.7772,
"step": 735
},
{
"epoch": 0.21008079172771202,
"grad_norm": 1.8359375,
"learning_rate": 0.0002707587930005136,
"loss": 2.7419,
"step": 736
},
{
"epoch": 0.21036622758603774,
"grad_norm": 2.09375,
"learning_rate": 0.0002706781388836531,
"loss": 2.7889,
"step": 737
},
{
"epoch": 0.21065166344436342,
"grad_norm": 1.765625,
"learning_rate": 0.00027059738573602583,
"loss": 2.768,
"step": 738
},
{
"epoch": 0.2109370993026891,
"grad_norm": 2.25,
"learning_rate": 0.00027051653362389935,
"loss": 2.8016,
"step": 739
},
{
"epoch": 0.2112225351610148,
"grad_norm": 1.8046875,
"learning_rate": 0.0002704355826136224,
"loss": 2.758,
"step": 740
},
{
"epoch": 0.21150797101934052,
"grad_norm": 2.203125,
"learning_rate": 0.0002703545327716249,
"loss": 2.7658,
"step": 741
},
{
"epoch": 0.2117934068776662,
"grad_norm": 1.859375,
"learning_rate": 0.00027027338416441785,
"loss": 2.7693,
"step": 742
},
{
"epoch": 0.2120788427359919,
"grad_norm": 2.40625,
"learning_rate": 0.0002701921368585934,
"loss": 2.7948,
"step": 743
},
{
"epoch": 0.21236427859431758,
"grad_norm": 2.0,
"learning_rate": 0.0002701107909208246,
"loss": 2.7832,
"step": 744
},
{
"epoch": 0.2126497144526433,
"grad_norm": 2.25,
"learning_rate": 0.00027002934641786545,
"loss": 2.7851,
"step": 745
},
{
"epoch": 0.212935150310969,
"grad_norm": 2.0625,
"learning_rate": 0.00026994780341655093,
"loss": 2.7461,
"step": 746
},
{
"epoch": 0.21322058616929468,
"grad_norm": 2.015625,
"learning_rate": 0.0002698661619837967,
"loss": 2.7511,
"step": 747
},
{
"epoch": 0.21350602202762037,
"grad_norm": 1.8125,
"learning_rate": 0.0002697844221865993,
"loss": 2.7562,
"step": 748
},
{
"epoch": 0.21379145788594608,
"grad_norm": 2.078125,
"learning_rate": 0.00026970258409203594,
"loss": 2.729,
"step": 749
},
{
"epoch": 0.21407689374427177,
"grad_norm": 1.7421875,
"learning_rate": 0.00026962064776726445,
"loss": 2.7467,
"step": 750
},
{
"epoch": 0.21407689374427177,
"eval_loss": 2.6212494373321533,
"eval_runtime": 5936.0633,
"eval_samples_per_second": 10.83,
"eval_steps_per_second": 10.83,
"step": 750
},
{
"epoch": 0.21436232960259746,
"grad_norm": 2.0625,
"learning_rate": 0.0002695386132795234,
"loss": 2.7875,
"step": 751
},
{
"epoch": 0.21464776546092315,
"grad_norm": 1.8828125,
"learning_rate": 0.0002694564806961319,
"loss": 2.7879,
"step": 752
},
{
"epoch": 0.21493320131924887,
"grad_norm": 1.9140625,
"learning_rate": 0.00026937425008448937,
"loss": 2.7634,
"step": 753
},
{
"epoch": 0.21521863717757456,
"grad_norm": 1.65625,
"learning_rate": 0.0002692919215120759,
"loss": 2.7563,
"step": 754
},
{
"epoch": 0.21550407303590025,
"grad_norm": 1.953125,
"learning_rate": 0.0002692094950464519,
"loss": 2.7836,
"step": 755
},
{
"epoch": 0.21578950889422593,
"grad_norm": 1.59375,
"learning_rate": 0.000269126970755258,
"loss": 2.7366,
"step": 756
},
{
"epoch": 0.21607494475255165,
"grad_norm": 1.828125,
"learning_rate": 0.00026904434870621524,
"loss": 2.7813,
"step": 757
},
{
"epoch": 0.21636038061087734,
"grad_norm": 1.53125,
"learning_rate": 0.00026896162896712476,
"loss": 2.7718,
"step": 758
},
{
"epoch": 0.21664581646920303,
"grad_norm": 2.0,
"learning_rate": 0.00026887881160586813,
"loss": 2.7536,
"step": 759
},
{
"epoch": 0.21693125232752872,
"grad_norm": 1.6953125,
"learning_rate": 0.0002687958966904067,
"loss": 2.7619,
"step": 760
},
{
"epoch": 0.21721668818585443,
"grad_norm": 1.84375,
"learning_rate": 0.00026871288428878206,
"loss": 2.7672,
"step": 761
},
{
"epoch": 0.21750212404418012,
"grad_norm": 1.6328125,
"learning_rate": 0.0002686297744691158,
"loss": 2.7571,
"step": 762
},
{
"epoch": 0.2177875599025058,
"grad_norm": 2.015625,
"learning_rate": 0.0002685465672996093,
"loss": 2.7652,
"step": 763
},
{
"epoch": 0.2180729957608315,
"grad_norm": 1.6953125,
"learning_rate": 0.000268463262848544,
"loss": 2.7748,
"step": 764
},
{
"epoch": 0.21835843161915722,
"grad_norm": 1.7109375,
"learning_rate": 0.0002683798611842812,
"loss": 2.7583,
"step": 765
},
{
"epoch": 0.2186438674774829,
"grad_norm": 1.59375,
"learning_rate": 0.0002682963623752617,
"loss": 2.7586,
"step": 766
},
{
"epoch": 0.2189293033358086,
"grad_norm": 1.828125,
"learning_rate": 0.0002682127664900064,
"loss": 2.7338,
"step": 767
},
{
"epoch": 0.21921473919413428,
"grad_norm": 1.40625,
"learning_rate": 0.0002681290735971156,
"loss": 2.752,
"step": 768
},
{
"epoch": 0.21950017505246,
"grad_norm": 1.796875,
"learning_rate": 0.0002680452837652691,
"loss": 2.7629,
"step": 769
},
{
"epoch": 0.2197856109107857,
"grad_norm": 1.5625,
"learning_rate": 0.0002679613970632267,
"loss": 2.7652,
"step": 770
},
{
"epoch": 0.22007104676911138,
"grad_norm": 2.109375,
"learning_rate": 0.0002678774135598272,
"loss": 2.7537,
"step": 771
},
{
"epoch": 0.22035648262743707,
"grad_norm": 1.6875,
"learning_rate": 0.00026779333332398923,
"loss": 2.7141,
"step": 772
},
{
"epoch": 0.22064191848576278,
"grad_norm": 1.90625,
"learning_rate": 0.0002677091564247105,
"loss": 2.757,
"step": 773
},
{
"epoch": 0.22092735434408847,
"grad_norm": 1.6875,
"learning_rate": 0.0002676248829310682,
"loss": 2.7454,
"step": 774
},
{
"epoch": 0.22121279020241416,
"grad_norm": 1.8671875,
"learning_rate": 0.0002675405129122188,
"loss": 2.7545,
"step": 775
},
{
"epoch": 0.22149822606073985,
"grad_norm": 1.4765625,
"learning_rate": 0.0002674560464373979,
"loss": 2.7331,
"step": 776
},
{
"epoch": 0.22178366191906557,
"grad_norm": 1.9609375,
"learning_rate": 0.0002673714835759202,
"loss": 2.7603,
"step": 777
},
{
"epoch": 0.22206909777739126,
"grad_norm": 1.515625,
"learning_rate": 0.00026728682439717974,
"loss": 2.7551,
"step": 778
},
{
"epoch": 0.22235453363571694,
"grad_norm": 2.234375,
"learning_rate": 0.0002672020689706493,
"loss": 2.7814,
"step": 779
},
{
"epoch": 0.22263996949404263,
"grad_norm": 1.765625,
"learning_rate": 0.00026711721736588103,
"loss": 2.7604,
"step": 780
},
{
"epoch": 0.22292540535236835,
"grad_norm": 2.0625,
"learning_rate": 0.00026703226965250546,
"loss": 2.7551,
"step": 781
},
{
"epoch": 0.22321084121069404,
"grad_norm": 1.890625,
"learning_rate": 0.00026694722590023246,
"loss": 2.7357,
"step": 782
},
{
"epoch": 0.22349627706901973,
"grad_norm": 1.875,
"learning_rate": 0.00026686208617885055,
"loss": 2.7532,
"step": 783
},
{
"epoch": 0.22378171292734542,
"grad_norm": 1.6875,
"learning_rate": 0.0002667768505582269,
"loss": 2.7388,
"step": 784
},
{
"epoch": 0.22406714878567113,
"grad_norm": 1.6484375,
"learning_rate": 0.0002666915191083076,
"loss": 2.7594,
"step": 785
},
{
"epoch": 0.22435258464399682,
"grad_norm": 1.4296875,
"learning_rate": 0.00026660609189911724,
"loss": 2.7504,
"step": 786
},
{
"epoch": 0.2246380205023225,
"grad_norm": 1.515625,
"learning_rate": 0.00026652056900075885,
"loss": 2.7631,
"step": 787
},
{
"epoch": 0.2249234563606482,
"grad_norm": 1.2578125,
"learning_rate": 0.0002664349504834143,
"loss": 2.7534,
"step": 788
},
{
"epoch": 0.22520889221897392,
"grad_norm": 1.6875,
"learning_rate": 0.00026634923641734374,
"loss": 2.7584,
"step": 789
},
{
"epoch": 0.2254943280772996,
"grad_norm": 1.3125,
"learning_rate": 0.00026626342687288576,
"loss": 2.7519,
"step": 790
},
{
"epoch": 0.2257797639356253,
"grad_norm": 2.0625,
"learning_rate": 0.0002661775219204572,
"loss": 2.7477,
"step": 791
},
{
"epoch": 0.22606519979395098,
"grad_norm": 1.7578125,
"learning_rate": 0.0002660915216305534,
"loss": 2.7484,
"step": 792
},
{
"epoch": 0.22635063565227667,
"grad_norm": 1.734375,
"learning_rate": 0.0002660054260737478,
"loss": 2.7718,
"step": 793
},
{
"epoch": 0.2266360715106024,
"grad_norm": 1.6015625,
"learning_rate": 0.000265919235320692,
"loss": 2.7437,
"step": 794
},
{
"epoch": 0.22692150736892808,
"grad_norm": 1.6953125,
"learning_rate": 0.00026583294944211583,
"loss": 2.7564,
"step": 795
},
{
"epoch": 0.22720694322725377,
"grad_norm": 1.3046875,
"learning_rate": 0.00026574656850882706,
"loss": 2.7322,
"step": 796
},
{
"epoch": 0.22749237908557945,
"grad_norm": 1.8828125,
"learning_rate": 0.0002656600925917116,
"loss": 2.7623,
"step": 797
},
{
"epoch": 0.22777781494390517,
"grad_norm": 1.46875,
"learning_rate": 0.00026557352176173317,
"loss": 2.7294,
"step": 798
},
{
"epoch": 0.22806325080223086,
"grad_norm": 2.1875,
"learning_rate": 0.00026548685608993337,
"loss": 2.7457,
"step": 799
},
{
"epoch": 0.22834868666055655,
"grad_norm": 2.015625,
"learning_rate": 0.0002654000956474318,
"loss": 2.7512,
"step": 800
},
{
"epoch": 0.22863412251888224,
"grad_norm": 1.6953125,
"learning_rate": 0.0002653132405054257,
"loss": 2.7251,
"step": 801
},
{
"epoch": 0.22891955837720795,
"grad_norm": 1.59375,
"learning_rate": 0.00026522629073519,
"loss": 2.7645,
"step": 802
},
{
"epoch": 0.22920499423553364,
"grad_norm": 1.6328125,
"learning_rate": 0.00026513924640807733,
"loss": 2.7856,
"step": 803
},
{
"epoch": 0.22949043009385933,
"grad_norm": 1.3203125,
"learning_rate": 0.000265052107595518,
"loss": 2.7234,
"step": 804
},
{
"epoch": 0.22977586595218502,
"grad_norm": 1.84375,
"learning_rate": 0.00026496487436901964,
"loss": 2.7626,
"step": 805
},
{
"epoch": 0.23006130181051074,
"grad_norm": 1.578125,
"learning_rate": 0.00026487754680016765,
"loss": 2.7252,
"step": 806
},
{
"epoch": 0.23034673766883643,
"grad_norm": 1.890625,
"learning_rate": 0.0002647901249606245,
"loss": 2.7371,
"step": 807
},
{
"epoch": 0.23063217352716212,
"grad_norm": 1.7109375,
"learning_rate": 0.00026470260892213034,
"loss": 2.7533,
"step": 808
},
{
"epoch": 0.2309176093854878,
"grad_norm": 1.734375,
"learning_rate": 0.00026461499875650245,
"loss": 2.7512,
"step": 809
},
{
"epoch": 0.23120304524381352,
"grad_norm": 1.59375,
"learning_rate": 0.0002645272945356354,
"loss": 2.7423,
"step": 810
},
{
"epoch": 0.2314884811021392,
"grad_norm": 1.890625,
"learning_rate": 0.0002644394963315009,
"loss": 2.7495,
"step": 811
},
{
"epoch": 0.2317739169604649,
"grad_norm": 1.6484375,
"learning_rate": 0.00026435160421614784,
"loss": 2.7378,
"step": 812
},
{
"epoch": 0.2320593528187906,
"grad_norm": 1.859375,
"learning_rate": 0.0002642636182617022,
"loss": 2.7887,
"step": 813
},
{
"epoch": 0.2323447886771163,
"grad_norm": 1.5390625,
"learning_rate": 0.0002641755385403669,
"loss": 2.7452,
"step": 814
},
{
"epoch": 0.232630224535442,
"grad_norm": 2.0625,
"learning_rate": 0.0002640873651244217,
"loss": 2.7407,
"step": 815
},
{
"epoch": 0.23291566039376768,
"grad_norm": 1.671875,
"learning_rate": 0.0002639990980862236,
"loss": 2.7571,
"step": 816
},
{
"epoch": 0.23320109625209337,
"grad_norm": 2.15625,
"learning_rate": 0.00026391073749820607,
"loss": 2.7219,
"step": 817
},
{
"epoch": 0.2334865321104191,
"grad_norm": 1.953125,
"learning_rate": 0.00026382228343287947,
"loss": 2.7314,
"step": 818
},
{
"epoch": 0.23377196796874478,
"grad_norm": 2.15625,
"learning_rate": 0.0002637337359628309,
"loss": 2.7363,
"step": 819
},
{
"epoch": 0.23405740382707046,
"grad_norm": 1.953125,
"learning_rate": 0.00026364509516072415,
"loss": 2.7455,
"step": 820
},
{
"epoch": 0.23434283968539615,
"grad_norm": 1.921875,
"learning_rate": 0.00026355636109929946,
"loss": 2.7301,
"step": 821
},
{
"epoch": 0.23462827554372187,
"grad_norm": 1.7578125,
"learning_rate": 0.0002634675338513738,
"loss": 2.733,
"step": 822
},
{
"epoch": 0.23491371140204756,
"grad_norm": 1.71875,
"learning_rate": 0.00026337861348984024,
"loss": 2.7564,
"step": 823
},
{
"epoch": 0.23519914726037325,
"grad_norm": 1.4609375,
"learning_rate": 0.00026328960008766884,
"loss": 2.7489,
"step": 824
},
{
"epoch": 0.23548458311869894,
"grad_norm": 1.9140625,
"learning_rate": 0.0002632004937179055,
"loss": 2.7493,
"step": 825
},
{
"epoch": 0.23577001897702465,
"grad_norm": 1.6328125,
"learning_rate": 0.00026311129445367255,
"loss": 2.7289,
"step": 826
},
{
"epoch": 0.23605545483535034,
"grad_norm": 2.15625,
"learning_rate": 0.0002630220023681687,
"loss": 2.7193,
"step": 827
},
{
"epoch": 0.23634089069367603,
"grad_norm": 2.0,
"learning_rate": 0.0002629326175346687,
"loss": 2.738,
"step": 828
},
{
"epoch": 0.23662632655200172,
"grad_norm": 1.921875,
"learning_rate": 0.0002628431400265235,
"loss": 2.7497,
"step": 829
},
{
"epoch": 0.23691176241032744,
"grad_norm": 1.8203125,
"learning_rate": 0.00026275356991715986,
"loss": 2.7239,
"step": 830
},
{
"epoch": 0.23719719826865313,
"grad_norm": 1.71875,
"learning_rate": 0.0002626639072800809,
"loss": 2.7372,
"step": 831
},
{
"epoch": 0.23748263412697881,
"grad_norm": 1.4921875,
"learning_rate": 0.00026257415218886536,
"loss": 2.7284,
"step": 832
},
{
"epoch": 0.2377680699853045,
"grad_norm": 2.046875,
"learning_rate": 0.00026248430471716795,
"loss": 2.7515,
"step": 833
},
{
"epoch": 0.23805350584363022,
"grad_norm": 1.8984375,
"learning_rate": 0.0002623943649387194,
"loss": 2.7412,
"step": 834
},
{
"epoch": 0.2383389417019559,
"grad_norm": 1.84375,
"learning_rate": 0.0002623043329273257,
"loss": 2.7339,
"step": 835
},
{
"epoch": 0.2386243775602816,
"grad_norm": 1.6953125,
"learning_rate": 0.0002622142087568691,
"loss": 2.7482,
"step": 836
},
{
"epoch": 0.2389098134186073,
"grad_norm": 1.7890625,
"learning_rate": 0.00026212399250130706,
"loss": 2.7411,
"step": 837
},
{
"epoch": 0.239195249276933,
"grad_norm": 1.5234375,
"learning_rate": 0.0002620336842346728,
"loss": 2.7394,
"step": 838
},
{
"epoch": 0.2394806851352587,
"grad_norm": 1.9375,
"learning_rate": 0.0002619432840310749,
"loss": 2.6938,
"step": 839
},
{
"epoch": 0.23976612099358438,
"grad_norm": 1.7265625,
"learning_rate": 0.00026185279196469757,
"loss": 2.7298,
"step": 840
},
{
"epoch": 0.24005155685191007,
"grad_norm": 1.8828125,
"learning_rate": 0.00026176220810980035,
"loss": 2.7237,
"step": 841
},
{
"epoch": 0.2403369927102358,
"grad_norm": 1.7890625,
"learning_rate": 0.00026167153254071795,
"loss": 2.742,
"step": 842
},
{
"epoch": 0.24062242856856147,
"grad_norm": 1.6015625,
"learning_rate": 0.0002615807653318605,
"loss": 2.7514,
"step": 843
},
{
"epoch": 0.24090786442688716,
"grad_norm": 1.4453125,
"learning_rate": 0.0002614899065577133,
"loss": 2.7606,
"step": 844
},
{
"epoch": 0.24119330028521285,
"grad_norm": 1.8125,
"learning_rate": 0.0002613989562928369,
"loss": 2.7474,
"step": 845
},
{
"epoch": 0.24147873614353857,
"grad_norm": 1.53125,
"learning_rate": 0.00026130791461186656,
"loss": 2.7309,
"step": 846
},
{
"epoch": 0.24176417200186426,
"grad_norm": 1.984375,
"learning_rate": 0.000261216781589513,
"loss": 2.726,
"step": 847
},
{
"epoch": 0.24204960786018995,
"grad_norm": 1.703125,
"learning_rate": 0.0002611255573005617,
"loss": 2.7471,
"step": 848
},
{
"epoch": 0.24233504371851564,
"grad_norm": 1.90625,
"learning_rate": 0.00026103424181987293,
"loss": 2.7328,
"step": 849
},
{
"epoch": 0.24262047957684135,
"grad_norm": 1.7578125,
"learning_rate": 0.00026094283522238204,
"loss": 2.755,
"step": 850
},
{
"epoch": 0.24290591543516704,
"grad_norm": 1.7890625,
"learning_rate": 0.00026085133758309883,
"loss": 2.7581,
"step": 851
},
{
"epoch": 0.24319135129349273,
"grad_norm": 1.59375,
"learning_rate": 0.00026075974897710815,
"loss": 2.7312,
"step": 852
},
{
"epoch": 0.24347678715181842,
"grad_norm": 1.9296875,
"learning_rate": 0.0002606680694795693,
"loss": 2.7274,
"step": 853
},
{
"epoch": 0.24376222301014414,
"grad_norm": 1.5,
"learning_rate": 0.0002605762991657163,
"loss": 2.7208,
"step": 854
},
{
"epoch": 0.24404765886846982,
"grad_norm": 2.296875,
"learning_rate": 0.00026048443811085744,
"loss": 2.7326,
"step": 855
},
{
"epoch": 0.2443330947267955,
"grad_norm": 1.9765625,
"learning_rate": 0.00026039248639037575,
"loss": 2.7559,
"step": 856
},
{
"epoch": 0.2446185305851212,
"grad_norm": 2.1875,
"learning_rate": 0.00026030044407972854,
"loss": 2.7389,
"step": 857
},
{
"epoch": 0.24490396644344692,
"grad_norm": 2.03125,
"learning_rate": 0.00026020831125444745,
"loss": 2.7434,
"step": 858
},
{
"epoch": 0.2451894023017726,
"grad_norm": 1.8828125,
"learning_rate": 0.0002601160879901384,
"loss": 2.745,
"step": 859
},
{
"epoch": 0.2454748381600983,
"grad_norm": 1.6640625,
"learning_rate": 0.0002600237743624816,
"loss": 2.74,
"step": 860
},
{
"epoch": 0.24576027401842399,
"grad_norm": 2.125,
"learning_rate": 0.00025993137044723135,
"loss": 2.736,
"step": 861
},
{
"epoch": 0.2460457098767497,
"grad_norm": 1.7734375,
"learning_rate": 0.0002598388763202161,
"loss": 2.7447,
"step": 862
},
{
"epoch": 0.2463311457350754,
"grad_norm": 2.1875,
"learning_rate": 0.0002597462920573381,
"loss": 2.7457,
"step": 863
},
{
"epoch": 0.24661658159340108,
"grad_norm": 2.0,
"learning_rate": 0.000259653617734574,
"loss": 2.7256,
"step": 864
},
{
"epoch": 0.24690201745172677,
"grad_norm": 1.953125,
"learning_rate": 0.00025956085342797395,
"loss": 2.7233,
"step": 865
},
{
"epoch": 0.24718745331005246,
"grad_norm": 1.7265625,
"learning_rate": 0.00025946799921366205,
"loss": 2.7471,
"step": 866
},
{
"epoch": 0.24747288916837817,
"grad_norm": 1.9296875,
"learning_rate": 0.0002593750551678364,
"loss": 2.7426,
"step": 867
},
{
"epoch": 0.24775832502670386,
"grad_norm": 1.4921875,
"learning_rate": 0.00025928202136676855,
"loss": 2.6968,
"step": 868
},
{
"epoch": 0.24804376088502955,
"grad_norm": 2.09375,
"learning_rate": 0.0002591888978868038,
"loss": 2.7192,
"step": 869
},
{
"epoch": 0.24832919674335524,
"grad_norm": 1.828125,
"learning_rate": 0.000259095684804361,
"loss": 2.7436,
"step": 870
},
{
"epoch": 0.24861463260168096,
"grad_norm": 1.9609375,
"learning_rate": 0.0002590023821959326,
"loss": 2.7627,
"step": 871
},
{
"epoch": 0.24890006846000665,
"grad_norm": 1.7421875,
"learning_rate": 0.00025890899013808455,
"loss": 2.7603,
"step": 872
},
{
"epoch": 0.24918550431833233,
"grad_norm": 1.6640625,
"learning_rate": 0.0002588155087074561,
"loss": 2.7315,
"step": 873
},
{
"epoch": 0.24947094017665802,
"grad_norm": 1.515625,
"learning_rate": 0.00025872193798075985,
"loss": 2.7302,
"step": 874
},
{
"epoch": 0.24975637603498374,
"grad_norm": 1.453125,
"learning_rate": 0.0002586282780347818,
"loss": 2.7236,
"step": 875
},
{
"epoch": 0.25004181189330943,
"grad_norm": 1.25,
"learning_rate": 0.00025853452894638093,
"loss": 2.7152,
"step": 876
},
{
"epoch": 0.2503272477516351,
"grad_norm": 1.3515625,
"learning_rate": 0.00025844069079248964,
"loss": 2.7169,
"step": 877
},
{
"epoch": 0.2506126836099608,
"grad_norm": 1.125,
"learning_rate": 0.00025834676365011326,
"loss": 2.7202,
"step": 878
},
{
"epoch": 0.2508981194682865,
"grad_norm": 1.6484375,
"learning_rate": 0.00025825274759633016,
"loss": 2.7239,
"step": 879
},
{
"epoch": 0.2511835553266122,
"grad_norm": 1.234375,
"learning_rate": 0.0002581586427082918,
"loss": 2.7023,
"step": 880
},
{
"epoch": 0.25146899118493793,
"grad_norm": 1.90625,
"learning_rate": 0.0002580644490632222,
"loss": 2.7203,
"step": 881
},
{
"epoch": 0.2517544270432636,
"grad_norm": 1.5234375,
"learning_rate": 0.0002579701667384187,
"loss": 2.7288,
"step": 882
},
{
"epoch": 0.2520398629015893,
"grad_norm": 1.90625,
"learning_rate": 0.00025787579581125107,
"loss": 2.7284,
"step": 883
},
{
"epoch": 0.252325298759915,
"grad_norm": 1.7265625,
"learning_rate": 0.00025778133635916183,
"loss": 2.7377,
"step": 884
},
{
"epoch": 0.2526107346182407,
"grad_norm": 1.75,
"learning_rate": 0.0002576867884596663,
"loss": 2.7267,
"step": 885
},
{
"epoch": 0.2528961704765664,
"grad_norm": 1.5859375,
"learning_rate": 0.00025759215219035213,
"loss": 2.723,
"step": 886
},
{
"epoch": 0.25318160633489206,
"grad_norm": 1.7109375,
"learning_rate": 0.00025749742762887977,
"loss": 2.7178,
"step": 887
},
{
"epoch": 0.25346704219321775,
"grad_norm": 1.3828125,
"learning_rate": 0.00025740261485298195,
"loss": 2.7387,
"step": 888
},
{
"epoch": 0.2537524780515435,
"grad_norm": 1.984375,
"learning_rate": 0.0002573077139404638,
"loss": 2.7513,
"step": 889
},
{
"epoch": 0.2540379139098692,
"grad_norm": 1.7265625,
"learning_rate": 0.0002572127249692028,
"loss": 2.7288,
"step": 890
},
{
"epoch": 0.2543233497681949,
"grad_norm": 1.734375,
"learning_rate": 0.00025711764801714874,
"loss": 2.7322,
"step": 891
},
{
"epoch": 0.25460878562652056,
"grad_norm": 1.6015625,
"learning_rate": 0.00025702248316232355,
"loss": 2.7598,
"step": 892
},
{
"epoch": 0.25489422148484625,
"grad_norm": 1.671875,
"learning_rate": 0.0002569272304828213,
"loss": 2.7304,
"step": 893
},
{
"epoch": 0.25517965734317194,
"grad_norm": 1.421875,
"learning_rate": 0.00025683189005680827,
"loss": 2.7288,
"step": 894
},
{
"epoch": 0.25546509320149763,
"grad_norm": 1.8203125,
"learning_rate": 0.0002567364619625224,
"loss": 2.753,
"step": 895
},
{
"epoch": 0.2557505290598233,
"grad_norm": 1.5390625,
"learning_rate": 0.00025664094627827393,
"loss": 2.7233,
"step": 896
},
{
"epoch": 0.25603596491814906,
"grad_norm": 1.8046875,
"learning_rate": 0.00025654534308244484,
"loss": 2.731,
"step": 897
},
{
"epoch": 0.25632140077647475,
"grad_norm": 1.6328125,
"learning_rate": 0.0002564496524534888,
"loss": 2.7177,
"step": 898
},
{
"epoch": 0.25660683663480044,
"grad_norm": 2.015625,
"learning_rate": 0.00025635387446993154,
"loss": 2.7327,
"step": 899
},
{
"epoch": 0.25689227249312613,
"grad_norm": 1.7890625,
"learning_rate": 0.0002562580092103702,
"loss": 2.7251,
"step": 900
},
{
"epoch": 0.2571777083514518,
"grad_norm": 1.90625,
"learning_rate": 0.00025616205675347355,
"loss": 2.7005,
"step": 901
},
{
"epoch": 0.2574631442097775,
"grad_norm": 1.8125,
"learning_rate": 0.00025606601717798207,
"loss": 2.7263,
"step": 902
},
{
"epoch": 0.2577485800681032,
"grad_norm": 1.6953125,
"learning_rate": 0.0002559698905627077,
"loss": 2.6863,
"step": 903
},
{
"epoch": 0.2580340159264289,
"grad_norm": 1.5703125,
"learning_rate": 0.00025587367698653367,
"loss": 2.718,
"step": 904
},
{
"epoch": 0.25831945178475463,
"grad_norm": 1.6640625,
"learning_rate": 0.0002557773765284148,
"loss": 2.7263,
"step": 905
},
{
"epoch": 0.2586048876430803,
"grad_norm": 1.546875,
"learning_rate": 0.0002556809892673769,
"loss": 2.7485,
"step": 906
},
{
"epoch": 0.258890323501406,
"grad_norm": 1.8359375,
"learning_rate": 0.0002555845152825173,
"loss": 2.6922,
"step": 907
},
{
"epoch": 0.2591757593597317,
"grad_norm": 1.71875,
"learning_rate": 0.00025548795465300426,
"loss": 2.7269,
"step": 908
},
{
"epoch": 0.2594611952180574,
"grad_norm": 1.6875,
"learning_rate": 0.0002553913074580774,
"loss": 2.7466,
"step": 909
},
{
"epoch": 0.25974663107638307,
"grad_norm": 1.6015625,
"learning_rate": 0.00025529457377704713,
"loss": 2.728,
"step": 910
},
{
"epoch": 0.26003206693470876,
"grad_norm": 1.703125,
"learning_rate": 0.0002551977536892951,
"loss": 2.7171,
"step": 911
},
{
"epoch": 0.26031750279303445,
"grad_norm": 1.4921875,
"learning_rate": 0.0002551008472742735,
"loss": 2.7028,
"step": 912
},
{
"epoch": 0.2606029386513602,
"grad_norm": 1.7578125,
"learning_rate": 0.00025500385461150565,
"loss": 2.7107,
"step": 913
},
{
"epoch": 0.2608883745096859,
"grad_norm": 1.65625,
"learning_rate": 0.0002549067757805856,
"loss": 2.7452,
"step": 914
},
{
"epoch": 0.26117381036801157,
"grad_norm": 1.515625,
"learning_rate": 0.00025480961086117815,
"loss": 2.7045,
"step": 915
},
{
"epoch": 0.26145924622633726,
"grad_norm": 1.4453125,
"learning_rate": 0.0002547123599330185,
"loss": 2.72,
"step": 916
},
{
"epoch": 0.26174468208466295,
"grad_norm": 1.640625,
"learning_rate": 0.00025461502307591274,
"loss": 2.7136,
"step": 917
},
{
"epoch": 0.26203011794298864,
"grad_norm": 1.515625,
"learning_rate": 0.0002545176003697372,
"loss": 2.7097,
"step": 918
},
{
"epoch": 0.2623155538013143,
"grad_norm": 1.6328125,
"learning_rate": 0.000254420091894439,
"loss": 2.7218,
"step": 919
},
{
"epoch": 0.26260098965964,
"grad_norm": 1.4921875,
"learning_rate": 0.0002543224977300352,
"loss": 2.6923,
"step": 920
},
{
"epoch": 0.26288642551796576,
"grad_norm": 1.6015625,
"learning_rate": 0.0002542248179566137,
"loss": 2.735,
"step": 921
},
{
"epoch": 0.26317186137629145,
"grad_norm": 1.375,
"learning_rate": 0.0002541270526543321,
"loss": 2.7211,
"step": 922
},
{
"epoch": 0.26345729723461714,
"grad_norm": 1.8203125,
"learning_rate": 0.00025402920190341864,
"loss": 2.73,
"step": 923
},
{
"epoch": 0.2637427330929428,
"grad_norm": 1.625,
"learning_rate": 0.0002539312657841714,
"loss": 2.7038,
"step": 924
},
{
"epoch": 0.2640281689512685,
"grad_norm": 1.65625,
"learning_rate": 0.0002538332443769587,
"loss": 2.7209,
"step": 925
},
{
"epoch": 0.2643136048095942,
"grad_norm": 2.875,
"learning_rate": 0.0002537351377622187,
"loss": 2.7053,
"step": 926
},
{
"epoch": 0.2645990406679199,
"grad_norm": 0.88671875,
"learning_rate": 0.00025363694602045957,
"loss": 2.7378,
"step": 927
},
{
"epoch": 0.2648844765262456,
"grad_norm": 2.1875,
"learning_rate": 0.0002535386692322593,
"loss": 2.7339,
"step": 928
},
{
"epoch": 0.2651699123845713,
"grad_norm": 1.875,
"learning_rate": 0.0002534403074782657,
"loss": 2.7474,
"step": 929
},
{
"epoch": 0.265455348242897,
"grad_norm": 1.7734375,
"learning_rate": 0.00025334186083919623,
"loss": 2.7283,
"step": 930
},
{
"epoch": 0.2657407841012227,
"grad_norm": 1.71875,
"learning_rate": 0.00025324332939583813,
"loss": 2.7195,
"step": 931
},
{
"epoch": 0.2660262199595484,
"grad_norm": 1.2578125,
"learning_rate": 0.0002531447132290482,
"loss": 2.7133,
"step": 932
},
{
"epoch": 0.2663116558178741,
"grad_norm": 1.484375,
"learning_rate": 0.00025304601241975266,
"loss": 2.737,
"step": 933
},
{
"epoch": 0.26659709167619977,
"grad_norm": 1.1171875,
"learning_rate": 0.0002529472270489473,
"loss": 2.7129,
"step": 934
},
{
"epoch": 0.26688252753452546,
"grad_norm": 1.828125,
"learning_rate": 0.0002528483571976973,
"loss": 2.7195,
"step": 935
},
{
"epoch": 0.26716796339285115,
"grad_norm": 1.28125,
"learning_rate": 0.00025274940294713706,
"loss": 2.694,
"step": 936
},
{
"epoch": 0.2674533992511769,
"grad_norm": 2.171875,
"learning_rate": 0.00025265036437847036,
"loss": 2.739,
"step": 937
},
{
"epoch": 0.2677388351095026,
"grad_norm": 1.8046875,
"learning_rate": 0.0002525512415729701,
"loss": 2.7,
"step": 938
},
{
"epoch": 0.26802427096782827,
"grad_norm": 1.4453125,
"learning_rate": 0.00025245203461197834,
"loss": 2.7329,
"step": 939
},
{
"epoch": 0.26830970682615396,
"grad_norm": 1.5390625,
"learning_rate": 0.0002523527435769062,
"loss": 2.7321,
"step": 940
},
{
"epoch": 0.26859514268447965,
"grad_norm": 1.25,
"learning_rate": 0.0002522533685492338,
"loss": 2.7274,
"step": 941
},
{
"epoch": 0.26888057854280534,
"grad_norm": 1.5703125,
"learning_rate": 0.0002521539096105101,
"loss": 2.719,
"step": 942
},
{
"epoch": 0.269166014401131,
"grad_norm": 1.203125,
"learning_rate": 0.00025205436684235313,
"loss": 2.7257,
"step": 943
},
{
"epoch": 0.2694514502594567,
"grad_norm": 1.703125,
"learning_rate": 0.0002519547403264494,
"loss": 2.7126,
"step": 944
},
{
"epoch": 0.2697368861177824,
"grad_norm": 1.359375,
"learning_rate": 0.00025185503014455443,
"loss": 2.7297,
"step": 945
},
{
"epoch": 0.27002232197610815,
"grad_norm": 1.8203125,
"learning_rate": 0.00025175523637849224,
"loss": 2.7324,
"step": 946
},
{
"epoch": 0.27030775783443384,
"grad_norm": 1.296875,
"learning_rate": 0.0002516553591101555,
"loss": 2.7367,
"step": 947
},
{
"epoch": 0.2705931936927595,
"grad_norm": 1.890625,
"learning_rate": 0.00025155539842150535,
"loss": 2.6977,
"step": 948
},
{
"epoch": 0.2708786295510852,
"grad_norm": 1.3125,
"learning_rate": 0.0002514553543945715,
"loss": 2.6864,
"step": 949
},
{
"epoch": 0.2711640654094109,
"grad_norm": 2.015625,
"learning_rate": 0.00025135522711145197,
"loss": 2.7111,
"step": 950
},
{
"epoch": 0.2714495012677366,
"grad_norm": 1.7265625,
"learning_rate": 0.000251255016654313,
"loss": 2.7124,
"step": 951
},
{
"epoch": 0.2717349371260623,
"grad_norm": 1.875,
"learning_rate": 0.0002511547231053893,
"loss": 2.6945,
"step": 952
},
{
"epoch": 0.27202037298438797,
"grad_norm": 1.765625,
"learning_rate": 0.00025105434654698356,
"loss": 2.7364,
"step": 953
},
{
"epoch": 0.2723058088427137,
"grad_norm": 1.53125,
"learning_rate": 0.00025095388706146676,
"loss": 2.7086,
"step": 954
},
{
"epoch": 0.2725912447010394,
"grad_norm": 1.4765625,
"learning_rate": 0.00025085334473127786,
"loss": 2.7037,
"step": 955
},
{
"epoch": 0.2728766805593651,
"grad_norm": 1.359375,
"learning_rate": 0.0002507527196389238,
"loss": 2.7295,
"step": 956
},
{
"epoch": 0.2731621164176908,
"grad_norm": 1.25,
"learning_rate": 0.0002506520118669794,
"loss": 2.6829,
"step": 957
},
{
"epoch": 0.27344755227601647,
"grad_norm": 1.3671875,
"learning_rate": 0.0002505512214980873,
"loss": 2.6869,
"step": 958
},
{
"epoch": 0.27373298813434216,
"grad_norm": 1.0390625,
"learning_rate": 0.0002504503486149581,
"loss": 2.6919,
"step": 959
},
{
"epoch": 0.27401842399266785,
"grad_norm": 1.65625,
"learning_rate": 0.00025034939330037,
"loss": 2.6851,
"step": 960
},
{
"epoch": 0.27430385985099354,
"grad_norm": 1.34375,
"learning_rate": 0.0002502483556371688,
"loss": 2.7326,
"step": 961
},
{
"epoch": 0.2745892957093193,
"grad_norm": 1.8515625,
"learning_rate": 0.00025014723570826794,
"loss": 2.7369,
"step": 962
},
{
"epoch": 0.27487473156764497,
"grad_norm": 1.6640625,
"learning_rate": 0.00025004603359664833,
"loss": 2.7398,
"step": 963
},
{
"epoch": 0.27516016742597066,
"grad_norm": 1.6875,
"learning_rate": 0.0002499447493853583,
"loss": 2.7145,
"step": 964
},
{
"epoch": 0.27544560328429635,
"grad_norm": 1.4921875,
"learning_rate": 0.00024984338315751366,
"loss": 2.733,
"step": 965
},
{
"epoch": 0.27573103914262204,
"grad_norm": 1.7578125,
"learning_rate": 0.00024974193499629745,
"loss": 2.707,
"step": 966
},
{
"epoch": 0.2760164750009477,
"grad_norm": 1.6171875,
"learning_rate": 0.00024964040498496,
"loss": 2.7282,
"step": 967
},
{
"epoch": 0.2763019108592734,
"grad_norm": 1.765625,
"learning_rate": 0.00024953879320681853,
"loss": 2.7208,
"step": 968
},
{
"epoch": 0.2765873467175991,
"grad_norm": 1.5625,
"learning_rate": 0.00024943709974525793,
"loss": 2.7021,
"step": 969
},
{
"epoch": 0.27687278257592485,
"grad_norm": 1.609375,
"learning_rate": 0.00024933532468372955,
"loss": 2.7056,
"step": 970
},
{
"epoch": 0.27715821843425054,
"grad_norm": 1.515625,
"learning_rate": 0.00024923346810575193,
"loss": 2.7342,
"step": 971
},
{
"epoch": 0.2774436542925762,
"grad_norm": 1.5703125,
"learning_rate": 0.0002491315300949106,
"loss": 2.7258,
"step": 972
},
{
"epoch": 0.2777290901509019,
"grad_norm": 1.40625,
"learning_rate": 0.00024902951073485784,
"loss": 2.7053,
"step": 973
},
{
"epoch": 0.2780145260092276,
"grad_norm": 1.609375,
"learning_rate": 0.00024892741010931264,
"loss": 2.7111,
"step": 974
},
{
"epoch": 0.2782999618675533,
"grad_norm": 1.390625,
"learning_rate": 0.0002488252283020606,
"loss": 2.6961,
"step": 975
},
{
"epoch": 0.278585397725879,
"grad_norm": 1.7421875,
"learning_rate": 0.00024872296539695427,
"loss": 2.7148,
"step": 976
},
{
"epoch": 0.27887083358420467,
"grad_norm": 1.5078125,
"learning_rate": 0.00024862062147791233,
"loss": 2.7192,
"step": 977
},
{
"epoch": 0.2791562694425304,
"grad_norm": 1.671875,
"learning_rate": 0.00024851819662892016,
"loss": 2.725,
"step": 978
},
{
"epoch": 0.2794417053008561,
"grad_norm": 1.546875,
"learning_rate": 0.0002484156909340296,
"loss": 2.7303,
"step": 979
},
{
"epoch": 0.2797271411591818,
"grad_norm": 1.703125,
"learning_rate": 0.00024831310447735874,
"loss": 2.6735,
"step": 980
},
{
"epoch": 0.2800125770175075,
"grad_norm": 1.4921875,
"learning_rate": 0.00024821043734309204,
"loss": 2.6935,
"step": 981
},
{
"epoch": 0.28029801287583317,
"grad_norm": 1.7890625,
"learning_rate": 0.0002481076896154799,
"loss": 2.7103,
"step": 982
},
{
"epoch": 0.28058344873415886,
"grad_norm": 1.6328125,
"learning_rate": 0.00024800486137883926,
"loss": 2.7239,
"step": 983
},
{
"epoch": 0.28086888459248455,
"grad_norm": 1.6875,
"learning_rate": 0.00024790195271755277,
"loss": 2.7289,
"step": 984
},
{
"epoch": 0.28115432045081024,
"grad_norm": 1.5078125,
"learning_rate": 0.0002477989637160694,
"loss": 2.7095,
"step": 985
},
{
"epoch": 0.281439756309136,
"grad_norm": 1.7578125,
"learning_rate": 0.0002476958944589037,
"loss": 2.6648,
"step": 986
},
{
"epoch": 0.28172519216746167,
"grad_norm": 1.546875,
"learning_rate": 0.0002475927450306363,
"loss": 2.666,
"step": 987
},
{
"epoch": 0.28201062802578736,
"grad_norm": 1.796875,
"learning_rate": 0.00024748951551591364,
"loss": 2.7152,
"step": 988
},
{
"epoch": 0.28229606388411305,
"grad_norm": 1.578125,
"learning_rate": 0.00024738620599944774,
"loss": 2.7102,
"step": 989
},
{
"epoch": 0.28258149974243874,
"grad_norm": 1.703125,
"learning_rate": 0.0002472828165660164,
"loss": 2.7055,
"step": 990
},
{
"epoch": 0.2828669356007644,
"grad_norm": 1.5625,
"learning_rate": 0.0002471793473004629,
"loss": 2.7004,
"step": 991
},
{
"epoch": 0.2831523714590901,
"grad_norm": 1.734375,
"learning_rate": 0.0002470757982876961,
"loss": 2.6998,
"step": 992
},
{
"epoch": 0.2834378073174158,
"grad_norm": 1.5546875,
"learning_rate": 0.00024697216961269035,
"loss": 2.7259,
"step": 993
},
{
"epoch": 0.28372324317574155,
"grad_norm": 1.6953125,
"learning_rate": 0.0002468684613604852,
"loss": 2.6939,
"step": 994
},
{
"epoch": 0.28400867903406724,
"grad_norm": 1.6015625,
"learning_rate": 0.00024676467361618563,
"loss": 2.7005,
"step": 995
},
{
"epoch": 0.2842941148923929,
"grad_norm": 1.6015625,
"learning_rate": 0.00024666080646496187,
"loss": 2.7153,
"step": 996
},
{
"epoch": 0.2845795507507186,
"grad_norm": 1.4296875,
"learning_rate": 0.0002465568599920493,
"loss": 2.7052,
"step": 997
},
{
"epoch": 0.2848649866090443,
"grad_norm": 1.6171875,
"learning_rate": 0.0002464528342827482,
"loss": 2.7191,
"step": 998
},
{
"epoch": 0.28515042246737,
"grad_norm": 1.5546875,
"learning_rate": 0.00024634872942242423,
"loss": 2.7117,
"step": 999
},
{
"epoch": 0.2854358583256957,
"grad_norm": 1.734375,
"learning_rate": 0.0002462445454965077,
"loss": 2.6923,
"step": 1000
},
{
"epoch": 0.2854358583256957,
"eval_loss": 2.571556806564331,
"eval_runtime": 5980.855,
"eval_samples_per_second": 10.749,
"eval_steps_per_second": 10.749,
"step": 1000
},
{
"epoch": 0.28572129418402137,
"grad_norm": 1.578125,
"learning_rate": 0.00024614028259049397,
"loss": 2.6922,
"step": 1001
},
{
"epoch": 0.2860067300423471,
"grad_norm": 1.5625,
"learning_rate": 0.0002460359407899431,
"loss": 2.7178,
"step": 1002
},
{
"epoch": 0.2862921659006728,
"grad_norm": 1.4609375,
"learning_rate": 0.00024593152018048,
"loss": 2.696,
"step": 1003
},
{
"epoch": 0.2865776017589985,
"grad_norm": 1.625,
"learning_rate": 0.00024582702084779414,
"loss": 2.6841,
"step": 1004
},
{
"epoch": 0.2868630376173242,
"grad_norm": 1.4140625,
"learning_rate": 0.00024572244287763976,
"loss": 2.6869,
"step": 1005
},
{
"epoch": 0.28714847347564987,
"grad_norm": 1.5546875,
"learning_rate": 0.0002456177863558354,
"loss": 2.7185,
"step": 1006
},
{
"epoch": 0.28743390933397556,
"grad_norm": 1.4140625,
"learning_rate": 0.00024551305136826424,
"loss": 2.69,
"step": 1007
},
{
"epoch": 0.28771934519230125,
"grad_norm": 1.6171875,
"learning_rate": 0.00024540823800087386,
"loss": 2.6593,
"step": 1008
},
{
"epoch": 0.28800478105062693,
"grad_norm": 1.3984375,
"learning_rate": 0.00024530334633967595,
"loss": 2.6818,
"step": 1009
},
{
"epoch": 0.2882902169089526,
"grad_norm": 1.5390625,
"learning_rate": 0.00024519837647074674,
"loss": 2.7043,
"step": 1010
},
{
"epoch": 0.28857565276727837,
"grad_norm": 1.40625,
"learning_rate": 0.00024509332848022636,
"loss": 2.7057,
"step": 1011
},
{
"epoch": 0.28886108862560406,
"grad_norm": 1.5,
"learning_rate": 0.0002449882024543193,
"loss": 2.6855,
"step": 1012
},
{
"epoch": 0.28914652448392975,
"grad_norm": 1.3515625,
"learning_rate": 0.00024488299847929385,
"loss": 2.7012,
"step": 1013
},
{
"epoch": 0.28943196034225543,
"grad_norm": 1.5390625,
"learning_rate": 0.0002447777166414825,
"loss": 2.7178,
"step": 1014
},
{
"epoch": 0.2897173962005811,
"grad_norm": 1.5625,
"learning_rate": 0.0002446723570272814,
"loss": 2.6926,
"step": 1015
},
{
"epoch": 0.2900028320589068,
"grad_norm": 1.21875,
"learning_rate": 0.00024456691972315076,
"loss": 2.6914,
"step": 1016
},
{
"epoch": 0.2902882679172325,
"grad_norm": 1.0390625,
"learning_rate": 0.0002444614048156144,
"loss": 2.6794,
"step": 1017
},
{
"epoch": 0.2905737037755582,
"grad_norm": 1.4375,
"learning_rate": 0.00024435581239125987,
"loss": 2.7046,
"step": 1018
},
{
"epoch": 0.29085913963388393,
"grad_norm": 1.09375,
"learning_rate": 0.0002442501425367382,
"loss": 2.6849,
"step": 1019
},
{
"epoch": 0.2911445754922096,
"grad_norm": 1.796875,
"learning_rate": 0.0002441443953387642,
"loss": 2.6808,
"step": 1020
},
{
"epoch": 0.2914300113505353,
"grad_norm": 1.65625,
"learning_rate": 0.000244038570884116,
"loss": 2.6968,
"step": 1021
},
{
"epoch": 0.291715447208861,
"grad_norm": 1.5,
"learning_rate": 0.00024393266925963505,
"loss": 2.6755,
"step": 1022
},
{
"epoch": 0.2920008830671867,
"grad_norm": 1.4765625,
"learning_rate": 0.00024382669055222634,
"loss": 2.7195,
"step": 1023
},
{
"epoch": 0.2922863189255124,
"grad_norm": 1.1484375,
"learning_rate": 0.000243720634848858,
"loss": 2.6943,
"step": 1024
},
{
"epoch": 0.29257175478383807,
"grad_norm": 1.1640625,
"learning_rate": 0.0002436145022365613,
"loss": 2.7172,
"step": 1025
},
{
"epoch": 0.29285719064216376,
"grad_norm": 1.390625,
"learning_rate": 0.00024350829280243074,
"loss": 2.7061,
"step": 1026
},
{
"epoch": 0.2931426265004895,
"grad_norm": 1.3359375,
"learning_rate": 0.00024340200663362368,
"loss": 2.6897,
"step": 1027
},
{
"epoch": 0.2934280623588152,
"grad_norm": 0.96484375,
"learning_rate": 0.00024329564381736068,
"loss": 2.691,
"step": 1028
},
{
"epoch": 0.2937134982171409,
"grad_norm": 0.8828125,
"learning_rate": 0.000243189204440925,
"loss": 2.7367,
"step": 1029
},
{
"epoch": 0.29399893407546657,
"grad_norm": 1.171875,
"learning_rate": 0.0002430826885916629,
"loss": 2.6964,
"step": 1030
},
{
"epoch": 0.29428436993379226,
"grad_norm": 1.1796875,
"learning_rate": 0.0002429760963569832,
"loss": 2.7204,
"step": 1031
},
{
"epoch": 0.29456980579211794,
"grad_norm": 1.90625,
"learning_rate": 0.00024286942782435753,
"loss": 2.7186,
"step": 1032
},
{
"epoch": 0.29485524165044363,
"grad_norm": 1.1328125,
"learning_rate": 0.0002427626830813202,
"loss": 2.6901,
"step": 1033
},
{
"epoch": 0.2951406775087693,
"grad_norm": 1.2890625,
"learning_rate": 0.0002426558622154679,
"loss": 2.7291,
"step": 1034
},
{
"epoch": 0.29542611336709507,
"grad_norm": 1.875,
"learning_rate": 0.0002425489653144598,
"loss": 2.717,
"step": 1035
},
{
"epoch": 0.29571154922542076,
"grad_norm": 0.71484375,
"learning_rate": 0.0002424419924660176,
"loss": 2.7074,
"step": 1036
},
{
"epoch": 0.29599698508374644,
"grad_norm": 2.03125,
"learning_rate": 0.00024233494375792524,
"loss": 2.7174,
"step": 1037
},
{
"epoch": 0.29628242094207213,
"grad_norm": 1.1640625,
"learning_rate": 0.00024222781927802888,
"loss": 2.6859,
"step": 1038
},
{
"epoch": 0.2965678568003978,
"grad_norm": 2.421875,
"learning_rate": 0.0002421206191142369,
"loss": 2.6916,
"step": 1039
},
{
"epoch": 0.2968532926587235,
"grad_norm": 1.8984375,
"learning_rate": 0.00024201334335451988,
"loss": 2.7098,
"step": 1040
},
{
"epoch": 0.2971387285170492,
"grad_norm": 2.09375,
"learning_rate": 0.0002419059920869102,
"loss": 2.7105,
"step": 1041
},
{
"epoch": 0.2974241643753749,
"grad_norm": 1.65625,
"learning_rate": 0.0002417985653995024,
"loss": 2.7329,
"step": 1042
},
{
"epoch": 0.29770960023370063,
"grad_norm": 2.328125,
"learning_rate": 0.0002416910633804529,
"loss": 2.6864,
"step": 1043
},
{
"epoch": 0.2979950360920263,
"grad_norm": 1.6640625,
"learning_rate": 0.00024158348611797985,
"loss": 2.6915,
"step": 1044
},
{
"epoch": 0.298280471950352,
"grad_norm": 2.578125,
"learning_rate": 0.0002414758337003632,
"loss": 2.71,
"step": 1045
},
{
"epoch": 0.2985659078086777,
"grad_norm": 2.421875,
"learning_rate": 0.00024136810621594454,
"loss": 2.7174,
"step": 1046
},
{
"epoch": 0.2988513436670034,
"grad_norm": 1.2578125,
"learning_rate": 0.0002412603037531271,
"loss": 2.7106,
"step": 1047
},
{
"epoch": 0.2991367795253291,
"grad_norm": 1.5390625,
"learning_rate": 0.00024115242640037569,
"loss": 2.7032,
"step": 1048
},
{
"epoch": 0.29942221538365477,
"grad_norm": 1.2421875,
"learning_rate": 0.0002410444742462164,
"loss": 2.6975,
"step": 1049
},
{
"epoch": 0.29970765124198046,
"grad_norm": 1.484375,
"learning_rate": 0.00024093644737923682,
"loss": 2.6909,
"step": 1050
},
{
"epoch": 0.2999930871003062,
"grad_norm": 1.1484375,
"learning_rate": 0.00024082834588808592,
"loss": 2.7097,
"step": 1051
},
{
"epoch": 0.3002785229586319,
"grad_norm": 1.640625,
"learning_rate": 0.0002407201698614738,
"loss": 2.7031,
"step": 1052
},
{
"epoch": 0.3005639588169576,
"grad_norm": 1.2734375,
"learning_rate": 0.0002406119193881718,
"loss": 2.6834,
"step": 1053
},
{
"epoch": 0.30084939467528327,
"grad_norm": 1.953125,
"learning_rate": 0.00024050359455701217,
"loss": 2.7092,
"step": 1054
},
{
"epoch": 0.30113483053360895,
"grad_norm": 1.7734375,
"learning_rate": 0.00024039519545688846,
"loss": 2.6838,
"step": 1055
},
{
"epoch": 0.30142026639193464,
"grad_norm": 1.7265625,
"learning_rate": 0.00024028672217675493,
"loss": 2.7051,
"step": 1056
},
{
"epoch": 0.30170570225026033,
"grad_norm": 1.5625,
"learning_rate": 0.00024017817480562686,
"loss": 2.698,
"step": 1057
},
{
"epoch": 0.301991138108586,
"grad_norm": 1.59375,
"learning_rate": 0.00024006955343258032,
"loss": 2.6918,
"step": 1058
},
{
"epoch": 0.30227657396691177,
"grad_norm": 1.46875,
"learning_rate": 0.00023996085814675198,
"loss": 2.7027,
"step": 1059
},
{
"epoch": 0.30256200982523745,
"grad_norm": 1.34375,
"learning_rate": 0.0002398520890373393,
"loss": 2.6585,
"step": 1060
},
{
"epoch": 0.30284744568356314,
"grad_norm": 1.3671875,
"learning_rate": 0.00023974324619360028,
"loss": 2.7134,
"step": 1061
},
{
"epoch": 0.30313288154188883,
"grad_norm": 1.1328125,
"learning_rate": 0.00023963432970485333,
"loss": 2.7017,
"step": 1062
},
{
"epoch": 0.3034183174002145,
"grad_norm": 1.328125,
"learning_rate": 0.0002395253396604775,
"loss": 2.7121,
"step": 1063
},
{
"epoch": 0.3037037532585402,
"grad_norm": 1.1015625,
"learning_rate": 0.00023941627614991205,
"loss": 2.6666,
"step": 1064
},
{
"epoch": 0.3039891891168659,
"grad_norm": 1.3203125,
"learning_rate": 0.00023930713926265652,
"loss": 2.6927,
"step": 1065
},
{
"epoch": 0.3042746249751916,
"grad_norm": 1.0546875,
"learning_rate": 0.00023919792908827072,
"loss": 2.6844,
"step": 1066
},
{
"epoch": 0.30456006083351733,
"grad_norm": 1.3125,
"learning_rate": 0.00023908864571637464,
"loss": 2.6666,
"step": 1067
},
{
"epoch": 0.304845496691843,
"grad_norm": 1.03125,
"learning_rate": 0.00023897928923664825,
"loss": 2.6676,
"step": 1068
},
{
"epoch": 0.3051309325501687,
"grad_norm": 1.3671875,
"learning_rate": 0.00023886985973883157,
"loss": 2.7065,
"step": 1069
},
{
"epoch": 0.3054163684084944,
"grad_norm": 1.09375,
"learning_rate": 0.00023876035731272444,
"loss": 2.6579,
"step": 1070
},
{
"epoch": 0.3057018042668201,
"grad_norm": 1.65625,
"learning_rate": 0.00023865078204818676,
"loss": 2.6919,
"step": 1071
},
{
"epoch": 0.3059872401251458,
"grad_norm": 1.3515625,
"learning_rate": 0.0002385411340351379,
"loss": 2.6779,
"step": 1072
},
{
"epoch": 0.30627267598347147,
"grad_norm": 1.59375,
"learning_rate": 0.00023843141336355725,
"loss": 2.6798,
"step": 1073
},
{
"epoch": 0.30655811184179715,
"grad_norm": 1.4609375,
"learning_rate": 0.0002383216201234836,
"loss": 2.6775,
"step": 1074
},
{
"epoch": 0.3068435477001229,
"grad_norm": 1.4765625,
"learning_rate": 0.00023821175440501535,
"loss": 2.693,
"step": 1075
},
{
"epoch": 0.3071289835584486,
"grad_norm": 1.328125,
"learning_rate": 0.00023810181629831042,
"loss": 2.6807,
"step": 1076
},
{
"epoch": 0.3074144194167743,
"grad_norm": 1.3125,
"learning_rate": 0.0002379918058935861,
"loss": 2.6583,
"step": 1077
},
{
"epoch": 0.30769985527509996,
"grad_norm": 1.1171875,
"learning_rate": 0.00023788172328111903,
"loss": 2.6784,
"step": 1078
},
{
"epoch": 0.30798529113342565,
"grad_norm": 1.40625,
"learning_rate": 0.00023777156855124505,
"loss": 2.6992,
"step": 1079
},
{
"epoch": 0.30827072699175134,
"grad_norm": 1.0390625,
"learning_rate": 0.00023766134179435921,
"loss": 2.7007,
"step": 1080
},
{
"epoch": 0.30855616285007703,
"grad_norm": 1.5390625,
"learning_rate": 0.0002375510431009157,
"loss": 2.698,
"step": 1081
},
{
"epoch": 0.3088415987084027,
"grad_norm": 1.2109375,
"learning_rate": 0.00023744067256142775,
"loss": 2.6982,
"step": 1082
},
{
"epoch": 0.3091270345667284,
"grad_norm": 1.7421875,
"learning_rate": 0.00023733023026646744,
"loss": 2.732,
"step": 1083
},
{
"epoch": 0.30941247042505415,
"grad_norm": 1.53125,
"learning_rate": 0.00023721971630666589,
"loss": 2.7234,
"step": 1084
},
{
"epoch": 0.30969790628337984,
"grad_norm": 1.3828125,
"learning_rate": 0.00023710913077271286,
"loss": 2.6996,
"step": 1085
},
{
"epoch": 0.30998334214170553,
"grad_norm": 1.3359375,
"learning_rate": 0.00023699847375535698,
"loss": 2.7038,
"step": 1086
},
{
"epoch": 0.3102687780000312,
"grad_norm": 1.296875,
"learning_rate": 0.00023688774534540554,
"loss": 2.6705,
"step": 1087
},
{
"epoch": 0.3105542138583569,
"grad_norm": 1.1484375,
"learning_rate": 0.0002367769456337243,
"loss": 2.6632,
"step": 1088
},
{
"epoch": 0.3108396497166826,
"grad_norm": 1.296875,
"learning_rate": 0.00023666607471123767,
"loss": 2.6572,
"step": 1089
},
{
"epoch": 0.3111250855750083,
"grad_norm": 1.09375,
"learning_rate": 0.0002365551326689283,
"loss": 2.68,
"step": 1090
},
{
"epoch": 0.311410521433334,
"grad_norm": 1.625,
"learning_rate": 0.0002364441195978375,
"loss": 2.6704,
"step": 1091
},
{
"epoch": 0.3116959572916597,
"grad_norm": 1.359375,
"learning_rate": 0.0002363330355890646,
"loss": 2.6514,
"step": 1092
},
{
"epoch": 0.3119813931499854,
"grad_norm": 1.4765625,
"learning_rate": 0.00023622188073376728,
"loss": 2.6773,
"step": 1093
},
{
"epoch": 0.3122668290083111,
"grad_norm": 1.34375,
"learning_rate": 0.00023611065512316127,
"loss": 2.6896,
"step": 1094
},
{
"epoch": 0.3125522648666368,
"grad_norm": 1.3515625,
"learning_rate": 0.00023599935884852045,
"loss": 2.7068,
"step": 1095
},
{
"epoch": 0.3128377007249625,
"grad_norm": 1.21875,
"learning_rate": 0.00023588799200117662,
"loss": 2.6837,
"step": 1096
},
{
"epoch": 0.31312313658328816,
"grad_norm": 1.3828125,
"learning_rate": 0.00023577655467251963,
"loss": 2.6873,
"step": 1097
},
{
"epoch": 0.31340857244161385,
"grad_norm": 1.234375,
"learning_rate": 0.0002356650469539969,
"loss": 2.6891,
"step": 1098
},
{
"epoch": 0.31369400829993954,
"grad_norm": 1.296875,
"learning_rate": 0.0002355534689371139,
"loss": 2.6888,
"step": 1099
},
{
"epoch": 0.3139794441582653,
"grad_norm": 1.1796875,
"learning_rate": 0.00023544182071343363,
"loss": 2.6745,
"step": 1100
},
{
"epoch": 0.314264880016591,
"grad_norm": 1.3046875,
"learning_rate": 0.00023533010237457674,
"loss": 2.6668,
"step": 1101
},
{
"epoch": 0.31455031587491666,
"grad_norm": 1.1171875,
"learning_rate": 0.00023521831401222132,
"loss": 2.6679,
"step": 1102
},
{
"epoch": 0.31483575173324235,
"grad_norm": 1.578125,
"learning_rate": 0.00023510645571810316,
"loss": 2.693,
"step": 1103
},
{
"epoch": 0.31512118759156804,
"grad_norm": 1.34375,
"learning_rate": 0.00023499452758401525,
"loss": 2.6966,
"step": 1104
},
{
"epoch": 0.31540662344989373,
"grad_norm": 1.59375,
"learning_rate": 0.00023488252970180792,
"loss": 2.6786,
"step": 1105
},
{
"epoch": 0.3156920593082194,
"grad_norm": 1.3984375,
"learning_rate": 0.00023477046216338875,
"loss": 2.6579,
"step": 1106
},
{
"epoch": 0.3159774951665451,
"grad_norm": 1.515625,
"learning_rate": 0.0002346583250607225,
"loss": 2.6717,
"step": 1107
},
{
"epoch": 0.31626293102487085,
"grad_norm": 1.421875,
"learning_rate": 0.00023454611848583104,
"loss": 2.6939,
"step": 1108
},
{
"epoch": 0.31654836688319654,
"grad_norm": 1.390625,
"learning_rate": 0.00023443384253079308,
"loss": 2.658,
"step": 1109
},
{
"epoch": 0.31683380274152223,
"grad_norm": 1.21875,
"learning_rate": 0.00023432149728774455,
"loss": 2.6733,
"step": 1110
},
{
"epoch": 0.3171192385998479,
"grad_norm": 1.5546875,
"learning_rate": 0.000234209082848878,
"loss": 2.6814,
"step": 1111
},
{
"epoch": 0.3174046744581736,
"grad_norm": 1.2421875,
"learning_rate": 0.00023409659930644287,
"loss": 2.67,
"step": 1112
},
{
"epoch": 0.3176901103164993,
"grad_norm": 1.8359375,
"learning_rate": 0.00023398404675274522,
"loss": 2.6662,
"step": 1113
},
{
"epoch": 0.317975546174825,
"grad_norm": 1.7578125,
"learning_rate": 0.00023387142528014798,
"loss": 2.6935,
"step": 1114
},
{
"epoch": 0.3182609820331507,
"grad_norm": 1.296875,
"learning_rate": 0.00023375873498107026,
"loss": 2.6746,
"step": 1115
},
{
"epoch": 0.3185464178914764,
"grad_norm": 1.3125,
"learning_rate": 0.00023364597594798802,
"loss": 2.6977,
"step": 1116
},
{
"epoch": 0.3188318537498021,
"grad_norm": 1.453125,
"learning_rate": 0.0002335331482734333,
"loss": 2.6889,
"step": 1117
},
{
"epoch": 0.3191172896081278,
"grad_norm": 1.09375,
"learning_rate": 0.00023342025204999472,
"loss": 2.6725,
"step": 1118
},
{
"epoch": 0.3194027254664535,
"grad_norm": 1.8203125,
"learning_rate": 0.0002333072873703171,
"loss": 2.669,
"step": 1119
},
{
"epoch": 0.3196881613247792,
"grad_norm": 1.640625,
"learning_rate": 0.00023319425432710136,
"loss": 2.691,
"step": 1120
},
{
"epoch": 0.31997359718310486,
"grad_norm": 1.5859375,
"learning_rate": 0.0002330811530131045,
"loss": 2.6734,
"step": 1121
},
{
"epoch": 0.32025903304143055,
"grad_norm": 1.53125,
"learning_rate": 0.0002329679835211397,
"loss": 2.6915,
"step": 1122
},
{
"epoch": 0.32054446889975624,
"grad_norm": 1.421875,
"learning_rate": 0.00023285474594407585,
"loss": 2.6766,
"step": 1123
},
{
"epoch": 0.320829904758082,
"grad_norm": 1.2890625,
"learning_rate": 0.000232741440374838,
"loss": 2.6737,
"step": 1124
},
{
"epoch": 0.3211153406164077,
"grad_norm": 1.484375,
"learning_rate": 0.00023262806690640673,
"loss": 2.6618,
"step": 1125
},
{
"epoch": 0.32140077647473336,
"grad_norm": 1.2578125,
"learning_rate": 0.00023251462563181853,
"loss": 2.7,
"step": 1126
},
{
"epoch": 0.32168621233305905,
"grad_norm": 1.6484375,
"learning_rate": 0.00023240111664416544,
"loss": 2.6777,
"step": 1127
},
{
"epoch": 0.32197164819138474,
"grad_norm": 1.4765625,
"learning_rate": 0.0002322875400365951,
"loss": 2.6749,
"step": 1128
},
{
"epoch": 0.32225708404971043,
"grad_norm": 1.5703125,
"learning_rate": 0.00023217389590231058,
"loss": 2.6936,
"step": 1129
},
{
"epoch": 0.3225425199080361,
"grad_norm": 1.3359375,
"learning_rate": 0.00023206018433457045,
"loss": 2.6419,
"step": 1130
},
{
"epoch": 0.3228279557663618,
"grad_norm": 1.453125,
"learning_rate": 0.00023194640542668855,
"loss": 2.6704,
"step": 1131
},
{
"epoch": 0.32311339162468755,
"grad_norm": 1.3125,
"learning_rate": 0.00023183255927203405,
"loss": 2.7011,
"step": 1132
},
{
"epoch": 0.32339882748301324,
"grad_norm": 1.40625,
"learning_rate": 0.00023171864596403116,
"loss": 2.683,
"step": 1133
},
{
"epoch": 0.32368426334133893,
"grad_norm": 1.1875,
"learning_rate": 0.00023160466559615946,
"loss": 2.7078,
"step": 1134
},
{
"epoch": 0.3239696991996646,
"grad_norm": 1.2734375,
"learning_rate": 0.00023149061826195327,
"loss": 2.6919,
"step": 1135
},
{
"epoch": 0.3242551350579903,
"grad_norm": 1.09375,
"learning_rate": 0.00023137650405500202,
"loss": 2.6554,
"step": 1136
},
{
"epoch": 0.324540570916316,
"grad_norm": 1.3046875,
"learning_rate": 0.00023126232306895,
"loss": 2.6734,
"step": 1137
},
{
"epoch": 0.3248260067746417,
"grad_norm": 1.1484375,
"learning_rate": 0.0002311480753974963,
"loss": 2.6794,
"step": 1138
},
{
"epoch": 0.3251114426329674,
"grad_norm": 1.296875,
"learning_rate": 0.00023103376113439472,
"loss": 2.6802,
"step": 1139
},
{
"epoch": 0.3253968784912931,
"grad_norm": 1.046875,
"learning_rate": 0.0002309193803734537,
"loss": 2.6811,
"step": 1140
},
{
"epoch": 0.3256823143496188,
"grad_norm": 1.5,
"learning_rate": 0.00023080493320853628,
"loss": 2.671,
"step": 1141
},
{
"epoch": 0.3259677502079445,
"grad_norm": 1.0859375,
"learning_rate": 0.00023069041973355992,
"loss": 2.6759,
"step": 1142
},
{
"epoch": 0.3262531860662702,
"grad_norm": 1.5859375,
"learning_rate": 0.00023057584004249662,
"loss": 2.682,
"step": 1143
},
{
"epoch": 0.3265386219245959,
"grad_norm": 1.3515625,
"learning_rate": 0.00023046119422937258,
"loss": 2.6591,
"step": 1144
},
{
"epoch": 0.32682405778292156,
"grad_norm": 1.6171875,
"learning_rate": 0.00023034648238826836,
"loss": 2.6607,
"step": 1145
},
{
"epoch": 0.32710949364124725,
"grad_norm": 1.4140625,
"learning_rate": 0.00023023170461331863,
"loss": 2.6512,
"step": 1146
},
{
"epoch": 0.32739492949957294,
"grad_norm": 1.6171875,
"learning_rate": 0.0002301168609987123,
"loss": 2.6913,
"step": 1147
},
{
"epoch": 0.3276803653578987,
"grad_norm": 1.46875,
"learning_rate": 0.00023000195163869216,
"loss": 2.6783,
"step": 1148
},
{
"epoch": 0.3279658012162244,
"grad_norm": 1.5546875,
"learning_rate": 0.0002298869766275549,
"loss": 2.6467,
"step": 1149
},
{
"epoch": 0.32825123707455006,
"grad_norm": 1.40625,
"learning_rate": 0.00022977193605965143,
"loss": 2.7,
"step": 1150
},
{
"epoch": 0.32853667293287575,
"grad_norm": 1.4453125,
"learning_rate": 0.000229656830029386,
"loss": 2.6604,
"step": 1151
},
{
"epoch": 0.32882210879120144,
"grad_norm": 1.328125,
"learning_rate": 0.0002295416586312169,
"loss": 2.6538,
"step": 1152
},
{
"epoch": 0.32910754464952713,
"grad_norm": 1.28125,
"learning_rate": 0.00022942642195965596,
"loss": 2.69,
"step": 1153
},
{
"epoch": 0.3293929805078528,
"grad_norm": 1.2109375,
"learning_rate": 0.0002293111201092686,
"loss": 2.6806,
"step": 1154
},
{
"epoch": 0.3296784163661785,
"grad_norm": 1.203125,
"learning_rate": 0.00022919575317467358,
"loss": 2.6815,
"step": 1155
},
{
"epoch": 0.3299638522245042,
"grad_norm": 1.0390625,
"learning_rate": 0.0002290803212505433,
"loss": 2.6887,
"step": 1156
},
{
"epoch": 0.33024928808282994,
"grad_norm": 1.5078125,
"learning_rate": 0.00022896482443160335,
"loss": 2.6799,
"step": 1157
},
{
"epoch": 0.33053472394115563,
"grad_norm": 1.34375,
"learning_rate": 0.00022884926281263265,
"loss": 2.6802,
"step": 1158
},
{
"epoch": 0.3308201597994813,
"grad_norm": 1.3984375,
"learning_rate": 0.00022873363648846318,
"loss": 2.6585,
"step": 1159
},
{
"epoch": 0.331105595657807,
"grad_norm": 1.3203125,
"learning_rate": 0.00022861794555398016,
"loss": 2.6746,
"step": 1160
},
{
"epoch": 0.3313910315161327,
"grad_norm": 1.40625,
"learning_rate": 0.0002285021901041217,
"loss": 2.6856,
"step": 1161
},
{
"epoch": 0.3316764673744584,
"grad_norm": 1.234375,
"learning_rate": 0.000228386370233879,
"loss": 2.6456,
"step": 1162
},
{
"epoch": 0.3319619032327841,
"grad_norm": 1.484375,
"learning_rate": 0.00022827048603829596,
"loss": 2.6973,
"step": 1163
},
{
"epoch": 0.33224733909110976,
"grad_norm": 1.3359375,
"learning_rate": 0.0002281545376124694,
"loss": 2.665,
"step": 1164
},
{
"epoch": 0.3325327749494355,
"grad_norm": 1.515625,
"learning_rate": 0.00022803852505154867,
"loss": 2.666,
"step": 1165
},
{
"epoch": 0.3328182108077612,
"grad_norm": 1.390625,
"learning_rate": 0.00022792244845073608,
"loss": 2.6748,
"step": 1166
},
{
"epoch": 0.3331036466660869,
"grad_norm": 1.40625,
"learning_rate": 0.00022780630790528617,
"loss": 2.6593,
"step": 1167
},
{
"epoch": 0.33338908252441257,
"grad_norm": 1.296875,
"learning_rate": 0.00022769010351050606,
"loss": 2.6485,
"step": 1168
},
{
"epoch": 0.33367451838273826,
"grad_norm": 1.375,
"learning_rate": 0.00022757383536175529,
"loss": 2.6684,
"step": 1169
},
{
"epoch": 0.33395995424106395,
"grad_norm": 1.203125,
"learning_rate": 0.00022745750355444573,
"loss": 2.6508,
"step": 1170
},
{
"epoch": 0.33424539009938964,
"grad_norm": 1.4453125,
"learning_rate": 0.00022734110818404144,
"loss": 2.6546,
"step": 1171
},
{
"epoch": 0.3345308259577153,
"grad_norm": 1.3828125,
"learning_rate": 0.00022722464934605869,
"loss": 2.6864,
"step": 1172
},
{
"epoch": 0.33481626181604107,
"grad_norm": 1.5,
"learning_rate": 0.00022710812713606582,
"loss": 2.6611,
"step": 1173
},
{
"epoch": 0.33510169767436676,
"grad_norm": 1.328125,
"learning_rate": 0.00022699154164968307,
"loss": 2.6822,
"step": 1174
},
{
"epoch": 0.33538713353269245,
"grad_norm": 1.3671875,
"learning_rate": 0.0002268748929825828,
"loss": 2.6522,
"step": 1175
},
{
"epoch": 0.33567256939101814,
"grad_norm": 1.25,
"learning_rate": 0.0002267581812304891,
"loss": 2.6546,
"step": 1176
},
{
"epoch": 0.3359580052493438,
"grad_norm": 1.390625,
"learning_rate": 0.00022664140648917782,
"loss": 2.6711,
"step": 1177
},
{
"epoch": 0.3362434411076695,
"grad_norm": 1.15625,
"learning_rate": 0.00022652456885447652,
"loss": 2.6533,
"step": 1178
},
{
"epoch": 0.3365288769659952,
"grad_norm": 1.53125,
"learning_rate": 0.0002264076684222644,
"loss": 2.6659,
"step": 1179
},
{
"epoch": 0.3368143128243209,
"grad_norm": 1.359375,
"learning_rate": 0.00022629070528847216,
"loss": 2.6843,
"step": 1180
},
{
"epoch": 0.33709974868264664,
"grad_norm": 1.4375,
"learning_rate": 0.00022617367954908194,
"loss": 2.6654,
"step": 1181
},
{
"epoch": 0.3373851845409723,
"grad_norm": 1.2109375,
"learning_rate": 0.00022605659130012733,
"loss": 2.6624,
"step": 1182
},
{
"epoch": 0.337670620399298,
"grad_norm": 1.3828125,
"learning_rate": 0.00022593944063769314,
"loss": 2.6839,
"step": 1183
},
{
"epoch": 0.3379560562576237,
"grad_norm": 1.21875,
"learning_rate": 0.0002258222276579154,
"loss": 2.6787,
"step": 1184
},
{
"epoch": 0.3382414921159494,
"grad_norm": 1.375,
"learning_rate": 0.00022570495245698128,
"loss": 2.6928,
"step": 1185
},
{
"epoch": 0.3385269279742751,
"grad_norm": 1.2109375,
"learning_rate": 0.00022558761513112913,
"loss": 2.6999,
"step": 1186
},
{
"epoch": 0.33881236383260077,
"grad_norm": 1.3984375,
"learning_rate": 0.00022547021577664814,
"loss": 2.6904,
"step": 1187
},
{
"epoch": 0.33909779969092646,
"grad_norm": 1.171875,
"learning_rate": 0.00022535275448987832,
"loss": 2.6623,
"step": 1188
},
{
"epoch": 0.3393832355492522,
"grad_norm": 1.2734375,
"learning_rate": 0.00022523523136721085,
"loss": 2.6658,
"step": 1189
},
{
"epoch": 0.3396686714075779,
"grad_norm": 1.1171875,
"learning_rate": 0.00022511764650508728,
"loss": 2.6547,
"step": 1190
},
{
"epoch": 0.3399541072659036,
"grad_norm": 1.3359375,
"learning_rate": 0.000225,
"loss": 2.6677,
"step": 1191
},
{
"epoch": 0.34023954312422927,
"grad_norm": 1.171875,
"learning_rate": 0.00022488229194849192,
"loss": 2.6869,
"step": 1192
},
{
"epoch": 0.34052497898255496,
"grad_norm": 1.40625,
"learning_rate": 0.00022476452244715663,
"loss": 2.6773,
"step": 1193
},
{
"epoch": 0.34081041484088065,
"grad_norm": 1.1875,
"learning_rate": 0.00022464669159263793,
"loss": 2.6669,
"step": 1194
},
{
"epoch": 0.34109585069920634,
"grad_norm": 1.3515625,
"learning_rate": 0.00022452879948162998,
"loss": 2.64,
"step": 1195
},
{
"epoch": 0.341381286557532,
"grad_norm": 1.203125,
"learning_rate": 0.0002244108462108774,
"loss": 2.6452,
"step": 1196
},
{
"epoch": 0.34166672241585777,
"grad_norm": 1.3359375,
"learning_rate": 0.00022429283187717485,
"loss": 2.6339,
"step": 1197
},
{
"epoch": 0.34195215827418346,
"grad_norm": 1.1640625,
"learning_rate": 0.00022417475657736705,
"loss": 2.6572,
"step": 1198
},
{
"epoch": 0.34223759413250915,
"grad_norm": 1.3125,
"learning_rate": 0.00022405662040834895,
"loss": 2.646,
"step": 1199
},
{
"epoch": 0.34252302999083484,
"grad_norm": 1.1796875,
"learning_rate": 0.00022393842346706523,
"loss": 2.6676,
"step": 1200
},
{
"epoch": 0.3428084658491605,
"grad_norm": 1.171875,
"learning_rate": 0.00022382016585051058,
"loss": 2.6574,
"step": 1201
},
{
"epoch": 0.3430939017074862,
"grad_norm": 1.1171875,
"learning_rate": 0.00022370184765572944,
"loss": 2.6481,
"step": 1202
},
{
"epoch": 0.3433793375658119,
"grad_norm": 1.15625,
"learning_rate": 0.00022358346897981596,
"loss": 2.675,
"step": 1203
},
{
"epoch": 0.3436647734241376,
"grad_norm": 1.09375,
"learning_rate": 0.0002234650299199139,
"loss": 2.6475,
"step": 1204
},
{
"epoch": 0.34395020928246334,
"grad_norm": 1.0234375,
"learning_rate": 0.00022334653057321663,
"loss": 2.6372,
"step": 1205
},
{
"epoch": 0.344235645140789,
"grad_norm": 0.90625,
"learning_rate": 0.00022322797103696692,
"loss": 2.657,
"step": 1206
},
{
"epoch": 0.3445210809991147,
"grad_norm": 0.98828125,
"learning_rate": 0.00022310935140845706,
"loss": 2.6606,
"step": 1207
},
{
"epoch": 0.3448065168574404,
"grad_norm": 0.8515625,
"learning_rate": 0.0002229906717850284,
"loss": 2.6751,
"step": 1208
},
{
"epoch": 0.3450919527157661,
"grad_norm": 0.9140625,
"learning_rate": 0.00022287193226407185,
"loss": 2.6703,
"step": 1209
},
{
"epoch": 0.3453773885740918,
"grad_norm": 0.81640625,
"learning_rate": 0.00022275313294302726,
"loss": 2.6554,
"step": 1210
},
{
"epoch": 0.34566282443241747,
"grad_norm": 0.9375,
"learning_rate": 0.00022263427391938358,
"loss": 2.6401,
"step": 1211
},
{
"epoch": 0.34594826029074316,
"grad_norm": 0.78515625,
"learning_rate": 0.00022251535529067877,
"loss": 2.6659,
"step": 1212
},
{
"epoch": 0.3462336961490689,
"grad_norm": 0.984375,
"learning_rate": 0.00022239637715449977,
"loss": 2.6972,
"step": 1213
},
{
"epoch": 0.3465191320073946,
"grad_norm": 0.82421875,
"learning_rate": 0.0002222773396084822,
"loss": 2.6545,
"step": 1214
},
{
"epoch": 0.3468045678657203,
"grad_norm": 0.80859375,
"learning_rate": 0.0002221582427503106,
"loss": 2.6515,
"step": 1215
},
{
"epoch": 0.34709000372404597,
"grad_norm": 0.6953125,
"learning_rate": 0.00022203908667771808,
"loss": 2.6517,
"step": 1216
},
{
"epoch": 0.34737543958237166,
"grad_norm": 0.73828125,
"learning_rate": 0.00022191987148848636,
"loss": 2.6596,
"step": 1217
},
{
"epoch": 0.34766087544069735,
"grad_norm": 0.6640625,
"learning_rate": 0.0002218005972804457,
"loss": 2.6795,
"step": 1218
},
{
"epoch": 0.34794631129902304,
"grad_norm": 0.73828125,
"learning_rate": 0.00022168126415147478,
"loss": 2.6416,
"step": 1219
},
{
"epoch": 0.3482317471573487,
"grad_norm": 0.71875,
"learning_rate": 0.00022156187219950059,
"loss": 2.6384,
"step": 1220
},
{
"epoch": 0.34851718301567447,
"grad_norm": 0.69140625,
"learning_rate": 0.0002214424215224985,
"loss": 2.6574,
"step": 1221
},
{
"epoch": 0.34880261887400016,
"grad_norm": 0.77734375,
"learning_rate": 0.0002213229122184919,
"loss": 2.6864,
"step": 1222
},
{
"epoch": 0.34908805473232585,
"grad_norm": 0.796875,
"learning_rate": 0.0002212033443855525,
"loss": 2.6457,
"step": 1223
},
{
"epoch": 0.34937349059065154,
"grad_norm": 0.7265625,
"learning_rate": 0.0002210837181217998,
"loss": 2.6441,
"step": 1224
},
{
"epoch": 0.3496589264489772,
"grad_norm": 0.8203125,
"learning_rate": 0.0002209640335254015,
"loss": 2.6643,
"step": 1225
},
{
"epoch": 0.3499443623073029,
"grad_norm": 0.703125,
"learning_rate": 0.00022084429069457297,
"loss": 2.6436,
"step": 1226
},
{
"epoch": 0.3502297981656286,
"grad_norm": 0.80859375,
"learning_rate": 0.0002207244897275775,
"loss": 2.6485,
"step": 1227
},
{
"epoch": 0.3505152340239543,
"grad_norm": 0.80859375,
"learning_rate": 0.00022060463072272595,
"loss": 2.6534,
"step": 1228
},
{
"epoch": 0.35080066988228,
"grad_norm": 0.8203125,
"learning_rate": 0.00022048471377837697,
"loss": 2.6605,
"step": 1229
},
{
"epoch": 0.3510861057406057,
"grad_norm": 0.9296875,
"learning_rate": 0.0002203647389929367,
"loss": 2.6603,
"step": 1230
},
{
"epoch": 0.3513715415989314,
"grad_norm": 1.2578125,
"learning_rate": 0.00022024470646485862,
"loss": 2.6937,
"step": 1231
},
{
"epoch": 0.3516569774572571,
"grad_norm": 0.96484375,
"learning_rate": 0.0002201246162926437,
"loss": 2.6643,
"step": 1232
},
{
"epoch": 0.3519424133155828,
"grad_norm": 0.875,
"learning_rate": 0.00022000446857484035,
"loss": 2.6523,
"step": 1233
},
{
"epoch": 0.3522278491739085,
"grad_norm": 0.7578125,
"learning_rate": 0.0002198842634100439,
"loss": 2.6739,
"step": 1234
},
{
"epoch": 0.35251328503223417,
"grad_norm": 0.59765625,
"learning_rate": 0.00021976400089689712,
"loss": 2.6605,
"step": 1235
},
{
"epoch": 0.35279872089055986,
"grad_norm": 0.7109375,
"learning_rate": 0.00021964368113408959,
"loss": 2.6868,
"step": 1236
},
{
"epoch": 0.35308415674888555,
"grad_norm": 0.828125,
"learning_rate": 0.00021952330422035803,
"loss": 2.6759,
"step": 1237
},
{
"epoch": 0.3533695926072113,
"grad_norm": 0.96875,
"learning_rate": 0.0002194028702544861,
"loss": 2.6735,
"step": 1238
},
{
"epoch": 0.353655028465537,
"grad_norm": 0.97265625,
"learning_rate": 0.00021928237933530403,
"loss": 2.661,
"step": 1239
},
{
"epoch": 0.35394046432386267,
"grad_norm": 1.0546875,
"learning_rate": 0.00021916183156168908,
"loss": 2.6457,
"step": 1240
},
{
"epoch": 0.35422590018218836,
"grad_norm": 0.9453125,
"learning_rate": 0.00021904122703256498,
"loss": 2.6761,
"step": 1241
},
{
"epoch": 0.35451133604051405,
"grad_norm": 0.80859375,
"learning_rate": 0.00021892056584690213,
"loss": 2.6441,
"step": 1242
},
{
"epoch": 0.35479677189883974,
"grad_norm": 0.94921875,
"learning_rate": 0.00021879984810371734,
"loss": 2.6453,
"step": 1243
},
{
"epoch": 0.3550822077571654,
"grad_norm": 1.015625,
"learning_rate": 0.00021867907390207394,
"loss": 2.6208,
"step": 1244
},
{
"epoch": 0.3553676436154911,
"grad_norm": 0.98046875,
"learning_rate": 0.00021855824334108143,
"loss": 2.6572,
"step": 1245
},
{
"epoch": 0.35565307947381686,
"grad_norm": 0.85546875,
"learning_rate": 0.00021843735651989575,
"loss": 2.6826,
"step": 1246
},
{
"epoch": 0.35593851533214255,
"grad_norm": 0.8125,
"learning_rate": 0.00021831641353771885,
"loss": 2.6611,
"step": 1247
},
{
"epoch": 0.35622395119046824,
"grad_norm": 0.83203125,
"learning_rate": 0.00021819541449379892,
"loss": 2.6597,
"step": 1248
},
{
"epoch": 0.3565093870487939,
"grad_norm": 0.99609375,
"learning_rate": 0.00021807435948742994,
"loss": 2.635,
"step": 1249
},
{
"epoch": 0.3567948229071196,
"grad_norm": 0.9296875,
"learning_rate": 0.00021795324861795208,
"loss": 2.6526,
"step": 1250
},
{
"epoch": 0.3567948229071196,
"eval_loss": 2.5330393314361572,
"eval_runtime": 5928.9133,
"eval_samples_per_second": 10.843,
"eval_steps_per_second": 10.843,
"step": 1250
},
{
"epoch": 0.3570802587654453,
"grad_norm": 0.84375,
"learning_rate": 0.00021783208198475107,
"loss": 2.6512,
"step": 1251
},
{
"epoch": 0.357365694623771,
"grad_norm": 0.7890625,
"learning_rate": 0.00021771085968725864,
"loss": 2.6381,
"step": 1252
},
{
"epoch": 0.3576511304820967,
"grad_norm": 0.7265625,
"learning_rate": 0.00021758958182495214,
"loss": 2.6498,
"step": 1253
},
{
"epoch": 0.3579365663404224,
"grad_norm": 1.171875,
"learning_rate": 0.00021746824849735435,
"loss": 2.6614,
"step": 1254
},
{
"epoch": 0.3582220021987481,
"grad_norm": 0.72265625,
"learning_rate": 0.00021734685980403376,
"loss": 2.6483,
"step": 1255
},
{
"epoch": 0.3585074380570738,
"grad_norm": 0.89453125,
"learning_rate": 0.0002172254158446043,
"loss": 2.6365,
"step": 1256
},
{
"epoch": 0.3587928739153995,
"grad_norm": 0.86328125,
"learning_rate": 0.00021710391671872514,
"loss": 2.6484,
"step": 1257
},
{
"epoch": 0.3590783097737252,
"grad_norm": 0.8984375,
"learning_rate": 0.00021698236252610072,
"loss": 2.6372,
"step": 1258
},
{
"epoch": 0.35936374563205087,
"grad_norm": 0.80859375,
"learning_rate": 0.00021686075336648075,
"loss": 2.6554,
"step": 1259
},
{
"epoch": 0.35964918149037656,
"grad_norm": 0.8359375,
"learning_rate": 0.00021673908933965996,
"loss": 2.6511,
"step": 1260
},
{
"epoch": 0.35993461734870225,
"grad_norm": 0.81640625,
"learning_rate": 0.00021661737054547826,
"loss": 2.6473,
"step": 1261
},
{
"epoch": 0.360220053207028,
"grad_norm": 0.7890625,
"learning_rate": 0.00021649559708382027,
"loss": 2.6396,
"step": 1262
},
{
"epoch": 0.3605054890653537,
"grad_norm": 0.87890625,
"learning_rate": 0.0002163737690546157,
"loss": 2.6517,
"step": 1263
},
{
"epoch": 0.36079092492367937,
"grad_norm": 0.9765625,
"learning_rate": 0.00021625188655783893,
"loss": 2.6126,
"step": 1264
},
{
"epoch": 0.36107636078200506,
"grad_norm": 0.89453125,
"learning_rate": 0.000216129949693509,
"loss": 2.6551,
"step": 1265
},
{
"epoch": 0.36136179664033075,
"grad_norm": 0.8671875,
"learning_rate": 0.0002160079585616896,
"loss": 2.6316,
"step": 1266
},
{
"epoch": 0.36164723249865643,
"grad_norm": 0.89453125,
"learning_rate": 0.000215885913262489,
"loss": 2.6376,
"step": 1267
},
{
"epoch": 0.3619326683569821,
"grad_norm": 0.78125,
"learning_rate": 0.00021576381389605992,
"loss": 2.6378,
"step": 1268
},
{
"epoch": 0.3622181042153078,
"grad_norm": 0.78515625,
"learning_rate": 0.00021564166056259936,
"loss": 2.6742,
"step": 1269
},
{
"epoch": 0.36250354007363356,
"grad_norm": 0.9453125,
"learning_rate": 0.00021551945336234867,
"loss": 2.6676,
"step": 1270
},
{
"epoch": 0.36278897593195925,
"grad_norm": 0.7578125,
"learning_rate": 0.00021539719239559336,
"loss": 2.6604,
"step": 1271
},
{
"epoch": 0.36307441179028493,
"grad_norm": 0.734375,
"learning_rate": 0.00021527487776266317,
"loss": 2.6459,
"step": 1272
},
{
"epoch": 0.3633598476486106,
"grad_norm": 0.74609375,
"learning_rate": 0.0002151525095639318,
"loss": 2.6323,
"step": 1273
},
{
"epoch": 0.3636452835069363,
"grad_norm": 0.80859375,
"learning_rate": 0.0002150300878998168,
"loss": 2.6476,
"step": 1274
},
{
"epoch": 0.363930719365262,
"grad_norm": 0.73046875,
"learning_rate": 0.0002149076128707798,
"loss": 2.6378,
"step": 1275
},
{
"epoch": 0.3642161552235877,
"grad_norm": 0.73828125,
"learning_rate": 0.00021478508457732615,
"loss": 2.654,
"step": 1276
},
{
"epoch": 0.3645015910819134,
"grad_norm": 0.62109375,
"learning_rate": 0.00021466250312000482,
"loss": 2.6398,
"step": 1277
},
{
"epoch": 0.3647870269402391,
"grad_norm": 0.74609375,
"learning_rate": 0.00021453986859940852,
"loss": 2.6306,
"step": 1278
},
{
"epoch": 0.3650724627985648,
"grad_norm": 0.84765625,
"learning_rate": 0.00021441718111617344,
"loss": 2.6299,
"step": 1279
},
{
"epoch": 0.3653578986568905,
"grad_norm": 0.8125,
"learning_rate": 0.00021429444077097928,
"loss": 2.6466,
"step": 1280
},
{
"epoch": 0.3656433345152162,
"grad_norm": 0.75390625,
"learning_rate": 0.00021417164766454903,
"loss": 2.6788,
"step": 1281
},
{
"epoch": 0.3659287703735419,
"grad_norm": 0.61328125,
"learning_rate": 0.00021404880189764913,
"loss": 2.6416,
"step": 1282
},
{
"epoch": 0.36621420623186757,
"grad_norm": 0.63671875,
"learning_rate": 0.00021392590357108905,
"loss": 2.6469,
"step": 1283
},
{
"epoch": 0.36649964209019326,
"grad_norm": 0.65625,
"learning_rate": 0.00021380295278572155,
"loss": 2.6422,
"step": 1284
},
{
"epoch": 0.36678507794851894,
"grad_norm": 0.63671875,
"learning_rate": 0.00021367994964244236,
"loss": 2.6202,
"step": 1285
},
{
"epoch": 0.3670705138068447,
"grad_norm": 0.640625,
"learning_rate": 0.00021355689424219023,
"loss": 2.6281,
"step": 1286
},
{
"epoch": 0.3673559496651704,
"grad_norm": 0.60546875,
"learning_rate": 0.00021343378668594662,
"loss": 2.6181,
"step": 1287
},
{
"epoch": 0.36764138552349607,
"grad_norm": 0.62890625,
"learning_rate": 0.00021331062707473605,
"loss": 2.6632,
"step": 1288
},
{
"epoch": 0.36792682138182176,
"grad_norm": 0.59765625,
"learning_rate": 0.00021318741550962556,
"loss": 2.6296,
"step": 1289
},
{
"epoch": 0.36821225724014744,
"grad_norm": 0.58984375,
"learning_rate": 0.00021306415209172502,
"loss": 2.654,
"step": 1290
},
{
"epoch": 0.36849769309847313,
"grad_norm": 0.54296875,
"learning_rate": 0.00021294083692218653,
"loss": 2.6375,
"step": 1291
},
{
"epoch": 0.3687831289567988,
"grad_norm": 0.61328125,
"learning_rate": 0.00021281747010220496,
"loss": 2.6488,
"step": 1292
},
{
"epoch": 0.3690685648151245,
"grad_norm": 0.62109375,
"learning_rate": 0.0002126940517330175,
"loss": 2.6565,
"step": 1293
},
{
"epoch": 0.36935400067345026,
"grad_norm": 0.58984375,
"learning_rate": 0.00021257058191590354,
"loss": 2.6622,
"step": 1294
},
{
"epoch": 0.36963943653177594,
"grad_norm": 0.59765625,
"learning_rate": 0.00021244706075218472,
"loss": 2.6498,
"step": 1295
},
{
"epoch": 0.36992487239010163,
"grad_norm": 0.72265625,
"learning_rate": 0.00021232348834322495,
"loss": 2.6525,
"step": 1296
},
{
"epoch": 0.3702103082484273,
"grad_norm": 0.75,
"learning_rate": 0.00021219986479043001,
"loss": 2.6365,
"step": 1297
},
{
"epoch": 0.370495744106753,
"grad_norm": 0.6484375,
"learning_rate": 0.00021207619019524777,
"loss": 2.6502,
"step": 1298
},
{
"epoch": 0.3707811799650787,
"grad_norm": 0.5859375,
"learning_rate": 0.00021195246465916792,
"loss": 2.6183,
"step": 1299
},
{
"epoch": 0.3710666158234044,
"grad_norm": 0.52734375,
"learning_rate": 0.00021182868828372196,
"loss": 2.6646,
"step": 1300
},
{
"epoch": 0.3713520516817301,
"grad_norm": 0.6484375,
"learning_rate": 0.00021170486117048315,
"loss": 2.6203,
"step": 1301
},
{
"epoch": 0.37163748754005577,
"grad_norm": 0.62109375,
"learning_rate": 0.0002115809834210664,
"loss": 2.625,
"step": 1302
},
{
"epoch": 0.3719229233983815,
"grad_norm": 0.6171875,
"learning_rate": 0.0002114570551371281,
"loss": 2.671,
"step": 1303
},
{
"epoch": 0.3722083592567072,
"grad_norm": 0.54296875,
"learning_rate": 0.00021133307642036615,
"loss": 2.6239,
"step": 1304
},
{
"epoch": 0.3724937951150329,
"grad_norm": 0.64453125,
"learning_rate": 0.0002112090473725198,
"loss": 2.643,
"step": 1305
},
{
"epoch": 0.3727792309733586,
"grad_norm": 0.5390625,
"learning_rate": 0.00021108496809536974,
"loss": 2.627,
"step": 1306
},
{
"epoch": 0.37306466683168427,
"grad_norm": 0.56640625,
"learning_rate": 0.00021096083869073765,
"loss": 2.6038,
"step": 1307
},
{
"epoch": 0.37335010269000996,
"grad_norm": 0.640625,
"learning_rate": 0.0002108366592604866,
"loss": 2.6223,
"step": 1308
},
{
"epoch": 0.37363553854833564,
"grad_norm": 0.7109375,
"learning_rate": 0.00021071242990652043,
"loss": 2.6492,
"step": 1309
},
{
"epoch": 0.37392097440666133,
"grad_norm": 0.625,
"learning_rate": 0.00021058815073078422,
"loss": 2.6534,
"step": 1310
},
{
"epoch": 0.3742064102649871,
"grad_norm": 0.58984375,
"learning_rate": 0.00021046382183526378,
"loss": 2.6197,
"step": 1311
},
{
"epoch": 0.37449184612331277,
"grad_norm": 0.7265625,
"learning_rate": 0.0002103394433219858,
"loss": 2.632,
"step": 1312
},
{
"epoch": 0.37477728198163845,
"grad_norm": 0.59375,
"learning_rate": 0.00021021501529301756,
"loss": 2.639,
"step": 1313
},
{
"epoch": 0.37506271783996414,
"grad_norm": 0.63671875,
"learning_rate": 0.00021009053785046706,
"loss": 2.6138,
"step": 1314
},
{
"epoch": 0.37534815369828983,
"grad_norm": 0.61328125,
"learning_rate": 0.0002099660110964829,
"loss": 2.647,
"step": 1315
},
{
"epoch": 0.3756335895566155,
"grad_norm": 0.60546875,
"learning_rate": 0.00020984143513325416,
"loss": 2.6299,
"step": 1316
},
{
"epoch": 0.3759190254149412,
"grad_norm": 0.55078125,
"learning_rate": 0.0002097168100630101,
"loss": 2.6422,
"step": 1317
},
{
"epoch": 0.3762044612732669,
"grad_norm": 0.62109375,
"learning_rate": 0.0002095921359880204,
"loss": 2.6092,
"step": 1318
},
{
"epoch": 0.37648989713159264,
"grad_norm": 0.609375,
"learning_rate": 0.00020946741301059514,
"loss": 2.6118,
"step": 1319
},
{
"epoch": 0.37677533298991833,
"grad_norm": 0.64453125,
"learning_rate": 0.0002093426412330842,
"loss": 2.6348,
"step": 1320
},
{
"epoch": 0.377060768848244,
"grad_norm": 0.671875,
"learning_rate": 0.00020921782075787777,
"loss": 2.6552,
"step": 1321
},
{
"epoch": 0.3773462047065697,
"grad_norm": 0.58203125,
"learning_rate": 0.00020909295168740577,
"loss": 2.6427,
"step": 1322
},
{
"epoch": 0.3776316405648954,
"grad_norm": 0.5625,
"learning_rate": 0.00020896803412413824,
"loss": 2.626,
"step": 1323
},
{
"epoch": 0.3779170764232211,
"grad_norm": 0.59375,
"learning_rate": 0.00020884306817058482,
"loss": 2.6509,
"step": 1324
},
{
"epoch": 0.3782025122815468,
"grad_norm": 0.58984375,
"learning_rate": 0.00020871805392929502,
"loss": 2.6215,
"step": 1325
},
{
"epoch": 0.37848794813987247,
"grad_norm": 0.55859375,
"learning_rate": 0.00020859299150285786,
"loss": 2.6605,
"step": 1326
},
{
"epoch": 0.3787733839981982,
"grad_norm": 0.625,
"learning_rate": 0.00020846788099390188,
"loss": 2.6488,
"step": 1327
},
{
"epoch": 0.3790588198565239,
"grad_norm": 0.5703125,
"learning_rate": 0.00020834272250509523,
"loss": 2.6607,
"step": 1328
},
{
"epoch": 0.3793442557148496,
"grad_norm": 0.6015625,
"learning_rate": 0.00020821751613914525,
"loss": 2.6426,
"step": 1329
},
{
"epoch": 0.3796296915731753,
"grad_norm": 0.57421875,
"learning_rate": 0.0002080922619987987,
"loss": 2.6458,
"step": 1330
},
{
"epoch": 0.37991512743150097,
"grad_norm": 0.57421875,
"learning_rate": 0.00020796696018684152,
"loss": 2.6278,
"step": 1331
},
{
"epoch": 0.38020056328982665,
"grad_norm": 0.54296875,
"learning_rate": 0.00020784161080609868,
"loss": 2.6603,
"step": 1332
},
{
"epoch": 0.38048599914815234,
"grad_norm": 0.6015625,
"learning_rate": 0.00020771621395943436,
"loss": 2.6395,
"step": 1333
},
{
"epoch": 0.38077143500647803,
"grad_norm": 0.55859375,
"learning_rate": 0.00020759076974975144,
"loss": 2.6346,
"step": 1334
},
{
"epoch": 0.3810568708648038,
"grad_norm": 0.58203125,
"learning_rate": 0.00020746527827999195,
"loss": 2.6412,
"step": 1335
},
{
"epoch": 0.38134230672312946,
"grad_norm": 0.6015625,
"learning_rate": 0.00020733973965313655,
"loss": 2.6311,
"step": 1336
},
{
"epoch": 0.38162774258145515,
"grad_norm": 0.57421875,
"learning_rate": 0.0002072141539722046,
"loss": 2.6174,
"step": 1337
},
{
"epoch": 0.38191317843978084,
"grad_norm": 0.5625,
"learning_rate": 0.00020708852134025397,
"loss": 2.6192,
"step": 1338
},
{
"epoch": 0.38219861429810653,
"grad_norm": 0.5703125,
"learning_rate": 0.0002069628418603814,
"loss": 2.6467,
"step": 1339
},
{
"epoch": 0.3824840501564322,
"grad_norm": 0.57421875,
"learning_rate": 0.00020683711563572167,
"loss": 2.6369,
"step": 1340
},
{
"epoch": 0.3827694860147579,
"grad_norm": 0.55078125,
"learning_rate": 0.00020671134276944815,
"loss": 2.6372,
"step": 1341
},
{
"epoch": 0.3830549218730836,
"grad_norm": 0.53515625,
"learning_rate": 0.0002065855233647725,
"loss": 2.6436,
"step": 1342
},
{
"epoch": 0.38334035773140934,
"grad_norm": 0.58203125,
"learning_rate": 0.00020645965752494444,
"loss": 2.6342,
"step": 1343
},
{
"epoch": 0.38362579358973503,
"grad_norm": 0.60546875,
"learning_rate": 0.0002063337453532519,
"loss": 2.637,
"step": 1344
},
{
"epoch": 0.3839112294480607,
"grad_norm": 0.58203125,
"learning_rate": 0.0002062077869530207,
"loss": 2.6444,
"step": 1345
},
{
"epoch": 0.3841966653063864,
"grad_norm": 0.54296875,
"learning_rate": 0.00020608178242761483,
"loss": 2.6339,
"step": 1346
},
{
"epoch": 0.3844821011647121,
"grad_norm": 0.5703125,
"learning_rate": 0.00020595573188043594,
"loss": 2.6422,
"step": 1347
},
{
"epoch": 0.3847675370230378,
"grad_norm": 0.6484375,
"learning_rate": 0.00020582963541492343,
"loss": 2.6472,
"step": 1348
},
{
"epoch": 0.3850529728813635,
"grad_norm": 0.65625,
"learning_rate": 0.00020570349313455452,
"loss": 2.6081,
"step": 1349
},
{
"epoch": 0.38533840873968916,
"grad_norm": 0.55078125,
"learning_rate": 0.00020557730514284396,
"loss": 2.6214,
"step": 1350
},
{
"epoch": 0.3856238445980149,
"grad_norm": 0.53515625,
"learning_rate": 0.00020545107154334397,
"loss": 2.6263,
"step": 1351
},
{
"epoch": 0.3859092804563406,
"grad_norm": 0.6328125,
"learning_rate": 0.0002053247924396442,
"loss": 2.6092,
"step": 1352
},
{
"epoch": 0.3861947163146663,
"grad_norm": 0.5625,
"learning_rate": 0.0002051984679353718,
"loss": 2.6329,
"step": 1353
},
{
"epoch": 0.386480152172992,
"grad_norm": 0.5546875,
"learning_rate": 0.0002050720981341909,
"loss": 2.6087,
"step": 1354
},
{
"epoch": 0.38676558803131766,
"grad_norm": 0.70703125,
"learning_rate": 0.00020494568313980305,
"loss": 2.6249,
"step": 1355
},
{
"epoch": 0.38705102388964335,
"grad_norm": 0.7578125,
"learning_rate": 0.00020481922305594678,
"loss": 2.6385,
"step": 1356
},
{
"epoch": 0.38733645974796904,
"grad_norm": 0.70703125,
"learning_rate": 0.0002046927179863976,
"loss": 2.632,
"step": 1357
},
{
"epoch": 0.38762189560629473,
"grad_norm": 0.54296875,
"learning_rate": 0.00020456616803496796,
"loss": 2.642,
"step": 1358
},
{
"epoch": 0.3879073314646205,
"grad_norm": 0.7109375,
"learning_rate": 0.00020443957330550718,
"loss": 2.6268,
"step": 1359
},
{
"epoch": 0.38819276732294616,
"grad_norm": 0.6640625,
"learning_rate": 0.0002043129339019013,
"loss": 2.6379,
"step": 1360
},
{
"epoch": 0.38847820318127185,
"grad_norm": 0.51953125,
"learning_rate": 0.00020418624992807295,
"loss": 2.6577,
"step": 1361
},
{
"epoch": 0.38876363903959754,
"grad_norm": 0.67578125,
"learning_rate": 0.00020405952148798144,
"loss": 2.6331,
"step": 1362
},
{
"epoch": 0.38904907489792323,
"grad_norm": 0.55078125,
"learning_rate": 0.00020393274868562254,
"loss": 2.6376,
"step": 1363
},
{
"epoch": 0.3893345107562489,
"grad_norm": 0.609375,
"learning_rate": 0.00020380593162502844,
"loss": 2.6041,
"step": 1364
},
{
"epoch": 0.3896199466145746,
"grad_norm": 0.6796875,
"learning_rate": 0.00020367907041026755,
"loss": 2.6439,
"step": 1365
},
{
"epoch": 0.3899053824729003,
"grad_norm": 0.5625,
"learning_rate": 0.00020355216514544462,
"loss": 2.6405,
"step": 1366
},
{
"epoch": 0.39019081833122604,
"grad_norm": 0.56640625,
"learning_rate": 0.0002034252159347005,
"loss": 2.6451,
"step": 1367
},
{
"epoch": 0.39047625418955173,
"grad_norm": 0.60546875,
"learning_rate": 0.00020329822288221218,
"loss": 2.637,
"step": 1368
},
{
"epoch": 0.3907616900478774,
"grad_norm": 0.578125,
"learning_rate": 0.00020317118609219253,
"loss": 2.5896,
"step": 1369
},
{
"epoch": 0.3910471259062031,
"grad_norm": 0.59375,
"learning_rate": 0.00020304410566889027,
"loss": 2.641,
"step": 1370
},
{
"epoch": 0.3913325617645288,
"grad_norm": 0.62890625,
"learning_rate": 0.0002029169817165901,
"loss": 2.6245,
"step": 1371
},
{
"epoch": 0.3916179976228545,
"grad_norm": 0.56640625,
"learning_rate": 0.0002027898143396123,
"loss": 2.6347,
"step": 1372
},
{
"epoch": 0.3919034334811802,
"grad_norm": 0.56640625,
"learning_rate": 0.00020266260364231286,
"loss": 2.6158,
"step": 1373
},
{
"epoch": 0.39218886933950586,
"grad_norm": 0.62890625,
"learning_rate": 0.00020253534972908326,
"loss": 2.6349,
"step": 1374
},
{
"epoch": 0.39247430519783155,
"grad_norm": 0.7421875,
"learning_rate": 0.00020240805270435044,
"loss": 2.6329,
"step": 1375
},
{
"epoch": 0.3927597410561573,
"grad_norm": 0.6875,
"learning_rate": 0.00020228071267257687,
"loss": 2.6633,
"step": 1376
},
{
"epoch": 0.393045176914483,
"grad_norm": 0.703125,
"learning_rate": 0.00020215332973826003,
"loss": 2.6117,
"step": 1377
},
{
"epoch": 0.3933306127728087,
"grad_norm": 0.91796875,
"learning_rate": 0.00020202590400593285,
"loss": 2.6286,
"step": 1378
},
{
"epoch": 0.39361604863113436,
"grad_norm": 0.87109375,
"learning_rate": 0.00020189843558016338,
"loss": 2.6105,
"step": 1379
},
{
"epoch": 0.39390148448946005,
"grad_norm": 0.98828125,
"learning_rate": 0.0002017709245655545,
"loss": 2.6128,
"step": 1380
},
{
"epoch": 0.39418692034778574,
"grad_norm": 0.82421875,
"learning_rate": 0.00020164337106674417,
"loss": 2.6243,
"step": 1381
},
{
"epoch": 0.39447235620611143,
"grad_norm": 0.69140625,
"learning_rate": 0.0002015157751884053,
"loss": 2.6557,
"step": 1382
},
{
"epoch": 0.3947577920644371,
"grad_norm": 0.8984375,
"learning_rate": 0.0002013881370352454,
"loss": 2.624,
"step": 1383
},
{
"epoch": 0.39504322792276286,
"grad_norm": 0.76171875,
"learning_rate": 0.00020126045671200682,
"loss": 2.6279,
"step": 1384
},
{
"epoch": 0.39532866378108855,
"grad_norm": 0.5859375,
"learning_rate": 0.00020113273432346632,
"loss": 2.6363,
"step": 1385
},
{
"epoch": 0.39561409963941424,
"grad_norm": 0.79296875,
"learning_rate": 0.00020100496997443553,
"loss": 2.6274,
"step": 1386
},
{
"epoch": 0.39589953549773993,
"grad_norm": 0.6484375,
"learning_rate": 0.00020087716376976014,
"loss": 2.6191,
"step": 1387
},
{
"epoch": 0.3961849713560656,
"grad_norm": 0.640625,
"learning_rate": 0.00020074931581432035,
"loss": 2.6355,
"step": 1388
},
{
"epoch": 0.3964704072143913,
"grad_norm": 0.6875,
"learning_rate": 0.0002006214262130307,
"loss": 2.6386,
"step": 1389
},
{
"epoch": 0.396755843072717,
"grad_norm": 0.6171875,
"learning_rate": 0.0002004934950708397,
"loss": 2.6345,
"step": 1390
},
{
"epoch": 0.3970412789310427,
"grad_norm": 0.55078125,
"learning_rate": 0.00020036552249273014,
"loss": 2.6081,
"step": 1391
},
{
"epoch": 0.39732671478936843,
"grad_norm": 0.65625,
"learning_rate": 0.00020023750858371876,
"loss": 2.6243,
"step": 1392
},
{
"epoch": 0.3976121506476941,
"grad_norm": 0.55078125,
"learning_rate": 0.00020010945344885615,
"loss": 2.6405,
"step": 1393
},
{
"epoch": 0.3978975865060198,
"grad_norm": 0.61328125,
"learning_rate": 0.0001999813571932268,
"loss": 2.5995,
"step": 1394
},
{
"epoch": 0.3981830223643455,
"grad_norm": 0.5859375,
"learning_rate": 0.00019985321992194892,
"loss": 2.6225,
"step": 1395
},
{
"epoch": 0.3984684582226712,
"grad_norm": 0.57421875,
"learning_rate": 0.00019972504174017446,
"loss": 2.6077,
"step": 1396
},
{
"epoch": 0.3987538940809969,
"grad_norm": 0.59765625,
"learning_rate": 0.00019959682275308869,
"loss": 2.6165,
"step": 1397
},
{
"epoch": 0.39903932993932256,
"grad_norm": 0.66796875,
"learning_rate": 0.0001994685630659107,
"loss": 2.601,
"step": 1398
},
{
"epoch": 0.39932476579764825,
"grad_norm": 0.6171875,
"learning_rate": 0.00019934026278389274,
"loss": 2.6332,
"step": 1399
},
{
"epoch": 0.399610201655974,
"grad_norm": 0.6015625,
"learning_rate": 0.00019921192201232047,
"loss": 2.6224,
"step": 1400
},
{
"epoch": 0.3998956375142997,
"grad_norm": 0.58203125,
"learning_rate": 0.0001990835408565127,
"loss": 2.5961,
"step": 1401
},
{
"epoch": 0.4001810733726254,
"grad_norm": 0.56640625,
"learning_rate": 0.0001989551194218216,
"loss": 2.6291,
"step": 1402
},
{
"epoch": 0.40046650923095106,
"grad_norm": 0.56640625,
"learning_rate": 0.00019882665781363208,
"loss": 2.6164,
"step": 1403
},
{
"epoch": 0.40075194508927675,
"grad_norm": 0.6015625,
"learning_rate": 0.00019869815613736224,
"loss": 2.6452,
"step": 1404
},
{
"epoch": 0.40103738094760244,
"grad_norm": 0.6015625,
"learning_rate": 0.00019856961449846294,
"loss": 2.6502,
"step": 1405
},
{
"epoch": 0.40132281680592813,
"grad_norm": 0.62109375,
"learning_rate": 0.0001984410330024179,
"loss": 2.6174,
"step": 1406
},
{
"epoch": 0.4016082526642538,
"grad_norm": 0.56640625,
"learning_rate": 0.0001983124117547436,
"loss": 2.5982,
"step": 1407
},
{
"epoch": 0.40189368852257956,
"grad_norm": 0.5625,
"learning_rate": 0.00019818375086098897,
"loss": 2.5949,
"step": 1408
},
{
"epoch": 0.40217912438090525,
"grad_norm": 1.015625,
"learning_rate": 0.00019805505042673564,
"loss": 2.6337,
"step": 1409
},
{
"epoch": 0.40246456023923094,
"grad_norm": 0.64453125,
"learning_rate": 0.00019792631055759764,
"loss": 2.6204,
"step": 1410
},
{
"epoch": 0.40274999609755663,
"grad_norm": 0.61328125,
"learning_rate": 0.00019779753135922126,
"loss": 2.6416,
"step": 1411
},
{
"epoch": 0.4030354319558823,
"grad_norm": 0.60546875,
"learning_rate": 0.00019766871293728524,
"loss": 2.6037,
"step": 1412
},
{
"epoch": 0.403320867814208,
"grad_norm": 0.63671875,
"learning_rate": 0.00019753985539750036,
"loss": 2.6191,
"step": 1413
},
{
"epoch": 0.4036063036725337,
"grad_norm": 0.578125,
"learning_rate": 0.00019741095884560957,
"loss": 2.6103,
"step": 1414
},
{
"epoch": 0.4038917395308594,
"grad_norm": 0.57421875,
"learning_rate": 0.00019728202338738785,
"loss": 2.6346,
"step": 1415
},
{
"epoch": 0.40417717538918513,
"grad_norm": 0.6796875,
"learning_rate": 0.0001971530491286421,
"loss": 2.6142,
"step": 1416
},
{
"epoch": 0.4044626112475108,
"grad_norm": 0.51953125,
"learning_rate": 0.00019702403617521093,
"loss": 2.612,
"step": 1417
},
{
"epoch": 0.4047480471058365,
"grad_norm": 0.625,
"learning_rate": 0.00019689498463296487,
"loss": 2.6237,
"step": 1418
},
{
"epoch": 0.4050334829641622,
"grad_norm": 0.5625,
"learning_rate": 0.00019676589460780616,
"loss": 2.6104,
"step": 1419
},
{
"epoch": 0.4053189188224879,
"grad_norm": 0.5703125,
"learning_rate": 0.00019663676620566836,
"loss": 2.6246,
"step": 1420
},
{
"epoch": 0.4056043546808136,
"grad_norm": 0.6328125,
"learning_rate": 0.00019650759953251677,
"loss": 2.6212,
"step": 1421
},
{
"epoch": 0.40588979053913926,
"grad_norm": 0.578125,
"learning_rate": 0.00019637839469434804,
"loss": 2.6268,
"step": 1422
},
{
"epoch": 0.40617522639746495,
"grad_norm": 0.578125,
"learning_rate": 0.00019624915179719004,
"loss": 2.6045,
"step": 1423
},
{
"epoch": 0.4064606622557907,
"grad_norm": 1.125,
"learning_rate": 0.00019611987094710192,
"loss": 2.5961,
"step": 1424
},
{
"epoch": 0.4067460981141164,
"grad_norm": 0.5625,
"learning_rate": 0.00019599055225017408,
"loss": 2.5987,
"step": 1425
},
{
"epoch": 0.40703153397244207,
"grad_norm": 0.6328125,
"learning_rate": 0.00019586119581252781,
"loss": 2.6394,
"step": 1426
},
{
"epoch": 0.40731696983076776,
"grad_norm": 0.5859375,
"learning_rate": 0.00019573180174031556,
"loss": 2.5998,
"step": 1427
},
{
"epoch": 0.40760240568909345,
"grad_norm": 0.6015625,
"learning_rate": 0.00019560237013972046,
"loss": 2.6149,
"step": 1428
},
{
"epoch": 0.40788784154741914,
"grad_norm": 0.5859375,
"learning_rate": 0.0001954729011169565,
"loss": 2.6389,
"step": 1429
},
{
"epoch": 0.4081732774057448,
"grad_norm": 0.59375,
"learning_rate": 0.00019534339477826854,
"loss": 2.6498,
"step": 1430
},
{
"epoch": 0.4084587132640705,
"grad_norm": 0.60546875,
"learning_rate": 0.00019521385122993185,
"loss": 2.6256,
"step": 1431
},
{
"epoch": 0.40874414912239626,
"grad_norm": 0.66015625,
"learning_rate": 0.00019508427057825237,
"loss": 2.614,
"step": 1432
},
{
"epoch": 0.40902958498072195,
"grad_norm": 0.58203125,
"learning_rate": 0.0001949546529295664,
"loss": 2.5803,
"step": 1433
},
{
"epoch": 0.40931502083904764,
"grad_norm": 0.625,
"learning_rate": 0.00019482499839024062,
"loss": 2.6267,
"step": 1434
},
{
"epoch": 0.4096004566973733,
"grad_norm": 0.58203125,
"learning_rate": 0.00019469530706667205,
"loss": 2.627,
"step": 1435
},
{
"epoch": 0.409885892555699,
"grad_norm": 0.5859375,
"learning_rate": 0.0001945655790652878,
"loss": 2.6262,
"step": 1436
},
{
"epoch": 0.4101713284140247,
"grad_norm": 0.62890625,
"learning_rate": 0.00019443581449254515,
"loss": 2.6189,
"step": 1437
},
{
"epoch": 0.4104567642723504,
"grad_norm": 0.55078125,
"learning_rate": 0.00019430601345493136,
"loss": 2.6023,
"step": 1438
},
{
"epoch": 0.4107422001306761,
"grad_norm": 0.58203125,
"learning_rate": 0.0001941761760589637,
"loss": 2.6085,
"step": 1439
},
{
"epoch": 0.4110276359890018,
"grad_norm": 0.5625,
"learning_rate": 0.00019404630241118902,
"loss": 2.6117,
"step": 1440
},
{
"epoch": 0.4113130718473275,
"grad_norm": 0.58203125,
"learning_rate": 0.00019391639261818428,
"loss": 2.6289,
"step": 1441
},
{
"epoch": 0.4115985077056532,
"grad_norm": 0.55859375,
"learning_rate": 0.00019378644678655582,
"loss": 2.6221,
"step": 1442
},
{
"epoch": 0.4118839435639789,
"grad_norm": 0.5546875,
"learning_rate": 0.00019365646502293962,
"loss": 2.6028,
"step": 1443
},
{
"epoch": 0.4121693794223046,
"grad_norm": 0.5546875,
"learning_rate": 0.00019352644743400124,
"loss": 2.599,
"step": 1444
},
{
"epoch": 0.41245481528063027,
"grad_norm": 0.68359375,
"learning_rate": 0.0001933963941264356,
"loss": 2.6002,
"step": 1445
},
{
"epoch": 0.41274025113895596,
"grad_norm": 0.53125,
"learning_rate": 0.0001932663052069668,
"loss": 2.6078,
"step": 1446
},
{
"epoch": 0.41302568699728165,
"grad_norm": 0.62109375,
"learning_rate": 0.00019313618078234843,
"loss": 2.6375,
"step": 1447
},
{
"epoch": 0.41331112285560734,
"grad_norm": 0.625,
"learning_rate": 0.00019300602095936287,
"loss": 2.6145,
"step": 1448
},
{
"epoch": 0.4135965587139331,
"grad_norm": 1.578125,
"learning_rate": 0.00019287582584482193,
"loss": 2.6075,
"step": 1449
},
{
"epoch": 0.41388199457225877,
"grad_norm": 1.3671875,
"learning_rate": 0.00019274559554556604,
"loss": 2.5988,
"step": 1450
},
{
"epoch": 0.41416743043058446,
"grad_norm": 1.0,
"learning_rate": 0.00019261533016846468,
"loss": 2.6142,
"step": 1451
},
{
"epoch": 0.41445286628891015,
"grad_norm": 0.65234375,
"learning_rate": 0.00019248502982041613,
"loss": 2.5849,
"step": 1452
},
{
"epoch": 0.41473830214723584,
"grad_norm": 0.703125,
"learning_rate": 0.00019235469460834732,
"loss": 2.6181,
"step": 1453
},
{
"epoch": 0.4150237380055615,
"grad_norm": 0.61328125,
"learning_rate": 0.00019222432463921374,
"loss": 2.5999,
"step": 1454
},
{
"epoch": 0.4153091738638872,
"grad_norm": 0.75,
"learning_rate": 0.0001920939200199995,
"loss": 2.6166,
"step": 1455
},
{
"epoch": 0.4155946097222129,
"grad_norm": 0.78515625,
"learning_rate": 0.00019196348085771713,
"loss": 2.6053,
"step": 1456
},
{
"epoch": 0.41588004558053865,
"grad_norm": 0.69140625,
"learning_rate": 0.0001918330072594074,
"loss": 2.6113,
"step": 1457
},
{
"epoch": 0.41616548143886434,
"grad_norm": 0.6484375,
"learning_rate": 0.00019170249933213947,
"loss": 2.6028,
"step": 1458
},
{
"epoch": 0.41645091729719,
"grad_norm": 0.765625,
"learning_rate": 0.00019157195718301067,
"loss": 2.6048,
"step": 1459
},
{
"epoch": 0.4167363531555157,
"grad_norm": 0.7421875,
"learning_rate": 0.00019144138091914617,
"loss": 2.6143,
"step": 1460
},
{
"epoch": 0.4170217890138414,
"grad_norm": 0.5859375,
"learning_rate": 0.00019131077064769953,
"loss": 2.6159,
"step": 1461
},
{
"epoch": 0.4173072248721671,
"grad_norm": 0.76171875,
"learning_rate": 0.00019118012647585192,
"loss": 2.5989,
"step": 1462
},
{
"epoch": 0.4175926607304928,
"grad_norm": 0.7421875,
"learning_rate": 0.00019104944851081244,
"loss": 2.6203,
"step": 1463
},
{
"epoch": 0.41787809658881847,
"grad_norm": 0.6015625,
"learning_rate": 0.00019091873685981786,
"loss": 2.596,
"step": 1464
},
{
"epoch": 0.4181635324471442,
"grad_norm": 0.875,
"learning_rate": 0.00019078799163013273,
"loss": 2.5961,
"step": 1465
},
{
"epoch": 0.4184489683054699,
"grad_norm": 0.8125,
"learning_rate": 0.000190657212929049,
"loss": 2.6254,
"step": 1466
},
{
"epoch": 0.4187344041637956,
"grad_norm": 0.57421875,
"learning_rate": 0.0001905264008638861,
"loss": 2.616,
"step": 1467
},
{
"epoch": 0.4190198400221213,
"grad_norm": 0.671875,
"learning_rate": 0.00019039555554199099,
"loss": 2.635,
"step": 1468
},
{
"epoch": 0.41930527588044697,
"grad_norm": 0.609375,
"learning_rate": 0.0001902646770707378,
"loss": 2.5834,
"step": 1469
},
{
"epoch": 0.41959071173877266,
"grad_norm": 0.58203125,
"learning_rate": 0.00019013376555752782,
"loss": 2.61,
"step": 1470
},
{
"epoch": 0.41987614759709835,
"grad_norm": 0.6015625,
"learning_rate": 0.00019000282110978958,
"loss": 2.6072,
"step": 1471
},
{
"epoch": 0.42016158345542404,
"grad_norm": 0.578125,
"learning_rate": 0.00018987184383497855,
"loss": 2.5803,
"step": 1472
},
{
"epoch": 0.4204470193137498,
"grad_norm": 0.5546875,
"learning_rate": 0.00018974083384057713,
"loss": 2.639,
"step": 1473
},
{
"epoch": 0.42073245517207547,
"grad_norm": 0.64453125,
"learning_rate": 0.00018960979123409466,
"loss": 2.5955,
"step": 1474
},
{
"epoch": 0.42101789103040116,
"grad_norm": 0.5234375,
"learning_rate": 0.0001894787161230672,
"loss": 2.6356,
"step": 1475
},
{
"epoch": 0.42130332688872685,
"grad_norm": 0.578125,
"learning_rate": 0.0001893476086150574,
"loss": 2.6224,
"step": 1476
},
{
"epoch": 0.42158876274705254,
"grad_norm": 0.62109375,
"learning_rate": 0.00018921646881765456,
"loss": 2.6103,
"step": 1477
},
{
"epoch": 0.4218741986053782,
"grad_norm": 0.578125,
"learning_rate": 0.0001890852968384746,
"loss": 2.6162,
"step": 1478
},
{
"epoch": 0.4221596344637039,
"grad_norm": 0.52734375,
"learning_rate": 0.0001889540927851596,
"loss": 2.628,
"step": 1479
},
{
"epoch": 0.4224450703220296,
"grad_norm": 0.65625,
"learning_rate": 0.0001888228567653781,
"loss": 2.6217,
"step": 1480
},
{
"epoch": 0.42273050618035535,
"grad_norm": 0.52734375,
"learning_rate": 0.00018869158888682494,
"loss": 2.613,
"step": 1481
},
{
"epoch": 0.42301594203868104,
"grad_norm": 0.5703125,
"learning_rate": 0.00018856028925722104,
"loss": 2.608,
"step": 1482
},
{
"epoch": 0.4233013778970067,
"grad_norm": 0.57421875,
"learning_rate": 0.00018842895798431327,
"loss": 2.6083,
"step": 1483
},
{
"epoch": 0.4235868137553324,
"grad_norm": 0.51171875,
"learning_rate": 0.00018829759517587457,
"loss": 2.6065,
"step": 1484
},
{
"epoch": 0.4238722496136581,
"grad_norm": 0.62890625,
"learning_rate": 0.00018816620093970387,
"loss": 2.6158,
"step": 1485
},
{
"epoch": 0.4241576854719838,
"grad_norm": 0.625,
"learning_rate": 0.00018803477538362562,
"loss": 2.628,
"step": 1486
},
{
"epoch": 0.4244431213303095,
"grad_norm": 0.52734375,
"learning_rate": 0.00018790331861549023,
"loss": 2.6095,
"step": 1487
},
{
"epoch": 0.42472855718863517,
"grad_norm": 0.58984375,
"learning_rate": 0.00018777183074317349,
"loss": 2.5987,
"step": 1488
},
{
"epoch": 0.4250139930469609,
"grad_norm": 0.5625,
"learning_rate": 0.000187640311874577,
"loss": 2.5805,
"step": 1489
},
{
"epoch": 0.4252994289052866,
"grad_norm": 0.515625,
"learning_rate": 0.00018750876211762752,
"loss": 2.6163,
"step": 1490
},
{
"epoch": 0.4255848647636123,
"grad_norm": 0.53515625,
"learning_rate": 0.00018737718158027734,
"loss": 2.596,
"step": 1491
},
{
"epoch": 0.425870300621938,
"grad_norm": 0.54296875,
"learning_rate": 0.00018724557037050384,
"loss": 2.6397,
"step": 1492
},
{
"epoch": 0.42615573648026367,
"grad_norm": 0.53125,
"learning_rate": 0.0001871139285963098,
"loss": 2.6378,
"step": 1493
},
{
"epoch": 0.42644117233858936,
"grad_norm": 0.546875,
"learning_rate": 0.00018698225636572285,
"loss": 2.6063,
"step": 1494
},
{
"epoch": 0.42672660819691505,
"grad_norm": 0.5234375,
"learning_rate": 0.0001868505537867958,
"loss": 2.6003,
"step": 1495
},
{
"epoch": 0.42701204405524074,
"grad_norm": 0.58984375,
"learning_rate": 0.00018671882096760623,
"loss": 2.595,
"step": 1496
},
{
"epoch": 0.4272974799135665,
"grad_norm": 0.5546875,
"learning_rate": 0.00018658705801625656,
"loss": 2.5969,
"step": 1497
},
{
"epoch": 0.42758291577189217,
"grad_norm": 0.515625,
"learning_rate": 0.00018645526504087402,
"loss": 2.6158,
"step": 1498
},
{
"epoch": 0.42786835163021786,
"grad_norm": 0.5546875,
"learning_rate": 0.00018632344214961045,
"loss": 2.6027,
"step": 1499
},
{
"epoch": 0.42815378748854355,
"grad_norm": 0.53515625,
"learning_rate": 0.0001861915894506421,
"loss": 2.6258,
"step": 1500
},
{
"epoch": 0.42815378748854355,
"eval_loss": 2.498450517654419,
"eval_runtime": 5960.8882,
"eval_samples_per_second": 10.785,
"eval_steps_per_second": 10.785,
"step": 1500
},
{
"epoch": 0.42843922334686924,
"grad_norm": 0.578125,
"learning_rate": 0.00018605970705216988,
"loss": 2.5927,
"step": 1501
},
{
"epoch": 0.4287246592051949,
"grad_norm": 0.51171875,
"learning_rate": 0.00018592779506241902,
"loss": 2.5965,
"step": 1502
},
{
"epoch": 0.4290100950635206,
"grad_norm": 0.5625,
"learning_rate": 0.00018579585358963885,
"loss": 2.6102,
"step": 1503
},
{
"epoch": 0.4292955309218463,
"grad_norm": 0.5390625,
"learning_rate": 0.00018566388274210316,
"loss": 2.5903,
"step": 1504
},
{
"epoch": 0.42958096678017205,
"grad_norm": 0.515625,
"learning_rate": 0.00018553188262810974,
"loss": 2.6056,
"step": 1505
},
{
"epoch": 0.42986640263849774,
"grad_norm": 0.56640625,
"learning_rate": 0.00018539985335598033,
"loss": 2.6157,
"step": 1506
},
{
"epoch": 0.4301518384968234,
"grad_norm": 0.53125,
"learning_rate": 0.00018526779503406059,
"loss": 2.5769,
"step": 1507
},
{
"epoch": 0.4304372743551491,
"grad_norm": 0.55078125,
"learning_rate": 0.00018513570777072024,
"loss": 2.6171,
"step": 1508
},
{
"epoch": 0.4307227102134748,
"grad_norm": 0.52734375,
"learning_rate": 0.0001850035916743525,
"loss": 2.5859,
"step": 1509
},
{
"epoch": 0.4310081460718005,
"grad_norm": 0.52734375,
"learning_rate": 0.00018487144685337432,
"loss": 2.5976,
"step": 1510
},
{
"epoch": 0.4312935819301262,
"grad_norm": 0.5390625,
"learning_rate": 0.00018473927341622627,
"loss": 2.6144,
"step": 1511
},
{
"epoch": 0.43157901778845187,
"grad_norm": 0.53125,
"learning_rate": 0.0001846070714713724,
"loss": 2.6233,
"step": 1512
},
{
"epoch": 0.4318644536467776,
"grad_norm": 0.5390625,
"learning_rate": 0.0001844748411273001,
"loss": 2.6009,
"step": 1513
},
{
"epoch": 0.4321498895051033,
"grad_norm": 0.578125,
"learning_rate": 0.00018434258249252008,
"loss": 2.6117,
"step": 1514
},
{
"epoch": 0.432435325363429,
"grad_norm": 0.50390625,
"learning_rate": 0.00018421029567556633,
"loss": 2.6089,
"step": 1515
},
{
"epoch": 0.4327207612217547,
"grad_norm": 0.5390625,
"learning_rate": 0.00018407798078499588,
"loss": 2.5967,
"step": 1516
},
{
"epoch": 0.43300619708008037,
"grad_norm": 0.515625,
"learning_rate": 0.0001839456379293889,
"loss": 2.6026,
"step": 1517
},
{
"epoch": 0.43329163293840606,
"grad_norm": 0.515625,
"learning_rate": 0.00018381326721734833,
"loss": 2.6104,
"step": 1518
},
{
"epoch": 0.43357706879673175,
"grad_norm": 0.51953125,
"learning_rate": 0.00018368086875750013,
"loss": 2.6096,
"step": 1519
},
{
"epoch": 0.43386250465505743,
"grad_norm": 0.486328125,
"learning_rate": 0.00018354844265849307,
"loss": 2.6035,
"step": 1520
},
{
"epoch": 0.4341479405133831,
"grad_norm": 0.5234375,
"learning_rate": 0.0001834159890289984,
"loss": 2.6119,
"step": 1521
},
{
"epoch": 0.43443337637170887,
"grad_norm": 0.4921875,
"learning_rate": 0.00018328350797771018,
"loss": 2.6295,
"step": 1522
},
{
"epoch": 0.43471881223003456,
"grad_norm": 0.515625,
"learning_rate": 0.0001831509996133447,
"loss": 2.5938,
"step": 1523
},
{
"epoch": 0.43500424808836025,
"grad_norm": 0.50390625,
"learning_rate": 0.000183018464044641,
"loss": 2.6174,
"step": 1524
},
{
"epoch": 0.43528968394668593,
"grad_norm": 0.486328125,
"learning_rate": 0.00018288590138036028,
"loss": 2.6166,
"step": 1525
},
{
"epoch": 0.4355751198050116,
"grad_norm": 0.50390625,
"learning_rate": 0.00018275331172928587,
"loss": 2.6148,
"step": 1526
},
{
"epoch": 0.4358605556633373,
"grad_norm": 0.498046875,
"learning_rate": 0.00018262069520022338,
"loss": 2.5973,
"step": 1527
},
{
"epoch": 0.436145991521663,
"grad_norm": 0.51953125,
"learning_rate": 0.00018248805190200048,
"loss": 2.5931,
"step": 1528
},
{
"epoch": 0.4364314273799887,
"grad_norm": 0.51171875,
"learning_rate": 0.0001823553819434668,
"loss": 2.5844,
"step": 1529
},
{
"epoch": 0.43671686323831443,
"grad_norm": 0.515625,
"learning_rate": 0.00018222268543349374,
"loss": 2.6187,
"step": 1530
},
{
"epoch": 0.4370022990966401,
"grad_norm": 0.5234375,
"learning_rate": 0.00018208996248097458,
"loss": 2.5919,
"step": 1531
},
{
"epoch": 0.4372877349549658,
"grad_norm": 0.53125,
"learning_rate": 0.00018195721319482438,
"loss": 2.6071,
"step": 1532
},
{
"epoch": 0.4375731708132915,
"grad_norm": 0.515625,
"learning_rate": 0.00018182443768397963,
"loss": 2.6021,
"step": 1533
},
{
"epoch": 0.4378586066716172,
"grad_norm": 0.5546875,
"learning_rate": 0.00018169163605739845,
"loss": 2.5948,
"step": 1534
},
{
"epoch": 0.4381440425299429,
"grad_norm": 0.53515625,
"learning_rate": 0.0001815588084240604,
"loss": 2.6145,
"step": 1535
},
{
"epoch": 0.43842947838826857,
"grad_norm": 0.55859375,
"learning_rate": 0.0001814259548929663,
"loss": 2.5996,
"step": 1536
},
{
"epoch": 0.43871491424659426,
"grad_norm": 0.55078125,
"learning_rate": 0.0001812930755731383,
"loss": 2.6011,
"step": 1537
},
{
"epoch": 0.43900035010492,
"grad_norm": 0.56640625,
"learning_rate": 0.00018116017057361972,
"loss": 2.6185,
"step": 1538
},
{
"epoch": 0.4392857859632457,
"grad_norm": 0.609375,
"learning_rate": 0.00018102724000347488,
"loss": 2.5761,
"step": 1539
},
{
"epoch": 0.4395712218215714,
"grad_norm": 0.51953125,
"learning_rate": 0.00018089428397178908,
"loss": 2.6193,
"step": 1540
},
{
"epoch": 0.43985665767989707,
"grad_norm": 0.494140625,
"learning_rate": 0.0001807613025876687,
"loss": 2.6,
"step": 1541
},
{
"epoch": 0.44014209353822276,
"grad_norm": 0.53515625,
"learning_rate": 0.00018062829596024067,
"loss": 2.5964,
"step": 1542
},
{
"epoch": 0.44042752939654845,
"grad_norm": 0.5234375,
"learning_rate": 0.0001804952641986527,
"loss": 2.5884,
"step": 1543
},
{
"epoch": 0.44071296525487413,
"grad_norm": 0.5,
"learning_rate": 0.00018036220741207332,
"loss": 2.5893,
"step": 1544
},
{
"epoch": 0.4409984011131998,
"grad_norm": 0.484375,
"learning_rate": 0.0001802291257096914,
"loss": 2.5842,
"step": 1545
},
{
"epoch": 0.44128383697152557,
"grad_norm": 0.498046875,
"learning_rate": 0.00018009601920071624,
"loss": 2.6291,
"step": 1546
},
{
"epoch": 0.44156927282985126,
"grad_norm": 0.470703125,
"learning_rate": 0.00017996288799437758,
"loss": 2.6153,
"step": 1547
},
{
"epoch": 0.44185470868817694,
"grad_norm": 0.50390625,
"learning_rate": 0.00017982973219992548,
"loss": 2.5752,
"step": 1548
},
{
"epoch": 0.44214014454650263,
"grad_norm": 0.482421875,
"learning_rate": 0.00017969655192663007,
"loss": 2.5856,
"step": 1549
},
{
"epoch": 0.4424255804048283,
"grad_norm": 0.48828125,
"learning_rate": 0.00017956334728378158,
"loss": 2.5989,
"step": 1550
},
{
"epoch": 0.442711016263154,
"grad_norm": 0.5078125,
"learning_rate": 0.00017943011838069021,
"loss": 2.621,
"step": 1551
},
{
"epoch": 0.4429964521214797,
"grad_norm": 0.51171875,
"learning_rate": 0.0001792968653266863,
"loss": 2.6003,
"step": 1552
},
{
"epoch": 0.4432818879798054,
"grad_norm": 0.52734375,
"learning_rate": 0.00017916358823111972,
"loss": 2.6094,
"step": 1553
},
{
"epoch": 0.44356732383813113,
"grad_norm": 0.490234375,
"learning_rate": 0.0001790302872033601,
"loss": 2.6167,
"step": 1554
},
{
"epoch": 0.4438527596964568,
"grad_norm": 0.494140625,
"learning_rate": 0.00017889696235279693,
"loss": 2.576,
"step": 1555
},
{
"epoch": 0.4441381955547825,
"grad_norm": 0.478515625,
"learning_rate": 0.00017876361378883903,
"loss": 2.5914,
"step": 1556
},
{
"epoch": 0.4444236314131082,
"grad_norm": 0.515625,
"learning_rate": 0.00017863024162091478,
"loss": 2.591,
"step": 1557
},
{
"epoch": 0.4447090672714339,
"grad_norm": 0.482421875,
"learning_rate": 0.0001784968459584719,
"loss": 2.6002,
"step": 1558
},
{
"epoch": 0.4449945031297596,
"grad_norm": 0.482421875,
"learning_rate": 0.00017836342691097742,
"loss": 2.5826,
"step": 1559
},
{
"epoch": 0.44527993898808527,
"grad_norm": 0.4921875,
"learning_rate": 0.0001782299845879175,
"loss": 2.5972,
"step": 1560
},
{
"epoch": 0.44556537484641096,
"grad_norm": 0.48046875,
"learning_rate": 0.00017809651909879749,
"loss": 2.5984,
"step": 1561
},
{
"epoch": 0.4458508107047367,
"grad_norm": 0.5,
"learning_rate": 0.00017796303055314164,
"loss": 2.5803,
"step": 1562
},
{
"epoch": 0.4461362465630624,
"grad_norm": 0.515625,
"learning_rate": 0.00017782951906049316,
"loss": 2.6079,
"step": 1563
},
{
"epoch": 0.4464216824213881,
"grad_norm": 0.486328125,
"learning_rate": 0.00017769598473041422,
"loss": 2.5998,
"step": 1564
},
{
"epoch": 0.44670711827971377,
"grad_norm": 0.5234375,
"learning_rate": 0.00017756242767248557,
"loss": 2.5921,
"step": 1565
},
{
"epoch": 0.44699255413803946,
"grad_norm": 0.52734375,
"learning_rate": 0.0001774288479963066,
"loss": 2.5799,
"step": 1566
},
{
"epoch": 0.44727798999636514,
"grad_norm": 0.55078125,
"learning_rate": 0.00017729524581149537,
"loss": 2.639,
"step": 1567
},
{
"epoch": 0.44756342585469083,
"grad_norm": 0.52734375,
"learning_rate": 0.00017716162122768836,
"loss": 2.613,
"step": 1568
},
{
"epoch": 0.4478488617130165,
"grad_norm": 0.5,
"learning_rate": 0.0001770279743545405,
"loss": 2.6075,
"step": 1569
},
{
"epoch": 0.44813429757134227,
"grad_norm": 0.515625,
"learning_rate": 0.00017689430530172482,
"loss": 2.5834,
"step": 1570
},
{
"epoch": 0.44841973342966795,
"grad_norm": 0.51171875,
"learning_rate": 0.00017676061417893274,
"loss": 2.607,
"step": 1571
},
{
"epoch": 0.44870516928799364,
"grad_norm": 0.51171875,
"learning_rate": 0.00017662690109587382,
"loss": 2.5996,
"step": 1572
},
{
"epoch": 0.44899060514631933,
"grad_norm": 0.51171875,
"learning_rate": 0.00017649316616227538,
"loss": 2.5941,
"step": 1573
},
{
"epoch": 0.449276041004645,
"grad_norm": 0.5234375,
"learning_rate": 0.0001763594094878829,
"loss": 2.5961,
"step": 1574
},
{
"epoch": 0.4495614768629707,
"grad_norm": 0.50390625,
"learning_rate": 0.00017622563118245972,
"loss": 2.5923,
"step": 1575
},
{
"epoch": 0.4498469127212964,
"grad_norm": 0.53515625,
"learning_rate": 0.00017609183135578675,
"loss": 2.5981,
"step": 1576
},
{
"epoch": 0.4501323485796221,
"grad_norm": 0.5234375,
"learning_rate": 0.00017595801011766274,
"loss": 2.6039,
"step": 1577
},
{
"epoch": 0.45041778443794783,
"grad_norm": 0.51171875,
"learning_rate": 0.00017582416757790388,
"loss": 2.587,
"step": 1578
},
{
"epoch": 0.4507032202962735,
"grad_norm": 0.52734375,
"learning_rate": 0.0001756903038463439,
"loss": 2.5729,
"step": 1579
},
{
"epoch": 0.4509886561545992,
"grad_norm": 0.47265625,
"learning_rate": 0.0001755564190328339,
"loss": 2.6028,
"step": 1580
},
{
"epoch": 0.4512740920129249,
"grad_norm": 0.53125,
"learning_rate": 0.00017542251324724237,
"loss": 2.5784,
"step": 1581
},
{
"epoch": 0.4515595278712506,
"grad_norm": 0.50390625,
"learning_rate": 0.00017528858659945486,
"loss": 2.6228,
"step": 1582
},
{
"epoch": 0.4518449637295763,
"grad_norm": 0.51171875,
"learning_rate": 0.00017515463919937413,
"loss": 2.6181,
"step": 1583
},
{
"epoch": 0.45213039958790197,
"grad_norm": 0.498046875,
"learning_rate": 0.00017502067115691996,
"loss": 2.5915,
"step": 1584
},
{
"epoch": 0.45241583544622765,
"grad_norm": 0.462890625,
"learning_rate": 0.0001748866825820291,
"loss": 2.6104,
"step": 1585
},
{
"epoch": 0.45270127130455334,
"grad_norm": 0.49609375,
"learning_rate": 0.00017475267358465504,
"loss": 2.5913,
"step": 1586
},
{
"epoch": 0.4529867071628791,
"grad_norm": 0.484375,
"learning_rate": 0.00017461864427476814,
"loss": 2.6017,
"step": 1587
},
{
"epoch": 0.4532721430212048,
"grad_norm": 0.55078125,
"learning_rate": 0.0001744845947623554,
"loss": 2.6186,
"step": 1588
},
{
"epoch": 0.45355757887953047,
"grad_norm": 0.5078125,
"learning_rate": 0.00017435052515742038,
"loss": 2.5961,
"step": 1589
},
{
"epoch": 0.45384301473785615,
"grad_norm": 0.54296875,
"learning_rate": 0.00017421643556998312,
"loss": 2.5929,
"step": 1590
},
{
"epoch": 0.45412845059618184,
"grad_norm": 0.50390625,
"learning_rate": 0.0001740823261100801,
"loss": 2.5902,
"step": 1591
},
{
"epoch": 0.45441388645450753,
"grad_norm": 0.5390625,
"learning_rate": 0.0001739481968877641,
"loss": 2.5817,
"step": 1592
},
{
"epoch": 0.4546993223128332,
"grad_norm": 0.50390625,
"learning_rate": 0.00017381404801310404,
"loss": 2.5856,
"step": 1593
},
{
"epoch": 0.4549847581711589,
"grad_norm": 0.515625,
"learning_rate": 0.00017367987959618505,
"loss": 2.5742,
"step": 1594
},
{
"epoch": 0.45527019402948465,
"grad_norm": 0.51953125,
"learning_rate": 0.00017354569174710834,
"loss": 2.5916,
"step": 1595
},
{
"epoch": 0.45555562988781034,
"grad_norm": 0.515625,
"learning_rate": 0.00017341148457599096,
"loss": 2.5964,
"step": 1596
},
{
"epoch": 0.45584106574613603,
"grad_norm": 0.5625,
"learning_rate": 0.00017327725819296576,
"loss": 2.597,
"step": 1597
},
{
"epoch": 0.4561265016044617,
"grad_norm": 0.48046875,
"learning_rate": 0.0001731430127081816,
"loss": 2.5921,
"step": 1598
},
{
"epoch": 0.4564119374627874,
"grad_norm": 0.5390625,
"learning_rate": 0.00017300874823180282,
"loss": 2.61,
"step": 1599
},
{
"epoch": 0.4566973733211131,
"grad_norm": 0.5703125,
"learning_rate": 0.00017287446487400935,
"loss": 2.5985,
"step": 1600
},
{
"epoch": 0.4569828091794388,
"grad_norm": 0.494140625,
"learning_rate": 0.00017274016274499665,
"loss": 2.6079,
"step": 1601
},
{
"epoch": 0.4572682450377645,
"grad_norm": 0.55078125,
"learning_rate": 0.00017260584195497567,
"loss": 2.5797,
"step": 1602
},
{
"epoch": 0.4575536808960902,
"grad_norm": 0.494140625,
"learning_rate": 0.00017247150261417255,
"loss": 2.6106,
"step": 1603
},
{
"epoch": 0.4578391167544159,
"grad_norm": 0.53125,
"learning_rate": 0.0001723371448328287,
"loss": 2.5846,
"step": 1604
},
{
"epoch": 0.4581245526127416,
"grad_norm": 0.55078125,
"learning_rate": 0.00017220276872120072,
"loss": 2.5763,
"step": 1605
},
{
"epoch": 0.4584099884710673,
"grad_norm": 0.52734375,
"learning_rate": 0.00017206837438956004,
"loss": 2.5878,
"step": 1606
},
{
"epoch": 0.458695424329393,
"grad_norm": 0.5390625,
"learning_rate": 0.00017193396194819328,
"loss": 2.5931,
"step": 1607
},
{
"epoch": 0.45898086018771866,
"grad_norm": 0.5078125,
"learning_rate": 0.00017179953150740193,
"loss": 2.5835,
"step": 1608
},
{
"epoch": 0.45926629604604435,
"grad_norm": 0.5625,
"learning_rate": 0.000171665083177502,
"loss": 2.6094,
"step": 1609
},
{
"epoch": 0.45955173190437004,
"grad_norm": 0.515625,
"learning_rate": 0.00017153061706882443,
"loss": 2.6024,
"step": 1610
},
{
"epoch": 0.4598371677626958,
"grad_norm": 0.53515625,
"learning_rate": 0.0001713961332917146,
"loss": 2.618,
"step": 1611
},
{
"epoch": 0.4601226036210215,
"grad_norm": 0.5625,
"learning_rate": 0.00017126163195653254,
"loss": 2.6115,
"step": 1612
},
{
"epoch": 0.46040803947934716,
"grad_norm": 0.515625,
"learning_rate": 0.00017112711317365247,
"loss": 2.5529,
"step": 1613
},
{
"epoch": 0.46069347533767285,
"grad_norm": 0.52734375,
"learning_rate": 0.00017099257705346314,
"loss": 2.6051,
"step": 1614
},
{
"epoch": 0.46097891119599854,
"grad_norm": 0.56640625,
"learning_rate": 0.00017085802370636743,
"loss": 2.6073,
"step": 1615
},
{
"epoch": 0.46126434705432423,
"grad_norm": 0.494140625,
"learning_rate": 0.00017072345324278232,
"loss": 2.5969,
"step": 1616
},
{
"epoch": 0.4615497829126499,
"grad_norm": 0.54296875,
"learning_rate": 0.00017058886577313892,
"loss": 2.6139,
"step": 1617
},
{
"epoch": 0.4618352187709756,
"grad_norm": 0.50390625,
"learning_rate": 0.00017045426140788224,
"loss": 2.5696,
"step": 1618
},
{
"epoch": 0.46212065462930135,
"grad_norm": 0.53125,
"learning_rate": 0.00017031964025747117,
"loss": 2.5835,
"step": 1619
},
{
"epoch": 0.46240609048762704,
"grad_norm": 0.515625,
"learning_rate": 0.00017018500243237838,
"loss": 2.5731,
"step": 1620
},
{
"epoch": 0.46269152634595273,
"grad_norm": 0.482421875,
"learning_rate": 0.00017005034804309027,
"loss": 2.6096,
"step": 1621
},
{
"epoch": 0.4629769622042784,
"grad_norm": 0.494140625,
"learning_rate": 0.00016991567720010668,
"loss": 2.6063,
"step": 1622
},
{
"epoch": 0.4632623980626041,
"grad_norm": 0.453125,
"learning_rate": 0.00016978099001394112,
"loss": 2.6002,
"step": 1623
},
{
"epoch": 0.4635478339209298,
"grad_norm": 0.5234375,
"learning_rate": 0.00016964628659512046,
"loss": 2.5955,
"step": 1624
},
{
"epoch": 0.4638332697792555,
"grad_norm": 0.482421875,
"learning_rate": 0.00016951156705418484,
"loss": 2.5975,
"step": 1625
},
{
"epoch": 0.4641187056375812,
"grad_norm": 0.609375,
"learning_rate": 0.00016937683150168765,
"loss": 2.5944,
"step": 1626
},
{
"epoch": 0.4644041414959069,
"grad_norm": 0.9921875,
"learning_rate": 0.0001692420800481955,
"loss": 2.5734,
"step": 1627
},
{
"epoch": 0.4646895773542326,
"grad_norm": 0.73828125,
"learning_rate": 0.000169107312804288,
"loss": 2.6232,
"step": 1628
},
{
"epoch": 0.4649750132125583,
"grad_norm": 0.8046875,
"learning_rate": 0.0001689725298805576,
"loss": 2.5985,
"step": 1629
},
{
"epoch": 0.465260449070884,
"grad_norm": 1.5234375,
"learning_rate": 0.00016883773138760976,
"loss": 2.578,
"step": 1630
},
{
"epoch": 0.4655458849292097,
"grad_norm": 0.83203125,
"learning_rate": 0.00016870291743606273,
"loss": 2.5762,
"step": 1631
},
{
"epoch": 0.46583132078753536,
"grad_norm": 0.8359375,
"learning_rate": 0.0001685680881365474,
"loss": 2.5714,
"step": 1632
},
{
"epoch": 0.46611675664586105,
"grad_norm": 0.8671875,
"learning_rate": 0.00016843324359970712,
"loss": 2.5721,
"step": 1633
},
{
"epoch": 0.46640219250418674,
"grad_norm": 0.7109375,
"learning_rate": 0.00016829838393619796,
"loss": 2.6092,
"step": 1634
},
{
"epoch": 0.4666876283625125,
"grad_norm": 0.6875,
"learning_rate": 0.00016816350925668837,
"loss": 2.5973,
"step": 1635
},
{
"epoch": 0.4669730642208382,
"grad_norm": 0.85546875,
"learning_rate": 0.000168028619671859,
"loss": 2.6003,
"step": 1636
},
{
"epoch": 0.46725850007916386,
"grad_norm": 0.6171875,
"learning_rate": 0.00016789371529240271,
"loss": 2.612,
"step": 1637
},
{
"epoch": 0.46754393593748955,
"grad_norm": 0.79296875,
"learning_rate": 0.0001677587962290248,
"loss": 2.5903,
"step": 1638
},
{
"epoch": 0.46782937179581524,
"grad_norm": 0.58984375,
"learning_rate": 0.00016762386259244224,
"loss": 2.5791,
"step": 1639
},
{
"epoch": 0.46811480765414093,
"grad_norm": 0.65234375,
"learning_rate": 0.0001674889144933842,
"loss": 2.6103,
"step": 1640
},
{
"epoch": 0.4684002435124666,
"grad_norm": 0.63671875,
"learning_rate": 0.00016735395204259162,
"loss": 2.5757,
"step": 1641
},
{
"epoch": 0.4686856793707923,
"grad_norm": 0.60546875,
"learning_rate": 0.00016721897535081724,
"loss": 2.5925,
"step": 1642
},
{
"epoch": 0.46897111522911805,
"grad_norm": 0.66796875,
"learning_rate": 0.00016708398452882552,
"loss": 2.6213,
"step": 1643
},
{
"epoch": 0.46925655108744374,
"grad_norm": 0.5546875,
"learning_rate": 0.00016694897968739245,
"loss": 2.5948,
"step": 1644
},
{
"epoch": 0.46954198694576943,
"grad_norm": 0.6015625,
"learning_rate": 0.0001668139609373056,
"loss": 2.5849,
"step": 1645
},
{
"epoch": 0.4698274228040951,
"grad_norm": 0.62109375,
"learning_rate": 0.00016667892838936389,
"loss": 2.6265,
"step": 1646
},
{
"epoch": 0.4701128586624208,
"grad_norm": 0.57421875,
"learning_rate": 0.00016654388215437755,
"loss": 2.6059,
"step": 1647
},
{
"epoch": 0.4703982945207465,
"grad_norm": 0.5078125,
"learning_rate": 0.0001664088223431682,
"loss": 2.6298,
"step": 1648
},
{
"epoch": 0.4706837303790722,
"grad_norm": 0.5390625,
"learning_rate": 0.0001662737490665683,
"loss": 2.6045,
"step": 1649
},
{
"epoch": 0.4709691662373979,
"grad_norm": 0.52734375,
"learning_rate": 0.0001661386624354217,
"loss": 2.6153,
"step": 1650
},
{
"epoch": 0.4712546020957236,
"grad_norm": 0.498046875,
"learning_rate": 0.00016600356256058296,
"loss": 2.5974,
"step": 1651
},
{
"epoch": 0.4715400379540493,
"grad_norm": 0.515625,
"learning_rate": 0.00016586844955291768,
"loss": 2.5846,
"step": 1652
},
{
"epoch": 0.471825473812375,
"grad_norm": 0.515625,
"learning_rate": 0.00016573332352330203,
"loss": 2.5888,
"step": 1653
},
{
"epoch": 0.4721109096707007,
"grad_norm": 0.515625,
"learning_rate": 0.00016559818458262304,
"loss": 2.5823,
"step": 1654
},
{
"epoch": 0.4723963455290264,
"grad_norm": 0.50390625,
"learning_rate": 0.00016546303284177837,
"loss": 2.5973,
"step": 1655
},
{
"epoch": 0.47268178138735206,
"grad_norm": 0.51953125,
"learning_rate": 0.000165327868411676,
"loss": 2.5688,
"step": 1656
},
{
"epoch": 0.47296721724567775,
"grad_norm": 0.51171875,
"learning_rate": 0.00016519269140323443,
"loss": 2.584,
"step": 1657
},
{
"epoch": 0.47325265310400344,
"grad_norm": 0.51953125,
"learning_rate": 0.00016505750192738253,
"loss": 2.5829,
"step": 1658
},
{
"epoch": 0.47353808896232913,
"grad_norm": 0.50390625,
"learning_rate": 0.00016492230009505928,
"loss": 2.5653,
"step": 1659
},
{
"epoch": 0.4738235248206549,
"grad_norm": 0.5078125,
"learning_rate": 0.0001647870860172139,
"loss": 2.6081,
"step": 1660
},
{
"epoch": 0.47410896067898056,
"grad_norm": 0.49609375,
"learning_rate": 0.00016465185980480562,
"loss": 2.5732,
"step": 1661
},
{
"epoch": 0.47439439653730625,
"grad_norm": 0.53515625,
"learning_rate": 0.0001645166215688036,
"loss": 2.5776,
"step": 1662
},
{
"epoch": 0.47467983239563194,
"grad_norm": 0.5078125,
"learning_rate": 0.000164381371420187,
"loss": 2.5898,
"step": 1663
},
{
"epoch": 0.47496526825395763,
"grad_norm": 0.53515625,
"learning_rate": 0.00016424610946994453,
"loss": 2.6061,
"step": 1664
},
{
"epoch": 0.4752507041122833,
"grad_norm": 0.50390625,
"learning_rate": 0.00016411083582907476,
"loss": 2.5932,
"step": 1665
},
{
"epoch": 0.475536139970609,
"grad_norm": 0.478515625,
"learning_rate": 0.0001639755506085858,
"loss": 2.5887,
"step": 1666
},
{
"epoch": 0.4758215758289347,
"grad_norm": 0.484375,
"learning_rate": 0.0001638402539194953,
"loss": 2.597,
"step": 1667
},
{
"epoch": 0.47610701168726044,
"grad_norm": 0.50390625,
"learning_rate": 0.00016370494587283026,
"loss": 2.5624,
"step": 1668
},
{
"epoch": 0.47639244754558613,
"grad_norm": 0.44921875,
"learning_rate": 0.00016356962657962693,
"loss": 2.571,
"step": 1669
},
{
"epoch": 0.4766778834039118,
"grad_norm": 0.51171875,
"learning_rate": 0.00016343429615093104,
"loss": 2.5971,
"step": 1670
},
{
"epoch": 0.4769633192622375,
"grad_norm": 0.462890625,
"learning_rate": 0.00016329895469779725,
"loss": 2.5999,
"step": 1671
},
{
"epoch": 0.4772487551205632,
"grad_norm": 0.48046875,
"learning_rate": 0.00016316360233128933,
"loss": 2.5949,
"step": 1672
},
{
"epoch": 0.4775341909788889,
"grad_norm": 0.46484375,
"learning_rate": 0.0001630282391624799,
"loss": 2.599,
"step": 1673
},
{
"epoch": 0.4778196268372146,
"grad_norm": 0.52734375,
"learning_rate": 0.00016289286530245064,
"loss": 2.5983,
"step": 1674
},
{
"epoch": 0.47810506269554026,
"grad_norm": 0.4921875,
"learning_rate": 0.00016275748086229193,
"loss": 2.5857,
"step": 1675
},
{
"epoch": 0.478390498553866,
"grad_norm": 0.44140625,
"learning_rate": 0.0001626220859531027,
"loss": 2.5945,
"step": 1676
},
{
"epoch": 0.4786759344121917,
"grad_norm": 0.494140625,
"learning_rate": 0.00016248668068599066,
"loss": 2.6017,
"step": 1677
},
{
"epoch": 0.4789613702705174,
"grad_norm": 0.46484375,
"learning_rate": 0.0001623512651720719,
"loss": 2.6014,
"step": 1678
},
{
"epoch": 0.4792468061288431,
"grad_norm": 0.486328125,
"learning_rate": 0.00016221583952247097,
"loss": 2.5712,
"step": 1679
},
{
"epoch": 0.47953224198716876,
"grad_norm": 0.458984375,
"learning_rate": 0.00016208040384832072,
"loss": 2.5989,
"step": 1680
},
{
"epoch": 0.47981767784549445,
"grad_norm": 0.48828125,
"learning_rate": 0.00016194495826076224,
"loss": 2.5548,
"step": 1681
},
{
"epoch": 0.48010311370382014,
"grad_norm": 0.47265625,
"learning_rate": 0.0001618095028709447,
"loss": 2.5883,
"step": 1682
},
{
"epoch": 0.48038854956214583,
"grad_norm": 0.9296875,
"learning_rate": 0.0001616740377900254,
"loss": 2.6151,
"step": 1683
},
{
"epoch": 0.4806739854204716,
"grad_norm": 0.50390625,
"learning_rate": 0.00016153856312916957,
"loss": 2.5432,
"step": 1684
},
{
"epoch": 0.48095942127879726,
"grad_norm": 0.671875,
"learning_rate": 0.00016140307899955024,
"loss": 2.5735,
"step": 1685
},
{
"epoch": 0.48124485713712295,
"grad_norm": 0.671875,
"learning_rate": 0.00016126758551234825,
"loss": 2.5766,
"step": 1686
},
{
"epoch": 0.48153029299544864,
"grad_norm": 0.578125,
"learning_rate": 0.0001611320827787522,
"loss": 2.5697,
"step": 1687
},
{
"epoch": 0.4818157288537743,
"grad_norm": 0.5859375,
"learning_rate": 0.00016099657090995812,
"loss": 2.5824,
"step": 1688
},
{
"epoch": 0.4821011647121,
"grad_norm": 0.50390625,
"learning_rate": 0.0001608610500171696,
"loss": 2.5885,
"step": 1689
},
{
"epoch": 0.4823866005704257,
"grad_norm": 0.5078125,
"learning_rate": 0.00016072552021159775,
"loss": 2.5984,
"step": 1690
},
{
"epoch": 0.4826720364287514,
"grad_norm": 0.55078125,
"learning_rate": 0.0001605899816044608,
"loss": 2.6025,
"step": 1691
},
{
"epoch": 0.48295747228707714,
"grad_norm": 0.5,
"learning_rate": 0.00016045443430698437,
"loss": 2.6107,
"step": 1692
},
{
"epoch": 0.4832429081454028,
"grad_norm": 0.52734375,
"learning_rate": 0.00016031887843040104,
"loss": 2.5978,
"step": 1693
},
{
"epoch": 0.4835283440037285,
"grad_norm": 0.53515625,
"learning_rate": 0.00016018331408595063,
"loss": 2.5974,
"step": 1694
},
{
"epoch": 0.4838137798620542,
"grad_norm": 0.53515625,
"learning_rate": 0.00016004774138487983,
"loss": 2.6113,
"step": 1695
},
{
"epoch": 0.4840992157203799,
"grad_norm": 0.51171875,
"learning_rate": 0.00015991216043844208,
"loss": 2.5766,
"step": 1696
},
{
"epoch": 0.4843846515787056,
"grad_norm": 0.5,
"learning_rate": 0.00015977657135789764,
"loss": 2.5671,
"step": 1697
},
{
"epoch": 0.48467008743703127,
"grad_norm": 0.54296875,
"learning_rate": 0.0001596409742545136,
"loss": 2.6138,
"step": 1698
},
{
"epoch": 0.48495552329535696,
"grad_norm": 0.45703125,
"learning_rate": 0.00015950536923956346,
"loss": 2.5962,
"step": 1699
},
{
"epoch": 0.4852409591536827,
"grad_norm": 0.50390625,
"learning_rate": 0.00015936975642432725,
"loss": 2.5992,
"step": 1700
},
{
"epoch": 0.4855263950120084,
"grad_norm": 0.50390625,
"learning_rate": 0.00015923413592009144,
"loss": 2.5925,
"step": 1701
},
{
"epoch": 0.4858118308703341,
"grad_norm": 0.462890625,
"learning_rate": 0.00015909850783814874,
"loss": 2.5949,
"step": 1702
},
{
"epoch": 0.48609726672865977,
"grad_norm": 0.515625,
"learning_rate": 0.00015896287228979816,
"loss": 2.5671,
"step": 1703
},
{
"epoch": 0.48638270258698546,
"grad_norm": 0.5,
"learning_rate": 0.00015882722938634477,
"loss": 2.5684,
"step": 1704
},
{
"epoch": 0.48666813844531115,
"grad_norm": 0.482421875,
"learning_rate": 0.00015869157923909978,
"loss": 2.59,
"step": 1705
},
{
"epoch": 0.48695357430363684,
"grad_norm": 0.515625,
"learning_rate": 0.00015855592195938018,
"loss": 2.587,
"step": 1706
},
{
"epoch": 0.4872390101619625,
"grad_norm": 0.46875,
"learning_rate": 0.00015842025765850894,
"loss": 2.5942,
"step": 1707
},
{
"epoch": 0.48752444602028827,
"grad_norm": 0.48046875,
"learning_rate": 0.00015828458644781478,
"loss": 2.604,
"step": 1708
},
{
"epoch": 0.48780988187861396,
"grad_norm": 0.44140625,
"learning_rate": 0.00015814890843863204,
"loss": 2.5862,
"step": 1709
},
{
"epoch": 0.48809531773693965,
"grad_norm": 0.486328125,
"learning_rate": 0.00015801322374230068,
"loss": 2.5813,
"step": 1710
},
{
"epoch": 0.48838075359526534,
"grad_norm": 0.4453125,
"learning_rate": 0.00015787753247016608,
"loss": 2.5988,
"step": 1711
},
{
"epoch": 0.488666189453591,
"grad_norm": 0.470703125,
"learning_rate": 0.00015774183473357914,
"loss": 2.5786,
"step": 1712
},
{
"epoch": 0.4889516253119167,
"grad_norm": 0.48828125,
"learning_rate": 0.00015760613064389595,
"loss": 2.5616,
"step": 1713
},
{
"epoch": 0.4892370611702424,
"grad_norm": 0.484375,
"learning_rate": 0.00015747042031247785,
"loss": 2.5828,
"step": 1714
},
{
"epoch": 0.4895224970285681,
"grad_norm": 0.47265625,
"learning_rate": 0.0001573347038506914,
"loss": 2.565,
"step": 1715
},
{
"epoch": 0.48980793288689384,
"grad_norm": 0.46875,
"learning_rate": 0.00015719898136990794,
"loss": 2.5747,
"step": 1716
},
{
"epoch": 0.4900933687452195,
"grad_norm": 0.466796875,
"learning_rate": 0.00015706325298150403,
"loss": 2.5779,
"step": 1717
},
{
"epoch": 0.4903788046035452,
"grad_norm": 0.4921875,
"learning_rate": 0.00015692751879686095,
"loss": 2.5682,
"step": 1718
},
{
"epoch": 0.4906642404618709,
"grad_norm": 0.48828125,
"learning_rate": 0.00015679177892736468,
"loss": 2.5675,
"step": 1719
},
{
"epoch": 0.4909496763201966,
"grad_norm": 0.4765625,
"learning_rate": 0.00015665603348440595,
"loss": 2.5824,
"step": 1720
},
{
"epoch": 0.4912351121785223,
"grad_norm": 0.52734375,
"learning_rate": 0.0001565202825793801,
"loss": 2.5604,
"step": 1721
},
{
"epoch": 0.49152054803684797,
"grad_norm": 0.5,
"learning_rate": 0.0001563845263236868,
"loss": 2.5612,
"step": 1722
},
{
"epoch": 0.49180598389517366,
"grad_norm": 0.5234375,
"learning_rate": 0.0001562487648287303,
"loss": 2.6068,
"step": 1723
},
{
"epoch": 0.4920914197534994,
"grad_norm": 0.47265625,
"learning_rate": 0.000156112998205919,
"loss": 2.5695,
"step": 1724
},
{
"epoch": 0.4923768556118251,
"grad_norm": 0.51953125,
"learning_rate": 0.00015597722656666554,
"loss": 2.5929,
"step": 1725
},
{
"epoch": 0.4926622914701508,
"grad_norm": 0.515625,
"learning_rate": 0.00015584145002238677,
"loss": 2.5656,
"step": 1726
},
{
"epoch": 0.49294772732847647,
"grad_norm": 0.482421875,
"learning_rate": 0.00015570566868450343,
"loss": 2.5609,
"step": 1727
},
{
"epoch": 0.49323316318680216,
"grad_norm": 0.5234375,
"learning_rate": 0.00015556988266444028,
"loss": 2.5954,
"step": 1728
},
{
"epoch": 0.49351859904512785,
"grad_norm": 0.48828125,
"learning_rate": 0.0001554340920736259,
"loss": 2.5662,
"step": 1729
},
{
"epoch": 0.49380403490345354,
"grad_norm": 0.4921875,
"learning_rate": 0.00015529829702349266,
"loss": 2.6074,
"step": 1730
},
{
"epoch": 0.4940894707617792,
"grad_norm": 0.53515625,
"learning_rate": 0.0001551624976254765,
"loss": 2.593,
"step": 1731
},
{
"epoch": 0.4943749066201049,
"grad_norm": 0.5,
"learning_rate": 0.00015502669399101695,
"loss": 2.6089,
"step": 1732
},
{
"epoch": 0.49466034247843066,
"grad_norm": 0.5,
"learning_rate": 0.00015489088623155716,
"loss": 2.5917,
"step": 1733
},
{
"epoch": 0.49494577833675635,
"grad_norm": 0.53515625,
"learning_rate": 0.00015475507445854343,
"loss": 2.566,
"step": 1734
},
{
"epoch": 0.49523121419508204,
"grad_norm": 0.5,
"learning_rate": 0.00015461925878342556,
"loss": 2.5928,
"step": 1735
},
{
"epoch": 0.4955166500534077,
"grad_norm": 0.55859375,
"learning_rate": 0.00015448343931765635,
"loss": 2.5719,
"step": 1736
},
{
"epoch": 0.4958020859117334,
"grad_norm": 0.50390625,
"learning_rate": 0.000154347616172692,
"loss": 2.5568,
"step": 1737
},
{
"epoch": 0.4960875217700591,
"grad_norm": 0.49609375,
"learning_rate": 0.00015421178945999143,
"loss": 2.5836,
"step": 1738
},
{
"epoch": 0.4963729576283848,
"grad_norm": 0.498046875,
"learning_rate": 0.00015407595929101665,
"loss": 2.5957,
"step": 1739
},
{
"epoch": 0.4966583934867105,
"grad_norm": 0.4609375,
"learning_rate": 0.0001539401257772324,
"loss": 2.6004,
"step": 1740
},
{
"epoch": 0.4969438293450362,
"grad_norm": 0.51171875,
"learning_rate": 0.0001538042890301064,
"loss": 2.5866,
"step": 1741
},
{
"epoch": 0.4972292652033619,
"grad_norm": 0.478515625,
"learning_rate": 0.00015366844916110868,
"loss": 2.5744,
"step": 1742
},
{
"epoch": 0.4975147010616876,
"grad_norm": 0.474609375,
"learning_rate": 0.00015353260628171212,
"loss": 2.6165,
"step": 1743
},
{
"epoch": 0.4978001369200133,
"grad_norm": 0.5,
"learning_rate": 0.0001533967605033919,
"loss": 2.5778,
"step": 1744
},
{
"epoch": 0.498085572778339,
"grad_norm": 0.423828125,
"learning_rate": 0.00015326091193762568,
"loss": 2.5816,
"step": 1745
},
{
"epoch": 0.49837100863666467,
"grad_norm": 0.5078125,
"learning_rate": 0.00015312506069589335,
"loss": 2.6123,
"step": 1746
},
{
"epoch": 0.49865644449499036,
"grad_norm": 0.458984375,
"learning_rate": 0.00015298920688967702,
"loss": 2.5834,
"step": 1747
},
{
"epoch": 0.49894188035331605,
"grad_norm": 0.51953125,
"learning_rate": 0.00015285335063046089,
"loss": 2.5644,
"step": 1748
},
{
"epoch": 0.4992273162116418,
"grad_norm": 0.50390625,
"learning_rate": 0.00015271749202973116,
"loss": 2.5766,
"step": 1749
},
{
"epoch": 0.4995127520699675,
"grad_norm": 0.52734375,
"learning_rate": 0.000152581631198976,
"loss": 2.5764,
"step": 1750
},
{
"epoch": 0.4995127520699675,
"eval_loss": 2.4794108867645264,
"eval_runtime": 6003.2988,
"eval_samples_per_second": 10.708,
"eval_steps_per_second": 10.708,
"step": 1750
},
{
"epoch": 0.49979818792829317,
"grad_norm": 0.462890625,
"learning_rate": 0.00015244576824968538,
"loss": 2.5287,
"step": 1751
},
{
"epoch": 0.5000836237866189,
"grad_norm": 0.486328125,
"learning_rate": 0.000152309903293351,
"loss": 2.5808,
"step": 1752
},
{
"epoch": 0.5003690596449446,
"grad_norm": 0.4609375,
"learning_rate": 0.00015217403644146626,
"loss": 2.6024,
"step": 1753
},
{
"epoch": 0.5006544955032702,
"grad_norm": 0.50390625,
"learning_rate": 0.000152038167805526,
"loss": 2.6072,
"step": 1754
},
{
"epoch": 0.500939931361596,
"grad_norm": 0.50390625,
"learning_rate": 0.00015190229749702664,
"loss": 2.5662,
"step": 1755
},
{
"epoch": 0.5012253672199216,
"grad_norm": 0.56640625,
"learning_rate": 0.00015176642562746587,
"loss": 2.5949,
"step": 1756
},
{
"epoch": 0.5015108030782474,
"grad_norm": 0.5546875,
"learning_rate": 0.0001516305523083428,
"loss": 2.5952,
"step": 1757
},
{
"epoch": 0.501796238936573,
"grad_norm": 0.58984375,
"learning_rate": 0.00015149467765115764,
"loss": 2.5761,
"step": 1758
},
{
"epoch": 0.5020816747948987,
"grad_norm": 0.51953125,
"learning_rate": 0.0001513588017674117,
"loss": 2.5776,
"step": 1759
},
{
"epoch": 0.5023671106532244,
"grad_norm": 0.5078125,
"learning_rate": 0.0001512229247686072,
"loss": 2.5913,
"step": 1760
},
{
"epoch": 0.5026525465115501,
"grad_norm": 0.498046875,
"learning_rate": 0.00015108704676624756,
"loss": 2.6031,
"step": 1761
},
{
"epoch": 0.5029379823698759,
"grad_norm": 0.55859375,
"learning_rate": 0.00015095116787183668,
"loss": 2.5457,
"step": 1762
},
{
"epoch": 0.5032234182282015,
"grad_norm": 0.51171875,
"learning_rate": 0.0001508152881968795,
"loss": 2.5609,
"step": 1763
},
{
"epoch": 0.5035088540865272,
"grad_norm": 0.498046875,
"learning_rate": 0.00015067940785288135,
"loss": 2.6055,
"step": 1764
},
{
"epoch": 0.5037942899448529,
"grad_norm": 0.51171875,
"learning_rate": 0.0001505435269513482,
"loss": 2.597,
"step": 1765
},
{
"epoch": 0.5040797258031786,
"grad_norm": 0.458984375,
"learning_rate": 0.00015040764560378658,
"loss": 2.5936,
"step": 1766
},
{
"epoch": 0.5043651616615042,
"grad_norm": 0.578125,
"learning_rate": 0.00015027176392170326,
"loss": 2.5551,
"step": 1767
},
{
"epoch": 0.50465059751983,
"grad_norm": 0.51953125,
"learning_rate": 0.00015013588201660529,
"loss": 2.5881,
"step": 1768
},
{
"epoch": 0.5049360333781557,
"grad_norm": 0.515625,
"learning_rate": 0.00015,
"loss": 2.5998,
"step": 1769
},
{
"epoch": 0.5052214692364814,
"grad_norm": 0.451171875,
"learning_rate": 0.0001498641179833947,
"loss": 2.58,
"step": 1770
},
{
"epoch": 0.5055069050948071,
"grad_norm": 0.53515625,
"learning_rate": 0.00014972823607829674,
"loss": 2.5808,
"step": 1771
},
{
"epoch": 0.5057923409531327,
"grad_norm": 0.455078125,
"learning_rate": 0.00014959235439621343,
"loss": 2.575,
"step": 1772
},
{
"epoch": 0.5060777768114585,
"grad_norm": 0.5234375,
"learning_rate": 0.00014945647304865175,
"loss": 2.5957,
"step": 1773
},
{
"epoch": 0.5063632126697841,
"grad_norm": 0.5,
"learning_rate": 0.00014932059214711868,
"loss": 2.5831,
"step": 1774
},
{
"epoch": 0.5066486485281099,
"grad_norm": 0.59375,
"learning_rate": 0.00014918471180312053,
"loss": 2.5812,
"step": 1775
},
{
"epoch": 0.5069340843864355,
"grad_norm": 0.52734375,
"learning_rate": 0.0001490488321281633,
"loss": 2.5925,
"step": 1776
},
{
"epoch": 0.5072195202447612,
"grad_norm": 0.494140625,
"learning_rate": 0.00014891295323375244,
"loss": 2.5934,
"step": 1777
},
{
"epoch": 0.507504956103087,
"grad_norm": 0.5078125,
"learning_rate": 0.0001487770752313928,
"loss": 2.5923,
"step": 1778
},
{
"epoch": 0.5077903919614126,
"grad_norm": 0.466796875,
"learning_rate": 0.00014864119823258836,
"loss": 2.5811,
"step": 1779
},
{
"epoch": 0.5080758278197384,
"grad_norm": 0.490234375,
"learning_rate": 0.00014850532234884236,
"loss": 2.5726,
"step": 1780
},
{
"epoch": 0.508361263678064,
"grad_norm": 0.53515625,
"learning_rate": 0.00014836944769165716,
"loss": 2.57,
"step": 1781
},
{
"epoch": 0.5086466995363897,
"grad_norm": 0.51171875,
"learning_rate": 0.0001482335743725341,
"loss": 2.584,
"step": 1782
},
{
"epoch": 0.5089321353947154,
"grad_norm": 0.48828125,
"learning_rate": 0.00014809770250297336,
"loss": 2.5903,
"step": 1783
},
{
"epoch": 0.5092175712530411,
"grad_norm": 0.51171875,
"learning_rate": 0.000147961832194474,
"loss": 2.6009,
"step": 1784
},
{
"epoch": 0.5095030071113669,
"grad_norm": 0.478515625,
"learning_rate": 0.00014782596355853374,
"loss": 2.6057,
"step": 1785
},
{
"epoch": 0.5097884429696925,
"grad_norm": 0.49609375,
"learning_rate": 0.00014769009670664897,
"loss": 2.5661,
"step": 1786
},
{
"epoch": 0.5100738788280182,
"grad_norm": 0.447265625,
"learning_rate": 0.0001475542317503146,
"loss": 2.5986,
"step": 1787
},
{
"epoch": 0.5103593146863439,
"grad_norm": 0.5234375,
"learning_rate": 0.000147418368801024,
"loss": 2.5837,
"step": 1788
},
{
"epoch": 0.5106447505446696,
"grad_norm": 0.474609375,
"learning_rate": 0.0001472825079702688,
"loss": 2.5738,
"step": 1789
},
{
"epoch": 0.5109301864029953,
"grad_norm": 0.48828125,
"learning_rate": 0.0001471466493695391,
"loss": 2.5681,
"step": 1790
},
{
"epoch": 0.511215622261321,
"grad_norm": 0.466796875,
"learning_rate": 0.00014701079311032298,
"loss": 2.5817,
"step": 1791
},
{
"epoch": 0.5115010581196466,
"grad_norm": 0.48828125,
"learning_rate": 0.00014687493930410663,
"loss": 2.5813,
"step": 1792
},
{
"epoch": 0.5117864939779724,
"grad_norm": 0.478515625,
"learning_rate": 0.00014673908806237432,
"loss": 2.5893,
"step": 1793
},
{
"epoch": 0.5120719298362981,
"grad_norm": 0.498046875,
"learning_rate": 0.0001466032394966081,
"loss": 2.6104,
"step": 1794
},
{
"epoch": 0.5123573656946238,
"grad_norm": 0.5078125,
"learning_rate": 0.0001464673937182879,
"loss": 2.6105,
"step": 1795
},
{
"epoch": 0.5126428015529495,
"grad_norm": 0.494140625,
"learning_rate": 0.00014633155083889132,
"loss": 2.6015,
"step": 1796
},
{
"epoch": 0.5129282374112751,
"grad_norm": 0.5,
"learning_rate": 0.00014619571096989359,
"loss": 2.578,
"step": 1797
},
{
"epoch": 0.5132136732696009,
"grad_norm": 0.47265625,
"learning_rate": 0.00014605987422276756,
"loss": 2.5755,
"step": 1798
},
{
"epoch": 0.5134991091279265,
"grad_norm": 0.50390625,
"learning_rate": 0.00014592404070898335,
"loss": 2.5822,
"step": 1799
},
{
"epoch": 0.5137845449862523,
"grad_norm": 0.474609375,
"learning_rate": 0.00014578821054000854,
"loss": 2.5701,
"step": 1800
},
{
"epoch": 0.514069980844578,
"grad_norm": 0.51953125,
"learning_rate": 0.000145652383827308,
"loss": 2.5652,
"step": 1801
},
{
"epoch": 0.5143554167029036,
"grad_norm": 0.5078125,
"learning_rate": 0.00014551656068234362,
"loss": 2.5589,
"step": 1802
},
{
"epoch": 0.5146408525612294,
"grad_norm": 0.4609375,
"learning_rate": 0.00014538074121657447,
"loss": 2.5928,
"step": 1803
},
{
"epoch": 0.514926288419555,
"grad_norm": 0.48046875,
"learning_rate": 0.00014524492554145657,
"loss": 2.5787,
"step": 1804
},
{
"epoch": 0.5152117242778808,
"grad_norm": 0.474609375,
"learning_rate": 0.0001451091137684428,
"loss": 2.6031,
"step": 1805
},
{
"epoch": 0.5154971601362064,
"grad_norm": 0.478515625,
"learning_rate": 0.00014497330600898297,
"loss": 2.6,
"step": 1806
},
{
"epoch": 0.5157825959945321,
"grad_norm": 0.4609375,
"learning_rate": 0.0001448375023745235,
"loss": 2.5984,
"step": 1807
},
{
"epoch": 0.5160680318528578,
"grad_norm": 0.45703125,
"learning_rate": 0.00014470170297650734,
"loss": 2.5901,
"step": 1808
},
{
"epoch": 0.5163534677111835,
"grad_norm": 0.5078125,
"learning_rate": 0.00014456590792637407,
"loss": 2.555,
"step": 1809
},
{
"epoch": 0.5166389035695093,
"grad_norm": 0.4453125,
"learning_rate": 0.0001444301173355597,
"loss": 2.5745,
"step": 1810
},
{
"epoch": 0.5169243394278349,
"grad_norm": 0.4765625,
"learning_rate": 0.0001442943313154966,
"loss": 2.5377,
"step": 1811
},
{
"epoch": 0.5172097752861606,
"grad_norm": 0.455078125,
"learning_rate": 0.00014415854997761328,
"loss": 2.5617,
"step": 1812
},
{
"epoch": 0.5174952111444863,
"grad_norm": 0.46875,
"learning_rate": 0.0001440227734333344,
"loss": 2.5987,
"step": 1813
},
{
"epoch": 0.517780647002812,
"grad_norm": 0.44921875,
"learning_rate": 0.000143887001794081,
"loss": 2.5686,
"step": 1814
},
{
"epoch": 0.5180660828611376,
"grad_norm": 0.427734375,
"learning_rate": 0.00014375123517126968,
"loss": 2.5911,
"step": 1815
},
{
"epoch": 0.5183515187194634,
"grad_norm": 0.43359375,
"learning_rate": 0.00014361547367631317,
"loss": 2.5687,
"step": 1816
},
{
"epoch": 0.518636954577789,
"grad_norm": 0.447265625,
"learning_rate": 0.00014347971742061989,
"loss": 2.6098,
"step": 1817
},
{
"epoch": 0.5189223904361148,
"grad_norm": 0.474609375,
"learning_rate": 0.00014334396651559405,
"loss": 2.5648,
"step": 1818
},
{
"epoch": 0.5192078262944405,
"grad_norm": 0.40625,
"learning_rate": 0.00014320822107263532,
"loss": 2.583,
"step": 1819
},
{
"epoch": 0.5194932621527661,
"grad_norm": 0.50390625,
"learning_rate": 0.00014307248120313908,
"loss": 2.5763,
"step": 1820
},
{
"epoch": 0.5197786980110919,
"grad_norm": 0.44140625,
"learning_rate": 0.00014293674701849595,
"loss": 2.5835,
"step": 1821
},
{
"epoch": 0.5200641338694175,
"grad_norm": 0.478515625,
"learning_rate": 0.00014280101863009203,
"loss": 2.5738,
"step": 1822
},
{
"epoch": 0.5203495697277433,
"grad_norm": 0.447265625,
"learning_rate": 0.0001426652961493086,
"loss": 2.5956,
"step": 1823
},
{
"epoch": 0.5206350055860689,
"grad_norm": 0.5390625,
"learning_rate": 0.00014252957968752212,
"loss": 2.5553,
"step": 1824
},
{
"epoch": 0.5209204414443946,
"grad_norm": 0.484375,
"learning_rate": 0.00014239386935610405,
"loss": 2.5876,
"step": 1825
},
{
"epoch": 0.5212058773027204,
"grad_norm": 0.53515625,
"learning_rate": 0.00014225816526642086,
"loss": 2.592,
"step": 1826
},
{
"epoch": 0.521491313161046,
"grad_norm": 0.4609375,
"learning_rate": 0.00014212246752983392,
"loss": 2.5715,
"step": 1827
},
{
"epoch": 0.5217767490193718,
"grad_norm": 0.4765625,
"learning_rate": 0.00014198677625769937,
"loss": 2.5873,
"step": 1828
},
{
"epoch": 0.5220621848776974,
"grad_norm": 0.46875,
"learning_rate": 0.0001418510915613679,
"loss": 2.5964,
"step": 1829
},
{
"epoch": 0.5223476207360231,
"grad_norm": 0.470703125,
"learning_rate": 0.0001417154135521852,
"loss": 2.5588,
"step": 1830
},
{
"epoch": 0.5226330565943488,
"grad_norm": 0.478515625,
"learning_rate": 0.00014157974234149103,
"loss": 2.5652,
"step": 1831
},
{
"epoch": 0.5229184924526745,
"grad_norm": 0.46484375,
"learning_rate": 0.00014144407804061982,
"loss": 2.6088,
"step": 1832
},
{
"epoch": 0.5232039283110002,
"grad_norm": 0.494140625,
"learning_rate": 0.00014130842076090023,
"loss": 2.5847,
"step": 1833
},
{
"epoch": 0.5234893641693259,
"grad_norm": 0.439453125,
"learning_rate": 0.0001411727706136552,
"loss": 2.5664,
"step": 1834
},
{
"epoch": 0.5237748000276516,
"grad_norm": 0.458984375,
"learning_rate": 0.00014103712771020187,
"loss": 2.5667,
"step": 1835
},
{
"epoch": 0.5240602358859773,
"grad_norm": 0.447265625,
"learning_rate": 0.00014090149216185123,
"loss": 2.5789,
"step": 1836
},
{
"epoch": 0.524345671744303,
"grad_norm": 0.55859375,
"learning_rate": 0.00014076586407990856,
"loss": 2.5775,
"step": 1837
},
{
"epoch": 0.5246311076026287,
"grad_norm": 0.48046875,
"learning_rate": 0.00014063024357567275,
"loss": 2.5817,
"step": 1838
},
{
"epoch": 0.5249165434609544,
"grad_norm": 0.453125,
"learning_rate": 0.00014049463076043652,
"loss": 2.6099,
"step": 1839
},
{
"epoch": 0.52520197931928,
"grad_norm": 0.453125,
"learning_rate": 0.00014035902574548637,
"loss": 2.5589,
"step": 1840
},
{
"epoch": 0.5254874151776058,
"grad_norm": 0.44921875,
"learning_rate": 0.00014022342864210234,
"loss": 2.5884,
"step": 1841
},
{
"epoch": 0.5257728510359315,
"grad_norm": 0.458984375,
"learning_rate": 0.00014008783956155797,
"loss": 2.606,
"step": 1842
},
{
"epoch": 0.5260582868942572,
"grad_norm": 0.474609375,
"learning_rate": 0.0001399522586151202,
"loss": 2.5597,
"step": 1843
},
{
"epoch": 0.5263437227525829,
"grad_norm": 0.478515625,
"learning_rate": 0.00013981668591404932,
"loss": 2.5987,
"step": 1844
},
{
"epoch": 0.5266291586109085,
"grad_norm": 0.48046875,
"learning_rate": 0.00013968112156959893,
"loss": 2.5708,
"step": 1845
},
{
"epoch": 0.5269145944692343,
"grad_norm": 0.43359375,
"learning_rate": 0.00013954556569301563,
"loss": 2.5932,
"step": 1846
},
{
"epoch": 0.5272000303275599,
"grad_norm": 0.478515625,
"learning_rate": 0.0001394100183955392,
"loss": 2.6022,
"step": 1847
},
{
"epoch": 0.5274854661858857,
"grad_norm": 0.43359375,
"learning_rate": 0.00013927447978840225,
"loss": 2.5497,
"step": 1848
},
{
"epoch": 0.5277709020442113,
"grad_norm": 0.515625,
"learning_rate": 0.00013913894998283038,
"loss": 2.5742,
"step": 1849
},
{
"epoch": 0.528056337902537,
"grad_norm": 0.486328125,
"learning_rate": 0.00013900342909004188,
"loss": 2.624,
"step": 1850
},
{
"epoch": 0.5283417737608628,
"grad_norm": 0.5,
"learning_rate": 0.00013886791722124783,
"loss": 2.5814,
"step": 1851
},
{
"epoch": 0.5286272096191884,
"grad_norm": 0.44921875,
"learning_rate": 0.00013873241448765167,
"loss": 2.5622,
"step": 1852
},
{
"epoch": 0.5289126454775142,
"grad_norm": 0.474609375,
"learning_rate": 0.00013859692100044973,
"loss": 2.5673,
"step": 1853
},
{
"epoch": 0.5291980813358398,
"grad_norm": 0.4765625,
"learning_rate": 0.00013846143687083043,
"loss": 2.5758,
"step": 1854
},
{
"epoch": 0.5294835171941655,
"grad_norm": 0.4765625,
"learning_rate": 0.00013832596220997458,
"loss": 2.5934,
"step": 1855
},
{
"epoch": 0.5297689530524912,
"grad_norm": 0.455078125,
"learning_rate": 0.0001381904971290553,
"loss": 2.5529,
"step": 1856
},
{
"epoch": 0.5300543889108169,
"grad_norm": 0.447265625,
"learning_rate": 0.00013805504173923776,
"loss": 2.5794,
"step": 1857
},
{
"epoch": 0.5303398247691427,
"grad_norm": 0.466796875,
"learning_rate": 0.0001379195961516793,
"loss": 2.5519,
"step": 1858
},
{
"epoch": 0.5306252606274683,
"grad_norm": 0.482421875,
"learning_rate": 0.00013778416047752903,
"loss": 2.5965,
"step": 1859
},
{
"epoch": 0.530910696485794,
"grad_norm": 0.455078125,
"learning_rate": 0.0001376487348279281,
"loss": 2.5725,
"step": 1860
},
{
"epoch": 0.5311961323441197,
"grad_norm": 0.484375,
"learning_rate": 0.0001375133193140093,
"loss": 2.5638,
"step": 1861
},
{
"epoch": 0.5314815682024454,
"grad_norm": 0.46875,
"learning_rate": 0.00013737791404689728,
"loss": 2.5935,
"step": 1862
},
{
"epoch": 0.531767004060771,
"grad_norm": 0.470703125,
"learning_rate": 0.00013724251913770807,
"loss": 2.6033,
"step": 1863
},
{
"epoch": 0.5320524399190968,
"grad_norm": 0.44921875,
"learning_rate": 0.00013710713469754934,
"loss": 2.5982,
"step": 1864
},
{
"epoch": 0.5323378757774224,
"grad_norm": 0.5078125,
"learning_rate": 0.00013697176083752008,
"loss": 2.5374,
"step": 1865
},
{
"epoch": 0.5326233116357482,
"grad_norm": 0.443359375,
"learning_rate": 0.0001368363976687107,
"loss": 2.5623,
"step": 1866
},
{
"epoch": 0.5329087474940739,
"grad_norm": 0.494140625,
"learning_rate": 0.00013670104530220275,
"loss": 2.574,
"step": 1867
},
{
"epoch": 0.5331941833523995,
"grad_norm": 0.45703125,
"learning_rate": 0.0001365657038490689,
"loss": 2.5917,
"step": 1868
},
{
"epoch": 0.5334796192107253,
"grad_norm": 0.490234375,
"learning_rate": 0.000136430373420373,
"loss": 2.5844,
"step": 1869
},
{
"epoch": 0.5337650550690509,
"grad_norm": 0.419921875,
"learning_rate": 0.00013629505412716974,
"loss": 2.6019,
"step": 1870
},
{
"epoch": 0.5340504909273767,
"grad_norm": 0.478515625,
"learning_rate": 0.0001361597460805047,
"loss": 2.5718,
"step": 1871
},
{
"epoch": 0.5343359267857023,
"grad_norm": 0.46484375,
"learning_rate": 0.0001360244493914142,
"loss": 2.5665,
"step": 1872
},
{
"epoch": 0.534621362644028,
"grad_norm": 0.45703125,
"learning_rate": 0.0001358891641709252,
"loss": 2.5814,
"step": 1873
},
{
"epoch": 0.5349067985023538,
"grad_norm": 0.478515625,
"learning_rate": 0.00013575389053005547,
"loss": 2.5467,
"step": 1874
},
{
"epoch": 0.5351922343606794,
"grad_norm": 0.66015625,
"learning_rate": 0.00013561862857981304,
"loss": 2.5697,
"step": 1875
},
{
"epoch": 0.5354776702190052,
"grad_norm": 0.55078125,
"learning_rate": 0.00013548337843119634,
"loss": 2.5856,
"step": 1876
},
{
"epoch": 0.5357631060773308,
"grad_norm": 0.5625,
"learning_rate": 0.00013534814019519438,
"loss": 2.5662,
"step": 1877
},
{
"epoch": 0.5360485419356565,
"grad_norm": 0.5625,
"learning_rate": 0.00013521291398278608,
"loss": 2.5983,
"step": 1878
},
{
"epoch": 0.5363339777939822,
"grad_norm": 0.57421875,
"learning_rate": 0.00013507769990494072,
"loss": 2.5893,
"step": 1879
},
{
"epoch": 0.5366194136523079,
"grad_norm": 0.671875,
"learning_rate": 0.00013494249807261748,
"loss": 2.5852,
"step": 1880
},
{
"epoch": 0.5369048495106336,
"grad_norm": 0.55078125,
"learning_rate": 0.00013480730859676557,
"loss": 2.5667,
"step": 1881
},
{
"epoch": 0.5371902853689593,
"grad_norm": 0.82421875,
"learning_rate": 0.00013467213158832402,
"loss": 2.5674,
"step": 1882
},
{
"epoch": 0.537475721227285,
"grad_norm": 0.5078125,
"learning_rate": 0.00013453696715822163,
"loss": 2.5955,
"step": 1883
},
{
"epoch": 0.5377611570856107,
"grad_norm": 0.67578125,
"learning_rate": 0.0001344018154173769,
"loss": 2.5681,
"step": 1884
},
{
"epoch": 0.5380465929439364,
"grad_norm": 0.55859375,
"learning_rate": 0.00013426667647669795,
"loss": 2.6069,
"step": 1885
},
{
"epoch": 0.538332028802262,
"grad_norm": 0.609375,
"learning_rate": 0.00013413155044708232,
"loss": 2.5682,
"step": 1886
},
{
"epoch": 0.5386174646605878,
"grad_norm": 0.53125,
"learning_rate": 0.00013399643743941701,
"loss": 2.5783,
"step": 1887
},
{
"epoch": 0.5389029005189134,
"grad_norm": 0.59375,
"learning_rate": 0.0001338613375645783,
"loss": 2.5545,
"step": 1888
},
{
"epoch": 0.5391883363772392,
"grad_norm": 0.57421875,
"learning_rate": 0.00013372625093343167,
"loss": 2.5683,
"step": 1889
},
{
"epoch": 0.5394737722355648,
"grad_norm": 0.52734375,
"learning_rate": 0.00013359117765683183,
"loss": 2.5635,
"step": 1890
},
{
"epoch": 0.5397592080938906,
"grad_norm": 0.546875,
"learning_rate": 0.00013345611784562245,
"loss": 2.5851,
"step": 1891
},
{
"epoch": 0.5400446439522163,
"grad_norm": 0.578125,
"learning_rate": 0.0001333210716106361,
"loss": 2.5822,
"step": 1892
},
{
"epoch": 0.5403300798105419,
"grad_norm": 0.46484375,
"learning_rate": 0.00013318603906269436,
"loss": 2.587,
"step": 1893
},
{
"epoch": 0.5406155156688677,
"grad_norm": 0.62890625,
"learning_rate": 0.00013305102031260755,
"loss": 2.5887,
"step": 1894
},
{
"epoch": 0.5409009515271933,
"grad_norm": 0.443359375,
"learning_rate": 0.00013291601547117448,
"loss": 2.5895,
"step": 1895
},
{
"epoch": 0.541186387385519,
"grad_norm": 0.56640625,
"learning_rate": 0.00013278102464918276,
"loss": 2.5535,
"step": 1896
},
{
"epoch": 0.5414718232438447,
"grad_norm": 0.447265625,
"learning_rate": 0.00013264604795740838,
"loss": 2.5836,
"step": 1897
},
{
"epoch": 0.5417572591021704,
"grad_norm": 0.5390625,
"learning_rate": 0.00013251108550661585,
"loss": 2.5933,
"step": 1898
},
{
"epoch": 0.5420426949604962,
"grad_norm": 0.45703125,
"learning_rate": 0.0001323761374075578,
"loss": 2.5745,
"step": 1899
},
{
"epoch": 0.5423281308188218,
"grad_norm": 0.490234375,
"learning_rate": 0.0001322412037709752,
"loss": 2.5632,
"step": 1900
},
{
"epoch": 0.5426135666771476,
"grad_norm": 0.5,
"learning_rate": 0.00013210628470759726,
"loss": 2.5525,
"step": 1901
},
{
"epoch": 0.5428990025354732,
"grad_norm": 0.5078125,
"learning_rate": 0.000131971380328141,
"loss": 2.6075,
"step": 1902
},
{
"epoch": 0.5431844383937989,
"grad_norm": 0.447265625,
"learning_rate": 0.0001318364907433116,
"loss": 2.5948,
"step": 1903
},
{
"epoch": 0.5434698742521246,
"grad_norm": 0.53125,
"learning_rate": 0.00013170161606380204,
"loss": 2.6039,
"step": 1904
},
{
"epoch": 0.5437553101104503,
"grad_norm": 0.453125,
"learning_rate": 0.00013156675640029289,
"loss": 2.5849,
"step": 1905
},
{
"epoch": 0.5440407459687759,
"grad_norm": 0.546875,
"learning_rate": 0.00013143191186345266,
"loss": 2.5805,
"step": 1906
},
{
"epoch": 0.5443261818271017,
"grad_norm": 0.431640625,
"learning_rate": 0.00013129708256393724,
"loss": 2.5466,
"step": 1907
},
{
"epoch": 0.5446116176854274,
"grad_norm": 0.515625,
"learning_rate": 0.00013116226861239019,
"loss": 2.5889,
"step": 1908
},
{
"epoch": 0.5448970535437531,
"grad_norm": 0.45703125,
"learning_rate": 0.00013102747011944238,
"loss": 2.5744,
"step": 1909
},
{
"epoch": 0.5451824894020788,
"grad_norm": 0.484375,
"learning_rate": 0.000130892687195712,
"loss": 2.5408,
"step": 1910
},
{
"epoch": 0.5454679252604044,
"grad_norm": 0.470703125,
"learning_rate": 0.00013075791995180447,
"loss": 2.5915,
"step": 1911
},
{
"epoch": 0.5457533611187302,
"grad_norm": 0.439453125,
"learning_rate": 0.00013062316849831232,
"loss": 2.5739,
"step": 1912
},
{
"epoch": 0.5460387969770558,
"grad_norm": 0.458984375,
"learning_rate": 0.00013048843294581516,
"loss": 2.5662,
"step": 1913
},
{
"epoch": 0.5463242328353816,
"grad_norm": 0.447265625,
"learning_rate": 0.00013035371340487954,
"loss": 2.5486,
"step": 1914
},
{
"epoch": 0.5466096686937073,
"grad_norm": 0.47265625,
"learning_rate": 0.00013021900998605885,
"loss": 2.5508,
"step": 1915
},
{
"epoch": 0.5468951045520329,
"grad_norm": 0.45703125,
"learning_rate": 0.0001300843227998933,
"loss": 2.5886,
"step": 1916
},
{
"epoch": 0.5471805404103587,
"grad_norm": 0.455078125,
"learning_rate": 0.00012994965195690976,
"loss": 2.5568,
"step": 1917
},
{
"epoch": 0.5474659762686843,
"grad_norm": 0.443359375,
"learning_rate": 0.0001298149975676216,
"loss": 2.5776,
"step": 1918
},
{
"epoch": 0.5477514121270101,
"grad_norm": 0.4296875,
"learning_rate": 0.0001296803597425288,
"loss": 2.5829,
"step": 1919
},
{
"epoch": 0.5480368479853357,
"grad_norm": 0.455078125,
"learning_rate": 0.00012954573859211773,
"loss": 2.5828,
"step": 1920
},
{
"epoch": 0.5483222838436614,
"grad_norm": 0.408203125,
"learning_rate": 0.00012941113422686108,
"loss": 2.5825,
"step": 1921
},
{
"epoch": 0.5486077197019871,
"grad_norm": 0.474609375,
"learning_rate": 0.0001292765467572177,
"loss": 2.5706,
"step": 1922
},
{
"epoch": 0.5488931555603128,
"grad_norm": 0.435546875,
"learning_rate": 0.00012914197629363257,
"loss": 2.546,
"step": 1923
},
{
"epoch": 0.5491785914186386,
"grad_norm": 0.4609375,
"learning_rate": 0.00012900742294653684,
"loss": 2.6005,
"step": 1924
},
{
"epoch": 0.5494640272769642,
"grad_norm": 0.494140625,
"learning_rate": 0.0001288728868263475,
"loss": 2.5664,
"step": 1925
},
{
"epoch": 0.5497494631352899,
"grad_norm": 0.439453125,
"learning_rate": 0.00012873836804346746,
"loss": 2.5662,
"step": 1926
},
{
"epoch": 0.5500348989936156,
"grad_norm": 0.486328125,
"learning_rate": 0.00012860386670828538,
"loss": 2.5691,
"step": 1927
},
{
"epoch": 0.5503203348519413,
"grad_norm": 0.458984375,
"learning_rate": 0.0001284693829311756,
"loss": 2.556,
"step": 1928
},
{
"epoch": 0.550605770710267,
"grad_norm": 0.494140625,
"learning_rate": 0.00012833491682249802,
"loss": 2.5723,
"step": 1929
},
{
"epoch": 0.5508912065685927,
"grad_norm": 0.439453125,
"learning_rate": 0.0001282004684925981,
"loss": 2.5932,
"step": 1930
},
{
"epoch": 0.5511766424269184,
"grad_norm": 0.52734375,
"learning_rate": 0.00012806603805180666,
"loss": 2.5586,
"step": 1931
},
{
"epoch": 0.5514620782852441,
"grad_norm": 0.4453125,
"learning_rate": 0.00012793162561043994,
"loss": 2.6137,
"step": 1932
},
{
"epoch": 0.5517475141435698,
"grad_norm": 0.474609375,
"learning_rate": 0.0001277972312787993,
"loss": 2.5864,
"step": 1933
},
{
"epoch": 0.5520329500018955,
"grad_norm": 0.44140625,
"learning_rate": 0.0001276628551671713,
"loss": 2.5684,
"step": 1934
},
{
"epoch": 0.5523183858602212,
"grad_norm": 0.470703125,
"learning_rate": 0.00012752849738582745,
"loss": 2.5812,
"step": 1935
},
{
"epoch": 0.5526038217185468,
"grad_norm": 0.44921875,
"learning_rate": 0.0001273941580450243,
"loss": 2.5645,
"step": 1936
},
{
"epoch": 0.5528892575768726,
"grad_norm": 0.49609375,
"learning_rate": 0.00012725983725500332,
"loss": 2.5597,
"step": 1937
},
{
"epoch": 0.5531746934351982,
"grad_norm": 0.43359375,
"learning_rate": 0.0001271255351259907,
"loss": 2.5787,
"step": 1938
},
{
"epoch": 0.553460129293524,
"grad_norm": 0.466796875,
"learning_rate": 0.00012699125176819716,
"loss": 2.5669,
"step": 1939
},
{
"epoch": 0.5537455651518497,
"grad_norm": 0.78125,
"learning_rate": 0.00012685698729181837,
"loss": 2.5653,
"step": 1940
},
{
"epoch": 0.5540310010101753,
"grad_norm": 0.48828125,
"learning_rate": 0.0001267227418070342,
"loss": 2.5713,
"step": 1941
},
{
"epoch": 0.5543164368685011,
"grad_norm": 0.46484375,
"learning_rate": 0.00012658851542400907,
"loss": 2.5643,
"step": 1942
},
{
"epoch": 0.5546018727268267,
"grad_norm": 0.431640625,
"learning_rate": 0.00012645430825289163,
"loss": 2.5536,
"step": 1943
},
{
"epoch": 0.5548873085851525,
"grad_norm": 0.53515625,
"learning_rate": 0.00012632012040381493,
"loss": 2.5869,
"step": 1944
},
{
"epoch": 0.5551727444434781,
"grad_norm": 0.44921875,
"learning_rate": 0.00012618595198689596,
"loss": 2.5626,
"step": 1945
},
{
"epoch": 0.5554581803018038,
"grad_norm": 0.484375,
"learning_rate": 0.0001260518031122359,
"loss": 2.5907,
"step": 1946
},
{
"epoch": 0.5557436161601295,
"grad_norm": 0.431640625,
"learning_rate": 0.00012591767388991985,
"loss": 2.5852,
"step": 1947
},
{
"epoch": 0.5560290520184552,
"grad_norm": 0.458984375,
"learning_rate": 0.00012578356443001683,
"loss": 2.557,
"step": 1948
},
{
"epoch": 0.556314487876781,
"grad_norm": 0.453125,
"learning_rate": 0.0001256494748425796,
"loss": 2.581,
"step": 1949
},
{
"epoch": 0.5565999237351066,
"grad_norm": 0.451171875,
"learning_rate": 0.00012551540523764458,
"loss": 2.5861,
"step": 1950
},
{
"epoch": 0.5568853595934323,
"grad_norm": 0.49609375,
"learning_rate": 0.00012538135572523183,
"loss": 2.5701,
"step": 1951
},
{
"epoch": 0.557170795451758,
"grad_norm": 0.482421875,
"learning_rate": 0.00012524732641534496,
"loss": 2.5348,
"step": 1952
},
{
"epoch": 0.5574562313100837,
"grad_norm": 0.458984375,
"learning_rate": 0.00012511331741797092,
"loss": 2.5597,
"step": 1953
},
{
"epoch": 0.5577416671684093,
"grad_norm": 0.4921875,
"learning_rate": 0.00012497932884308002,
"loss": 2.5808,
"step": 1954
},
{
"epoch": 0.5580271030267351,
"grad_norm": 0.439453125,
"learning_rate": 0.00012484536080062581,
"loss": 2.5469,
"step": 1955
},
{
"epoch": 0.5583125388850608,
"grad_norm": 0.54296875,
"learning_rate": 0.00012471141340054508,
"loss": 2.5758,
"step": 1956
},
{
"epoch": 0.5585979747433865,
"grad_norm": 0.43359375,
"learning_rate": 0.00012457748675275763,
"loss": 2.5819,
"step": 1957
},
{
"epoch": 0.5588834106017122,
"grad_norm": 0.494140625,
"learning_rate": 0.00012444358096716607,
"loss": 2.5616,
"step": 1958
},
{
"epoch": 0.5591688464600378,
"grad_norm": 0.43359375,
"learning_rate": 0.0001243096961536561,
"loss": 2.5502,
"step": 1959
},
{
"epoch": 0.5594542823183636,
"grad_norm": 0.421875,
"learning_rate": 0.00012417583242209612,
"loss": 2.5667,
"step": 1960
},
{
"epoch": 0.5597397181766892,
"grad_norm": 0.478515625,
"learning_rate": 0.00012404198988233729,
"loss": 2.5661,
"step": 1961
},
{
"epoch": 0.560025154035015,
"grad_norm": 0.447265625,
"learning_rate": 0.00012390816864421325,
"loss": 2.5755,
"step": 1962
},
{
"epoch": 0.5603105898933406,
"grad_norm": 0.466796875,
"learning_rate": 0.00012377436881754025,
"loss": 2.5679,
"step": 1963
},
{
"epoch": 0.5605960257516663,
"grad_norm": 0.4296875,
"learning_rate": 0.00012364059051211707,
"loss": 2.5471,
"step": 1964
},
{
"epoch": 0.5608814616099921,
"grad_norm": 0.455078125,
"learning_rate": 0.00012350683383772462,
"loss": 2.5443,
"step": 1965
},
{
"epoch": 0.5611668974683177,
"grad_norm": 0.46875,
"learning_rate": 0.00012337309890412618,
"loss": 2.5963,
"step": 1966
},
{
"epoch": 0.5614523333266435,
"grad_norm": 0.443359375,
"learning_rate": 0.00012323938582106724,
"loss": 2.5735,
"step": 1967
},
{
"epoch": 0.5617377691849691,
"grad_norm": 0.48046875,
"learning_rate": 0.00012310569469827518,
"loss": 2.5885,
"step": 1968
},
{
"epoch": 0.5620232050432948,
"grad_norm": 0.458984375,
"learning_rate": 0.00012297202564545953,
"loss": 2.5558,
"step": 1969
},
{
"epoch": 0.5623086409016205,
"grad_norm": 0.419921875,
"learning_rate": 0.0001228383787723116,
"loss": 2.5914,
"step": 1970
},
{
"epoch": 0.5625940767599462,
"grad_norm": 0.458984375,
"learning_rate": 0.0001227047541885046,
"loss": 2.5518,
"step": 1971
},
{
"epoch": 0.562879512618272,
"grad_norm": 0.431640625,
"learning_rate": 0.00012257115200369338,
"loss": 2.541,
"step": 1972
},
{
"epoch": 0.5631649484765976,
"grad_norm": 0.4453125,
"learning_rate": 0.0001224375723275144,
"loss": 2.5672,
"step": 1973
},
{
"epoch": 0.5634503843349233,
"grad_norm": 0.4140625,
"learning_rate": 0.00012230401526958578,
"loss": 2.579,
"step": 1974
},
{
"epoch": 0.563735820193249,
"grad_norm": 0.431640625,
"learning_rate": 0.0001221704809395068,
"loss": 2.5442,
"step": 1975
},
{
"epoch": 0.5640212560515747,
"grad_norm": 0.447265625,
"learning_rate": 0.00012203696944685838,
"loss": 2.582,
"step": 1976
},
{
"epoch": 0.5643066919099003,
"grad_norm": 0.41015625,
"learning_rate": 0.00012190348090120253,
"loss": 2.5607,
"step": 1977
},
{
"epoch": 0.5645921277682261,
"grad_norm": 0.41796875,
"learning_rate": 0.00012177001541208247,
"loss": 2.5668,
"step": 1978
},
{
"epoch": 0.5648775636265517,
"grad_norm": 0.423828125,
"learning_rate": 0.00012163657308902254,
"loss": 2.5663,
"step": 1979
},
{
"epoch": 0.5651629994848775,
"grad_norm": 0.40625,
"learning_rate": 0.00012150315404152809,
"loss": 2.575,
"step": 1980
},
{
"epoch": 0.5654484353432032,
"grad_norm": 0.458984375,
"learning_rate": 0.00012136975837908521,
"loss": 2.5806,
"step": 1981
},
{
"epoch": 0.5657338712015288,
"grad_norm": 0.43359375,
"learning_rate": 0.00012123638621116096,
"loss": 2.5632,
"step": 1982
},
{
"epoch": 0.5660193070598546,
"grad_norm": 0.451171875,
"learning_rate": 0.00012110303764720305,
"loss": 2.5993,
"step": 1983
},
{
"epoch": 0.5663047429181802,
"grad_norm": 0.42578125,
"learning_rate": 0.00012096971279663991,
"loss": 2.5778,
"step": 1984
},
{
"epoch": 0.566590178776506,
"grad_norm": 0.462890625,
"learning_rate": 0.00012083641176888034,
"loss": 2.5656,
"step": 1985
},
{
"epoch": 0.5668756146348316,
"grad_norm": 0.419921875,
"learning_rate": 0.00012070313467331368,
"loss": 2.5657,
"step": 1986
},
{
"epoch": 0.5671610504931573,
"grad_norm": 0.427734375,
"learning_rate": 0.00012056988161930973,
"loss": 2.5606,
"step": 1987
},
{
"epoch": 0.5674464863514831,
"grad_norm": 0.44140625,
"learning_rate": 0.00012043665271621843,
"loss": 2.5621,
"step": 1988
},
{
"epoch": 0.5677319222098087,
"grad_norm": 0.455078125,
"learning_rate": 0.00012030344807336993,
"loss": 2.5528,
"step": 1989
},
{
"epoch": 0.5680173580681345,
"grad_norm": 0.416015625,
"learning_rate": 0.00012017026780007452,
"loss": 2.5568,
"step": 1990
},
{
"epoch": 0.5683027939264601,
"grad_norm": 0.46484375,
"learning_rate": 0.00012003711200562242,
"loss": 2.5495,
"step": 1991
},
{
"epoch": 0.5685882297847858,
"grad_norm": 0.412109375,
"learning_rate": 0.00011990398079928378,
"loss": 2.5533,
"step": 1992
},
{
"epoch": 0.5688736656431115,
"grad_norm": 0.447265625,
"learning_rate": 0.00011977087429030862,
"loss": 2.55,
"step": 1993
},
{
"epoch": 0.5691591015014372,
"grad_norm": 0.50390625,
"learning_rate": 0.00011963779258792664,
"loss": 2.5533,
"step": 1994
},
{
"epoch": 0.5694445373597629,
"grad_norm": 0.453125,
"learning_rate": 0.00011950473580134723,
"loss": 2.567,
"step": 1995
},
{
"epoch": 0.5697299732180886,
"grad_norm": 0.50390625,
"learning_rate": 0.00011937170403975933,
"loss": 2.5419,
"step": 1996
},
{
"epoch": 0.5700154090764143,
"grad_norm": 0.42578125,
"learning_rate": 0.00011923869741233131,
"loss": 2.56,
"step": 1997
},
{
"epoch": 0.57030084493474,
"grad_norm": 0.486328125,
"learning_rate": 0.00011910571602821089,
"loss": 2.571,
"step": 1998
},
{
"epoch": 0.5705862807930657,
"grad_norm": 0.40625,
"learning_rate": 0.00011897275999652513,
"loss": 2.5794,
"step": 1999
},
{
"epoch": 0.5708717166513914,
"grad_norm": 0.455078125,
"learning_rate": 0.00011883982942638028,
"loss": 2.5708,
"step": 2000
},
{
"epoch": 0.5708717166513914,
"eval_loss": 2.470252513885498,
"eval_runtime": 5925.0122,
"eval_samples_per_second": 10.85,
"eval_steps_per_second": 10.85,
"step": 2000
},
{
"epoch": 0.5711571525097171,
"grad_norm": 0.435546875,
"learning_rate": 0.00011870692442686172,
"loss": 2.5898,
"step": 2001
},
{
"epoch": 0.5714425883680427,
"grad_norm": 0.423828125,
"learning_rate": 0.00011857404510703366,
"loss": 2.5845,
"step": 2002
},
{
"epoch": 0.5717280242263685,
"grad_norm": 0.5,
"learning_rate": 0.0001184411915759396,
"loss": 2.5365,
"step": 2003
},
{
"epoch": 0.5720134600846942,
"grad_norm": 0.4140625,
"learning_rate": 0.00011830836394260153,
"loss": 2.562,
"step": 2004
},
{
"epoch": 0.5722988959430199,
"grad_norm": 0.4453125,
"learning_rate": 0.00011817556231602037,
"loss": 2.5718,
"step": 2005
},
{
"epoch": 0.5725843318013456,
"grad_norm": 0.416015625,
"learning_rate": 0.00011804278680517561,
"loss": 2.5428,
"step": 2006
},
{
"epoch": 0.5728697676596712,
"grad_norm": 0.439453125,
"learning_rate": 0.00011791003751902542,
"loss": 2.5839,
"step": 2007
},
{
"epoch": 0.573155203517997,
"grad_norm": 0.4609375,
"learning_rate": 0.00011777731456650629,
"loss": 2.5791,
"step": 2008
},
{
"epoch": 0.5734406393763226,
"grad_norm": 0.43359375,
"learning_rate": 0.00011764461805653324,
"loss": 2.5559,
"step": 2009
},
{
"epoch": 0.5737260752346484,
"grad_norm": 0.484375,
"learning_rate": 0.00011751194809799949,
"loss": 2.5588,
"step": 2010
},
{
"epoch": 0.574011511092974,
"grad_norm": 0.47265625,
"learning_rate": 0.00011737930479977658,
"loss": 2.597,
"step": 2011
},
{
"epoch": 0.5742969469512997,
"grad_norm": 0.474609375,
"learning_rate": 0.00011724668827071413,
"loss": 2.5619,
"step": 2012
},
{
"epoch": 0.5745823828096255,
"grad_norm": 0.458984375,
"learning_rate": 0.00011711409861963971,
"loss": 2.5595,
"step": 2013
},
{
"epoch": 0.5748678186679511,
"grad_norm": 0.478515625,
"learning_rate": 0.00011698153595535897,
"loss": 2.5641,
"step": 2014
},
{
"epoch": 0.5751532545262769,
"grad_norm": 0.435546875,
"learning_rate": 0.0001168490003866553,
"loss": 2.5707,
"step": 2015
},
{
"epoch": 0.5754386903846025,
"grad_norm": 0.490234375,
"learning_rate": 0.00011671649202228988,
"loss": 2.5486,
"step": 2016
},
{
"epoch": 0.5757241262429282,
"grad_norm": 0.453125,
"learning_rate": 0.00011658401097100161,
"loss": 2.5753,
"step": 2017
},
{
"epoch": 0.5760095621012539,
"grad_norm": 0.50390625,
"learning_rate": 0.0001164515573415069,
"loss": 2.5995,
"step": 2018
},
{
"epoch": 0.5762949979595796,
"grad_norm": 0.4609375,
"learning_rate": 0.00011631913124249981,
"loss": 2.587,
"step": 2019
},
{
"epoch": 0.5765804338179052,
"grad_norm": 0.439453125,
"learning_rate": 0.00011618673278265168,
"loss": 2.5885,
"step": 2020
},
{
"epoch": 0.576865869676231,
"grad_norm": 0.435546875,
"learning_rate": 0.00011605436207061112,
"loss": 2.5741,
"step": 2021
},
{
"epoch": 0.5771513055345567,
"grad_norm": 0.431640625,
"learning_rate": 0.00011592201921500408,
"loss": 2.5782,
"step": 2022
},
{
"epoch": 0.5774367413928824,
"grad_norm": 0.42578125,
"learning_rate": 0.00011578970432443364,
"loss": 2.5819,
"step": 2023
},
{
"epoch": 0.5777221772512081,
"grad_norm": 0.427734375,
"learning_rate": 0.00011565741750747992,
"loss": 2.5745,
"step": 2024
},
{
"epoch": 0.5780076131095337,
"grad_norm": 0.455078125,
"learning_rate": 0.00011552515887269992,
"loss": 2.5694,
"step": 2025
},
{
"epoch": 0.5782930489678595,
"grad_norm": 0.416015625,
"learning_rate": 0.00011539292852862757,
"loss": 2.5542,
"step": 2026
},
{
"epoch": 0.5785784848261851,
"grad_norm": 0.396484375,
"learning_rate": 0.0001152607265837737,
"loss": 2.5776,
"step": 2027
},
{
"epoch": 0.5788639206845109,
"grad_norm": 0.431640625,
"learning_rate": 0.00011512855314662566,
"loss": 2.555,
"step": 2028
},
{
"epoch": 0.5791493565428366,
"grad_norm": 0.71484375,
"learning_rate": 0.00011499640832564749,
"loss": 2.5699,
"step": 2029
},
{
"epoch": 0.5794347924011622,
"grad_norm": 0.44140625,
"learning_rate": 0.00011486429222927976,
"loss": 2.5698,
"step": 2030
},
{
"epoch": 0.579720228259488,
"grad_norm": 0.427734375,
"learning_rate": 0.00011473220496593937,
"loss": 2.546,
"step": 2031
},
{
"epoch": 0.5800056641178136,
"grad_norm": 0.439453125,
"learning_rate": 0.0001146001466440197,
"loss": 2.563,
"step": 2032
},
{
"epoch": 0.5802910999761394,
"grad_norm": 0.4296875,
"learning_rate": 0.00011446811737189029,
"loss": 2.5682,
"step": 2033
},
{
"epoch": 0.580576535834465,
"grad_norm": 0.44921875,
"learning_rate": 0.0001143361172578968,
"loss": 2.5643,
"step": 2034
},
{
"epoch": 0.5808619716927907,
"grad_norm": 0.416015625,
"learning_rate": 0.00011420414641036111,
"loss": 2.5385,
"step": 2035
},
{
"epoch": 0.5811474075511164,
"grad_norm": 0.453125,
"learning_rate": 0.00011407220493758099,
"loss": 2.5788,
"step": 2036
},
{
"epoch": 0.5814328434094421,
"grad_norm": 0.4375,
"learning_rate": 0.00011394029294783011,
"loss": 2.5717,
"step": 2037
},
{
"epoch": 0.5817182792677679,
"grad_norm": 0.46484375,
"learning_rate": 0.00011380841054935789,
"loss": 2.595,
"step": 2038
},
{
"epoch": 0.5820037151260935,
"grad_norm": 0.484375,
"learning_rate": 0.00011367655785038957,
"loss": 2.5678,
"step": 2039
},
{
"epoch": 0.5822891509844192,
"grad_norm": 0.427734375,
"learning_rate": 0.00011354473495912596,
"loss": 2.5785,
"step": 2040
},
{
"epoch": 0.5825745868427449,
"grad_norm": 0.4453125,
"learning_rate": 0.00011341294198374341,
"loss": 2.5803,
"step": 2041
},
{
"epoch": 0.5828600227010706,
"grad_norm": 0.451171875,
"learning_rate": 0.00011328117903239376,
"loss": 2.5802,
"step": 2042
},
{
"epoch": 0.5831454585593963,
"grad_norm": 0.44140625,
"learning_rate": 0.00011314944621320421,
"loss": 2.5512,
"step": 2043
},
{
"epoch": 0.583430894417722,
"grad_norm": 0.447265625,
"learning_rate": 0.00011301774363427714,
"loss": 2.5891,
"step": 2044
},
{
"epoch": 0.5837163302760477,
"grad_norm": 0.4453125,
"learning_rate": 0.00011288607140369021,
"loss": 2.5855,
"step": 2045
},
{
"epoch": 0.5840017661343734,
"grad_norm": 0.451171875,
"learning_rate": 0.00011275442962949613,
"loss": 2.5551,
"step": 2046
},
{
"epoch": 0.5842872019926991,
"grad_norm": 0.4296875,
"learning_rate": 0.00011262281841972272,
"loss": 2.5605,
"step": 2047
},
{
"epoch": 0.5845726378510248,
"grad_norm": 0.48046875,
"learning_rate": 0.0001124912378823725,
"loss": 2.5974,
"step": 2048
},
{
"epoch": 0.5848580737093505,
"grad_norm": 0.482421875,
"learning_rate": 0.00011235968812542298,
"loss": 2.5483,
"step": 2049
},
{
"epoch": 0.5851435095676761,
"grad_norm": 0.474609375,
"learning_rate": 0.00011222816925682647,
"loss": 2.5846,
"step": 2050
},
{
"epoch": 0.5854289454260019,
"grad_norm": 0.490234375,
"learning_rate": 0.00011209668138450979,
"loss": 2.572,
"step": 2051
},
{
"epoch": 0.5857143812843275,
"grad_norm": 0.466796875,
"learning_rate": 0.00011196522461637439,
"loss": 2.5609,
"step": 2052
},
{
"epoch": 0.5859998171426533,
"grad_norm": 0.52734375,
"learning_rate": 0.00011183379906029615,
"loss": 2.5499,
"step": 2053
},
{
"epoch": 0.586285253000979,
"grad_norm": 0.490234375,
"learning_rate": 0.00011170240482412542,
"loss": 2.5417,
"step": 2054
},
{
"epoch": 0.5865706888593046,
"grad_norm": 0.5390625,
"learning_rate": 0.00011157104201568677,
"loss": 2.5613,
"step": 2055
},
{
"epoch": 0.5868561247176304,
"grad_norm": 0.4609375,
"learning_rate": 0.000111439710742779,
"loss": 2.5377,
"step": 2056
},
{
"epoch": 0.587141560575956,
"grad_norm": 0.5703125,
"learning_rate": 0.00011130841111317501,
"loss": 2.5511,
"step": 2057
},
{
"epoch": 0.5874269964342818,
"grad_norm": 0.4296875,
"learning_rate": 0.00011117714323462186,
"loss": 2.581,
"step": 2058
},
{
"epoch": 0.5877124322926074,
"grad_norm": 0.4921875,
"learning_rate": 0.0001110459072148404,
"loss": 2.556,
"step": 2059
},
{
"epoch": 0.5879978681509331,
"grad_norm": 0.44140625,
"learning_rate": 0.00011091470316152543,
"loss": 2.5631,
"step": 2060
},
{
"epoch": 0.5882833040092589,
"grad_norm": 0.4609375,
"learning_rate": 0.00011078353118234542,
"loss": 2.5587,
"step": 2061
},
{
"epoch": 0.5885687398675845,
"grad_norm": 0.486328125,
"learning_rate": 0.00011065239138494263,
"loss": 2.5622,
"step": 2062
},
{
"epoch": 0.5888541757259103,
"grad_norm": 0.421875,
"learning_rate": 0.0001105212838769328,
"loss": 2.5687,
"step": 2063
},
{
"epoch": 0.5891396115842359,
"grad_norm": 0.458984375,
"learning_rate": 0.00011039020876590535,
"loss": 2.5541,
"step": 2064
},
{
"epoch": 0.5894250474425616,
"grad_norm": 0.44140625,
"learning_rate": 0.00011025916615942281,
"loss": 2.5607,
"step": 2065
},
{
"epoch": 0.5897104833008873,
"grad_norm": 0.423828125,
"learning_rate": 0.00011012815616502145,
"loss": 2.5617,
"step": 2066
},
{
"epoch": 0.589995919159213,
"grad_norm": 0.46875,
"learning_rate": 0.00010999717889021042,
"loss": 2.5915,
"step": 2067
},
{
"epoch": 0.5902813550175386,
"grad_norm": 0.408203125,
"learning_rate": 0.00010986623444247216,
"loss": 2.5686,
"step": 2068
},
{
"epoch": 0.5905667908758644,
"grad_norm": 0.45703125,
"learning_rate": 0.0001097353229292622,
"loss": 2.5715,
"step": 2069
},
{
"epoch": 0.5908522267341901,
"grad_norm": 0.44140625,
"learning_rate": 0.00010960444445800901,
"loss": 2.5551,
"step": 2070
},
{
"epoch": 0.5911376625925158,
"grad_norm": 0.4140625,
"learning_rate": 0.0001094735991361139,
"loss": 2.5485,
"step": 2071
},
{
"epoch": 0.5914230984508415,
"grad_norm": 0.453125,
"learning_rate": 0.00010934278707095103,
"loss": 2.5534,
"step": 2072
},
{
"epoch": 0.5917085343091671,
"grad_norm": 0.427734375,
"learning_rate": 0.00010921200836986727,
"loss": 2.56,
"step": 2073
},
{
"epoch": 0.5919939701674929,
"grad_norm": 0.435546875,
"learning_rate": 0.00010908126314018212,
"loss": 2.5518,
"step": 2074
},
{
"epoch": 0.5922794060258185,
"grad_norm": 0.455078125,
"learning_rate": 0.00010895055148918756,
"loss": 2.587,
"step": 2075
},
{
"epoch": 0.5925648418841443,
"grad_norm": 0.419921875,
"learning_rate": 0.00010881987352414806,
"loss": 2.5573,
"step": 2076
},
{
"epoch": 0.59285027774247,
"grad_norm": 0.439453125,
"learning_rate": 0.00010868922935230049,
"loss": 2.5569,
"step": 2077
},
{
"epoch": 0.5931357136007956,
"grad_norm": 0.462890625,
"learning_rate": 0.00010855861908085383,
"loss": 2.5437,
"step": 2078
},
{
"epoch": 0.5934211494591214,
"grad_norm": 0.4296875,
"learning_rate": 0.00010842804281698937,
"loss": 2.554,
"step": 2079
},
{
"epoch": 0.593706585317447,
"grad_norm": 0.46875,
"learning_rate": 0.00010829750066786052,
"loss": 2.5834,
"step": 2080
},
{
"epoch": 0.5939920211757728,
"grad_norm": 0.4140625,
"learning_rate": 0.00010816699274059255,
"loss": 2.5947,
"step": 2081
},
{
"epoch": 0.5942774570340984,
"grad_norm": 0.470703125,
"learning_rate": 0.00010803651914228285,
"loss": 2.557,
"step": 2082
},
{
"epoch": 0.5945628928924241,
"grad_norm": 0.400390625,
"learning_rate": 0.00010790607998000048,
"loss": 2.5781,
"step": 2083
},
{
"epoch": 0.5948483287507498,
"grad_norm": 0.455078125,
"learning_rate": 0.00010777567536078623,
"loss": 2.57,
"step": 2084
},
{
"epoch": 0.5951337646090755,
"grad_norm": 0.42578125,
"learning_rate": 0.0001076453053916527,
"loss": 2.5555,
"step": 2085
},
{
"epoch": 0.5954192004674013,
"grad_norm": 0.4296875,
"learning_rate": 0.00010751497017958385,
"loss": 2.6032,
"step": 2086
},
{
"epoch": 0.5957046363257269,
"grad_norm": 0.5546875,
"learning_rate": 0.00010738466983153533,
"loss": 2.5711,
"step": 2087
},
{
"epoch": 0.5959900721840526,
"grad_norm": 0.439453125,
"learning_rate": 0.000107254404454434,
"loss": 2.5851,
"step": 2088
},
{
"epoch": 0.5962755080423783,
"grad_norm": 0.49609375,
"learning_rate": 0.00010712417415517808,
"loss": 2.5805,
"step": 2089
},
{
"epoch": 0.596560943900704,
"grad_norm": 0.451171875,
"learning_rate": 0.00010699397904063708,
"loss": 2.5809,
"step": 2090
},
{
"epoch": 0.5968463797590297,
"grad_norm": 0.57421875,
"learning_rate": 0.00010686381921765158,
"loss": 2.5796,
"step": 2091
},
{
"epoch": 0.5971318156173554,
"grad_norm": 0.462890625,
"learning_rate": 0.00010673369479303315,
"loss": 2.5641,
"step": 2092
},
{
"epoch": 0.597417251475681,
"grad_norm": 0.42578125,
"learning_rate": 0.00010660360587356438,
"loss": 2.5651,
"step": 2093
},
{
"epoch": 0.5977026873340068,
"grad_norm": 0.44921875,
"learning_rate": 0.00010647355256599877,
"loss": 2.5639,
"step": 2094
},
{
"epoch": 0.5979881231923325,
"grad_norm": 0.423828125,
"learning_rate": 0.00010634353497706037,
"loss": 2.5482,
"step": 2095
},
{
"epoch": 0.5982735590506582,
"grad_norm": 0.439453125,
"learning_rate": 0.0001062135532134442,
"loss": 2.5762,
"step": 2096
},
{
"epoch": 0.5985589949089839,
"grad_norm": 0.419921875,
"learning_rate": 0.0001060836073818157,
"loss": 2.573,
"step": 2097
},
{
"epoch": 0.5988444307673095,
"grad_norm": 0.4453125,
"learning_rate": 0.00010595369758881091,
"loss": 2.5582,
"step": 2098
},
{
"epoch": 0.5991298666256353,
"grad_norm": 0.455078125,
"learning_rate": 0.00010582382394103628,
"loss": 2.6,
"step": 2099
},
{
"epoch": 0.5994153024839609,
"grad_norm": 0.400390625,
"learning_rate": 0.0001056939865450686,
"loss": 2.573,
"step": 2100
},
{
"epoch": 0.5997007383422867,
"grad_norm": 0.419921875,
"learning_rate": 0.00010556418550745482,
"loss": 2.5422,
"step": 2101
},
{
"epoch": 0.5999861742006124,
"grad_norm": 0.427734375,
"learning_rate": 0.00010543442093471218,
"loss": 2.5682,
"step": 2102
},
{
"epoch": 0.600271610058938,
"grad_norm": 0.451171875,
"learning_rate": 0.00010530469293332797,
"loss": 2.563,
"step": 2103
},
{
"epoch": 0.6005570459172638,
"grad_norm": 0.41015625,
"learning_rate": 0.00010517500160975935,
"loss": 2.5584,
"step": 2104
},
{
"epoch": 0.6008424817755894,
"grad_norm": 0.4296875,
"learning_rate": 0.00010504534707043357,
"loss": 2.5646,
"step": 2105
},
{
"epoch": 0.6011279176339152,
"grad_norm": 0.447265625,
"learning_rate": 0.00010491572942174763,
"loss": 2.5812,
"step": 2106
},
{
"epoch": 0.6014133534922408,
"grad_norm": 0.46875,
"learning_rate": 0.00010478614877006813,
"loss": 2.5652,
"step": 2107
},
{
"epoch": 0.6016987893505665,
"grad_norm": 0.443359375,
"learning_rate": 0.00010465660522173144,
"loss": 2.5468,
"step": 2108
},
{
"epoch": 0.6019842252088922,
"grad_norm": 0.4140625,
"learning_rate": 0.00010452709888304347,
"loss": 2.5424,
"step": 2109
},
{
"epoch": 0.6022696610672179,
"grad_norm": 0.43359375,
"learning_rate": 0.0001043976298602796,
"loss": 2.579,
"step": 2110
},
{
"epoch": 0.6025550969255437,
"grad_norm": 0.45703125,
"learning_rate": 0.00010426819825968449,
"loss": 2.5618,
"step": 2111
},
{
"epoch": 0.6028405327838693,
"grad_norm": 0.421875,
"learning_rate": 0.00010413880418747215,
"loss": 2.5656,
"step": 2112
},
{
"epoch": 0.603125968642195,
"grad_norm": 0.4609375,
"learning_rate": 0.00010400944774982593,
"loss": 2.5724,
"step": 2113
},
{
"epoch": 0.6034114045005207,
"grad_norm": 0.435546875,
"learning_rate": 0.00010388012905289808,
"loss": 2.5452,
"step": 2114
},
{
"epoch": 0.6036968403588464,
"grad_norm": 0.41796875,
"learning_rate": 0.00010375084820280998,
"loss": 2.5538,
"step": 2115
},
{
"epoch": 0.603982276217172,
"grad_norm": 0.4296875,
"learning_rate": 0.00010362160530565197,
"loss": 2.5399,
"step": 2116
},
{
"epoch": 0.6042677120754978,
"grad_norm": 0.42578125,
"learning_rate": 0.00010349240046748324,
"loss": 2.5613,
"step": 2117
},
{
"epoch": 0.6045531479338235,
"grad_norm": 0.412109375,
"learning_rate": 0.00010336323379433165,
"loss": 2.5742,
"step": 2118
},
{
"epoch": 0.6048385837921492,
"grad_norm": 0.41015625,
"learning_rate": 0.00010323410539219388,
"loss": 2.5627,
"step": 2119
},
{
"epoch": 0.6051240196504749,
"grad_norm": 0.412109375,
"learning_rate": 0.00010310501536703507,
"loss": 2.5675,
"step": 2120
},
{
"epoch": 0.6054094555088005,
"grad_norm": 0.412109375,
"learning_rate": 0.00010297596382478906,
"loss": 2.5845,
"step": 2121
},
{
"epoch": 0.6056948913671263,
"grad_norm": 0.419921875,
"learning_rate": 0.00010284695087135791,
"loss": 2.5579,
"step": 2122
},
{
"epoch": 0.6059803272254519,
"grad_norm": 0.423828125,
"learning_rate": 0.00010271797661261215,
"loss": 2.5864,
"step": 2123
},
{
"epoch": 0.6062657630837777,
"grad_norm": 0.390625,
"learning_rate": 0.0001025890411543904,
"loss": 2.5851,
"step": 2124
},
{
"epoch": 0.6065511989421033,
"grad_norm": 0.412109375,
"learning_rate": 0.00010246014460249964,
"loss": 2.5753,
"step": 2125
},
{
"epoch": 0.606836634800429,
"grad_norm": 0.404296875,
"learning_rate": 0.00010233128706271475,
"loss": 2.5756,
"step": 2126
},
{
"epoch": 0.6071220706587548,
"grad_norm": 0.380859375,
"learning_rate": 0.00010220246864077875,
"loss": 2.5755,
"step": 2127
},
{
"epoch": 0.6074075065170804,
"grad_norm": 0.384765625,
"learning_rate": 0.00010207368944240234,
"loss": 2.5598,
"step": 2128
},
{
"epoch": 0.6076929423754062,
"grad_norm": 0.4140625,
"learning_rate": 0.00010194494957326434,
"loss": 2.564,
"step": 2129
},
{
"epoch": 0.6079783782337318,
"grad_norm": 0.388671875,
"learning_rate": 0.00010181624913901099,
"loss": 2.5546,
"step": 2130
},
{
"epoch": 0.6082638140920575,
"grad_norm": 0.38671875,
"learning_rate": 0.0001016875882452564,
"loss": 2.5709,
"step": 2131
},
{
"epoch": 0.6085492499503832,
"grad_norm": 0.42578125,
"learning_rate": 0.00010155896699758206,
"loss": 2.5293,
"step": 2132
},
{
"epoch": 0.6088346858087089,
"grad_norm": 0.384765625,
"learning_rate": 0.00010143038550153703,
"loss": 2.5746,
"step": 2133
},
{
"epoch": 0.6091201216670347,
"grad_norm": 0.45703125,
"learning_rate": 0.0001013018438626378,
"loss": 2.5632,
"step": 2134
},
{
"epoch": 0.6094055575253603,
"grad_norm": 0.408203125,
"learning_rate": 0.00010117334218636793,
"loss": 2.5465,
"step": 2135
},
{
"epoch": 0.609690993383686,
"grad_norm": 0.400390625,
"learning_rate": 0.00010104488057817839,
"loss": 2.5461,
"step": 2136
},
{
"epoch": 0.6099764292420117,
"grad_norm": 0.408203125,
"learning_rate": 0.00010091645914348724,
"loss": 2.5891,
"step": 2137
},
{
"epoch": 0.6102618651003374,
"grad_norm": 0.412109375,
"learning_rate": 0.00010078807798767953,
"loss": 2.5954,
"step": 2138
},
{
"epoch": 0.610547300958663,
"grad_norm": 0.4140625,
"learning_rate": 0.00010065973721610727,
"loss": 2.5611,
"step": 2139
},
{
"epoch": 0.6108327368169888,
"grad_norm": 0.392578125,
"learning_rate": 0.00010053143693408932,
"loss": 2.5958,
"step": 2140
},
{
"epoch": 0.6111181726753144,
"grad_norm": 0.41015625,
"learning_rate": 0.00010040317724691133,
"loss": 2.5734,
"step": 2141
},
{
"epoch": 0.6114036085336402,
"grad_norm": 0.40625,
"learning_rate": 0.00010027495825982558,
"loss": 2.5665,
"step": 2142
},
{
"epoch": 0.6116890443919659,
"grad_norm": 0.388671875,
"learning_rate": 0.00010014678007805106,
"loss": 2.5597,
"step": 2143
},
{
"epoch": 0.6119744802502916,
"grad_norm": 0.4140625,
"learning_rate": 0.00010001864280677316,
"loss": 2.5883,
"step": 2144
},
{
"epoch": 0.6122599161086173,
"grad_norm": 0.41015625,
"learning_rate": 9.989054655114383e-05,
"loss": 2.5357,
"step": 2145
},
{
"epoch": 0.6125453519669429,
"grad_norm": 0.40625,
"learning_rate": 9.976249141628124e-05,
"loss": 2.5692,
"step": 2146
},
{
"epoch": 0.6128307878252687,
"grad_norm": 0.4296875,
"learning_rate": 9.963447750726984e-05,
"loss": 2.5544,
"step": 2147
},
{
"epoch": 0.6131162236835943,
"grad_norm": 0.390625,
"learning_rate": 9.95065049291603e-05,
"loss": 2.5472,
"step": 2148
},
{
"epoch": 0.61340165954192,
"grad_norm": 0.3984375,
"learning_rate": 9.937857378696932e-05,
"loss": 2.6036,
"step": 2149
},
{
"epoch": 0.6136870954002458,
"grad_norm": 0.40234375,
"learning_rate": 9.925068418567967e-05,
"loss": 2.5645,
"step": 2150
},
{
"epoch": 0.6139725312585714,
"grad_norm": 0.396484375,
"learning_rate": 9.912283623023988e-05,
"loss": 2.5646,
"step": 2151
},
{
"epoch": 0.6142579671168972,
"grad_norm": 0.4140625,
"learning_rate": 9.899503002556442e-05,
"loss": 2.5792,
"step": 2152
},
{
"epoch": 0.6145434029752228,
"grad_norm": 0.39453125,
"learning_rate": 9.886726567653362e-05,
"loss": 2.5629,
"step": 2153
},
{
"epoch": 0.6148288388335486,
"grad_norm": 0.4375,
"learning_rate": 9.87395432879932e-05,
"loss": 2.5558,
"step": 2154
},
{
"epoch": 0.6151142746918742,
"grad_norm": 0.416015625,
"learning_rate": 9.861186296475458e-05,
"loss": 2.5663,
"step": 2155
},
{
"epoch": 0.6153997105501999,
"grad_norm": 0.390625,
"learning_rate": 9.84842248115947e-05,
"loss": 2.5347,
"step": 2156
},
{
"epoch": 0.6156851464085256,
"grad_norm": 0.3828125,
"learning_rate": 9.835662893325584e-05,
"loss": 2.5608,
"step": 2157
},
{
"epoch": 0.6159705822668513,
"grad_norm": 0.3984375,
"learning_rate": 9.822907543444553e-05,
"loss": 2.5695,
"step": 2158
},
{
"epoch": 0.616256018125177,
"grad_norm": 0.376953125,
"learning_rate": 9.810156441983665e-05,
"loss": 2.5549,
"step": 2159
},
{
"epoch": 0.6165414539835027,
"grad_norm": 0.41015625,
"learning_rate": 9.797409599406709e-05,
"loss": 2.5916,
"step": 2160
},
{
"epoch": 0.6168268898418284,
"grad_norm": 0.4140625,
"learning_rate": 9.784667026173993e-05,
"loss": 2.546,
"step": 2161
},
{
"epoch": 0.6171123257001541,
"grad_norm": 0.380859375,
"learning_rate": 9.771928732742313e-05,
"loss": 2.5728,
"step": 2162
},
{
"epoch": 0.6173977615584798,
"grad_norm": 0.376953125,
"learning_rate": 9.759194729564954e-05,
"loss": 2.5711,
"step": 2163
},
{
"epoch": 0.6176831974168054,
"grad_norm": 0.421875,
"learning_rate": 9.746465027091676e-05,
"loss": 2.5335,
"step": 2164
},
{
"epoch": 0.6179686332751312,
"grad_norm": 0.376953125,
"learning_rate": 9.733739635768714e-05,
"loss": 2.5583,
"step": 2165
},
{
"epoch": 0.6182540691334568,
"grad_norm": 0.404296875,
"learning_rate": 9.721018566038767e-05,
"loss": 2.537,
"step": 2166
},
{
"epoch": 0.6185395049917826,
"grad_norm": 0.421875,
"learning_rate": 9.708301828340993e-05,
"loss": 2.5576,
"step": 2167
},
{
"epoch": 0.6188249408501083,
"grad_norm": 0.388671875,
"learning_rate": 9.695589433110968e-05,
"loss": 2.5786,
"step": 2168
},
{
"epoch": 0.6191103767084339,
"grad_norm": 0.37890625,
"learning_rate": 9.682881390780749e-05,
"loss": 2.584,
"step": 2169
},
{
"epoch": 0.6193958125667597,
"grad_norm": 0.41796875,
"learning_rate": 9.67017771177878e-05,
"loss": 2.5681,
"step": 2170
},
{
"epoch": 0.6196812484250853,
"grad_norm": 0.392578125,
"learning_rate": 9.657478406529946e-05,
"loss": 2.553,
"step": 2171
},
{
"epoch": 0.6199666842834111,
"grad_norm": 0.390625,
"learning_rate": 9.644783485455537e-05,
"loss": 2.5665,
"step": 2172
},
{
"epoch": 0.6202521201417367,
"grad_norm": 0.39453125,
"learning_rate": 9.632092958973246e-05,
"loss": 2.5572,
"step": 2173
},
{
"epoch": 0.6205375560000624,
"grad_norm": 0.40234375,
"learning_rate": 9.61940683749716e-05,
"loss": 2.5576,
"step": 2174
},
{
"epoch": 0.6208229918583882,
"grad_norm": 0.3828125,
"learning_rate": 9.606725131437739e-05,
"loss": 2.5667,
"step": 2175
},
{
"epoch": 0.6211084277167138,
"grad_norm": 0.400390625,
"learning_rate": 9.594047851201855e-05,
"loss": 2.5688,
"step": 2176
},
{
"epoch": 0.6213938635750396,
"grad_norm": 0.38671875,
"learning_rate": 9.581375007192705e-05,
"loss": 2.5627,
"step": 2177
},
{
"epoch": 0.6216792994333652,
"grad_norm": 0.400390625,
"learning_rate": 9.568706609809872e-05,
"loss": 2.5918,
"step": 2178
},
{
"epoch": 0.6219647352916909,
"grad_norm": 0.396484375,
"learning_rate": 9.556042669449281e-05,
"loss": 2.5662,
"step": 2179
},
{
"epoch": 0.6222501711500166,
"grad_norm": 0.396484375,
"learning_rate": 9.543383196503206e-05,
"loss": 2.5345,
"step": 2180
},
{
"epoch": 0.6225356070083423,
"grad_norm": 0.40234375,
"learning_rate": 9.530728201360244e-05,
"loss": 2.5612,
"step": 2181
},
{
"epoch": 0.622821042866668,
"grad_norm": 0.390625,
"learning_rate": 9.518077694405322e-05,
"loss": 2.5691,
"step": 2182
},
{
"epoch": 0.6231064787249937,
"grad_norm": 0.40234375,
"learning_rate": 9.505431686019692e-05,
"loss": 2.5599,
"step": 2183
},
{
"epoch": 0.6233919145833194,
"grad_norm": 0.39453125,
"learning_rate": 9.492790186580906e-05,
"loss": 2.5384,
"step": 2184
},
{
"epoch": 0.6236773504416451,
"grad_norm": 0.388671875,
"learning_rate": 9.480153206462817e-05,
"loss": 2.5833,
"step": 2185
},
{
"epoch": 0.6239627862999708,
"grad_norm": 0.3828125,
"learning_rate": 9.467520756035575e-05,
"loss": 2.5582,
"step": 2186
},
{
"epoch": 0.6242482221582965,
"grad_norm": 0.390625,
"learning_rate": 9.454892845665603e-05,
"loss": 2.5327,
"step": 2187
},
{
"epoch": 0.6245336580166222,
"grad_norm": 0.41015625,
"learning_rate": 9.442269485715602e-05,
"loss": 2.5675,
"step": 2188
},
{
"epoch": 0.6248190938749478,
"grad_norm": 0.38671875,
"learning_rate": 9.429650686544546e-05,
"loss": 2.5706,
"step": 2189
},
{
"epoch": 0.6251045297332736,
"grad_norm": 0.41015625,
"learning_rate": 9.417036458507658e-05,
"loss": 2.5732,
"step": 2190
},
{
"epoch": 0.6253899655915993,
"grad_norm": 0.40234375,
"learning_rate": 9.404426811956404e-05,
"loss": 2.57,
"step": 2191
},
{
"epoch": 0.625675401449925,
"grad_norm": 0.40234375,
"learning_rate": 9.391821757238511e-05,
"loss": 2.5336,
"step": 2192
},
{
"epoch": 0.6259608373082507,
"grad_norm": 0.40625,
"learning_rate": 9.379221304697925e-05,
"loss": 2.5533,
"step": 2193
},
{
"epoch": 0.6262462731665763,
"grad_norm": 0.40234375,
"learning_rate": 9.366625464674811e-05,
"loss": 2.5648,
"step": 2194
},
{
"epoch": 0.6265317090249021,
"grad_norm": 0.40625,
"learning_rate": 9.354034247505556e-05,
"loss": 2.5672,
"step": 2195
},
{
"epoch": 0.6268171448832277,
"grad_norm": 0.40234375,
"learning_rate": 9.341447663522749e-05,
"loss": 2.5789,
"step": 2196
},
{
"epoch": 0.6271025807415535,
"grad_norm": 0.384765625,
"learning_rate": 9.328865723055185e-05,
"loss": 2.5557,
"step": 2197
},
{
"epoch": 0.6273880165998791,
"grad_norm": 0.431640625,
"learning_rate": 9.316288436427834e-05,
"loss": 2.5479,
"step": 2198
},
{
"epoch": 0.6276734524582048,
"grad_norm": 0.40234375,
"learning_rate": 9.30371581396186e-05,
"loss": 2.5853,
"step": 2199
},
{
"epoch": 0.6279588883165306,
"grad_norm": 0.380859375,
"learning_rate": 9.291147865974599e-05,
"loss": 2.588,
"step": 2200
},
{
"epoch": 0.6282443241748562,
"grad_norm": 0.37890625,
"learning_rate": 9.278584602779541e-05,
"loss": 2.5675,
"step": 2201
},
{
"epoch": 0.628529760033182,
"grad_norm": 0.396484375,
"learning_rate": 9.266026034686341e-05,
"loss": 2.59,
"step": 2202
},
{
"epoch": 0.6288151958915076,
"grad_norm": 0.44140625,
"learning_rate": 9.253472172000802e-05,
"loss": 2.5578,
"step": 2203
},
{
"epoch": 0.6291006317498333,
"grad_norm": 0.40234375,
"learning_rate": 9.240923025024853e-05,
"loss": 2.5348,
"step": 2204
},
{
"epoch": 0.629386067608159,
"grad_norm": 0.423828125,
"learning_rate": 9.228378604056568e-05,
"loss": 2.5759,
"step": 2205
},
{
"epoch": 0.6296715034664847,
"grad_norm": 0.416015625,
"learning_rate": 9.215838919390132e-05,
"loss": 2.5559,
"step": 2206
},
{
"epoch": 0.6299569393248104,
"grad_norm": 0.41015625,
"learning_rate": 9.203303981315847e-05,
"loss": 2.5611,
"step": 2207
},
{
"epoch": 0.6302423751831361,
"grad_norm": 0.41015625,
"learning_rate": 9.190773800120126e-05,
"loss": 2.5746,
"step": 2208
},
{
"epoch": 0.6305278110414618,
"grad_norm": 0.396484375,
"learning_rate": 9.178248386085474e-05,
"loss": 2.5519,
"step": 2209
},
{
"epoch": 0.6308132468997875,
"grad_norm": 0.408203125,
"learning_rate": 9.165727749490477e-05,
"loss": 2.5576,
"step": 2210
},
{
"epoch": 0.6310986827581132,
"grad_norm": 0.408203125,
"learning_rate": 9.15321190060981e-05,
"loss": 2.5854,
"step": 2211
},
{
"epoch": 0.6313841186164388,
"grad_norm": 0.404296875,
"learning_rate": 9.140700849714216e-05,
"loss": 2.5661,
"step": 2212
},
{
"epoch": 0.6316695544747646,
"grad_norm": 0.41015625,
"learning_rate": 9.128194607070498e-05,
"loss": 2.5572,
"step": 2213
},
{
"epoch": 0.6319549903330902,
"grad_norm": 0.404296875,
"learning_rate": 9.115693182941518e-05,
"loss": 2.5889,
"step": 2214
},
{
"epoch": 0.632240426191416,
"grad_norm": 0.421875,
"learning_rate": 9.103196587586172e-05,
"loss": 2.5474,
"step": 2215
},
{
"epoch": 0.6325258620497417,
"grad_norm": 0.412109375,
"learning_rate": 9.090704831259422e-05,
"loss": 2.5664,
"step": 2216
},
{
"epoch": 0.6328112979080673,
"grad_norm": 0.376953125,
"learning_rate": 9.078217924212224e-05,
"loss": 2.5648,
"step": 2217
},
{
"epoch": 0.6330967337663931,
"grad_norm": 0.412109375,
"learning_rate": 9.065735876691578e-05,
"loss": 2.5675,
"step": 2218
},
{
"epoch": 0.6333821696247187,
"grad_norm": 0.39453125,
"learning_rate": 9.053258698940484e-05,
"loss": 2.5783,
"step": 2219
},
{
"epoch": 0.6336676054830445,
"grad_norm": 0.4140625,
"learning_rate": 9.040786401197957e-05,
"loss": 2.561,
"step": 2220
},
{
"epoch": 0.6339530413413701,
"grad_norm": 0.390625,
"learning_rate": 9.028318993698993e-05,
"loss": 2.5814,
"step": 2221
},
{
"epoch": 0.6342384771996958,
"grad_norm": 0.421875,
"learning_rate": 9.015856486674587e-05,
"loss": 2.6124,
"step": 2222
},
{
"epoch": 0.6345239130580216,
"grad_norm": 0.458984375,
"learning_rate": 9.003398890351704e-05,
"loss": 2.5395,
"step": 2223
},
{
"epoch": 0.6348093489163472,
"grad_norm": 0.400390625,
"learning_rate": 8.99094621495329e-05,
"loss": 2.5417,
"step": 2224
},
{
"epoch": 0.635094784774673,
"grad_norm": 0.388671875,
"learning_rate": 8.978498470698244e-05,
"loss": 2.5751,
"step": 2225
},
{
"epoch": 0.6353802206329986,
"grad_norm": 0.439453125,
"learning_rate": 8.966055667801422e-05,
"loss": 2.5614,
"step": 2226
},
{
"epoch": 0.6356656564913243,
"grad_norm": 0.423828125,
"learning_rate": 8.95361781647362e-05,
"loss": 2.5633,
"step": 2227
},
{
"epoch": 0.63595109234965,
"grad_norm": 0.396484375,
"learning_rate": 8.941184926921576e-05,
"loss": 2.5668,
"step": 2228
},
{
"epoch": 0.6362365282079757,
"grad_norm": 0.384765625,
"learning_rate": 8.928757009347956e-05,
"loss": 2.5793,
"step": 2229
},
{
"epoch": 0.6365219640663013,
"grad_norm": 0.373046875,
"learning_rate": 8.916334073951345e-05,
"loss": 2.5548,
"step": 2230
},
{
"epoch": 0.6368073999246271,
"grad_norm": 0.419921875,
"learning_rate": 8.90391613092623e-05,
"loss": 2.5783,
"step": 2231
},
{
"epoch": 0.6370928357829528,
"grad_norm": 0.419921875,
"learning_rate": 8.891503190463024e-05,
"loss": 2.5809,
"step": 2232
},
{
"epoch": 0.6373782716412785,
"grad_norm": 0.390625,
"learning_rate": 8.879095262748018e-05,
"loss": 2.5614,
"step": 2233
},
{
"epoch": 0.6376637074996042,
"grad_norm": 0.41796875,
"learning_rate": 8.866692357963387e-05,
"loss": 2.5739,
"step": 2234
},
{
"epoch": 0.6379491433579298,
"grad_norm": 0.416015625,
"learning_rate": 8.854294486287188e-05,
"loss": 2.5764,
"step": 2235
},
{
"epoch": 0.6382345792162556,
"grad_norm": 0.4375,
"learning_rate": 8.84190165789336e-05,
"loss": 2.5702,
"step": 2236
},
{
"epoch": 0.6385200150745812,
"grad_norm": 0.40625,
"learning_rate": 8.829513882951686e-05,
"loss": 2.5682,
"step": 2237
},
{
"epoch": 0.638805450932907,
"grad_norm": 0.423828125,
"learning_rate": 8.8171311716278e-05,
"loss": 2.5557,
"step": 2238
},
{
"epoch": 0.6390908867912326,
"grad_norm": 0.42578125,
"learning_rate": 8.804753534083208e-05,
"loss": 2.5917,
"step": 2239
},
{
"epoch": 0.6393763226495583,
"grad_norm": 0.390625,
"learning_rate": 8.79238098047522e-05,
"loss": 2.5776,
"step": 2240
},
{
"epoch": 0.6396617585078841,
"grad_norm": 0.3984375,
"learning_rate": 8.780013520956996e-05,
"loss": 2.5412,
"step": 2241
},
{
"epoch": 0.6399471943662097,
"grad_norm": 0.423828125,
"learning_rate": 8.767651165677502e-05,
"loss": 2.572,
"step": 2242
},
{
"epoch": 0.6402326302245355,
"grad_norm": 0.388671875,
"learning_rate": 8.755293924781523e-05,
"loss": 2.5363,
"step": 2243
},
{
"epoch": 0.6405180660828611,
"grad_norm": 0.390625,
"learning_rate": 8.742941808409647e-05,
"loss": 2.5623,
"step": 2244
},
{
"epoch": 0.6408035019411868,
"grad_norm": 0.404296875,
"learning_rate": 8.730594826698253e-05,
"loss": 2.551,
"step": 2245
},
{
"epoch": 0.6410889377995125,
"grad_norm": 0.37109375,
"learning_rate": 8.718252989779496e-05,
"loss": 2.5181,
"step": 2246
},
{
"epoch": 0.6413743736578382,
"grad_norm": 0.396484375,
"learning_rate": 8.705916307781344e-05,
"loss": 2.5543,
"step": 2247
},
{
"epoch": 0.641659809516164,
"grad_norm": 0.392578125,
"learning_rate": 8.6935847908275e-05,
"loss": 2.5636,
"step": 2248
},
{
"epoch": 0.6419452453744896,
"grad_norm": 0.416015625,
"learning_rate": 8.681258449037438e-05,
"loss": 2.5439,
"step": 2249
},
{
"epoch": 0.6422306812328153,
"grad_norm": 0.396484375,
"learning_rate": 8.668937292526394e-05,
"loss": 2.5287,
"step": 2250
},
{
"epoch": 0.6422306812328153,
"eval_loss": 2.4652860164642334,
"eval_runtime": 6001.1587,
"eval_samples_per_second": 10.712,
"eval_steps_per_second": 10.712,
"step": 2250
},
{
"epoch": 0.642516117091141,
"grad_norm": 0.400390625,
"learning_rate": 8.656621331405339e-05,
"loss": 2.5401,
"step": 2251
},
{
"epoch": 0.6428015529494667,
"grad_norm": 0.373046875,
"learning_rate": 8.644310575780979e-05,
"loss": 2.5709,
"step": 2252
},
{
"epoch": 0.6430869888077924,
"grad_norm": 0.37890625,
"learning_rate": 8.632005035755766e-05,
"loss": 2.6213,
"step": 2253
},
{
"epoch": 0.6433724246661181,
"grad_norm": 0.38671875,
"learning_rate": 8.619704721427843e-05,
"loss": 2.5512,
"step": 2254
},
{
"epoch": 0.6436578605244437,
"grad_norm": 0.376953125,
"learning_rate": 8.607409642891091e-05,
"loss": 2.563,
"step": 2255
},
{
"epoch": 0.6439432963827695,
"grad_norm": 0.39453125,
"learning_rate": 8.595119810235088e-05,
"loss": 2.5438,
"step": 2256
},
{
"epoch": 0.6442287322410952,
"grad_norm": 0.38671875,
"learning_rate": 8.582835233545093e-05,
"loss": 2.5563,
"step": 2257
},
{
"epoch": 0.6445141680994209,
"grad_norm": 0.38671875,
"learning_rate": 8.570555922902074e-05,
"loss": 2.5278,
"step": 2258
},
{
"epoch": 0.6447996039577466,
"grad_norm": 0.388671875,
"learning_rate": 8.558281888382659e-05,
"loss": 2.5753,
"step": 2259
},
{
"epoch": 0.6450850398160722,
"grad_norm": 0.380859375,
"learning_rate": 8.546013140059148e-05,
"loss": 2.5751,
"step": 2260
},
{
"epoch": 0.645370475674398,
"grad_norm": 0.37890625,
"learning_rate": 8.53374968799952e-05,
"loss": 2.5553,
"step": 2261
},
{
"epoch": 0.6456559115327236,
"grad_norm": 0.3828125,
"learning_rate": 8.521491542267386e-05,
"loss": 2.5534,
"step": 2262
},
{
"epoch": 0.6459413473910494,
"grad_norm": 0.37890625,
"learning_rate": 8.509238712922014e-05,
"loss": 2.5781,
"step": 2263
},
{
"epoch": 0.6462267832493751,
"grad_norm": 0.365234375,
"learning_rate": 8.496991210018319e-05,
"loss": 2.5595,
"step": 2264
},
{
"epoch": 0.6465122191077007,
"grad_norm": 0.390625,
"learning_rate": 8.484749043606824e-05,
"loss": 2.5502,
"step": 2265
},
{
"epoch": 0.6467976549660265,
"grad_norm": 0.3671875,
"learning_rate": 8.472512223733679e-05,
"loss": 2.5458,
"step": 2266
},
{
"epoch": 0.6470830908243521,
"grad_norm": 0.375,
"learning_rate": 8.460280760440664e-05,
"loss": 2.5653,
"step": 2267
},
{
"epoch": 0.6473685266826779,
"grad_norm": 0.361328125,
"learning_rate": 8.448054663765135e-05,
"loss": 2.5727,
"step": 2268
},
{
"epoch": 0.6476539625410035,
"grad_norm": 0.390625,
"learning_rate": 8.435833943740064e-05,
"loss": 2.5665,
"step": 2269
},
{
"epoch": 0.6479393983993292,
"grad_norm": 0.390625,
"learning_rate": 8.423618610394004e-05,
"loss": 2.5411,
"step": 2270
},
{
"epoch": 0.6482248342576549,
"grad_norm": 0.375,
"learning_rate": 8.411408673751096e-05,
"loss": 2.5636,
"step": 2271
},
{
"epoch": 0.6485102701159806,
"grad_norm": 0.369140625,
"learning_rate": 8.399204143831036e-05,
"loss": 2.5729,
"step": 2272
},
{
"epoch": 0.6487957059743064,
"grad_norm": 0.37890625,
"learning_rate": 8.387005030649102e-05,
"loss": 2.5837,
"step": 2273
},
{
"epoch": 0.649081141832632,
"grad_norm": 0.375,
"learning_rate": 8.374811344216105e-05,
"loss": 2.5646,
"step": 2274
},
{
"epoch": 0.6493665776909577,
"grad_norm": 0.380859375,
"learning_rate": 8.362623094538428e-05,
"loss": 2.5886,
"step": 2275
},
{
"epoch": 0.6496520135492834,
"grad_norm": 0.39453125,
"learning_rate": 8.350440291617974e-05,
"loss": 2.5494,
"step": 2276
},
{
"epoch": 0.6499374494076091,
"grad_norm": 0.400390625,
"learning_rate": 8.338262945452176e-05,
"loss": 2.5577,
"step": 2277
},
{
"epoch": 0.6502228852659347,
"grad_norm": 0.369140625,
"learning_rate": 8.326091066033998e-05,
"loss": 2.5796,
"step": 2278
},
{
"epoch": 0.6505083211242605,
"grad_norm": 0.376953125,
"learning_rate": 8.313924663351926e-05,
"loss": 2.574,
"step": 2279
},
{
"epoch": 0.6507937569825862,
"grad_norm": 0.38671875,
"learning_rate": 8.301763747389925e-05,
"loss": 2.5544,
"step": 2280
},
{
"epoch": 0.6510791928409119,
"grad_norm": 0.36328125,
"learning_rate": 8.289608328127483e-05,
"loss": 2.5358,
"step": 2281
},
{
"epoch": 0.6513646286992376,
"grad_norm": 0.38671875,
"learning_rate": 8.277458415539569e-05,
"loss": 2.5567,
"step": 2282
},
{
"epoch": 0.6516500645575632,
"grad_norm": 0.375,
"learning_rate": 8.265314019596617e-05,
"loss": 2.5566,
"step": 2283
},
{
"epoch": 0.651935500415889,
"grad_norm": 0.369140625,
"learning_rate": 8.253175150264565e-05,
"loss": 2.5591,
"step": 2284
},
{
"epoch": 0.6522209362742146,
"grad_norm": 0.375,
"learning_rate": 8.241041817504791e-05,
"loss": 2.5519,
"step": 2285
},
{
"epoch": 0.6525063721325404,
"grad_norm": 0.380859375,
"learning_rate": 8.228914031274128e-05,
"loss": 2.5378,
"step": 2286
},
{
"epoch": 0.652791807990866,
"grad_norm": 0.392578125,
"learning_rate": 8.21679180152489e-05,
"loss": 2.5576,
"step": 2287
},
{
"epoch": 0.6530772438491917,
"grad_norm": 0.361328125,
"learning_rate": 8.204675138204794e-05,
"loss": 2.5636,
"step": 2288
},
{
"epoch": 0.6533626797075175,
"grad_norm": 0.37109375,
"learning_rate": 8.192564051257001e-05,
"loss": 2.5682,
"step": 2289
},
{
"epoch": 0.6536481155658431,
"grad_norm": 0.376953125,
"learning_rate": 8.180458550620109e-05,
"loss": 2.5616,
"step": 2290
},
{
"epoch": 0.6539335514241689,
"grad_norm": 0.3671875,
"learning_rate": 8.168358646228115e-05,
"loss": 2.5503,
"step": 2291
},
{
"epoch": 0.6542189872824945,
"grad_norm": 0.3828125,
"learning_rate": 8.156264348010425e-05,
"loss": 2.548,
"step": 2292
},
{
"epoch": 0.6545044231408202,
"grad_norm": 0.365234375,
"learning_rate": 8.144175665891858e-05,
"loss": 2.5327,
"step": 2293
},
{
"epoch": 0.6547898589991459,
"grad_norm": 0.369140625,
"learning_rate": 8.132092609792608e-05,
"loss": 2.5491,
"step": 2294
},
{
"epoch": 0.6550752948574716,
"grad_norm": 0.373046875,
"learning_rate": 8.120015189628259e-05,
"loss": 2.5576,
"step": 2295
},
{
"epoch": 0.6553607307157974,
"grad_norm": 0.375,
"learning_rate": 8.107943415309786e-05,
"loss": 2.5687,
"step": 2296
},
{
"epoch": 0.655646166574123,
"grad_norm": 0.388671875,
"learning_rate": 8.095877296743497e-05,
"loss": 2.5506,
"step": 2297
},
{
"epoch": 0.6559316024324487,
"grad_norm": 0.361328125,
"learning_rate": 8.083816843831091e-05,
"loss": 2.5609,
"step": 2298
},
{
"epoch": 0.6562170382907744,
"grad_norm": 0.35546875,
"learning_rate": 8.071762066469598e-05,
"loss": 2.5515,
"step": 2299
},
{
"epoch": 0.6565024741491001,
"grad_norm": 0.3671875,
"learning_rate": 8.059712974551392e-05,
"loss": 2.5587,
"step": 2300
},
{
"epoch": 0.6567879100074258,
"grad_norm": 0.384765625,
"learning_rate": 8.047669577964197e-05,
"loss": 2.5523,
"step": 2301
},
{
"epoch": 0.6570733458657515,
"grad_norm": 0.384765625,
"learning_rate": 8.03563188659104e-05,
"loss": 2.5321,
"step": 2302
},
{
"epoch": 0.6573587817240771,
"grad_norm": 0.36328125,
"learning_rate": 8.023599910310287e-05,
"loss": 2.5848,
"step": 2303
},
{
"epoch": 0.6576442175824029,
"grad_norm": 0.353515625,
"learning_rate": 8.011573658995606e-05,
"loss": 2.539,
"step": 2304
},
{
"epoch": 0.6579296534407286,
"grad_norm": 0.384765625,
"learning_rate": 7.999553142515969e-05,
"loss": 2.5545,
"step": 2305
},
{
"epoch": 0.6582150892990543,
"grad_norm": 0.373046875,
"learning_rate": 7.987538370735624e-05,
"loss": 2.5481,
"step": 2306
},
{
"epoch": 0.65850052515738,
"grad_norm": 0.373046875,
"learning_rate": 7.975529353514141e-05,
"loss": 2.5889,
"step": 2307
},
{
"epoch": 0.6587859610157056,
"grad_norm": 0.37109375,
"learning_rate": 7.963526100706337e-05,
"loss": 2.5113,
"step": 2308
},
{
"epoch": 0.6590713968740314,
"grad_norm": 0.361328125,
"learning_rate": 7.951528622162297e-05,
"loss": 2.5789,
"step": 2309
},
{
"epoch": 0.659356832732357,
"grad_norm": 0.36328125,
"learning_rate": 7.9395369277274e-05,
"loss": 2.546,
"step": 2310
},
{
"epoch": 0.6596422685906828,
"grad_norm": 0.3671875,
"learning_rate": 7.927551027242252e-05,
"loss": 2.5322,
"step": 2311
},
{
"epoch": 0.6599277044490084,
"grad_norm": 0.384765625,
"learning_rate": 7.9155709305427e-05,
"loss": 2.5277,
"step": 2312
},
{
"epoch": 0.6602131403073341,
"grad_norm": 0.384765625,
"learning_rate": 7.90359664745985e-05,
"loss": 2.5684,
"step": 2313
},
{
"epoch": 0.6604985761656599,
"grad_norm": 0.369140625,
"learning_rate": 7.891628187820021e-05,
"loss": 2.5712,
"step": 2314
},
{
"epoch": 0.6607840120239855,
"grad_norm": 0.384765625,
"learning_rate": 7.87966556144475e-05,
"loss": 2.5458,
"step": 2315
},
{
"epoch": 0.6610694478823113,
"grad_norm": 0.40234375,
"learning_rate": 7.867708778150812e-05,
"loss": 2.572,
"step": 2316
},
{
"epoch": 0.6613548837406369,
"grad_norm": 0.376953125,
"learning_rate": 7.855757847750151e-05,
"loss": 2.553,
"step": 2317
},
{
"epoch": 0.6616403195989626,
"grad_norm": 0.38671875,
"learning_rate": 7.843812780049935e-05,
"loss": 2.5738,
"step": 2318
},
{
"epoch": 0.6619257554572883,
"grad_norm": 0.375,
"learning_rate": 7.831873584852522e-05,
"loss": 2.5652,
"step": 2319
},
{
"epoch": 0.662211191315614,
"grad_norm": 0.37890625,
"learning_rate": 7.819940271955425e-05,
"loss": 2.5447,
"step": 2320
},
{
"epoch": 0.6624966271739398,
"grad_norm": 0.375,
"learning_rate": 7.808012851151362e-05,
"loss": 2.5698,
"step": 2321
},
{
"epoch": 0.6627820630322654,
"grad_norm": 0.3828125,
"learning_rate": 7.796091332228193e-05,
"loss": 2.54,
"step": 2322
},
{
"epoch": 0.6630674988905911,
"grad_norm": 0.3515625,
"learning_rate": 7.784175724968939e-05,
"loss": 2.5497,
"step": 2323
},
{
"epoch": 0.6633529347489168,
"grad_norm": 0.376953125,
"learning_rate": 7.772266039151781e-05,
"loss": 2.5507,
"step": 2324
},
{
"epoch": 0.6636383706072425,
"grad_norm": 3.140625,
"learning_rate": 7.760362284550024e-05,
"loss": 2.5712,
"step": 2325
},
{
"epoch": 0.6639238064655681,
"grad_norm": 0.67578125,
"learning_rate": 7.748464470932117e-05,
"loss": 2.5554,
"step": 2326
},
{
"epoch": 0.6642092423238939,
"grad_norm": 1.328125,
"learning_rate": 7.73657260806164e-05,
"loss": 2.5577,
"step": 2327
},
{
"epoch": 0.6644946781822195,
"grad_norm": 0.38671875,
"learning_rate": 7.724686705697274e-05,
"loss": 2.5744,
"step": 2328
},
{
"epoch": 0.6647801140405453,
"grad_norm": 0.431640625,
"learning_rate": 7.712806773592811e-05,
"loss": 2.547,
"step": 2329
},
{
"epoch": 0.665065549898871,
"grad_norm": 0.400390625,
"learning_rate": 7.700932821497157e-05,
"loss": 2.558,
"step": 2330
},
{
"epoch": 0.6653509857571966,
"grad_norm": 0.39453125,
"learning_rate": 7.689064859154299e-05,
"loss": 2.5383,
"step": 2331
},
{
"epoch": 0.6656364216155224,
"grad_norm": 0.3671875,
"learning_rate": 7.677202896303307e-05,
"loss": 2.6,
"step": 2332
},
{
"epoch": 0.665921857473848,
"grad_norm": 0.3828125,
"learning_rate": 7.665346942678335e-05,
"loss": 2.5926,
"step": 2333
},
{
"epoch": 0.6662072933321738,
"grad_norm": 0.384765625,
"learning_rate": 7.653497008008611e-05,
"loss": 2.5573,
"step": 2334
},
{
"epoch": 0.6664927291904994,
"grad_norm": 0.3828125,
"learning_rate": 7.641653102018402e-05,
"loss": 2.5838,
"step": 2335
},
{
"epoch": 0.6667781650488251,
"grad_norm": 0.380859375,
"learning_rate": 7.629815234427057e-05,
"loss": 2.5812,
"step": 2336
},
{
"epoch": 0.6670636009071509,
"grad_norm": 0.41015625,
"learning_rate": 7.617983414948937e-05,
"loss": 2.5533,
"step": 2337
},
{
"epoch": 0.6673490367654765,
"grad_norm": 0.376953125,
"learning_rate": 7.606157653293476e-05,
"loss": 2.5459,
"step": 2338
},
{
"epoch": 0.6676344726238023,
"grad_norm": 0.419921875,
"learning_rate": 7.594337959165107e-05,
"loss": 2.5619,
"step": 2339
},
{
"epoch": 0.6679199084821279,
"grad_norm": 0.380859375,
"learning_rate": 7.582524342263292e-05,
"loss": 2.5708,
"step": 2340
},
{
"epoch": 0.6682053443404536,
"grad_norm": 0.392578125,
"learning_rate": 7.570716812282512e-05,
"loss": 2.5465,
"step": 2341
},
{
"epoch": 0.6684907801987793,
"grad_norm": 0.388671875,
"learning_rate": 7.558915378912257e-05,
"loss": 2.5456,
"step": 2342
},
{
"epoch": 0.668776216057105,
"grad_norm": 0.3828125,
"learning_rate": 7.547120051836996e-05,
"loss": 2.5814,
"step": 2343
},
{
"epoch": 0.6690616519154307,
"grad_norm": 0.3984375,
"learning_rate": 7.535330840736209e-05,
"loss": 2.5684,
"step": 2344
},
{
"epoch": 0.6693470877737564,
"grad_norm": 0.357421875,
"learning_rate": 7.523547755284337e-05,
"loss": 2.5622,
"step": 2345
},
{
"epoch": 0.6696325236320821,
"grad_norm": 0.392578125,
"learning_rate": 7.511770805150802e-05,
"loss": 2.5668,
"step": 2346
},
{
"epoch": 0.6699179594904078,
"grad_norm": 0.390625,
"learning_rate": 7.500000000000002e-05,
"loss": 2.5299,
"step": 2347
},
{
"epoch": 0.6702033953487335,
"grad_norm": 0.384765625,
"learning_rate": 7.488235349491278e-05,
"loss": 2.546,
"step": 2348
},
{
"epoch": 0.6704888312070592,
"grad_norm": 0.388671875,
"learning_rate": 7.47647686327891e-05,
"loss": 2.5488,
"step": 2349
},
{
"epoch": 0.6707742670653849,
"grad_norm": 0.419921875,
"learning_rate": 7.464724551012161e-05,
"loss": 2.5425,
"step": 2350
},
{
"epoch": 0.6710597029237105,
"grad_norm": 0.365234375,
"learning_rate": 7.45297842233519e-05,
"loss": 2.5346,
"step": 2351
},
{
"epoch": 0.6713451387820363,
"grad_norm": 0.373046875,
"learning_rate": 7.441238486887083e-05,
"loss": 2.5254,
"step": 2352
},
{
"epoch": 0.671630574640362,
"grad_norm": 0.380859375,
"learning_rate": 7.42950475430187e-05,
"loss": 2.5561,
"step": 2353
},
{
"epoch": 0.6719160104986877,
"grad_norm": 0.376953125,
"learning_rate": 7.417777234208463e-05,
"loss": 2.5601,
"step": 2354
},
{
"epoch": 0.6722014463570134,
"grad_norm": 0.3671875,
"learning_rate": 7.406055936230687e-05,
"loss": 2.5617,
"step": 2355
},
{
"epoch": 0.672486882215339,
"grad_norm": 0.39453125,
"learning_rate": 7.394340869987267e-05,
"loss": 2.5633,
"step": 2356
},
{
"epoch": 0.6727723180736648,
"grad_norm": 0.380859375,
"learning_rate": 7.382632045091803e-05,
"loss": 2.5703,
"step": 2357
},
{
"epoch": 0.6730577539319904,
"grad_norm": 0.37109375,
"learning_rate": 7.37092947115278e-05,
"loss": 2.5611,
"step": 2358
},
{
"epoch": 0.6733431897903162,
"grad_norm": 0.369140625,
"learning_rate": 7.359233157773557e-05,
"loss": 2.5762,
"step": 2359
},
{
"epoch": 0.6736286256486418,
"grad_norm": 0.373046875,
"learning_rate": 7.347543114552343e-05,
"loss": 2.5665,
"step": 2360
},
{
"epoch": 0.6739140615069675,
"grad_norm": 0.40234375,
"learning_rate": 7.335859351082217e-05,
"loss": 2.548,
"step": 2361
},
{
"epoch": 0.6741994973652933,
"grad_norm": 0.365234375,
"learning_rate": 7.324181876951092e-05,
"loss": 2.5389,
"step": 2362
},
{
"epoch": 0.6744849332236189,
"grad_norm": 0.390625,
"learning_rate": 7.312510701741717e-05,
"loss": 2.5481,
"step": 2363
},
{
"epoch": 0.6747703690819447,
"grad_norm": 0.3671875,
"learning_rate": 7.300845835031693e-05,
"loss": 2.5571,
"step": 2364
},
{
"epoch": 0.6750558049402703,
"grad_norm": 0.3828125,
"learning_rate": 7.28918728639342e-05,
"loss": 2.5809,
"step": 2365
},
{
"epoch": 0.675341240798596,
"grad_norm": 0.384765625,
"learning_rate": 7.277535065394127e-05,
"loss": 2.5644,
"step": 2366
},
{
"epoch": 0.6756266766569217,
"grad_norm": 0.359375,
"learning_rate": 7.265889181595853e-05,
"loss": 2.5799,
"step": 2367
},
{
"epoch": 0.6759121125152474,
"grad_norm": 0.373046875,
"learning_rate": 7.254249644555429e-05,
"loss": 2.5631,
"step": 2368
},
{
"epoch": 0.6761975483735732,
"grad_norm": 0.36328125,
"learning_rate": 7.242616463824469e-05,
"loss": 2.5673,
"step": 2369
},
{
"epoch": 0.6764829842318988,
"grad_norm": 0.37109375,
"learning_rate": 7.230989648949396e-05,
"loss": 2.5697,
"step": 2370
},
{
"epoch": 0.6767684200902245,
"grad_norm": 0.36328125,
"learning_rate": 7.219369209471387e-05,
"loss": 2.569,
"step": 2371
},
{
"epoch": 0.6770538559485502,
"grad_norm": 0.357421875,
"learning_rate": 7.207755154926386e-05,
"loss": 2.5493,
"step": 2372
},
{
"epoch": 0.6773392918068759,
"grad_norm": 0.357421875,
"learning_rate": 7.196147494845127e-05,
"loss": 2.5515,
"step": 2373
},
{
"epoch": 0.6776247276652015,
"grad_norm": 0.396484375,
"learning_rate": 7.184546238753064e-05,
"loss": 2.5449,
"step": 2374
},
{
"epoch": 0.6779101635235273,
"grad_norm": 0.36328125,
"learning_rate": 7.172951396170402e-05,
"loss": 2.5657,
"step": 2375
},
{
"epoch": 0.6781955993818529,
"grad_norm": 0.376953125,
"learning_rate": 7.1613629766121e-05,
"loss": 2.5615,
"step": 2376
},
{
"epoch": 0.6784810352401787,
"grad_norm": 0.39453125,
"learning_rate": 7.149780989587825e-05,
"loss": 2.5787,
"step": 2377
},
{
"epoch": 0.6787664710985044,
"grad_norm": 0.359375,
"learning_rate": 7.138205444601985e-05,
"loss": 2.5632,
"step": 2378
},
{
"epoch": 0.67905190695683,
"grad_norm": 0.375,
"learning_rate": 7.126636351153684e-05,
"loss": 2.5594,
"step": 2379
},
{
"epoch": 0.6793373428151558,
"grad_norm": 0.373046875,
"learning_rate": 7.115073718736735e-05,
"loss": 2.55,
"step": 2380
},
{
"epoch": 0.6796227786734814,
"grad_norm": 0.357421875,
"learning_rate": 7.10351755683966e-05,
"loss": 2.5493,
"step": 2381
},
{
"epoch": 0.6799082145318072,
"grad_norm": 0.3671875,
"learning_rate": 7.09196787494567e-05,
"loss": 2.54,
"step": 2382
},
{
"epoch": 0.6801936503901328,
"grad_norm": 0.35546875,
"learning_rate": 7.08042468253264e-05,
"loss": 2.5681,
"step": 2383
},
{
"epoch": 0.6804790862484585,
"grad_norm": 0.375,
"learning_rate": 7.068887989073143e-05,
"loss": 2.5505,
"step": 2384
},
{
"epoch": 0.6807645221067842,
"grad_norm": 0.388671875,
"learning_rate": 7.057357804034404e-05,
"loss": 2.5489,
"step": 2385
},
{
"epoch": 0.6810499579651099,
"grad_norm": 0.373046875,
"learning_rate": 7.045834136878308e-05,
"loss": 2.5669,
"step": 2386
},
{
"epoch": 0.6813353938234357,
"grad_norm": 0.373046875,
"learning_rate": 7.0343169970614e-05,
"loss": 2.5354,
"step": 2387
},
{
"epoch": 0.6816208296817613,
"grad_norm": 0.359375,
"learning_rate": 7.022806394034856e-05,
"loss": 2.5571,
"step": 2388
},
{
"epoch": 0.681906265540087,
"grad_norm": 0.369140625,
"learning_rate": 7.0113023372445e-05,
"loss": 2.5556,
"step": 2389
},
{
"epoch": 0.6821917013984127,
"grad_norm": 0.36328125,
"learning_rate": 6.999804836130784e-05,
"loss": 2.5822,
"step": 2390
},
{
"epoch": 0.6824771372567384,
"grad_norm": 0.365234375,
"learning_rate": 6.988313900128769e-05,
"loss": 2.5923,
"step": 2391
},
{
"epoch": 0.682762573115064,
"grad_norm": 0.384765625,
"learning_rate": 6.97682953866813e-05,
"loss": 2.5303,
"step": 2392
},
{
"epoch": 0.6830480089733898,
"grad_norm": 0.37109375,
"learning_rate": 6.965351761173165e-05,
"loss": 2.5794,
"step": 2393
},
{
"epoch": 0.6833334448317155,
"grad_norm": 0.35546875,
"learning_rate": 6.953880577062745e-05,
"loss": 2.582,
"step": 2394
},
{
"epoch": 0.6836188806900412,
"grad_norm": 0.37109375,
"learning_rate": 6.94241599575034e-05,
"loss": 2.5485,
"step": 2395
},
{
"epoch": 0.6839043165483669,
"grad_norm": 0.361328125,
"learning_rate": 6.930958026644005e-05,
"loss": 2.5524,
"step": 2396
},
{
"epoch": 0.6841897524066926,
"grad_norm": 0.36328125,
"learning_rate": 6.919506679146372e-05,
"loss": 2.5754,
"step": 2397
},
{
"epoch": 0.6844751882650183,
"grad_norm": 0.357421875,
"learning_rate": 6.908061962654626e-05,
"loss": 2.5647,
"step": 2398
},
{
"epoch": 0.6847606241233439,
"grad_norm": 0.373046875,
"learning_rate": 6.896623886560528e-05,
"loss": 2.567,
"step": 2399
},
{
"epoch": 0.6850460599816697,
"grad_norm": 0.36328125,
"learning_rate": 6.885192460250366e-05,
"loss": 2.5596,
"step": 2400
},
{
"epoch": 0.6853314958399953,
"grad_norm": 0.40234375,
"learning_rate": 6.873767693105e-05,
"loss": 2.5652,
"step": 2401
},
{
"epoch": 0.685616931698321,
"grad_norm": 0.369140625,
"learning_rate": 6.8623495944998e-05,
"loss": 2.5612,
"step": 2402
},
{
"epoch": 0.6859023675566468,
"grad_norm": 0.37109375,
"learning_rate": 6.850938173804672e-05,
"loss": 2.5595,
"step": 2403
},
{
"epoch": 0.6861878034149724,
"grad_norm": 0.380859375,
"learning_rate": 6.839533440384051e-05,
"loss": 2.5805,
"step": 2404
},
{
"epoch": 0.6864732392732982,
"grad_norm": 0.353515625,
"learning_rate": 6.82813540359688e-05,
"loss": 2.5742,
"step": 2405
},
{
"epoch": 0.6867586751316238,
"grad_norm": 0.365234375,
"learning_rate": 6.816744072796592e-05,
"loss": 2.5801,
"step": 2406
},
{
"epoch": 0.6870441109899496,
"grad_norm": 0.365234375,
"learning_rate": 6.805359457331144e-05,
"loss": 2.5545,
"step": 2407
},
{
"epoch": 0.6873295468482752,
"grad_norm": 0.369140625,
"learning_rate": 6.793981566542957e-05,
"loss": 2.553,
"step": 2408
},
{
"epoch": 0.6876149827066009,
"grad_norm": 0.365234375,
"learning_rate": 6.78261040976894e-05,
"loss": 2.5477,
"step": 2409
},
{
"epoch": 0.6879004185649267,
"grad_norm": 0.36328125,
"learning_rate": 6.771245996340491e-05,
"loss": 2.5584,
"step": 2410
},
{
"epoch": 0.6881858544232523,
"grad_norm": 0.4453125,
"learning_rate": 6.759888335583458e-05,
"loss": 2.5786,
"step": 2411
},
{
"epoch": 0.688471290281578,
"grad_norm": 0.34765625,
"learning_rate": 6.748537436818142e-05,
"loss": 2.5663,
"step": 2412
},
{
"epoch": 0.6887567261399037,
"grad_norm": 0.38671875,
"learning_rate": 6.737193309359324e-05,
"loss": 2.5402,
"step": 2413
},
{
"epoch": 0.6890421619982294,
"grad_norm": 0.353515625,
"learning_rate": 6.7258559625162e-05,
"loss": 2.5748,
"step": 2414
},
{
"epoch": 0.6893275978565551,
"grad_norm": 0.357421875,
"learning_rate": 6.714525405592412e-05,
"loss": 2.5759,
"step": 2415
},
{
"epoch": 0.6896130337148808,
"grad_norm": 0.3828125,
"learning_rate": 6.703201647886034e-05,
"loss": 2.5636,
"step": 2416
},
{
"epoch": 0.6898984695732064,
"grad_norm": 0.4765625,
"learning_rate": 6.691884698689548e-05,
"loss": 2.5573,
"step": 2417
},
{
"epoch": 0.6901839054315322,
"grad_norm": 0.369140625,
"learning_rate": 6.680574567289864e-05,
"loss": 2.5802,
"step": 2418
},
{
"epoch": 0.6904693412898579,
"grad_norm": 0.373046875,
"learning_rate": 6.66927126296829e-05,
"loss": 2.5497,
"step": 2419
},
{
"epoch": 0.6907547771481836,
"grad_norm": 0.36328125,
"learning_rate": 6.657974795000525e-05,
"loss": 2.5806,
"step": 2420
},
{
"epoch": 0.6910402130065093,
"grad_norm": 0.37109375,
"learning_rate": 6.646685172656667e-05,
"loss": 2.5485,
"step": 2421
},
{
"epoch": 0.6913256488648349,
"grad_norm": 0.37109375,
"learning_rate": 6.6354024052012e-05,
"loss": 2.5518,
"step": 2422
},
{
"epoch": 0.6916110847231607,
"grad_norm": 0.373046875,
"learning_rate": 6.62412650189297e-05,
"loss": 2.5628,
"step": 2423
},
{
"epoch": 0.6918965205814863,
"grad_norm": 0.349609375,
"learning_rate": 6.612857471985203e-05,
"loss": 2.5364,
"step": 2424
},
{
"epoch": 0.6921819564398121,
"grad_norm": 0.365234375,
"learning_rate": 6.601595324725474e-05,
"loss": 2.5879,
"step": 2425
},
{
"epoch": 0.6924673922981378,
"grad_norm": 0.353515625,
"learning_rate": 6.590340069355713e-05,
"loss": 2.5652,
"step": 2426
},
{
"epoch": 0.6927528281564634,
"grad_norm": 0.37109375,
"learning_rate": 6.579091715112201e-05,
"loss": 2.544,
"step": 2427
},
{
"epoch": 0.6930382640147892,
"grad_norm": 0.384765625,
"learning_rate": 6.567850271225543e-05,
"loss": 2.5717,
"step": 2428
},
{
"epoch": 0.6933236998731148,
"grad_norm": 0.37109375,
"learning_rate": 6.556615746920685e-05,
"loss": 2.5632,
"step": 2429
},
{
"epoch": 0.6936091357314406,
"grad_norm": 0.3515625,
"learning_rate": 6.545388151416896e-05,
"loss": 2.544,
"step": 2430
},
{
"epoch": 0.6938945715897662,
"grad_norm": 0.36328125,
"learning_rate": 6.534167493927748e-05,
"loss": 2.5697,
"step": 2431
},
{
"epoch": 0.6941800074480919,
"grad_norm": 0.35546875,
"learning_rate": 6.522953783661121e-05,
"loss": 2.5455,
"step": 2432
},
{
"epoch": 0.6944654433064176,
"grad_norm": 0.404296875,
"learning_rate": 6.511747029819207e-05,
"loss": 2.5844,
"step": 2433
},
{
"epoch": 0.6947508791647433,
"grad_norm": 0.36328125,
"learning_rate": 6.500547241598478e-05,
"loss": 2.5579,
"step": 2434
},
{
"epoch": 0.6950363150230691,
"grad_norm": 0.3828125,
"learning_rate": 6.489354428189683e-05,
"loss": 2.5542,
"step": 2435
},
{
"epoch": 0.6953217508813947,
"grad_norm": 0.3671875,
"learning_rate": 6.478168598777864e-05,
"loss": 2.5787,
"step": 2436
},
{
"epoch": 0.6956071867397204,
"grad_norm": 0.39453125,
"learning_rate": 6.466989762542332e-05,
"loss": 2.5676,
"step": 2437
},
{
"epoch": 0.6958926225980461,
"grad_norm": 0.3671875,
"learning_rate": 6.455817928656636e-05,
"loss": 2.5601,
"step": 2438
},
{
"epoch": 0.6961780584563718,
"grad_norm": 0.33984375,
"learning_rate": 6.444653106288612e-05,
"loss": 2.5721,
"step": 2439
},
{
"epoch": 0.6964634943146975,
"grad_norm": 0.42578125,
"learning_rate": 6.433495304600306e-05,
"loss": 2.5427,
"step": 2440
},
{
"epoch": 0.6967489301730232,
"grad_norm": 0.361328125,
"learning_rate": 6.422344532748039e-05,
"loss": 2.5505,
"step": 2441
},
{
"epoch": 0.6970343660313489,
"grad_norm": 0.384765625,
"learning_rate": 6.411200799882338e-05,
"loss": 2.5491,
"step": 2442
},
{
"epoch": 0.6973198018896746,
"grad_norm": 0.36328125,
"learning_rate": 6.400064115147955e-05,
"loss": 2.5645,
"step": 2443
},
{
"epoch": 0.6976052377480003,
"grad_norm": 0.34765625,
"learning_rate": 6.38893448768387e-05,
"loss": 2.5374,
"step": 2444
},
{
"epoch": 0.697890673606326,
"grad_norm": 0.3515625,
"learning_rate": 6.377811926623273e-05,
"loss": 2.5343,
"step": 2445
},
{
"epoch": 0.6981761094646517,
"grad_norm": 0.345703125,
"learning_rate": 6.366696441093536e-05,
"loss": 2.6022,
"step": 2446
},
{
"epoch": 0.6984615453229773,
"grad_norm": 0.365234375,
"learning_rate": 6.355588040216248e-05,
"loss": 2.5745,
"step": 2447
},
{
"epoch": 0.6987469811813031,
"grad_norm": 0.390625,
"learning_rate": 6.344486733107168e-05,
"loss": 2.5623,
"step": 2448
},
{
"epoch": 0.6990324170396287,
"grad_norm": 0.353515625,
"learning_rate": 6.333392528876233e-05,
"loss": 2.567,
"step": 2449
},
{
"epoch": 0.6993178528979545,
"grad_norm": 0.359375,
"learning_rate": 6.32230543662757e-05,
"loss": 2.5734,
"step": 2450
},
{
"epoch": 0.6996032887562802,
"grad_norm": 0.38671875,
"learning_rate": 6.311225465459442e-05,
"loss": 2.5358,
"step": 2451
},
{
"epoch": 0.6998887246146058,
"grad_norm": 0.369140625,
"learning_rate": 6.300152624464296e-05,
"loss": 2.5494,
"step": 2452
},
{
"epoch": 0.7001741604729316,
"grad_norm": 0.3515625,
"learning_rate": 6.289086922728712e-05,
"loss": 2.5602,
"step": 2453
},
{
"epoch": 0.7004595963312572,
"grad_norm": 0.3515625,
"learning_rate": 6.278028369333413e-05,
"loss": 2.5788,
"step": 2454
},
{
"epoch": 0.700745032189583,
"grad_norm": 0.392578125,
"learning_rate": 6.266976973353252e-05,
"loss": 2.5591,
"step": 2455
},
{
"epoch": 0.7010304680479086,
"grad_norm": 0.3671875,
"learning_rate": 6.255932743857226e-05,
"loss": 2.5517,
"step": 2456
},
{
"epoch": 0.7013159039062343,
"grad_norm": 0.353515625,
"learning_rate": 6.244895689908426e-05,
"loss": 2.5502,
"step": 2457
},
{
"epoch": 0.70160133976456,
"grad_norm": 0.373046875,
"learning_rate": 6.233865820564079e-05,
"loss": 2.5815,
"step": 2458
},
{
"epoch": 0.7018867756228857,
"grad_norm": 0.353515625,
"learning_rate": 6.222843144875492e-05,
"loss": 2.5633,
"step": 2459
},
{
"epoch": 0.7021722114812115,
"grad_norm": 0.373046875,
"learning_rate": 6.211827671888098e-05,
"loss": 2.5513,
"step": 2460
},
{
"epoch": 0.7024576473395371,
"grad_norm": 0.380859375,
"learning_rate": 6.200819410641385e-05,
"loss": 2.569,
"step": 2461
},
{
"epoch": 0.7027430831978628,
"grad_norm": 0.37109375,
"learning_rate": 6.189818370168956e-05,
"loss": 2.559,
"step": 2462
},
{
"epoch": 0.7030285190561885,
"grad_norm": 0.369140625,
"learning_rate": 6.17882455949846e-05,
"loss": 2.5625,
"step": 2463
},
{
"epoch": 0.7033139549145142,
"grad_norm": 0.359375,
"learning_rate": 6.16783798765164e-05,
"loss": 2.552,
"step": 2464
},
{
"epoch": 0.7035993907728398,
"grad_norm": 0.365234375,
"learning_rate": 6.156858663644277e-05,
"loss": 2.5329,
"step": 2465
},
{
"epoch": 0.7038848266311656,
"grad_norm": 0.33984375,
"learning_rate": 6.145886596486208e-05,
"loss": 2.5371,
"step": 2466
},
{
"epoch": 0.7041702624894913,
"grad_norm": 0.337890625,
"learning_rate": 6.134921795181324e-05,
"loss": 2.561,
"step": 2467
},
{
"epoch": 0.704455698347817,
"grad_norm": 0.34765625,
"learning_rate": 6.123964268727554e-05,
"loss": 2.5607,
"step": 2468
},
{
"epoch": 0.7047411342061427,
"grad_norm": 0.3515625,
"learning_rate": 6.113014026116841e-05,
"loss": 2.5781,
"step": 2469
},
{
"epoch": 0.7050265700644683,
"grad_norm": 0.369140625,
"learning_rate": 6.102071076335173e-05,
"loss": 2.5742,
"step": 2470
},
{
"epoch": 0.7053120059227941,
"grad_norm": 0.341796875,
"learning_rate": 6.091135428362536e-05,
"loss": 2.5736,
"step": 2471
},
{
"epoch": 0.7055974417811197,
"grad_norm": 0.36328125,
"learning_rate": 6.0802070911729246e-05,
"loss": 2.5795,
"step": 2472
},
{
"epoch": 0.7058828776394455,
"grad_norm": 0.357421875,
"learning_rate": 6.06928607373435e-05,
"loss": 2.5563,
"step": 2473
},
{
"epoch": 0.7061683134977711,
"grad_norm": 0.357421875,
"learning_rate": 6.058372385008801e-05,
"loss": 2.5287,
"step": 2474
},
{
"epoch": 0.7064537493560968,
"grad_norm": 0.34765625,
"learning_rate": 6.047466033952245e-05,
"loss": 2.5752,
"step": 2475
},
{
"epoch": 0.7067391852144226,
"grad_norm": 0.34765625,
"learning_rate": 6.036567029514665e-05,
"loss": 2.5511,
"step": 2476
},
{
"epoch": 0.7070246210727482,
"grad_norm": 0.357421875,
"learning_rate": 6.025675380639976e-05,
"loss": 2.5685,
"step": 2477
},
{
"epoch": 0.707310056931074,
"grad_norm": 0.357421875,
"learning_rate": 6.0147910962660684e-05,
"loss": 2.577,
"step": 2478
},
{
"epoch": 0.7075954927893996,
"grad_norm": 0.3671875,
"learning_rate": 6.003914185324802e-05,
"loss": 2.5451,
"step": 2479
},
{
"epoch": 0.7078809286477253,
"grad_norm": 0.349609375,
"learning_rate": 5.993044656741965e-05,
"loss": 2.5405,
"step": 2480
},
{
"epoch": 0.708166364506051,
"grad_norm": 0.34765625,
"learning_rate": 5.982182519437311e-05,
"loss": 2.5569,
"step": 2481
},
{
"epoch": 0.7084518003643767,
"grad_norm": 0.373046875,
"learning_rate": 5.971327782324508e-05,
"loss": 2.5454,
"step": 2482
},
{
"epoch": 0.7087372362227025,
"grad_norm": 0.369140625,
"learning_rate": 5.960480454311155e-05,
"loss": 2.5725,
"step": 2483
},
{
"epoch": 0.7090226720810281,
"grad_norm": 0.34375,
"learning_rate": 5.949640544298779e-05,
"loss": 2.5612,
"step": 2484
},
{
"epoch": 0.7093081079393538,
"grad_norm": 0.3359375,
"learning_rate": 5.938808061182823e-05,
"loss": 2.5581,
"step": 2485
},
{
"epoch": 0.7095935437976795,
"grad_norm": 0.34765625,
"learning_rate": 5.927983013852614e-05,
"loss": 2.5476,
"step": 2486
},
{
"epoch": 0.7098789796560052,
"grad_norm": 0.359375,
"learning_rate": 5.917165411191405e-05,
"loss": 2.5592,
"step": 2487
},
{
"epoch": 0.7101644155143308,
"grad_norm": 0.36328125,
"learning_rate": 5.906355262076317e-05,
"loss": 2.5649,
"step": 2488
},
{
"epoch": 0.7104498513726566,
"grad_norm": 0.3515625,
"learning_rate": 5.895552575378361e-05,
"loss": 2.5849,
"step": 2489
},
{
"epoch": 0.7107352872309822,
"grad_norm": 0.34765625,
"learning_rate": 5.8847573599624335e-05,
"loss": 2.5812,
"step": 2490
},
{
"epoch": 0.711020723089308,
"grad_norm": 0.365234375,
"learning_rate": 5.8739696246872853e-05,
"loss": 2.5425,
"step": 2491
},
{
"epoch": 0.7113061589476337,
"grad_norm": 0.353515625,
"learning_rate": 5.863189378405541e-05,
"loss": 2.554,
"step": 2492
},
{
"epoch": 0.7115915948059593,
"grad_norm": 0.361328125,
"learning_rate": 5.8524166299636785e-05,
"loss": 2.5374,
"step": 2493
},
{
"epoch": 0.7118770306642851,
"grad_norm": 0.353515625,
"learning_rate": 5.841651388202015e-05,
"loss": 2.5079,
"step": 2494
},
{
"epoch": 0.7121624665226107,
"grad_norm": 0.380859375,
"learning_rate": 5.8308936619547076e-05,
"loss": 2.5421,
"step": 2495
},
{
"epoch": 0.7124479023809365,
"grad_norm": 0.376953125,
"learning_rate": 5.820143460049759e-05,
"loss": 2.5617,
"step": 2496
},
{
"epoch": 0.7127333382392621,
"grad_norm": 0.3515625,
"learning_rate": 5.809400791308978e-05,
"loss": 2.5253,
"step": 2497
},
{
"epoch": 0.7130187740975878,
"grad_norm": 0.34765625,
"learning_rate": 5.798665664548015e-05,
"loss": 2.5518,
"step": 2498
},
{
"epoch": 0.7133042099559136,
"grad_norm": 0.369140625,
"learning_rate": 5.787938088576305e-05,
"loss": 2.5575,
"step": 2499
},
{
"epoch": 0.7135896458142392,
"grad_norm": 0.359375,
"learning_rate": 5.777218072197113e-05,
"loss": 2.5604,
"step": 2500
},
{
"epoch": 0.7135896458142392,
"eval_loss": 2.4628705978393555,
"eval_runtime": 5982.5105,
"eval_samples_per_second": 10.746,
"eval_steps_per_second": 10.746,
"step": 2500
}
],
"logging_steps": 1,
"max_steps": 3503,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"total_flos": 9.70632734441472e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}