Files
CareBot_Medical_multi-llama…/trainer_state.json
2024-10-16 18:01:15 +08:00

34714 lines
773 KiB
JSON

{
"best_metric": 0.8676137924194336,
"best_model_checkpoint": "/share/project/zhaolulu/LLama-factory/saves/llama3-8B/full/med_llama3_v4/checkpoint-4900",
"epoch": 1.9995919200163232,
"eval_steps": 100,
"global_step": 4900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 339.91975405718694,
"learning_rate": 1e-07,
"loss": 1.7522,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 385.0343149275365,
"learning_rate": 2e-07,
"loss": 1.6698,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 217.16990109368848,
"learning_rate": 3e-07,
"loss": 1.477,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 59.020222478434036,
"learning_rate": 4e-07,
"loss": 1.7423,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 84.30213602570623,
"learning_rate": 5e-07,
"loss": 1.4437,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 190.93685227534522,
"learning_rate": 6e-07,
"loss": 1.4204,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 427.9251219742702,
"learning_rate": 7e-07,
"loss": 2.0805,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 472.2283030148613,
"learning_rate": 8e-07,
"loss": 1.7254,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 211.7388409142181,
"learning_rate": 9e-07,
"loss": 1.4883,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 458.30024463064365,
"learning_rate": 1e-06,
"loss": 1.574,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 141.00698111144882,
"learning_rate": 1.1e-06,
"loss": 1.4417,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 167.99840817001552,
"learning_rate": 1.2e-06,
"loss": 1.487,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 181.18818741954686,
"learning_rate": 1.3e-06,
"loss": 1.3476,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 107.95793944057903,
"learning_rate": 1.4e-06,
"loss": 1.315,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 70.68193840944636,
"learning_rate": 1.5e-06,
"loss": 1.3462,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 28.194353482015078,
"learning_rate": 1.6e-06,
"loss": 1.1409,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 10.09210224370722,
"learning_rate": 1.6999999999999998e-06,
"loss": 1.2671,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 58.40530962890393,
"learning_rate": 1.8e-06,
"loss": 1.2577,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 73.76364225357823,
"learning_rate": 1.8999999999999998e-06,
"loss": 1.1093,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 70.78009134224173,
"learning_rate": 2e-06,
"loss": 1.2377,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 55.47167428271892,
"learning_rate": 1.999999908153673e-06,
"loss": 1.1889,
"step": 21
},
{
"epoch": 0.01,
"grad_norm": 83.1042688968126,
"learning_rate": 1.9999996326147086e-06,
"loss": 1.1271,
"step": 22
},
{
"epoch": 0.01,
"grad_norm": 90.56701121867007,
"learning_rate": 1.999999173383157e-06,
"loss": 1.2201,
"step": 23
},
{
"epoch": 0.01,
"grad_norm": 7.141759754496498,
"learning_rate": 1.9999985304591036e-06,
"loss": 1.1423,
"step": 24
},
{
"epoch": 0.01,
"grad_norm": 33.72224032314947,
"learning_rate": 1.9999977038426654e-06,
"loss": 1.1821,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 71.88893468427116,
"learning_rate": 1.9999966935339955e-06,
"loss": 1.1234,
"step": 26
},
{
"epoch": 0.01,
"grad_norm": 70.56651989079585,
"learning_rate": 1.9999954995332777e-06,
"loss": 1.1789,
"step": 27
},
{
"epoch": 0.01,
"grad_norm": 23.98480836394711,
"learning_rate": 1.9999941218407332e-06,
"loss": 1.1098,
"step": 28
},
{
"epoch": 0.01,
"grad_norm": 51.405460686969995,
"learning_rate": 1.9999925604566143e-06,
"loss": 1.0898,
"step": 29
},
{
"epoch": 0.01,
"grad_norm": 43.28455092451016,
"learning_rate": 1.9999908153812073e-06,
"loss": 1.1261,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 62.349308709470144,
"learning_rate": 1.9999888866148333e-06,
"loss": 1.135,
"step": 31
},
{
"epoch": 0.01,
"grad_norm": 28.988538021846505,
"learning_rate": 1.9999867741578463e-06,
"loss": 1.1441,
"step": 32
},
{
"epoch": 0.01,
"grad_norm": 6.448672947172309,
"learning_rate": 1.999984478010635e-06,
"loss": 1.0653,
"step": 33
},
{
"epoch": 0.01,
"grad_norm": 42.95720616527255,
"learning_rate": 1.9999819981736206e-06,
"loss": 1.1115,
"step": 34
},
{
"epoch": 0.01,
"grad_norm": 87.46380844583713,
"learning_rate": 1.999979334647259e-06,
"loss": 1.157,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 194.61534308676295,
"learning_rate": 1.9999764874320387e-06,
"loss": 1.109,
"step": 36
},
{
"epoch": 0.02,
"grad_norm": 21.60235952627466,
"learning_rate": 1.999973456528484e-06,
"loss": 1.2009,
"step": 37
},
{
"epoch": 0.02,
"grad_norm": 22.13549530557716,
"learning_rate": 1.9999702419371503e-06,
"loss": 1.0869,
"step": 38
},
{
"epoch": 0.02,
"grad_norm": 7.949980871389036,
"learning_rate": 1.9999668436586287e-06,
"loss": 1.0941,
"step": 39
},
{
"epoch": 0.02,
"grad_norm": 8.399491775940783,
"learning_rate": 1.9999632616935437e-06,
"loss": 1.1252,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 8.201930401657817,
"learning_rate": 1.999959496042553e-06,
"loss": 1.1132,
"step": 41
},
{
"epoch": 0.02,
"grad_norm": 7.530019717545176,
"learning_rate": 1.999955546706348e-06,
"loss": 1.0995,
"step": 42
},
{
"epoch": 0.02,
"grad_norm": 32.714163362270355,
"learning_rate": 1.999951413685655e-06,
"loss": 1.1313,
"step": 43
},
{
"epoch": 0.02,
"grad_norm": 19.108603557564876,
"learning_rate": 1.9999470969812325e-06,
"loss": 1.1093,
"step": 44
},
{
"epoch": 0.02,
"grad_norm": 11.81911034783207,
"learning_rate": 1.9999425965938735e-06,
"loss": 1.1335,
"step": 45
},
{
"epoch": 0.02,
"grad_norm": 5.921218810407937,
"learning_rate": 1.9999379125244053e-06,
"loss": 1.1145,
"step": 46
},
{
"epoch": 0.02,
"grad_norm": 7.3950025841959395,
"learning_rate": 1.9999330447736876e-06,
"loss": 1.1806,
"step": 47
},
{
"epoch": 0.02,
"grad_norm": 23.027552607825008,
"learning_rate": 1.9999279933426146e-06,
"loss": 1.0825,
"step": 48
},
{
"epoch": 0.02,
"grad_norm": 30.587276075612916,
"learning_rate": 1.999922758232115e-06,
"loss": 1.1427,
"step": 49
},
{
"epoch": 0.02,
"grad_norm": 28.379564435971453,
"learning_rate": 1.9999173394431494e-06,
"loss": 1.106,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 12.020799160789908,
"learning_rate": 1.999911736976714e-06,
"loss": 1.128,
"step": 51
},
{
"epoch": 0.02,
"grad_norm": 13.949664947580269,
"learning_rate": 1.9999059508338378e-06,
"loss": 1.081,
"step": 52
},
{
"epoch": 0.02,
"grad_norm": 33.33448751130386,
"learning_rate": 1.999899981015583e-06,
"loss": 1.1035,
"step": 53
},
{
"epoch": 0.02,
"grad_norm": 3.9668368456755276,
"learning_rate": 1.999893827523047e-06,
"loss": 1.1161,
"step": 54
},
{
"epoch": 0.02,
"grad_norm": 20.305312141455982,
"learning_rate": 1.99988749035736e-06,
"loss": 1.1553,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 42.06911241416845,
"learning_rate": 1.999880969519686e-06,
"loss": 1.108,
"step": 56
},
{
"epoch": 0.02,
"grad_norm": 6.678007842323346,
"learning_rate": 1.999874265011222e-06,
"loss": 1.1029,
"step": 57
},
{
"epoch": 0.02,
"grad_norm": 6.32880115475331,
"learning_rate": 1.9998673768332017e-06,
"loss": 1.0707,
"step": 58
},
{
"epoch": 0.02,
"grad_norm": 17.235633351451884,
"learning_rate": 1.999860304986888e-06,
"loss": 1.0253,
"step": 59
},
{
"epoch": 0.02,
"grad_norm": 19.33072299321923,
"learning_rate": 1.9998530494735816e-06,
"loss": 1.0884,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 5.909475220704763,
"learning_rate": 1.9998456102946145e-06,
"loss": 1.0653,
"step": 61
},
{
"epoch": 0.03,
"grad_norm": 5.084332808704594,
"learning_rate": 1.999837987451353e-06,
"loss": 1.101,
"step": 62
},
{
"epoch": 0.03,
"grad_norm": 13.287813144023774,
"learning_rate": 1.9998301809451982e-06,
"loss": 1.0504,
"step": 63
},
{
"epoch": 0.03,
"grad_norm": 12.409945338059611,
"learning_rate": 1.999822190777584e-06,
"loss": 1.0961,
"step": 64
},
{
"epoch": 0.03,
"grad_norm": 5.494743088917732,
"learning_rate": 1.999814016949977e-06,
"loss": 1.0706,
"step": 65
},
{
"epoch": 0.03,
"grad_norm": 12.184734807851129,
"learning_rate": 1.9998056594638803e-06,
"loss": 1.0565,
"step": 66
},
{
"epoch": 0.03,
"grad_norm": 6.4866628053859365,
"learning_rate": 1.999797118320828e-06,
"loss": 1.0499,
"step": 67
},
{
"epoch": 0.03,
"grad_norm": 12.637078131039903,
"learning_rate": 1.999788393522389e-06,
"loss": 1.1124,
"step": 68
},
{
"epoch": 0.03,
"grad_norm": 9.277140232978086,
"learning_rate": 1.999779485070167e-06,
"loss": 1.1095,
"step": 69
},
{
"epoch": 0.03,
"grad_norm": 22.528400717613323,
"learning_rate": 1.9997703929657968e-06,
"loss": 1.064,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 10.581564196644136,
"learning_rate": 1.9997611172109504e-06,
"loss": 1.0353,
"step": 71
},
{
"epoch": 0.03,
"grad_norm": 5.926890124864172,
"learning_rate": 1.99975165780733e-06,
"loss": 1.0512,
"step": 72
},
{
"epoch": 0.03,
"grad_norm": 6.171417728369577,
"learning_rate": 1.9997420147566747e-06,
"loss": 1.0599,
"step": 73
},
{
"epoch": 0.03,
"grad_norm": 12.67195708325269,
"learning_rate": 1.999732188060755e-06,
"loss": 1.1049,
"step": 74
},
{
"epoch": 0.03,
"grad_norm": 6.074697341875566,
"learning_rate": 1.999722177721376e-06,
"loss": 1.0347,
"step": 75
},
{
"epoch": 0.03,
"grad_norm": 9.0236033834056,
"learning_rate": 1.9997119837403767e-06,
"loss": 1.0783,
"step": 76
},
{
"epoch": 0.03,
"grad_norm": 5.651529031068592,
"learning_rate": 1.9997016061196296e-06,
"loss": 1.0758,
"step": 77
},
{
"epoch": 0.03,
"grad_norm": 14.259058356363868,
"learning_rate": 1.9996910448610414e-06,
"loss": 1.0895,
"step": 78
},
{
"epoch": 0.03,
"grad_norm": 7.231264496851399,
"learning_rate": 1.9996802999665513e-06,
"loss": 1.0394,
"step": 79
},
{
"epoch": 0.03,
"grad_norm": 7.269817091285968,
"learning_rate": 1.9996693714381338e-06,
"loss": 1.0803,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 5.2617373695090395,
"learning_rate": 1.999658259277796e-06,
"loss": 1.0132,
"step": 81
},
{
"epoch": 0.03,
"grad_norm": 13.672718154039373,
"learning_rate": 1.999646963487579e-06,
"loss": 1.1079,
"step": 82
},
{
"epoch": 0.03,
"grad_norm": 6.250980638558097,
"learning_rate": 1.999635484069558e-06,
"loss": 1.0292,
"step": 83
},
{
"epoch": 0.03,
"grad_norm": 6.548687052977075,
"learning_rate": 1.999623821025842e-06,
"loss": 1.1051,
"step": 84
},
{
"epoch": 0.03,
"grad_norm": 6.130769440584885,
"learning_rate": 1.9996119743585727e-06,
"loss": 1.0528,
"step": 85
},
{
"epoch": 0.04,
"grad_norm": 13.560517645761712,
"learning_rate": 1.999599944069927e-06,
"loss": 1.07,
"step": 86
},
{
"epoch": 0.04,
"grad_norm": 7.871620100565465,
"learning_rate": 1.999587730162114e-06,
"loss": 1.0526,
"step": 87
},
{
"epoch": 0.04,
"grad_norm": 18.819029888323147,
"learning_rate": 1.9995753326373785e-06,
"loss": 1.1093,
"step": 88
},
{
"epoch": 0.04,
"grad_norm": 13.923392710523737,
"learning_rate": 1.999562751497996e-06,
"loss": 1.0381,
"step": 89
},
{
"epoch": 0.04,
"grad_norm": 6.749464764255322,
"learning_rate": 1.999549986746279e-06,
"loss": 0.9928,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 6.714025707712751,
"learning_rate": 1.9995370383845724e-06,
"loss": 0.9976,
"step": 91
},
{
"epoch": 0.04,
"grad_norm": 4.382497266457875,
"learning_rate": 1.999523906415254e-06,
"loss": 1.034,
"step": 92
},
{
"epoch": 0.04,
"grad_norm": 14.345247340942596,
"learning_rate": 1.999510590840736e-06,
"loss": 1.0392,
"step": 93
},
{
"epoch": 0.04,
"grad_norm": 7.358299918379953,
"learning_rate": 1.9994970916634646e-06,
"loss": 1.0161,
"step": 94
},
{
"epoch": 0.04,
"grad_norm": 4.838441471161105,
"learning_rate": 1.99948340888592e-06,
"loss": 1.0389,
"step": 95
},
{
"epoch": 0.04,
"grad_norm": 20.855315724495103,
"learning_rate": 1.9994695425106147e-06,
"loss": 1.1264,
"step": 96
},
{
"epoch": 0.04,
"grad_norm": 16.180006753000555,
"learning_rate": 1.9994554925400966e-06,
"loss": 1.0911,
"step": 97
},
{
"epoch": 0.04,
"grad_norm": 5.7938406577909785,
"learning_rate": 1.9994412589769456e-06,
"loss": 1.0681,
"step": 98
},
{
"epoch": 0.04,
"grad_norm": 4.80411945021199,
"learning_rate": 1.999426841823778e-06,
"loss": 1.0186,
"step": 99
},
{
"epoch": 0.04,
"grad_norm": 7.378249146507519,
"learning_rate": 1.99941224108324e-06,
"loss": 1.0443,
"step": 100
},
{
"epoch": 0.04,
"eval_loss": 1.0393818616867065,
"eval_runtime": 463.2227,
"eval_samples_per_second": 75.238,
"eval_steps_per_second": 4.704,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 8.284806265507699,
"learning_rate": 1.9993974567580157e-06,
"loss": 1.0648,
"step": 101
},
{
"epoch": 0.04,
"grad_norm": 3.915561894093318,
"learning_rate": 1.9993824888508195e-06,
"loss": 1.0175,
"step": 102
},
{
"epoch": 0.04,
"grad_norm": 12.02920913117933,
"learning_rate": 1.9993673373644015e-06,
"loss": 1.056,
"step": 103
},
{
"epoch": 0.04,
"grad_norm": 3.7614712833569777,
"learning_rate": 1.9993520023015446e-06,
"loss": 1.0324,
"step": 104
},
{
"epoch": 0.04,
"grad_norm": 7.977090642238815,
"learning_rate": 1.999336483665066e-06,
"loss": 1.0381,
"step": 105
},
{
"epoch": 0.04,
"grad_norm": 6.883775735139576,
"learning_rate": 1.999320781457816e-06,
"loss": 1.0401,
"step": 106
},
{
"epoch": 0.04,
"grad_norm": 7.646427102046584,
"learning_rate": 1.9993048956826795e-06,
"loss": 1.0177,
"step": 107
},
{
"epoch": 0.04,
"grad_norm": 6.91150732826457,
"learning_rate": 1.9992888263425744e-06,
"loss": 1.074,
"step": 108
},
{
"epoch": 0.04,
"grad_norm": 4.575391311039725,
"learning_rate": 1.9992725734404525e-06,
"loss": 0.9857,
"step": 109
},
{
"epoch": 0.04,
"grad_norm": 5.305802626023059,
"learning_rate": 1.999256136979299e-06,
"loss": 1.027,
"step": 110
},
{
"epoch": 0.05,
"grad_norm": 11.219582821884845,
"learning_rate": 1.9992395169621333e-06,
"loss": 1.0052,
"step": 111
},
{
"epoch": 0.05,
"grad_norm": 4.984065290204285,
"learning_rate": 1.999222713392009e-06,
"loss": 1.0373,
"step": 112
},
{
"epoch": 0.05,
"grad_norm": 5.692677315997426,
"learning_rate": 1.999205726272012e-06,
"loss": 0.989,
"step": 113
},
{
"epoch": 0.05,
"grad_norm": 7.198981699558153,
"learning_rate": 1.9991885556052634e-06,
"loss": 0.9909,
"step": 114
},
{
"epoch": 0.05,
"grad_norm": 3.7226013688839408,
"learning_rate": 1.9991712013949163e-06,
"loss": 1.0239,
"step": 115
},
{
"epoch": 0.05,
"grad_norm": 5.728359812177413,
"learning_rate": 1.9991536636441597e-06,
"loss": 1.0061,
"step": 116
},
{
"epoch": 0.05,
"grad_norm": 11.35359130813659,
"learning_rate": 1.999135942356214e-06,
"loss": 1.0194,
"step": 117
},
{
"epoch": 0.05,
"grad_norm": 5.316447076634276,
"learning_rate": 1.9991180375343356e-06,
"loss": 1.0421,
"step": 118
},
{
"epoch": 0.05,
"grad_norm": 7.369540039365404,
"learning_rate": 1.9990999491818133e-06,
"loss": 1.0394,
"step": 119
},
{
"epoch": 0.05,
"grad_norm": 4.862219267085779,
"learning_rate": 1.999081677301969e-06,
"loss": 1.0128,
"step": 120
},
{
"epoch": 0.05,
"grad_norm": 9.521672579462718,
"learning_rate": 1.99906322189816e-06,
"loss": 1.0344,
"step": 121
},
{
"epoch": 0.05,
"grad_norm": 41.51761530959716,
"learning_rate": 1.9990445829737753e-06,
"loss": 1.0083,
"step": 122
},
{
"epoch": 0.05,
"grad_norm": 5.021581999604391,
"learning_rate": 1.99902576053224e-06,
"loss": 1.0108,
"step": 123
},
{
"epoch": 0.05,
"grad_norm": 5.572278752142267,
"learning_rate": 1.999006754577011e-06,
"loss": 1.077,
"step": 124
},
{
"epoch": 0.05,
"grad_norm": 7.535242509289648,
"learning_rate": 1.9989875651115796e-06,
"loss": 1.0032,
"step": 125
},
{
"epoch": 0.05,
"grad_norm": 4.94458833365794,
"learning_rate": 1.998968192139471e-06,
"loss": 1.0327,
"step": 126
},
{
"epoch": 0.05,
"grad_norm": 5.22597604280909,
"learning_rate": 1.9989486356642436e-06,
"loss": 1.0607,
"step": 127
},
{
"epoch": 0.05,
"grad_norm": 5.600608857861259,
"learning_rate": 1.99892889568949e-06,
"loss": 1.0237,
"step": 128
},
{
"epoch": 0.05,
"grad_norm": 14.930712474516863,
"learning_rate": 1.9989089722188356e-06,
"loss": 1.0281,
"step": 129
},
{
"epoch": 0.05,
"grad_norm": 17.148793559272317,
"learning_rate": 1.9988888652559414e-06,
"loss": 1.0197,
"step": 130
},
{
"epoch": 0.05,
"grad_norm": 18.206267641662425,
"learning_rate": 1.9988685748045e-06,
"loss": 0.9993,
"step": 131
},
{
"epoch": 0.05,
"grad_norm": 6.534798574412053,
"learning_rate": 1.9988481008682387e-06,
"loss": 1.0076,
"step": 132
},
{
"epoch": 0.05,
"grad_norm": 9.602550701839428,
"learning_rate": 1.9988274434509188e-06,
"loss": 1.0448,
"step": 133
},
{
"epoch": 0.05,
"grad_norm": 10.087693635682243,
"learning_rate": 1.9988066025563345e-06,
"loss": 0.9993,
"step": 134
},
{
"epoch": 0.06,
"grad_norm": 8.814548896552493,
"learning_rate": 1.9987855781883147e-06,
"loss": 0.9999,
"step": 135
},
{
"epoch": 0.06,
"grad_norm": 8.51015431028383,
"learning_rate": 1.9987643703507207e-06,
"loss": 0.9729,
"step": 136
},
{
"epoch": 0.06,
"grad_norm": 12.060883713716391,
"learning_rate": 1.998742979047449e-06,
"loss": 0.9874,
"step": 137
},
{
"epoch": 0.06,
"grad_norm": 4.873858233081547,
"learning_rate": 1.998721404282428e-06,
"loss": 1.0466,
"step": 138
},
{
"epoch": 0.06,
"grad_norm": 3.863514204785544,
"learning_rate": 1.9986996460596216e-06,
"loss": 0.9653,
"step": 139
},
{
"epoch": 0.06,
"grad_norm": 6.10799679276538,
"learning_rate": 1.9986777043830266e-06,
"loss": 0.9817,
"step": 140
},
{
"epoch": 0.06,
"grad_norm": 8.999621669803073,
"learning_rate": 1.998655579256673e-06,
"loss": 0.9738,
"step": 141
},
{
"epoch": 0.06,
"grad_norm": 20.018784670228868,
"learning_rate": 1.998633270684626e-06,
"loss": 0.9826,
"step": 142
},
{
"epoch": 0.06,
"grad_norm": 5.541213022309585,
"learning_rate": 1.9986107786709824e-06,
"loss": 1.0134,
"step": 143
},
{
"epoch": 0.06,
"grad_norm": 3.733539788566174,
"learning_rate": 1.9985881032198745e-06,
"loss": 0.989,
"step": 144
},
{
"epoch": 0.06,
"grad_norm": 16.211818146232364,
"learning_rate": 1.9985652443354673e-06,
"loss": 1.0362,
"step": 145
},
{
"epoch": 0.06,
"grad_norm": 4.689598582888043,
"learning_rate": 1.99854220202196e-06,
"loss": 0.9797,
"step": 146
},
{
"epoch": 0.06,
"grad_norm": 4.99220033727988,
"learning_rate": 1.998518976283585e-06,
"loss": 0.9428,
"step": 147
},
{
"epoch": 0.06,
"grad_norm": 7.074682301599579,
"learning_rate": 1.9984955671246097e-06,
"loss": 0.9988,
"step": 148
},
{
"epoch": 0.06,
"grad_norm": 11.907831865175005,
"learning_rate": 1.9984719745493324e-06,
"loss": 1.0118,
"step": 149
},
{
"epoch": 0.06,
"grad_norm": 9.612315864921625,
"learning_rate": 1.998448198562089e-06,
"loss": 1.0058,
"step": 150
},
{
"epoch": 0.06,
"grad_norm": 5.344494388995307,
"learning_rate": 1.998424239167245e-06,
"loss": 1.0188,
"step": 151
},
{
"epoch": 0.06,
"grad_norm": 4.10363616849046,
"learning_rate": 1.9984000963692027e-06,
"loss": 1.0433,
"step": 152
},
{
"epoch": 0.06,
"grad_norm": 11.294603557505567,
"learning_rate": 1.9983757701723972e-06,
"loss": 0.9879,
"step": 153
},
{
"epoch": 0.06,
"grad_norm": 6.98576151211412,
"learning_rate": 1.998351260581296e-06,
"loss": 0.9485,
"step": 154
},
{
"epoch": 0.06,
"grad_norm": 7.4577844382968355,
"learning_rate": 1.9983265676004023e-06,
"loss": 0.9972,
"step": 155
},
{
"epoch": 0.06,
"grad_norm": 8.604037897186739,
"learning_rate": 1.998301691234251e-06,
"loss": 0.9427,
"step": 156
},
{
"epoch": 0.06,
"grad_norm": 6.408193325685539,
"learning_rate": 1.998276631487413e-06,
"loss": 0.9922,
"step": 157
},
{
"epoch": 0.06,
"grad_norm": 9.43705518120863,
"learning_rate": 1.9982513883644904e-06,
"loss": 0.9998,
"step": 158
},
{
"epoch": 0.06,
"grad_norm": 35.04273750105757,
"learning_rate": 1.9982259618701208e-06,
"loss": 0.9812,
"step": 159
},
{
"epoch": 0.07,
"grad_norm": 22.248660872474947,
"learning_rate": 1.998200352008975e-06,
"loss": 1.0115,
"step": 160
},
{
"epoch": 0.07,
"grad_norm": 14.3385256736133,
"learning_rate": 1.998174558785757e-06,
"loss": 1.0827,
"step": 161
},
{
"epoch": 0.07,
"grad_norm": 4.137417763517529,
"learning_rate": 1.9981485822052046e-06,
"loss": 0.9995,
"step": 162
},
{
"epoch": 0.07,
"grad_norm": 6.228720133807642,
"learning_rate": 1.99812242227209e-06,
"loss": 0.9785,
"step": 163
},
{
"epoch": 0.07,
"grad_norm": 4.832487979382411,
"learning_rate": 1.9980960789912186e-06,
"loss": 0.9773,
"step": 164
},
{
"epoch": 0.07,
"grad_norm": 4.006756832890176,
"learning_rate": 1.998069552367429e-06,
"loss": 0.9861,
"step": 165
},
{
"epoch": 0.07,
"grad_norm": 6.460011618427777,
"learning_rate": 1.9980428424055945e-06,
"loss": 0.9414,
"step": 166
},
{
"epoch": 0.07,
"grad_norm": 4.7866976769350735,
"learning_rate": 1.9980159491106207e-06,
"loss": 0.9809,
"step": 167
},
{
"epoch": 0.07,
"grad_norm": 3.916152096917494,
"learning_rate": 1.997988872487449e-06,
"loss": 0.9902,
"step": 168
},
{
"epoch": 0.07,
"grad_norm": 8.214419981077427,
"learning_rate": 1.9979616125410515e-06,
"loss": 0.974,
"step": 169
},
{
"epoch": 0.07,
"grad_norm": 7.119466860214886,
"learning_rate": 1.9979341692764374e-06,
"loss": 0.9428,
"step": 170
},
{
"epoch": 0.07,
"grad_norm": 10.127173069574503,
"learning_rate": 1.997906542698647e-06,
"loss": 1.0167,
"step": 171
},
{
"epoch": 0.07,
"grad_norm": 19.822838531990307,
"learning_rate": 1.9978787328127543e-06,
"loss": 0.9128,
"step": 172
},
{
"epoch": 0.07,
"grad_norm": 4.365137845589861,
"learning_rate": 1.997850739623869e-06,
"loss": 0.9935,
"step": 173
},
{
"epoch": 0.07,
"grad_norm": 5.1353522793875115,
"learning_rate": 1.997822563137133e-06,
"loss": 0.9882,
"step": 174
},
{
"epoch": 0.07,
"grad_norm": 3.6520473672366895,
"learning_rate": 1.997794203357722e-06,
"loss": 0.9423,
"step": 175
},
{
"epoch": 0.07,
"grad_norm": 8.01170223964648,
"learning_rate": 1.997765660290845e-06,
"loss": 0.9969,
"step": 176
},
{
"epoch": 0.07,
"grad_norm": 6.809970583916374,
"learning_rate": 1.9977369339417456e-06,
"loss": 0.9234,
"step": 177
},
{
"epoch": 0.07,
"grad_norm": 7.370299940330493,
"learning_rate": 1.997708024315701e-06,
"loss": 0.9789,
"step": 178
},
{
"epoch": 0.07,
"grad_norm": 5.975009472235388,
"learning_rate": 1.9976789314180208e-06,
"loss": 0.9797,
"step": 179
},
{
"epoch": 0.07,
"grad_norm": 4.852514165814264,
"learning_rate": 1.9976496552540497e-06,
"loss": 0.9021,
"step": 180
},
{
"epoch": 0.07,
"grad_norm": 8.069885413543581,
"learning_rate": 1.9976201958291655e-06,
"loss": 0.9225,
"step": 181
},
{
"epoch": 0.07,
"grad_norm": 4.215297859208096,
"learning_rate": 1.9975905531487797e-06,
"loss": 0.9991,
"step": 182
},
{
"epoch": 0.07,
"grad_norm": 4.909701623044126,
"learning_rate": 1.9975607272183377e-06,
"loss": 0.991,
"step": 183
},
{
"epoch": 0.08,
"grad_norm": 3.58899251973664,
"learning_rate": 1.997530718043317e-06,
"loss": 0.94,
"step": 184
},
{
"epoch": 0.08,
"grad_norm": 3.681492461334373,
"learning_rate": 1.9975005256292316e-06,
"loss": 0.9294,
"step": 185
},
{
"epoch": 0.08,
"grad_norm": 4.310978692950136,
"learning_rate": 1.9974701499816273e-06,
"loss": 0.9502,
"step": 186
},
{
"epoch": 0.08,
"grad_norm": 4.939663613018935,
"learning_rate": 1.997439591106083e-06,
"loss": 1.0303,
"step": 187
},
{
"epoch": 0.08,
"grad_norm": 3.524235129079609,
"learning_rate": 1.9974088490082136e-06,
"loss": 1.0421,
"step": 188
},
{
"epoch": 0.08,
"grad_norm": 12.294087847866352,
"learning_rate": 1.9973779236936647e-06,
"loss": 1.0107,
"step": 189
},
{
"epoch": 0.08,
"grad_norm": 4.687128691098791,
"learning_rate": 1.997346815168118e-06,
"loss": 0.9213,
"step": 190
},
{
"epoch": 0.08,
"grad_norm": 6.234426099871191,
"learning_rate": 1.9973155234372875e-06,
"loss": 0.976,
"step": 191
},
{
"epoch": 0.08,
"grad_norm": 2.786080313731807,
"learning_rate": 1.9972840485069218e-06,
"loss": 0.9638,
"step": 192
},
{
"epoch": 0.08,
"grad_norm": 5.611298737262752,
"learning_rate": 1.9972523903828015e-06,
"loss": 1.0001,
"step": 193
},
{
"epoch": 0.08,
"grad_norm": 5.2942870590170985,
"learning_rate": 1.997220549070743e-06,
"loss": 1.002,
"step": 194
},
{
"epoch": 0.08,
"grad_norm": 6.390078665152651,
"learning_rate": 1.997188524576595e-06,
"loss": 0.9676,
"step": 195
},
{
"epoch": 0.08,
"grad_norm": 7.576123306960086,
"learning_rate": 1.99715631690624e-06,
"loss": 0.9629,
"step": 196
},
{
"epoch": 0.08,
"grad_norm": 5.252530976148378,
"learning_rate": 1.9971239260655947e-06,
"loss": 0.9471,
"step": 197
},
{
"epoch": 0.08,
"grad_norm": 3.8464766629015497,
"learning_rate": 1.997091352060609e-06,
"loss": 0.9812,
"step": 198
},
{
"epoch": 0.08,
"grad_norm": 9.647651856811143,
"learning_rate": 1.997058594897266e-06,
"loss": 1.03,
"step": 199
},
{
"epoch": 0.08,
"grad_norm": 3.5151632832487567,
"learning_rate": 1.997025654581583e-06,
"loss": 0.9316,
"step": 200
},
{
"epoch": 0.08,
"eval_loss": 0.9661003351211548,
"eval_runtime": 463.3531,
"eval_samples_per_second": 75.217,
"eval_steps_per_second": 4.703,
"step": 200
},
{
"epoch": 0.08,
"grad_norm": 2.9385504843531964,
"learning_rate": 1.996992531119612e-06,
"loss": 0.9487,
"step": 201
},
{
"epoch": 0.08,
"grad_norm": 3.765050749804257,
"learning_rate": 1.996959224517436e-06,
"loss": 0.9886,
"step": 202
},
{
"epoch": 0.08,
"grad_norm": 3.534070963322837,
"learning_rate": 1.9969257347811743e-06,
"loss": 0.9738,
"step": 203
},
{
"epoch": 0.08,
"grad_norm": 4.5657531950578285,
"learning_rate": 1.996892061916978e-06,
"loss": 0.9813,
"step": 204
},
{
"epoch": 0.08,
"grad_norm": 3.635121564493459,
"learning_rate": 1.996858205931033e-06,
"loss": 0.992,
"step": 205
},
{
"epoch": 0.08,
"grad_norm": 8.436749294176517,
"learning_rate": 1.996824166829558e-06,
"loss": 0.9305,
"step": 206
},
{
"epoch": 0.08,
"grad_norm": 5.906065241020118,
"learning_rate": 1.9967899446188063e-06,
"loss": 0.9896,
"step": 207
},
{
"epoch": 0.08,
"grad_norm": 3.3358490024807343,
"learning_rate": 1.996755539305064e-06,
"loss": 0.9283,
"step": 208
},
{
"epoch": 0.09,
"grad_norm": 4.409523619368786,
"learning_rate": 1.996720950894651e-06,
"loss": 0.9844,
"step": 209
},
{
"epoch": 0.09,
"grad_norm": 4.441090681598085,
"learning_rate": 1.996686179393921e-06,
"loss": 0.9613,
"step": 210
},
{
"epoch": 0.09,
"grad_norm": 3.7138159789312732,
"learning_rate": 1.996651224809261e-06,
"loss": 0.9121,
"step": 211
},
{
"epoch": 0.09,
"grad_norm": 3.95780209582353,
"learning_rate": 1.9966160871470924e-06,
"loss": 0.9307,
"step": 212
},
{
"epoch": 0.09,
"grad_norm": 3.59329408458561,
"learning_rate": 1.9965807664138696e-06,
"loss": 1.0759,
"step": 213
},
{
"epoch": 0.09,
"grad_norm": 8.940457236555044,
"learning_rate": 1.9965452626160805e-06,
"loss": 0.9942,
"step": 214
},
{
"epoch": 0.09,
"grad_norm": 9.35776711556365,
"learning_rate": 1.996509575760247e-06,
"loss": 0.9711,
"step": 215
},
{
"epoch": 0.09,
"grad_norm": 4.228339531497114,
"learning_rate": 1.996473705852925e-06,
"loss": 0.9737,
"step": 216
},
{
"epoch": 0.09,
"grad_norm": 4.108715627029094,
"learning_rate": 1.9964376529007025e-06,
"loss": 0.9499,
"step": 217
},
{
"epoch": 0.09,
"grad_norm": 3.2539883650771237,
"learning_rate": 1.996401416910203e-06,
"loss": 0.9612,
"step": 218
},
{
"epoch": 0.09,
"grad_norm": 3.941119836702424,
"learning_rate": 1.9963649978880827e-06,
"loss": 0.9738,
"step": 219
},
{
"epoch": 0.09,
"grad_norm": 3.9658748948789135,
"learning_rate": 1.9963283958410313e-06,
"loss": 0.9105,
"step": 220
},
{
"epoch": 0.09,
"grad_norm": 3.9568412196786698,
"learning_rate": 1.996291610775772e-06,
"loss": 0.9887,
"step": 221
},
{
"epoch": 0.09,
"grad_norm": 3.9145202306771965,
"learning_rate": 1.996254642699063e-06,
"loss": 0.9479,
"step": 222
},
{
"epoch": 0.09,
"grad_norm": 4.2520865655430615,
"learning_rate": 1.996217491617694e-06,
"loss": 0.9032,
"step": 223
},
{
"epoch": 0.09,
"grad_norm": 3.4105491667107604,
"learning_rate": 1.9961801575384897e-06,
"loss": 0.9667,
"step": 224
},
{
"epoch": 0.09,
"grad_norm": 3.4724662745613943,
"learning_rate": 1.9961426404683083e-06,
"loss": 0.9408,
"step": 225
},
{
"epoch": 0.09,
"grad_norm": 4.56984114598252,
"learning_rate": 1.9961049404140415e-06,
"loss": 1.0256,
"step": 226
},
{
"epoch": 0.09,
"grad_norm": 4.502345398746718,
"learning_rate": 1.9960670573826138e-06,
"loss": 1.0003,
"step": 227
},
{
"epoch": 0.09,
"grad_norm": 6.932855159749422,
"learning_rate": 1.9960289913809847e-06,
"loss": 0.9451,
"step": 228
},
{
"epoch": 0.09,
"grad_norm": 4.771399396007914,
"learning_rate": 1.9959907424161466e-06,
"loss": 1.0094,
"step": 229
},
{
"epoch": 0.09,
"grad_norm": 8.625246060785901,
"learning_rate": 1.9959523104951255e-06,
"loss": 0.9794,
"step": 230
},
{
"epoch": 0.09,
"grad_norm": 6.482126686528787,
"learning_rate": 1.9959136956249808e-06,
"loss": 0.9464,
"step": 231
},
{
"epoch": 0.09,
"grad_norm": 3.6404205546910484,
"learning_rate": 1.995874897812806e-06,
"loss": 0.9443,
"step": 232
},
{
"epoch": 0.1,
"grad_norm": 3.149098347754618,
"learning_rate": 1.9958359170657282e-06,
"loss": 0.9516,
"step": 233
},
{
"epoch": 0.1,
"grad_norm": 2.905971754362851,
"learning_rate": 1.9957967533909074e-06,
"loss": 0.9282,
"step": 234
},
{
"epoch": 0.1,
"grad_norm": 4.088784428658776,
"learning_rate": 1.995757406795538e-06,
"loss": 0.9573,
"step": 235
},
{
"epoch": 0.1,
"grad_norm": 5.3815959761356815,
"learning_rate": 1.9957178772868475e-06,
"loss": 0.9026,
"step": 236
},
{
"epoch": 0.1,
"grad_norm": 8.616971669230187,
"learning_rate": 1.995678164872097e-06,
"loss": 0.9264,
"step": 237
},
{
"epoch": 0.1,
"grad_norm": 4.6132568133245595,
"learning_rate": 1.995638269558582e-06,
"loss": 0.9241,
"step": 238
},
{
"epoch": 0.1,
"grad_norm": 3.510706446964944,
"learning_rate": 1.9955981913536302e-06,
"loss": 0.9543,
"step": 239
},
{
"epoch": 0.1,
"grad_norm": 3.2769463024507584,
"learning_rate": 1.9955579302646047e-06,
"loss": 0.947,
"step": 240
},
{
"epoch": 0.1,
"grad_norm": 3.977024105822628,
"learning_rate": 1.9955174862989e-06,
"loss": 0.9595,
"step": 241
},
{
"epoch": 0.1,
"grad_norm": 3.4412121236558737,
"learning_rate": 1.995476859463946e-06,
"loss": 0.9549,
"step": 242
},
{
"epoch": 0.1,
"grad_norm": 3.830427695200563,
"learning_rate": 1.9954360497672054e-06,
"loss": 0.9179,
"step": 243
},
{
"epoch": 0.1,
"grad_norm": 5.6013422726566455,
"learning_rate": 1.9953950572161753e-06,
"loss": 0.8908,
"step": 244
},
{
"epoch": 0.1,
"grad_norm": 9.083698286259871,
"learning_rate": 1.9953538818183845e-06,
"loss": 0.9821,
"step": 245
},
{
"epoch": 0.1,
"grad_norm": 6.829439279932517,
"learning_rate": 1.9953125235813975e-06,
"loss": 0.9668,
"step": 246
},
{
"epoch": 0.1,
"grad_norm": 6.259338572313049,
"learning_rate": 1.9952709825128113e-06,
"loss": 0.9603,
"step": 247
},
{
"epoch": 0.1,
"grad_norm": 3.832689908837347,
"learning_rate": 1.9952292586202564e-06,
"loss": 1.0133,
"step": 248
},
{
"epoch": 0.1,
"grad_norm": 5.302321664501716,
"learning_rate": 1.9951873519113976e-06,
"loss": 0.9383,
"step": 249
},
{
"epoch": 0.1,
"grad_norm": 4.570749587876467,
"learning_rate": 1.995145262393933e-06,
"loss": 0.9816,
"step": 250
},
{
"epoch": 0.1,
"grad_norm": 4.288180523782628,
"learning_rate": 1.995102990075593e-06,
"loss": 0.9279,
"step": 251
},
{
"epoch": 0.1,
"grad_norm": 3.4977744455565603,
"learning_rate": 1.995060534964144e-06,
"loss": 0.9062,
"step": 252
},
{
"epoch": 0.1,
"grad_norm": 3.459879593653582,
"learning_rate": 1.995017897067384e-06,
"loss": 0.9768,
"step": 253
},
{
"epoch": 0.1,
"grad_norm": 3.5774671457662954,
"learning_rate": 1.994975076393146e-06,
"loss": 0.9022,
"step": 254
},
{
"epoch": 0.1,
"grad_norm": 3.5145587310160735,
"learning_rate": 1.994932072949295e-06,
"loss": 0.99,
"step": 255
},
{
"epoch": 0.1,
"grad_norm": 5.7235289695123175,
"learning_rate": 1.9948888867437304e-06,
"loss": 0.9668,
"step": 256
},
{
"epoch": 0.1,
"grad_norm": 5.448951861195586,
"learning_rate": 1.9948455177843857e-06,
"loss": 0.9047,
"step": 257
},
{
"epoch": 0.11,
"grad_norm": 3.3390789555363773,
"learning_rate": 1.9948019660792273e-06,
"loss": 0.9102,
"step": 258
},
{
"epoch": 0.11,
"grad_norm": 5.901568154708476,
"learning_rate": 1.9947582316362553e-06,
"loss": 0.9639,
"step": 259
},
{
"epoch": 0.11,
"grad_norm": 5.688845411328821,
"learning_rate": 1.994714314463504e-06,
"loss": 0.8932,
"step": 260
},
{
"epoch": 0.11,
"grad_norm": 4.075791217266395,
"learning_rate": 1.9946702145690396e-06,
"loss": 0.9439,
"step": 261
},
{
"epoch": 0.11,
"grad_norm": 4.129028535291,
"learning_rate": 1.994625931960963e-06,
"loss": 1.0053,
"step": 262
},
{
"epoch": 0.11,
"grad_norm": 6.058408963629934,
"learning_rate": 1.9945814666474096e-06,
"loss": 0.9469,
"step": 263
},
{
"epoch": 0.11,
"grad_norm": 6.393405304501353,
"learning_rate": 1.9945368186365466e-06,
"loss": 0.9399,
"step": 264
},
{
"epoch": 0.11,
"grad_norm": 6.073676639857798,
"learning_rate": 1.9944919879365757e-06,
"loss": 0.9417,
"step": 265
},
{
"epoch": 0.11,
"grad_norm": 3.1186079188541544,
"learning_rate": 1.9944469745557314e-06,
"loss": 0.9974,
"step": 266
},
{
"epoch": 0.11,
"grad_norm": 6.095932936614379,
"learning_rate": 1.994401778502283e-06,
"loss": 0.9245,
"step": 267
},
{
"epoch": 0.11,
"grad_norm": 3.9927046082834905,
"learning_rate": 1.994356399784533e-06,
"loss": 0.9491,
"step": 268
},
{
"epoch": 0.11,
"grad_norm": 8.59663446244483,
"learning_rate": 1.9943108384108167e-06,
"loss": 0.9836,
"step": 269
},
{
"epoch": 0.11,
"grad_norm": 3.686022879208509,
"learning_rate": 1.9942650943895034e-06,
"loss": 0.9559,
"step": 270
},
{
"epoch": 0.11,
"grad_norm": 4.383383576178454,
"learning_rate": 1.994219167728995e-06,
"loss": 1.0379,
"step": 271
},
{
"epoch": 0.11,
"grad_norm": 9.579388951537883,
"learning_rate": 1.9941730584377295e-06,
"loss": 0.9952,
"step": 272
},
{
"epoch": 0.11,
"grad_norm": 3.154318001896529,
"learning_rate": 1.994126766524176e-06,
"loss": 0.9019,
"step": 273
},
{
"epoch": 0.11,
"grad_norm": 4.382606465350842,
"learning_rate": 1.9940802919968386e-06,
"loss": 0.9501,
"step": 274
},
{
"epoch": 0.11,
"grad_norm": 2.9333529359561097,
"learning_rate": 1.994033634864253e-06,
"loss": 0.9862,
"step": 275
},
{
"epoch": 0.11,
"grad_norm": 4.959892987229748,
"learning_rate": 1.993986795134991e-06,
"loss": 0.9472,
"step": 276
},
{
"epoch": 0.11,
"grad_norm": 4.489954498664958,
"learning_rate": 1.9939397728176565e-06,
"loss": 0.977,
"step": 277
},
{
"epoch": 0.11,
"grad_norm": 3.7655089181443926,
"learning_rate": 1.993892567920887e-06,
"loss": 0.9771,
"step": 278
},
{
"epoch": 0.11,
"grad_norm": 4.039445445445999,
"learning_rate": 1.9938451804533535e-06,
"loss": 0.9986,
"step": 279
},
{
"epoch": 0.11,
"grad_norm": 4.0790740541386805,
"learning_rate": 1.9937976104237607e-06,
"loss": 1.006,
"step": 280
},
{
"epoch": 0.11,
"grad_norm": 3.2266784977347087,
"learning_rate": 1.9937498578408475e-06,
"loss": 0.9471,
"step": 281
},
{
"epoch": 0.12,
"grad_norm": 5.673632405054337,
"learning_rate": 1.9937019227133848e-06,
"loss": 0.9683,
"step": 282
},
{
"epoch": 0.12,
"grad_norm": 5.890179125236931,
"learning_rate": 1.9936538050501788e-06,
"loss": 0.9094,
"step": 283
},
{
"epoch": 0.12,
"grad_norm": 4.407183363738669,
"learning_rate": 1.993605504860068e-06,
"loss": 1.0398,
"step": 284
},
{
"epoch": 0.12,
"grad_norm": 4.467816298238836,
"learning_rate": 1.9935570221519243e-06,
"loss": 0.9075,
"step": 285
},
{
"epoch": 0.12,
"grad_norm": 3.4668205405112267,
"learning_rate": 1.993508356934654e-06,
"loss": 0.9214,
"step": 286
},
{
"epoch": 0.12,
"grad_norm": 7.923273076473722,
"learning_rate": 1.9934595092171975e-06,
"loss": 0.8714,
"step": 287
},
{
"epoch": 0.12,
"grad_norm": 3.6937783921375686,
"learning_rate": 1.993410479008526e-06,
"loss": 0.9416,
"step": 288
},
{
"epoch": 0.12,
"grad_norm": 6.666324359701347,
"learning_rate": 1.9933612663176472e-06,
"loss": 0.9558,
"step": 289
},
{
"epoch": 0.12,
"grad_norm": 6.628637193584086,
"learning_rate": 1.993311871153601e-06,
"loss": 0.9829,
"step": 290
},
{
"epoch": 0.12,
"grad_norm": 6.759423982048169,
"learning_rate": 1.9932622935254602e-06,
"loss": 0.9458,
"step": 291
},
{
"epoch": 0.12,
"grad_norm": 18.306800116680012,
"learning_rate": 1.9932125334423328e-06,
"loss": 0.9367,
"step": 292
},
{
"epoch": 0.12,
"grad_norm": 3.816159722205097,
"learning_rate": 1.9931625909133584e-06,
"loss": 0.8722,
"step": 293
},
{
"epoch": 0.12,
"grad_norm": 3.448148075552452,
"learning_rate": 1.993112465947712e-06,
"loss": 0.9458,
"step": 294
},
{
"epoch": 0.12,
"grad_norm": 4.030606318136953,
"learning_rate": 1.993062158554601e-06,
"loss": 0.97,
"step": 295
},
{
"epoch": 0.12,
"grad_norm": 3.6420524382328194,
"learning_rate": 1.9930116687432657e-06,
"loss": 0.9783,
"step": 296
},
{
"epoch": 0.12,
"grad_norm": 4.717145343030925,
"learning_rate": 1.9929609965229816e-06,
"loss": 1.0023,
"step": 297
},
{
"epoch": 0.12,
"grad_norm": 4.719677774568812,
"learning_rate": 1.9929101419030564e-06,
"loss": 0.8967,
"step": 298
},
{
"epoch": 0.12,
"grad_norm": 3.7664123677086176,
"learning_rate": 1.992859104892832e-06,
"loss": 0.9597,
"step": 299
},
{
"epoch": 0.12,
"grad_norm": 2.9586481204619033,
"learning_rate": 1.9928078855016836e-06,
"loss": 0.9413,
"step": 300
},
{
"epoch": 0.12,
"eval_loss": 0.9467526078224182,
"eval_runtime": 463.6493,
"eval_samples_per_second": 75.169,
"eval_steps_per_second": 4.7,
"step": 300
},
{
"epoch": 0.12,
"grad_norm": 3.1281465411859273,
"learning_rate": 1.992756483739019e-06,
"loss": 0.9679,
"step": 301
},
{
"epoch": 0.12,
"grad_norm": 4.9334980166343545,
"learning_rate": 1.992704899614281e-06,
"loss": 0.9893,
"step": 302
},
{
"epoch": 0.12,
"grad_norm": 2.7633317230588625,
"learning_rate": 1.992653133136945e-06,
"loss": 0.9347,
"step": 303
},
{
"epoch": 0.12,
"grad_norm": 2.9660108129618883,
"learning_rate": 1.992601184316521e-06,
"loss": 0.9566,
"step": 304
},
{
"epoch": 0.12,
"grad_norm": 4.492415727819998,
"learning_rate": 1.99254905316255e-06,
"loss": 0.962,
"step": 305
},
{
"epoch": 0.12,
"grad_norm": 3.9156326847724907,
"learning_rate": 1.9924967396846094e-06,
"loss": 0.9861,
"step": 306
},
{
"epoch": 0.13,
"grad_norm": 3.8088389015080146,
"learning_rate": 1.992444243892308e-06,
"loss": 0.9842,
"step": 307
},
{
"epoch": 0.13,
"grad_norm": 3.1110156500141457,
"learning_rate": 1.99239156579529e-06,
"loss": 0.9517,
"step": 308
},
{
"epoch": 0.13,
"grad_norm": 4.138709755084549,
"learning_rate": 1.9923387054032305e-06,
"loss": 0.9641,
"step": 309
},
{
"epoch": 0.13,
"grad_norm": 7.7968182487325945,
"learning_rate": 1.9922856627258406e-06,
"loss": 0.9515,
"step": 310
},
{
"epoch": 0.13,
"grad_norm": 4.011173634825208,
"learning_rate": 1.9922324377728633e-06,
"loss": 0.956,
"step": 311
},
{
"epoch": 0.13,
"grad_norm": 3.480694274111223,
"learning_rate": 1.992179030554076e-06,
"loss": 1.0134,
"step": 312
},
{
"epoch": 0.13,
"grad_norm": 3.478272539260623,
"learning_rate": 1.9921254410792893e-06,
"loss": 0.9548,
"step": 313
},
{
"epoch": 0.13,
"grad_norm": 3.8261032166950657,
"learning_rate": 1.992071669358347e-06,
"loss": 1.0562,
"step": 314
},
{
"epoch": 0.13,
"grad_norm": 3.783821284258041,
"learning_rate": 1.9920177154011266e-06,
"loss": 0.9381,
"step": 315
},
{
"epoch": 0.13,
"grad_norm": 3.108527132585836,
"learning_rate": 1.9919635792175387e-06,
"loss": 0.9985,
"step": 316
},
{
"epoch": 0.13,
"grad_norm": 3.737178450877116,
"learning_rate": 1.9919092608175283e-06,
"loss": 0.9196,
"step": 317
},
{
"epoch": 0.13,
"grad_norm": 3.37697712329418,
"learning_rate": 1.9918547602110727e-06,
"loss": 0.9623,
"step": 318
},
{
"epoch": 0.13,
"grad_norm": 4.018688952178953,
"learning_rate": 1.9918000774081842e-06,
"loss": 0.9357,
"step": 319
},
{
"epoch": 0.13,
"grad_norm": 3.4672395655560244,
"learning_rate": 1.9917452124189065e-06,
"loss": 0.8905,
"step": 320
},
{
"epoch": 0.13,
"grad_norm": 3.347655106733803,
"learning_rate": 1.9916901652533185e-06,
"loss": 0.9546,
"step": 321
},
{
"epoch": 0.13,
"grad_norm": 3.1703093370155435,
"learning_rate": 1.991634935921532e-06,
"loss": 0.9199,
"step": 322
},
{
"epoch": 0.13,
"grad_norm": 3.2316298954158675,
"learning_rate": 1.991579524433692e-06,
"loss": 0.8889,
"step": 323
},
{
"epoch": 0.13,
"grad_norm": 3.9850307461382504,
"learning_rate": 1.991523930799977e-06,
"loss": 1.0068,
"step": 324
},
{
"epoch": 0.13,
"grad_norm": 3.5436130598908098,
"learning_rate": 1.9914681550306e-06,
"loss": 0.9794,
"step": 325
},
{
"epoch": 0.13,
"grad_norm": 3.363846136726895,
"learning_rate": 1.9914121971358054e-06,
"loss": 0.9474,
"step": 326
},
{
"epoch": 0.13,
"grad_norm": 4.204239838208441,
"learning_rate": 1.9913560571258736e-06,
"loss": 0.9527,
"step": 327
},
{
"epoch": 0.13,
"grad_norm": 10.558683831583629,
"learning_rate": 1.9912997350111155e-06,
"loss": 0.9989,
"step": 328
},
{
"epoch": 0.13,
"grad_norm": 3.0835092017925017,
"learning_rate": 1.991243230801879e-06,
"loss": 0.921,
"step": 329
},
{
"epoch": 0.13,
"grad_norm": 3.4660468815991767,
"learning_rate": 1.991186544508541e-06,
"loss": 0.8477,
"step": 330
},
{
"epoch": 0.14,
"grad_norm": 3.282548162442814,
"learning_rate": 1.991129676141517e-06,
"loss": 0.9955,
"step": 331
},
{
"epoch": 0.14,
"grad_norm": 5.5427239520446205,
"learning_rate": 1.991072625711252e-06,
"loss": 0.9152,
"step": 332
},
{
"epoch": 0.14,
"grad_norm": 3.5463144097391255,
"learning_rate": 1.9910153932282253e-06,
"loss": 0.9529,
"step": 333
},
{
"epoch": 0.14,
"grad_norm": 8.098197076633276,
"learning_rate": 1.990957978702951e-06,
"loss": 0.9377,
"step": 334
},
{
"epoch": 0.14,
"grad_norm": 3.3762931067403827,
"learning_rate": 1.9909003821459753e-06,
"loss": 0.9669,
"step": 335
},
{
"epoch": 0.14,
"grad_norm": 3.583660539566059,
"learning_rate": 1.990842603567878e-06,
"loss": 0.877,
"step": 336
},
{
"epoch": 0.14,
"grad_norm": 3.9089987450978314,
"learning_rate": 1.9907846429792735e-06,
"loss": 0.8441,
"step": 337
},
{
"epoch": 0.14,
"grad_norm": 6.229266590069221,
"learning_rate": 1.9907265003908077e-06,
"loss": 0.9788,
"step": 338
},
{
"epoch": 0.14,
"grad_norm": 4.695728593663099,
"learning_rate": 1.9906681758131615e-06,
"loss": 0.9516,
"step": 339
},
{
"epoch": 0.14,
"grad_norm": 3.6997930834225166,
"learning_rate": 1.990609669257049e-06,
"loss": 0.9864,
"step": 340
},
{
"epoch": 0.14,
"grad_norm": 4.044485772615036,
"learning_rate": 1.9905509807332164e-06,
"loss": 0.91,
"step": 341
},
{
"epoch": 0.14,
"grad_norm": 3.2415925558702305,
"learning_rate": 1.9904921102524457e-06,
"loss": 0.9784,
"step": 342
},
{
"epoch": 0.14,
"grad_norm": 6.160317931732714,
"learning_rate": 1.99043305782555e-06,
"loss": 0.9659,
"step": 343
},
{
"epoch": 0.14,
"grad_norm": 3.9049298212631687,
"learning_rate": 1.990373823463377e-06,
"loss": 1.0315,
"step": 344
},
{
"epoch": 0.14,
"grad_norm": 3.847690562386799,
"learning_rate": 1.9903144071768076e-06,
"loss": 0.9837,
"step": 345
},
{
"epoch": 0.14,
"grad_norm": 3.4252464651952153,
"learning_rate": 1.9902548089767562e-06,
"loss": 0.9347,
"step": 346
},
{
"epoch": 0.14,
"grad_norm": 4.054438157317686,
"learning_rate": 1.990195028874171e-06,
"loss": 0.9786,
"step": 347
},
{
"epoch": 0.14,
"grad_norm": 2.8632750603815813,
"learning_rate": 1.9901350668800322e-06,
"loss": 1.0071,
"step": 348
},
{
"epoch": 0.14,
"grad_norm": 6.038880332827908,
"learning_rate": 1.990074923005355e-06,
"loss": 0.9082,
"step": 349
},
{
"epoch": 0.14,
"grad_norm": 6.799164088149487,
"learning_rate": 1.9900145972611876e-06,
"loss": 1.0261,
"step": 350
},
{
"epoch": 0.14,
"grad_norm": 4.2566930579585485,
"learning_rate": 1.9899540896586107e-06,
"loss": 0.9593,
"step": 351
},
{
"epoch": 0.14,
"grad_norm": 13.634453166279975,
"learning_rate": 1.98989340020874e-06,
"loss": 0.9414,
"step": 352
},
{
"epoch": 0.14,
"grad_norm": 5.431466946501957,
"learning_rate": 1.989832528922723e-06,
"loss": 0.9521,
"step": 353
},
{
"epoch": 0.14,
"grad_norm": 4.040360890022179,
"learning_rate": 1.9897714758117413e-06,
"loss": 1.008,
"step": 354
},
{
"epoch": 0.14,
"grad_norm": 2.9959455294509265,
"learning_rate": 1.9897102408870104e-06,
"loss": 0.958,
"step": 355
},
{
"epoch": 0.15,
"grad_norm": 5.861673054503183,
"learning_rate": 1.9896488241597784e-06,
"loss": 0.9191,
"step": 356
},
{
"epoch": 0.15,
"grad_norm": 4.834339377580445,
"learning_rate": 1.989587225641327e-06,
"loss": 0.9018,
"step": 357
},
{
"epoch": 0.15,
"grad_norm": 4.685738550310006,
"learning_rate": 1.9895254453429713e-06,
"loss": 0.989,
"step": 358
},
{
"epoch": 0.15,
"grad_norm": 4.588722209719664,
"learning_rate": 1.9894634832760607e-06,
"loss": 1.0173,
"step": 359
},
{
"epoch": 0.15,
"grad_norm": 5.45015095535037,
"learning_rate": 1.989401339451976e-06,
"loss": 0.9193,
"step": 360
},
{
"epoch": 0.15,
"grad_norm": 3.1584774921782484,
"learning_rate": 1.9893390138821337e-06,
"loss": 0.9566,
"step": 361
},
{
"epoch": 0.15,
"grad_norm": 2.9385233627943146,
"learning_rate": 1.9892765065779817e-06,
"loss": 1.025,
"step": 362
},
{
"epoch": 0.15,
"grad_norm": 4.294399736359187,
"learning_rate": 1.989213817551002e-06,
"loss": 0.9355,
"step": 363
},
{
"epoch": 0.15,
"grad_norm": 5.639920182246751,
"learning_rate": 1.9891509468127113e-06,
"loss": 0.964,
"step": 364
},
{
"epoch": 0.15,
"grad_norm": 3.3199781164090547,
"learning_rate": 1.9890878943746576e-06,
"loss": 0.9835,
"step": 365
},
{
"epoch": 0.15,
"grad_norm": 2.525244120677975,
"learning_rate": 1.989024660248423e-06,
"loss": 0.9161,
"step": 366
},
{
"epoch": 0.15,
"grad_norm": 3.708716930676622,
"learning_rate": 1.988961244445624e-06,
"loss": 0.9174,
"step": 367
},
{
"epoch": 0.15,
"grad_norm": 3.458728582106158,
"learning_rate": 1.9888976469779087e-06,
"loss": 0.9586,
"step": 368
},
{
"epoch": 0.15,
"grad_norm": 2.9724997117925005,
"learning_rate": 1.98883386785696e-06,
"loss": 0.9566,
"step": 369
},
{
"epoch": 0.15,
"grad_norm": 4.625465005060366,
"learning_rate": 1.988769907094493e-06,
"loss": 0.9024,
"step": 370
},
{
"epoch": 0.15,
"grad_norm": 3.0186102463979534,
"learning_rate": 1.988705764702258e-06,
"loss": 0.8939,
"step": 371
},
{
"epoch": 0.15,
"grad_norm": 2.700988789493622,
"learning_rate": 1.9886414406920364e-06,
"loss": 0.9062,
"step": 372
},
{
"epoch": 0.15,
"grad_norm": 3.2166365994174058,
"learning_rate": 1.9885769350756445e-06,
"loss": 0.9093,
"step": 373
},
{
"epoch": 0.15,
"grad_norm": 4.481931521174414,
"learning_rate": 1.9885122478649318e-06,
"loss": 0.9761,
"step": 374
},
{
"epoch": 0.15,
"grad_norm": 3.665225939621108,
"learning_rate": 1.9884473790717804e-06,
"loss": 0.8867,
"step": 375
},
{
"epoch": 0.15,
"grad_norm": 3.5451117021565612,
"learning_rate": 1.988382328708106e-06,
"loss": 0.8956,
"step": 376
},
{
"epoch": 0.15,
"grad_norm": 7.35507416624539,
"learning_rate": 1.988317096785859e-06,
"loss": 0.8945,
"step": 377
},
{
"epoch": 0.15,
"grad_norm": 6.261163362510146,
"learning_rate": 1.9882516833170206e-06,
"loss": 0.8805,
"step": 378
},
{
"epoch": 0.15,
"grad_norm": 3.181032304523185,
"learning_rate": 1.988186088313608e-06,
"loss": 0.9995,
"step": 379
},
{
"epoch": 0.16,
"grad_norm": 4.980980674245214,
"learning_rate": 1.9881203117876697e-06,
"loss": 0.9569,
"step": 380
},
{
"epoch": 0.16,
"grad_norm": 3.4888383766381574,
"learning_rate": 1.988054353751288e-06,
"loss": 1.0082,
"step": 381
},
{
"epoch": 0.16,
"grad_norm": 3.7945633395200242,
"learning_rate": 1.9879882142165805e-06,
"loss": 0.9999,
"step": 382
},
{
"epoch": 0.16,
"grad_norm": 2.821944500960536,
"learning_rate": 1.9879218931956953e-06,
"loss": 0.9885,
"step": 383
},
{
"epoch": 0.16,
"grad_norm": 3.523874109725945,
"learning_rate": 1.987855390700815e-06,
"loss": 0.9548,
"step": 384
},
{
"epoch": 0.16,
"grad_norm": 3.6373750696191975,
"learning_rate": 1.9877887067441563e-06,
"loss": 0.9806,
"step": 385
},
{
"epoch": 0.16,
"grad_norm": 4.209319437715957,
"learning_rate": 1.9877218413379682e-06,
"loss": 0.9431,
"step": 386
},
{
"epoch": 0.16,
"grad_norm": 4.118036551726974,
"learning_rate": 1.987654794494533e-06,
"loss": 1.0157,
"step": 387
},
{
"epoch": 0.16,
"grad_norm": 3.1106326953713777,
"learning_rate": 1.9875875662261678e-06,
"loss": 1.0005,
"step": 388
},
{
"epoch": 0.16,
"grad_norm": 3.7001411367047443,
"learning_rate": 1.987520156545221e-06,
"loss": 0.9547,
"step": 389
},
{
"epoch": 0.16,
"grad_norm": 3.532622234042634,
"learning_rate": 1.9874525654640756e-06,
"loss": 0.9394,
"step": 390
},
{
"epoch": 0.16,
"grad_norm": 2.710157534535334,
"learning_rate": 1.9873847929951474e-06,
"loss": 0.956,
"step": 391
},
{
"epoch": 0.16,
"grad_norm": 3.354640373456238,
"learning_rate": 1.987316839150886e-06,
"loss": 0.9444,
"step": 392
},
{
"epoch": 0.16,
"grad_norm": 5.074983600793948,
"learning_rate": 1.987248703943774e-06,
"loss": 0.9761,
"step": 393
},
{
"epoch": 0.16,
"grad_norm": 3.0294872900523075,
"learning_rate": 1.9871803873863266e-06,
"loss": 0.975,
"step": 394
},
{
"epoch": 0.16,
"grad_norm": 4.9120944551798935,
"learning_rate": 1.9871118894910943e-06,
"loss": 0.8971,
"step": 395
},
{
"epoch": 0.16,
"grad_norm": 3.821862309276276,
"learning_rate": 1.9870432102706584e-06,
"loss": 0.9286,
"step": 396
},
{
"epoch": 0.16,
"grad_norm": 4.005242089461416,
"learning_rate": 1.9869743497376355e-06,
"loss": 0.8894,
"step": 397
},
{
"epoch": 0.16,
"grad_norm": 2.918280752004309,
"learning_rate": 1.9869053079046747e-06,
"loss": 0.9608,
"step": 398
},
{
"epoch": 0.16,
"grad_norm": 4.8020545872846885,
"learning_rate": 1.9868360847844586e-06,
"loss": 0.9719,
"step": 399
},
{
"epoch": 0.16,
"grad_norm": 3.133669377671361,
"learning_rate": 1.9867666803897025e-06,
"loss": 0.9685,
"step": 400
},
{
"epoch": 0.16,
"eval_loss": 0.9418269991874695,
"eval_runtime": 464.4075,
"eval_samples_per_second": 75.046,
"eval_steps_per_second": 4.692,
"step": 400
},
{
"epoch": 0.16,
"grad_norm": 3.1853230460544366,
"learning_rate": 1.986697094733156e-06,
"loss": 0.9906,
"step": 401
},
{
"epoch": 0.16,
"grad_norm": 3.3295631633136202,
"learning_rate": 1.986627327827601e-06,
"loss": 0.9165,
"step": 402
},
{
"epoch": 0.16,
"grad_norm": 3.2943146163586188,
"learning_rate": 1.986557379685854e-06,
"loss": 0.9173,
"step": 403
},
{
"epoch": 0.16,
"grad_norm": 3.198284628470984,
"learning_rate": 1.9864872503207626e-06,
"loss": 1.004,
"step": 404
},
{
"epoch": 0.17,
"grad_norm": 3.9515727550469495,
"learning_rate": 1.98641693974521e-06,
"loss": 0.9495,
"step": 405
},
{
"epoch": 0.17,
"grad_norm": 4.686963754718287,
"learning_rate": 1.9863464479721117e-06,
"loss": 0.905,
"step": 406
},
{
"epoch": 0.17,
"grad_norm": 5.091354512777394,
"learning_rate": 1.9862757750144168e-06,
"loss": 0.9911,
"step": 407
},
{
"epoch": 0.17,
"grad_norm": 3.1314287931835514,
"learning_rate": 1.9862049208851064e-06,
"loss": 1.0036,
"step": 408
},
{
"epoch": 0.17,
"grad_norm": 3.4684709273178367,
"learning_rate": 1.9861338855971967e-06,
"loss": 0.928,
"step": 409
},
{
"epoch": 0.17,
"grad_norm": 4.118643435175958,
"learning_rate": 1.9860626691637365e-06,
"loss": 0.89,
"step": 410
},
{
"epoch": 0.17,
"grad_norm": 7.006670120793721,
"learning_rate": 1.9859912715978065e-06,
"loss": 0.9018,
"step": 411
},
{
"epoch": 0.17,
"grad_norm": 3.621638071278063,
"learning_rate": 1.985919692912523e-06,
"loss": 0.9166,
"step": 412
},
{
"epoch": 0.17,
"grad_norm": 4.787134044539406,
"learning_rate": 1.985847933121035e-06,
"loss": 0.879,
"step": 413
},
{
"epoch": 0.17,
"grad_norm": 5.683467063667276,
"learning_rate": 1.9857759922365228e-06,
"loss": 0.9464,
"step": 414
},
{
"epoch": 0.17,
"grad_norm": 3.990599743424488,
"learning_rate": 1.9857038702722023e-06,
"loss": 0.8801,
"step": 415
},
{
"epoch": 0.17,
"grad_norm": 4.9484880924012495,
"learning_rate": 1.9856315672413213e-06,
"loss": 0.8944,
"step": 416
},
{
"epoch": 0.17,
"grad_norm": 4.148798063312885,
"learning_rate": 1.985559083157162e-06,
"loss": 0.928,
"step": 417
},
{
"epoch": 0.17,
"grad_norm": 4.796518833653735,
"learning_rate": 1.9854864180330385e-06,
"loss": 0.9192,
"step": 418
},
{
"epoch": 0.17,
"grad_norm": 3.7603675256425815,
"learning_rate": 1.9854135718822995e-06,
"loss": 0.9457,
"step": 419
},
{
"epoch": 0.17,
"grad_norm": 6.216146922276296,
"learning_rate": 1.985340544718326e-06,
"loss": 0.9337,
"step": 420
},
{
"epoch": 0.17,
"grad_norm": 3.925978251287307,
"learning_rate": 1.985267336554532e-06,
"loss": 0.9943,
"step": 421
},
{
"epoch": 0.17,
"grad_norm": 3.005877615130435,
"learning_rate": 1.985193947404366e-06,
"loss": 0.9412,
"step": 422
},
{
"epoch": 0.17,
"grad_norm": 3.4720786999177466,
"learning_rate": 1.9851203772813094e-06,
"loss": 0.8902,
"step": 423
},
{
"epoch": 0.17,
"grad_norm": 2.7736551606620616,
"learning_rate": 1.9850466261988753e-06,
"loss": 0.8833,
"step": 424
},
{
"epoch": 0.17,
"grad_norm": 3.5439327106557603,
"learning_rate": 1.9849726941706122e-06,
"loss": 0.9127,
"step": 425
},
{
"epoch": 0.17,
"grad_norm": 3.7696323829604266,
"learning_rate": 1.9848985812101007e-06,
"loss": 0.9354,
"step": 426
},
{
"epoch": 0.17,
"grad_norm": 6.927152353449846,
"learning_rate": 1.9848242873309546e-06,
"loss": 0.9095,
"step": 427
},
{
"epoch": 0.17,
"grad_norm": 21.227215789936643,
"learning_rate": 1.984749812546821e-06,
"loss": 0.9318,
"step": 428
},
{
"epoch": 0.18,
"grad_norm": 7.514504604544716,
"learning_rate": 1.984675156871381e-06,
"loss": 1.0057,
"step": 429
},
{
"epoch": 0.18,
"grad_norm": 3.0751556503357618,
"learning_rate": 1.984600320318347e-06,
"loss": 0.9136,
"step": 430
},
{
"epoch": 0.18,
"grad_norm": 3.6936282545629218,
"learning_rate": 1.9845253029014674e-06,
"loss": 1.0066,
"step": 431
},
{
"epoch": 0.18,
"grad_norm": 3.300362549106743,
"learning_rate": 1.9844501046345218e-06,
"loss": 0.8871,
"step": 432
},
{
"epoch": 0.18,
"grad_norm": 3.5090883694191572,
"learning_rate": 1.9843747255313236e-06,
"loss": 0.9754,
"step": 433
},
{
"epoch": 0.18,
"grad_norm": 3.4999404966729437,
"learning_rate": 1.984299165605719e-06,
"loss": 1.0135,
"step": 434
},
{
"epoch": 0.18,
"grad_norm": 4.2169780257798335,
"learning_rate": 1.9842234248715883e-06,
"loss": 0.9659,
"step": 435
},
{
"epoch": 0.18,
"grad_norm": 5.092498951627064,
"learning_rate": 1.9841475033428445e-06,
"loss": 0.9787,
"step": 436
},
{
"epoch": 0.18,
"grad_norm": 3.224606983878831,
"learning_rate": 1.9840714010334333e-06,
"loss": 0.9346,
"step": 437
},
{
"epoch": 0.18,
"grad_norm": 4.254190644019953,
"learning_rate": 1.9839951179573344e-06,
"loss": 0.8981,
"step": 438
},
{
"epoch": 0.18,
"grad_norm": 9.611881176543749,
"learning_rate": 1.983918654128561e-06,
"loss": 0.8859,
"step": 439
},
{
"epoch": 0.18,
"grad_norm": 3.161995951538404,
"learning_rate": 1.9838420095611583e-06,
"loss": 0.902,
"step": 440
},
{
"epoch": 0.18,
"grad_norm": 3.7991752979717535,
"learning_rate": 1.983765184269205e-06,
"loss": 0.8907,
"step": 441
},
{
"epoch": 0.18,
"grad_norm": 3.2277927294960187,
"learning_rate": 1.9836881782668144e-06,
"loss": 0.9024,
"step": 442
},
{
"epoch": 0.18,
"grad_norm": 4.5176021704784,
"learning_rate": 1.9836109915681314e-06,
"loss": 0.9412,
"step": 443
},
{
"epoch": 0.18,
"grad_norm": 3.7291118080214547,
"learning_rate": 1.983533624187334e-06,
"loss": 0.9026,
"step": 444
},
{
"epoch": 0.18,
"grad_norm": 6.10897976368194,
"learning_rate": 1.9834560761386356e-06,
"loss": 0.9276,
"step": 445
},
{
"epoch": 0.18,
"grad_norm": 4.006139482791361,
"learning_rate": 1.98337834743628e-06,
"loss": 0.8902,
"step": 446
},
{
"epoch": 0.18,
"grad_norm": 4.9701540165038836,
"learning_rate": 1.9833004380945452e-06,
"loss": 0.9567,
"step": 447
},
{
"epoch": 0.18,
"grad_norm": 3.07344098565774,
"learning_rate": 1.9832223481277432e-06,
"loss": 0.9518,
"step": 448
},
{
"epoch": 0.18,
"grad_norm": 8.59801806887501,
"learning_rate": 1.9831440775502184e-06,
"loss": 0.9617,
"step": 449
},
{
"epoch": 0.18,
"grad_norm": 3.2835418430769314,
"learning_rate": 1.9830656263763488e-06,
"loss": 0.9003,
"step": 450
},
{
"epoch": 0.18,
"grad_norm": 4.8957561286782685,
"learning_rate": 1.9829869946205448e-06,
"loss": 0.8957,
"step": 451
},
{
"epoch": 0.18,
"grad_norm": 5.192801214333014,
"learning_rate": 1.9829081822972505e-06,
"loss": 0.9904,
"step": 452
},
{
"epoch": 0.18,
"grad_norm": 3.242286975727523,
"learning_rate": 1.9828291894209434e-06,
"loss": 0.9724,
"step": 453
},
{
"epoch": 0.19,
"grad_norm": 3.6244301793102958,
"learning_rate": 1.982750016006134e-06,
"loss": 0.9641,
"step": 454
},
{
"epoch": 0.19,
"grad_norm": 2.7967776702640714,
"learning_rate": 1.9826706620673656e-06,
"loss": 0.9622,
"step": 455
},
{
"epoch": 0.19,
"grad_norm": 6.353346624347498,
"learning_rate": 1.982591127619215e-06,
"loss": 0.9375,
"step": 456
},
{
"epoch": 0.19,
"grad_norm": 3.3933061668821884,
"learning_rate": 1.9825114126762923e-06,
"loss": 0.8885,
"step": 457
},
{
"epoch": 0.19,
"grad_norm": 3.453958583016644,
"learning_rate": 1.98243151725324e-06,
"loss": 0.9527,
"step": 458
},
{
"epoch": 0.19,
"grad_norm": 3.6112906695173073,
"learning_rate": 1.982351441364735e-06,
"loss": 0.9569,
"step": 459
},
{
"epoch": 0.19,
"grad_norm": 2.9690242834982747,
"learning_rate": 1.9822711850254864e-06,
"loss": 0.9427,
"step": 460
},
{
"epoch": 0.19,
"grad_norm": 4.2945276639512935,
"learning_rate": 1.9821907482502366e-06,
"loss": 0.9569,
"step": 461
},
{
"epoch": 0.19,
"grad_norm": 3.86365155474614,
"learning_rate": 1.9821101310537612e-06,
"loss": 0.8982,
"step": 462
},
{
"epoch": 0.19,
"grad_norm": 2.941138088711051,
"learning_rate": 1.982029333450869e-06,
"loss": 1.011,
"step": 463
},
{
"epoch": 0.19,
"grad_norm": 4.315928364376279,
"learning_rate": 1.9819483554564023e-06,
"loss": 0.9448,
"step": 464
},
{
"epoch": 0.19,
"grad_norm": 3.105446038470642,
"learning_rate": 1.981867197085236e-06,
"loss": 0.9198,
"step": 465
},
{
"epoch": 0.19,
"grad_norm": 3.047080358679292,
"learning_rate": 1.9817858583522776e-06,
"loss": 0.8697,
"step": 466
},
{
"epoch": 0.19,
"grad_norm": 3.4456935559706876,
"learning_rate": 1.9817043392724695e-06,
"loss": 0.9695,
"step": 467
},
{
"epoch": 0.19,
"grad_norm": 3.149543245354265,
"learning_rate": 1.9816226398607852e-06,
"loss": 0.9635,
"step": 468
},
{
"epoch": 0.19,
"grad_norm": 4.606130664147586,
"learning_rate": 1.9815407601322333e-06,
"loss": 0.9529,
"step": 469
},
{
"epoch": 0.19,
"grad_norm": 3.4744199608813116,
"learning_rate": 1.9814587001018533e-06,
"loss": 0.9365,
"step": 470
},
{
"epoch": 0.19,
"grad_norm": 3.545039914361793,
"learning_rate": 1.98137645978472e-06,
"loss": 0.9246,
"step": 471
},
{
"epoch": 0.19,
"grad_norm": 4.130720034658656,
"learning_rate": 1.98129403919594e-06,
"loss": 0.8838,
"step": 472
},
{
"epoch": 0.19,
"grad_norm": 3.28702194097568,
"learning_rate": 1.9812114383506534e-06,
"loss": 0.9856,
"step": 473
},
{
"epoch": 0.19,
"grad_norm": 2.890224487143984,
"learning_rate": 1.981128657264033e-06,
"loss": 0.9173,
"step": 474
},
{
"epoch": 0.19,
"grad_norm": 3.018183462401382,
"learning_rate": 1.9810456959512858e-06,
"loss": 0.9048,
"step": 475
},
{
"epoch": 0.19,
"grad_norm": 4.221773708771155,
"learning_rate": 1.9809625544276505e-06,
"loss": 0.9215,
"step": 476
},
{
"epoch": 0.19,
"grad_norm": 3.3163804735436417,
"learning_rate": 1.9808792327084e-06,
"loss": 0.9799,
"step": 477
},
{
"epoch": 0.2,
"grad_norm": 3.030768763396988,
"learning_rate": 1.98079573080884e-06,
"loss": 0.9365,
"step": 478
},
{
"epoch": 0.2,
"grad_norm": 4.045214382131696,
"learning_rate": 1.9807120487443086e-06,
"loss": 0.8504,
"step": 479
},
{
"epoch": 0.2,
"grad_norm": 4.408018127510554,
"learning_rate": 1.980628186530178e-06,
"loss": 0.9341,
"step": 480
},
{
"epoch": 0.2,
"grad_norm": 3.2394030028978684,
"learning_rate": 1.9805441441818533e-06,
"loss": 0.9661,
"step": 481
},
{
"epoch": 0.2,
"grad_norm": 3.8754035812662297,
"learning_rate": 1.980459921714772e-06,
"loss": 0.8939,
"step": 482
},
{
"epoch": 0.2,
"grad_norm": 5.954576426765161,
"learning_rate": 1.9803755191444047e-06,
"loss": 0.9817,
"step": 483
},
{
"epoch": 0.2,
"grad_norm": 3.1305681512561314,
"learning_rate": 1.9802909364862567e-06,
"loss": 0.934,
"step": 484
},
{
"epoch": 0.2,
"grad_norm": 4.60992623323212,
"learning_rate": 1.9802061737558646e-06,
"loss": 0.9528,
"step": 485
},
{
"epoch": 0.2,
"grad_norm": 3.1212333827352943,
"learning_rate": 1.9801212309687986e-06,
"loss": 1.0024,
"step": 486
},
{
"epoch": 0.2,
"grad_norm": 3.1621880905918713,
"learning_rate": 1.9800361081406627e-06,
"loss": 0.9028,
"step": 487
},
{
"epoch": 0.2,
"grad_norm": 3.576181334119721,
"learning_rate": 1.9799508052870923e-06,
"loss": 0.929,
"step": 488
},
{
"epoch": 0.2,
"grad_norm": 3.1554513619371805,
"learning_rate": 1.979865322423758e-06,
"loss": 0.9548,
"step": 489
},
{
"epoch": 0.2,
"grad_norm": 3.342506343346046,
"learning_rate": 1.979779659566361e-06,
"loss": 0.9514,
"step": 490
},
{
"epoch": 0.2,
"grad_norm": 4.175634159426199,
"learning_rate": 1.9796938167306386e-06,
"loss": 0.9982,
"step": 491
},
{
"epoch": 0.2,
"grad_norm": 4.25781668567399,
"learning_rate": 1.9796077939323582e-06,
"loss": 0.9166,
"step": 492
},
{
"epoch": 0.2,
"grad_norm": 3.381617853087264,
"learning_rate": 1.979521591187322e-06,
"loss": 0.9192,
"step": 493
},
{
"epoch": 0.2,
"grad_norm": 3.8108799965776265,
"learning_rate": 1.979435208511365e-06,
"loss": 0.8524,
"step": 494
},
{
"epoch": 0.2,
"grad_norm": 4.037446943311405,
"learning_rate": 1.979348645920355e-06,
"loss": 0.8854,
"step": 495
},
{
"epoch": 0.2,
"grad_norm": 2.6839396423738915,
"learning_rate": 1.979261903430193e-06,
"loss": 0.9854,
"step": 496
},
{
"epoch": 0.2,
"grad_norm": 3.130181796195427,
"learning_rate": 1.9791749810568124e-06,
"loss": 0.9592,
"step": 497
},
{
"epoch": 0.2,
"grad_norm": 2.9313294012776336,
"learning_rate": 1.9790878788161807e-06,
"loss": 0.9462,
"step": 498
},
{
"epoch": 0.2,
"grad_norm": 6.194147693772205,
"learning_rate": 1.979000596724298e-06,
"loss": 0.9279,
"step": 499
},
{
"epoch": 0.2,
"grad_norm": 2.655893027887663,
"learning_rate": 1.9789131347971967e-06,
"loss": 0.9134,
"step": 500
},
{
"epoch": 0.2,
"eval_loss": 0.9338716268539429,
"eval_runtime": 465.4964,
"eval_samples_per_second": 74.871,
"eval_steps_per_second": 4.681,
"step": 500
},
{
"epoch": 0.2,
"grad_norm": 3.2342170239001113,
"learning_rate": 1.9788254930509436e-06,
"loss": 0.9478,
"step": 501
},
{
"epoch": 0.2,
"grad_norm": 3.0444654758327987,
"learning_rate": 1.978737671501638e-06,
"loss": 0.9653,
"step": 502
},
{
"epoch": 0.21,
"grad_norm": 3.0071246342708284,
"learning_rate": 1.9786496701654115e-06,
"loss": 0.9118,
"step": 503
},
{
"epoch": 0.21,
"grad_norm": 2.971433466056043,
"learning_rate": 1.978561489058429e-06,
"loss": 0.8965,
"step": 504
},
{
"epoch": 0.21,
"grad_norm": 3.3027030824420223,
"learning_rate": 1.97847312819689e-06,
"loss": 0.936,
"step": 505
},
{
"epoch": 0.21,
"grad_norm": 4.098738088363002,
"learning_rate": 1.9783845875970245e-06,
"loss": 0.9627,
"step": 506
},
{
"epoch": 0.21,
"grad_norm": 4.066411783256414,
"learning_rate": 1.9782958672750974e-06,
"loss": 0.9092,
"step": 507
},
{
"epoch": 0.21,
"grad_norm": 3.03941753372783,
"learning_rate": 1.9782069672474064e-06,
"loss": 0.9186,
"step": 508
},
{
"epoch": 0.21,
"grad_norm": 2.701428039136341,
"learning_rate": 1.9781178875302803e-06,
"loss": 0.9725,
"step": 509
},
{
"epoch": 0.21,
"grad_norm": 3.171555589084459,
"learning_rate": 1.9780286281400836e-06,
"loss": 0.8783,
"step": 510
},
{
"epoch": 0.21,
"grad_norm": 3.5965722853210784,
"learning_rate": 1.9779391890932125e-06,
"loss": 0.8187,
"step": 511
},
{
"epoch": 0.21,
"grad_norm": 2.9417291863336246,
"learning_rate": 1.977849570406096e-06,
"loss": 0.949,
"step": 512
},
{
"epoch": 0.21,
"grad_norm": 3.89960679303917,
"learning_rate": 1.977759772095196e-06,
"loss": 0.9413,
"step": 513
},
{
"epoch": 0.21,
"grad_norm": 3.0157786923207115,
"learning_rate": 1.9776697941770088e-06,
"loss": 0.8481,
"step": 514
},
{
"epoch": 0.21,
"grad_norm": 3.5721775734910786,
"learning_rate": 1.9775796366680623e-06,
"loss": 0.9162,
"step": 515
},
{
"epoch": 0.21,
"grad_norm": 3.214075835720345,
"learning_rate": 1.977489299584917e-06,
"loss": 0.9137,
"step": 516
},
{
"epoch": 0.21,
"grad_norm": 3.8189570478424657,
"learning_rate": 1.977398782944168e-06,
"loss": 0.9288,
"step": 517
},
{
"epoch": 0.21,
"grad_norm": 3.1898060568965696,
"learning_rate": 1.9773080867624427e-06,
"loss": 0.9292,
"step": 518
},
{
"epoch": 0.21,
"grad_norm": 3.229537178004683,
"learning_rate": 1.9772172110564003e-06,
"loss": 0.894,
"step": 519
},
{
"epoch": 0.21,
"grad_norm": 3.110726520343392,
"learning_rate": 1.9771261558427353e-06,
"loss": 0.876,
"step": 520
},
{
"epoch": 0.21,
"grad_norm": 3.0665102513063305,
"learning_rate": 1.977034921138173e-06,
"loss": 0.9841,
"step": 521
},
{
"epoch": 0.21,
"grad_norm": 2.8946573957026387,
"learning_rate": 1.9769435069594725e-06,
"loss": 0.9344,
"step": 522
},
{
"epoch": 0.21,
"grad_norm": 3.6624395824638296,
"learning_rate": 1.976851913323426e-06,
"loss": 0.9149,
"step": 523
},
{
"epoch": 0.21,
"grad_norm": 2.8247979649532424,
"learning_rate": 1.9767601402468594e-06,
"loss": 0.9788,
"step": 524
},
{
"epoch": 0.21,
"grad_norm": 2.744528042735639,
"learning_rate": 1.9766681877466294e-06,
"loss": 0.9484,
"step": 525
},
{
"epoch": 0.21,
"grad_norm": 2.7183843564142616,
"learning_rate": 1.976576055839628e-06,
"loss": 0.9107,
"step": 526
},
{
"epoch": 0.22,
"grad_norm": 3.467573296279892,
"learning_rate": 1.976483744542779e-06,
"loss": 0.8963,
"step": 527
},
{
"epoch": 0.22,
"grad_norm": 3.4817755702656497,
"learning_rate": 1.976391253873039e-06,
"loss": 0.9245,
"step": 528
},
{
"epoch": 0.22,
"grad_norm": 3.351806333798753,
"learning_rate": 1.976298583847398e-06,
"loss": 0.9229,
"step": 529
},
{
"epoch": 0.22,
"grad_norm": 4.022204442954504,
"learning_rate": 1.976205734482879e-06,
"loss": 0.9628,
"step": 530
},
{
"epoch": 0.22,
"grad_norm": 8.752985520492723,
"learning_rate": 1.9761127057965373e-06,
"loss": 0.9786,
"step": 531
},
{
"epoch": 0.22,
"grad_norm": 3.310797741725481,
"learning_rate": 1.976019497805462e-06,
"loss": 0.8944,
"step": 532
},
{
"epoch": 0.22,
"grad_norm": 3.0964286741347684,
"learning_rate": 1.9759261105267745e-06,
"loss": 0.8765,
"step": 533
},
{
"epoch": 0.22,
"grad_norm": 3.803348426045089,
"learning_rate": 1.9758325439776295e-06,
"loss": 0.9353,
"step": 534
},
{
"epoch": 0.22,
"grad_norm": 4.335889910202956,
"learning_rate": 1.9757387981752146e-06,
"loss": 0.9108,
"step": 535
},
{
"epoch": 0.22,
"grad_norm": 3.4415299584337022,
"learning_rate": 1.9756448731367495e-06,
"loss": 0.9311,
"step": 536
},
{
"epoch": 0.22,
"grad_norm": 3.0724224154884303,
"learning_rate": 1.975550768879488e-06,
"loss": 0.9195,
"step": 537
},
{
"epoch": 0.22,
"grad_norm": 3.7011016100114653,
"learning_rate": 1.975456485420717e-06,
"loss": 0.8745,
"step": 538
},
{
"epoch": 0.22,
"grad_norm": 3.9190133357361687,
"learning_rate": 1.975362022777755e-06,
"loss": 0.9462,
"step": 539
},
{
"epoch": 0.22,
"grad_norm": 7.101206111241604,
"learning_rate": 1.975267380967954e-06,
"loss": 0.9033,
"step": 540
},
{
"epoch": 0.22,
"grad_norm": 3.0630096141245673,
"learning_rate": 1.975172560008699e-06,
"loss": 0.9244,
"step": 541
},
{
"epoch": 0.22,
"grad_norm": 2.7631165969577176,
"learning_rate": 1.9750775599174086e-06,
"loss": 0.9061,
"step": 542
},
{
"epoch": 0.22,
"grad_norm": 4.542237574232739,
"learning_rate": 1.9749823807115333e-06,
"loss": 0.9931,
"step": 543
},
{
"epoch": 0.22,
"grad_norm": 3.3890777395765053,
"learning_rate": 1.9748870224085563e-06,
"loss": 0.8999,
"step": 544
},
{
"epoch": 0.22,
"grad_norm": 3.8000199798572822,
"learning_rate": 1.9747914850259942e-06,
"loss": 0.976,
"step": 545
},
{
"epoch": 0.22,
"grad_norm": 5.169265756365973,
"learning_rate": 1.9746957685813973e-06,
"loss": 0.9985,
"step": 546
},
{
"epoch": 0.22,
"grad_norm": 3.477263661890311,
"learning_rate": 1.9745998730923477e-06,
"loss": 0.9242,
"step": 547
},
{
"epoch": 0.22,
"grad_norm": 2.944473292104183,
"learning_rate": 1.9745037985764605e-06,
"loss": 0.9108,
"step": 548
},
{
"epoch": 0.22,
"grad_norm": 2.885191058325181,
"learning_rate": 1.974407545051384e-06,
"loss": 0.925,
"step": 549
},
{
"epoch": 0.22,
"grad_norm": 3.946290119516155,
"learning_rate": 1.9743111125347988e-06,
"loss": 0.933,
"step": 550
},
{
"epoch": 0.22,
"grad_norm": 2.9732576909315,
"learning_rate": 1.9742145010444197e-06,
"loss": 0.9317,
"step": 551
},
{
"epoch": 0.23,
"grad_norm": 3.729976468166404,
"learning_rate": 1.974117710597993e-06,
"loss": 0.8939,
"step": 552
},
{
"epoch": 0.23,
"grad_norm": 8.055036046006403,
"learning_rate": 1.9740207412132984e-06,
"loss": 1.0179,
"step": 553
},
{
"epoch": 0.23,
"grad_norm": 3.809079757052819,
"learning_rate": 1.973923592908149e-06,
"loss": 0.9291,
"step": 554
},
{
"epoch": 0.23,
"grad_norm": 3.104251558517355,
"learning_rate": 1.9738262657003895e-06,
"loss": 0.8751,
"step": 555
},
{
"epoch": 0.23,
"grad_norm": 2.7976066066821725,
"learning_rate": 1.9737287596078987e-06,
"loss": 0.9316,
"step": 556
},
{
"epoch": 0.23,
"grad_norm": 3.12404725847787,
"learning_rate": 1.9736310746485874e-06,
"loss": 0.9443,
"step": 557
},
{
"epoch": 0.23,
"grad_norm": 3.18328904389599,
"learning_rate": 1.9735332108403996e-06,
"loss": 0.8683,
"step": 558
},
{
"epoch": 0.23,
"grad_norm": 3.0602421150553094,
"learning_rate": 1.9734351682013123e-06,
"loss": 0.9173,
"step": 559
},
{
"epoch": 0.23,
"grad_norm": 3.4912776637847998,
"learning_rate": 1.9733369467493355e-06,
"loss": 0.9008,
"step": 560
},
{
"epoch": 0.23,
"grad_norm": 3.55514879726441,
"learning_rate": 1.9732385465025115e-06,
"loss": 0.9633,
"step": 561
},
{
"epoch": 0.23,
"grad_norm": 2.9719172446597297,
"learning_rate": 1.9731399674789154e-06,
"loss": 0.9757,
"step": 562
},
{
"epoch": 0.23,
"grad_norm": 4.200556645814142,
"learning_rate": 1.9730412096966557e-06,
"loss": 0.909,
"step": 563
},
{
"epoch": 0.23,
"grad_norm": 3.3682324521147007,
"learning_rate": 1.972942273173874e-06,
"loss": 1.0293,
"step": 564
},
{
"epoch": 0.23,
"grad_norm": 2.9333942347146347,
"learning_rate": 1.9728431579287434e-06,
"loss": 0.9026,
"step": 565
},
{
"epoch": 0.23,
"grad_norm": 2.995406385994797,
"learning_rate": 1.972743863979471e-06,
"loss": 0.9844,
"step": 566
},
{
"epoch": 0.23,
"grad_norm": 3.19152580590956,
"learning_rate": 1.972644391344296e-06,
"loss": 0.9541,
"step": 567
},
{
"epoch": 0.23,
"grad_norm": 3.171966151320119,
"learning_rate": 1.9725447400414917e-06,
"loss": 0.8812,
"step": 568
},
{
"epoch": 0.23,
"grad_norm": 3.538107661834801,
"learning_rate": 1.9724449100893624e-06,
"loss": 0.9003,
"step": 569
},
{
"epoch": 0.23,
"grad_norm": 2.6626769321123094,
"learning_rate": 1.9723449015062465e-06,
"loss": 0.9615,
"step": 570
},
{
"epoch": 0.23,
"grad_norm": 2.591554945500685,
"learning_rate": 1.972244714310515e-06,
"loss": 0.9366,
"step": 571
},
{
"epoch": 0.23,
"grad_norm": 3.037178629545735,
"learning_rate": 1.9721443485205713e-06,
"loss": 0.9367,
"step": 572
},
{
"epoch": 0.23,
"grad_norm": 3.3448501264729047,
"learning_rate": 1.9720438041548516e-06,
"loss": 0.9815,
"step": 573
},
{
"epoch": 0.23,
"grad_norm": 3.51629452848736,
"learning_rate": 1.971943081231826e-06,
"loss": 0.9762,
"step": 574
},
{
"epoch": 0.23,
"grad_norm": 3.280427731110891,
"learning_rate": 1.971842179769996e-06,
"loss": 0.9141,
"step": 575
},
{
"epoch": 0.24,
"grad_norm": 3.0876374613264312,
"learning_rate": 1.9717410997878963e-06,
"loss": 0.9228,
"step": 576
},
{
"epoch": 0.24,
"grad_norm": 5.346641534545134,
"learning_rate": 1.9716398413040945e-06,
"loss": 0.9261,
"step": 577
},
{
"epoch": 0.24,
"grad_norm": 3.5265975809377506,
"learning_rate": 1.9715384043371916e-06,
"loss": 0.9349,
"step": 578
},
{
"epoch": 0.24,
"grad_norm": 2.789319684482638,
"learning_rate": 1.9714367889058203e-06,
"loss": 0.9751,
"step": 579
},
{
"epoch": 0.24,
"grad_norm": 3.281067525880389,
"learning_rate": 1.971334995028647e-06,
"loss": 0.8536,
"step": 580
},
{
"epoch": 0.24,
"grad_norm": 3.7828035583022737,
"learning_rate": 1.9712330227243707e-06,
"loss": 1.0114,
"step": 581
},
{
"epoch": 0.24,
"grad_norm": 3.1019287026634985,
"learning_rate": 1.971130872011722e-06,
"loss": 0.9634,
"step": 582
},
{
"epoch": 0.24,
"grad_norm": 5.669687346685349,
"learning_rate": 1.971028542909466e-06,
"loss": 0.9875,
"step": 583
},
{
"epoch": 0.24,
"grad_norm": 3.7796922158693156,
"learning_rate": 1.9709260354363996e-06,
"loss": 0.9235,
"step": 584
},
{
"epoch": 0.24,
"grad_norm": 3.0041103040916415,
"learning_rate": 1.970823349611353e-06,
"loss": 0.9472,
"step": 585
},
{
"epoch": 0.24,
"grad_norm": 3.0712750389107333,
"learning_rate": 1.9707204854531877e-06,
"loss": 0.9282,
"step": 586
},
{
"epoch": 0.24,
"grad_norm": 3.5319196666971253,
"learning_rate": 1.970617442980801e-06,
"loss": 0.9622,
"step": 587
},
{
"epoch": 0.24,
"grad_norm": 3.2380140867747413,
"learning_rate": 1.970514222213119e-06,
"loss": 0.9468,
"step": 588
},
{
"epoch": 0.24,
"grad_norm": 5.0154888088448155,
"learning_rate": 1.970410823169104e-06,
"loss": 0.8961,
"step": 589
},
{
"epoch": 0.24,
"grad_norm": 3.199455548003289,
"learning_rate": 1.970307245867749e-06,
"loss": 0.899,
"step": 590
},
{
"epoch": 0.24,
"grad_norm": 4.028057114280807,
"learning_rate": 1.970203490328081e-06,
"loss": 0.8634,
"step": 591
},
{
"epoch": 0.24,
"grad_norm": 3.4790260480864657,
"learning_rate": 1.9700995565691584e-06,
"loss": 0.9246,
"step": 592
},
{
"epoch": 0.24,
"grad_norm": 3.8121145400620633,
"learning_rate": 1.969995444610073e-06,
"loss": 0.9616,
"step": 593
},
{
"epoch": 0.24,
"grad_norm": 3.112348736732859,
"learning_rate": 1.9698911544699507e-06,
"loss": 0.8717,
"step": 594
},
{
"epoch": 0.24,
"grad_norm": 2.7910768729648483,
"learning_rate": 1.969786686167947e-06,
"loss": 1.0142,
"step": 595
},
{
"epoch": 0.24,
"grad_norm": 3.471223810860898,
"learning_rate": 1.9696820397232536e-06,
"loss": 0.9767,
"step": 596
},
{
"epoch": 0.24,
"grad_norm": 2.9033382296970855,
"learning_rate": 1.9695772151550924e-06,
"loss": 0.9601,
"step": 597
},
{
"epoch": 0.24,
"grad_norm": 3.903016946356744,
"learning_rate": 1.969472212482719e-06,
"loss": 0.902,
"step": 598
},
{
"epoch": 0.24,
"grad_norm": 3.1095423105442688,
"learning_rate": 1.9693670317254216e-06,
"loss": 0.9449,
"step": 599
},
{
"epoch": 0.24,
"grad_norm": 2.946194175215952,
"learning_rate": 1.969261672902521e-06,
"loss": 0.9535,
"step": 600
},
{
"epoch": 0.24,
"eval_loss": 0.9296932816505432,
"eval_runtime": 464.7575,
"eval_samples_per_second": 74.99,
"eval_steps_per_second": 4.688,
"step": 600
},
{
"epoch": 0.25,
"grad_norm": 2.8544370960481196,
"learning_rate": 1.9691561360333712e-06,
"loss": 0.9674,
"step": 601
},
{
"epoch": 0.25,
"grad_norm": 2.968997762647846,
"learning_rate": 1.969050421137358e-06,
"loss": 0.8758,
"step": 602
},
{
"epoch": 0.25,
"grad_norm": 3.2446048961836484,
"learning_rate": 1.968944528233902e-06,
"loss": 0.9623,
"step": 603
},
{
"epoch": 0.25,
"grad_norm": 2.840937453984757,
"learning_rate": 1.968838457342453e-06,
"loss": 0.9547,
"step": 604
},
{
"epoch": 0.25,
"grad_norm": 4.784259955908488,
"learning_rate": 1.9687322084824965e-06,
"loss": 0.8821,
"step": 605
},
{
"epoch": 0.25,
"grad_norm": 4.262190971893724,
"learning_rate": 1.968625781673549e-06,
"loss": 0.9309,
"step": 606
},
{
"epoch": 0.25,
"grad_norm": 3.4592025074030435,
"learning_rate": 1.968519176935161e-06,
"loss": 0.8555,
"step": 607
},
{
"epoch": 0.25,
"grad_norm": 3.7361975814390798,
"learning_rate": 1.968412394286915e-06,
"loss": 0.9675,
"step": 608
},
{
"epoch": 0.25,
"grad_norm": 3.608329812957296,
"learning_rate": 1.9683054337484256e-06,
"loss": 0.8844,
"step": 609
},
{
"epoch": 0.25,
"grad_norm": 4.908161287085106,
"learning_rate": 1.9681982953393412e-06,
"loss": 0.9446,
"step": 610
},
{
"epoch": 0.25,
"grad_norm": 3.4328791316576845,
"learning_rate": 1.968090979079342e-06,
"loss": 0.9656,
"step": 611
},
{
"epoch": 0.25,
"grad_norm": 4.263908742780136,
"learning_rate": 1.967983484988141e-06,
"loss": 0.9011,
"step": 612
},
{
"epoch": 0.25,
"grad_norm": 4.655782135920792,
"learning_rate": 1.9678758130854843e-06,
"loss": 0.9129,
"step": 613
},
{
"epoch": 0.25,
"grad_norm": 3.0591126505616764,
"learning_rate": 1.967767963391151e-06,
"loss": 0.906,
"step": 614
},
{
"epoch": 0.25,
"grad_norm": 2.855116791674079,
"learning_rate": 1.9676599359249514e-06,
"loss": 0.9562,
"step": 615
},
{
"epoch": 0.25,
"grad_norm": 4.323496817698228,
"learning_rate": 1.96755173070673e-06,
"loss": 1.0127,
"step": 616
},
{
"epoch": 0.25,
"grad_norm": 2.878203458977944,
"learning_rate": 1.967443347756363e-06,
"loss": 0.9435,
"step": 617
},
{
"epoch": 0.25,
"grad_norm": 2.750859034924024,
"learning_rate": 1.96733478709376e-06,
"loss": 0.9774,
"step": 618
},
{
"epoch": 0.25,
"grad_norm": 3.6981150510076826,
"learning_rate": 1.967226048738862e-06,
"loss": 0.9304,
"step": 619
},
{
"epoch": 0.25,
"grad_norm": 3.351065815048634,
"learning_rate": 1.967117132711644e-06,
"loss": 0.8762,
"step": 620
},
{
"epoch": 0.25,
"grad_norm": 3.3642932470003775,
"learning_rate": 1.9670080390321127e-06,
"loss": 0.8915,
"step": 621
},
{
"epoch": 0.25,
"grad_norm": 3.2248070124347477,
"learning_rate": 1.966898767720309e-06,
"loss": 0.9212,
"step": 622
},
{
"epoch": 0.25,
"grad_norm": 4.0915281394243745,
"learning_rate": 1.9667893187963033e-06,
"loss": 0.9122,
"step": 623
},
{
"epoch": 0.25,
"grad_norm": 2.9017434367992774,
"learning_rate": 1.9666796922802016e-06,
"loss": 0.9187,
"step": 624
},
{
"epoch": 0.26,
"grad_norm": 3.0476445747686927,
"learning_rate": 1.9665698881921418e-06,
"loss": 0.9412,
"step": 625
},
{
"epoch": 0.26,
"grad_norm": 3.324412248710147,
"learning_rate": 1.9664599065522936e-06,
"loss": 0.949,
"step": 626
},
{
"epoch": 0.26,
"grad_norm": 3.5566676953726795,
"learning_rate": 1.9663497473808597e-06,
"loss": 0.8767,
"step": 627
},
{
"epoch": 0.26,
"grad_norm": 2.825090406946847,
"learning_rate": 1.966239410698076e-06,
"loss": 0.9363,
"step": 628
},
{
"epoch": 0.26,
"grad_norm": 3.7410431282953964,
"learning_rate": 1.966128896524211e-06,
"loss": 0.953,
"step": 629
},
{
"epoch": 0.26,
"grad_norm": 3.049191060082822,
"learning_rate": 1.9660182048795635e-06,
"loss": 0.8877,
"step": 630
},
{
"epoch": 0.26,
"grad_norm": 2.8249728989658687,
"learning_rate": 1.9659073357844683e-06,
"loss": 0.9238,
"step": 631
},
{
"epoch": 0.26,
"grad_norm": 2.8390523314205502,
"learning_rate": 1.965796289259291e-06,
"loss": 0.9546,
"step": 632
},
{
"epoch": 0.26,
"grad_norm": 3.581019905564975,
"learning_rate": 1.9656850653244297e-06,
"loss": 0.9377,
"step": 633
},
{
"epoch": 0.26,
"grad_norm": 2.7773844697697716,
"learning_rate": 1.965573664000316e-06,
"loss": 0.9671,
"step": 634
},
{
"epoch": 0.26,
"grad_norm": 2.8955031214791043,
"learning_rate": 1.965462085307413e-06,
"loss": 0.9162,
"step": 635
},
{
"epoch": 0.26,
"grad_norm": 2.9379324682864327,
"learning_rate": 1.965350329266217e-06,
"loss": 0.9384,
"step": 636
},
{
"epoch": 0.26,
"grad_norm": 4.377796053300959,
"learning_rate": 1.9652383958972564e-06,
"loss": 0.8977,
"step": 637
},
{
"epoch": 0.26,
"grad_norm": 2.84460597316765,
"learning_rate": 1.965126285221093e-06,
"loss": 0.931,
"step": 638
},
{
"epoch": 0.26,
"grad_norm": 4.830378318141518,
"learning_rate": 1.9650139972583206e-06,
"loss": 0.9111,
"step": 639
},
{
"epoch": 0.26,
"grad_norm": 2.8697211607422184,
"learning_rate": 1.9649015320295658e-06,
"loss": 0.9137,
"step": 640
},
{
"epoch": 0.26,
"grad_norm": 2.746739447831184,
"learning_rate": 1.9647888895554872e-06,
"loss": 0.8878,
"step": 641
},
{
"epoch": 0.26,
"grad_norm": 2.938680690054702,
"learning_rate": 1.964676069856777e-06,
"loss": 0.911,
"step": 642
},
{
"epoch": 0.26,
"grad_norm": 3.162352856662404,
"learning_rate": 1.9645630729541594e-06,
"loss": 0.9033,
"step": 643
},
{
"epoch": 0.26,
"grad_norm": 3.005459473016203,
"learning_rate": 1.9644498988683906e-06,
"loss": 0.9015,
"step": 644
},
{
"epoch": 0.26,
"grad_norm": 3.0487990192913257,
"learning_rate": 1.9643365476202595e-06,
"loss": 0.942,
"step": 645
},
{
"epoch": 0.26,
"grad_norm": 3.201109510449628,
"learning_rate": 1.9642230192305886e-06,
"loss": 0.9294,
"step": 646
},
{
"epoch": 0.26,
"grad_norm": 3.5453539824427938,
"learning_rate": 1.964109313720232e-06,
"loss": 0.9645,
"step": 647
},
{
"epoch": 0.26,
"grad_norm": 2.5814465472548416,
"learning_rate": 1.963995431110077e-06,
"loss": 0.9262,
"step": 648
},
{
"epoch": 0.26,
"grad_norm": 4.448002248254465,
"learning_rate": 1.963881371421042e-06,
"loss": 0.8457,
"step": 649
},
{
"epoch": 0.27,
"grad_norm": 3.2768523860371745,
"learning_rate": 1.96376713467408e-06,
"loss": 0.9545,
"step": 650
},
{
"epoch": 0.27,
"grad_norm": 2.9477530301855195,
"learning_rate": 1.963652720890175e-06,
"loss": 0.9588,
"step": 651
},
{
"epoch": 0.27,
"grad_norm": 2.69589301775634,
"learning_rate": 1.9635381300903432e-06,
"loss": 0.9813,
"step": 652
},
{
"epoch": 0.27,
"grad_norm": 2.928037428517663,
"learning_rate": 1.963423362295635e-06,
"loss": 0.9704,
"step": 653
},
{
"epoch": 0.27,
"grad_norm": 3.949855661724315,
"learning_rate": 1.963308417527133e-06,
"loss": 0.928,
"step": 654
},
{
"epoch": 0.27,
"grad_norm": 3.4503437589671466,
"learning_rate": 1.9631932958059506e-06,
"loss": 0.907,
"step": 655
},
{
"epoch": 0.27,
"grad_norm": 3.225944215545616,
"learning_rate": 1.963077997153235e-06,
"loss": 0.9221,
"step": 656
},
{
"epoch": 0.27,
"grad_norm": 3.3812282757604595,
"learning_rate": 1.962962521590166e-06,
"loss": 0.9505,
"step": 657
},
{
"epoch": 0.27,
"grad_norm": 3.6464110693826037,
"learning_rate": 1.9628468691379556e-06,
"loss": 0.8929,
"step": 658
},
{
"epoch": 0.27,
"grad_norm": 3.567484205914664,
"learning_rate": 1.962731039817848e-06,
"loss": 0.9099,
"step": 659
},
{
"epoch": 0.27,
"grad_norm": 3.7591910104703183,
"learning_rate": 1.96261503365112e-06,
"loss": 0.9238,
"step": 660
},
{
"epoch": 0.27,
"grad_norm": 2.9024379272152347,
"learning_rate": 1.962498850659082e-06,
"loss": 0.8869,
"step": 661
},
{
"epoch": 0.27,
"grad_norm": 2.793287578961247,
"learning_rate": 1.962382490863075e-06,
"loss": 1.0607,
"step": 662
},
{
"epoch": 0.27,
"grad_norm": 4.352063327575159,
"learning_rate": 1.9622659542844745e-06,
"loss": 0.9005,
"step": 663
},
{
"epoch": 0.27,
"grad_norm": 3.629022682583476,
"learning_rate": 1.9621492409446862e-06,
"loss": 0.9202,
"step": 664
},
{
"epoch": 0.27,
"grad_norm": 3.25386796826463,
"learning_rate": 1.9620323508651504e-06,
"loss": 0.8861,
"step": 665
},
{
"epoch": 0.27,
"grad_norm": 3.618202804683061,
"learning_rate": 1.9619152840673385e-06,
"loss": 0.9484,
"step": 666
},
{
"epoch": 0.27,
"grad_norm": 4.141916568039785,
"learning_rate": 1.961798040572755e-06,
"loss": 0.9419,
"step": 667
},
{
"epoch": 0.27,
"grad_norm": 2.7511722282689735,
"learning_rate": 1.9616806204029363e-06,
"loss": 1.025,
"step": 668
},
{
"epoch": 0.27,
"grad_norm": 3.0028885430127867,
"learning_rate": 1.961563023579452e-06,
"loss": 0.8952,
"step": 669
},
{
"epoch": 0.27,
"grad_norm": 2.8065377189776806,
"learning_rate": 1.961445250123904e-06,
"loss": 0.9603,
"step": 670
},
{
"epoch": 0.27,
"grad_norm": 3.9406301575323646,
"learning_rate": 1.961327300057926e-06,
"loss": 0.9534,
"step": 671
},
{
"epoch": 0.27,
"grad_norm": 2.8991885146521676,
"learning_rate": 1.9612091734031844e-06,
"loss": 0.9409,
"step": 672
},
{
"epoch": 0.27,
"grad_norm": 5.186359154120154,
"learning_rate": 1.961090870181379e-06,
"loss": 0.8957,
"step": 673
},
{
"epoch": 0.28,
"grad_norm": 2.9824064851768886,
"learning_rate": 1.96097239041424e-06,
"loss": 0.957,
"step": 674
},
{
"epoch": 0.28,
"grad_norm": 5.0694117945571575,
"learning_rate": 1.9608537341235324e-06,
"loss": 0.9048,
"step": 675
},
{
"epoch": 0.28,
"grad_norm": 3.20268628504309,
"learning_rate": 1.960734901331052e-06,
"loss": 0.8879,
"step": 676
},
{
"epoch": 0.28,
"grad_norm": 3.1542624508516406,
"learning_rate": 1.9606158920586273e-06,
"loss": 0.9587,
"step": 677
},
{
"epoch": 0.28,
"grad_norm": 3.057228571031918,
"learning_rate": 1.96049670632812e-06,
"loss": 0.9451,
"step": 678
},
{
"epoch": 0.28,
"grad_norm": 3.188452246096376,
"learning_rate": 1.960377344161423e-06,
"loss": 0.8829,
"step": 679
},
{
"epoch": 0.28,
"grad_norm": 3.122732218184677,
"learning_rate": 1.960257805580463e-06,
"loss": 0.9486,
"step": 680
},
{
"epoch": 0.28,
"grad_norm": 3.845565588136525,
"learning_rate": 1.9601380906071977e-06,
"loss": 0.8822,
"step": 681
},
{
"epoch": 0.28,
"grad_norm": 2.930779487365368,
"learning_rate": 1.9600181992636186e-06,
"loss": 0.9291,
"step": 682
},
{
"epoch": 0.28,
"grad_norm": 2.8057092825066197,
"learning_rate": 1.959898131571748e-06,
"loss": 0.9815,
"step": 683
},
{
"epoch": 0.28,
"grad_norm": 2.6866673472927887,
"learning_rate": 1.9597778875536413e-06,
"loss": 0.9191,
"step": 684
},
{
"epoch": 0.28,
"grad_norm": 2.835657221446559,
"learning_rate": 1.959657467231388e-06,
"loss": 0.9261,
"step": 685
},
{
"epoch": 0.28,
"grad_norm": 3.8847583712865856,
"learning_rate": 1.9595368706271065e-06,
"loss": 0.9532,
"step": 686
},
{
"epoch": 0.28,
"grad_norm": 2.871564407010368,
"learning_rate": 1.9594160977629506e-06,
"loss": 0.9185,
"step": 687
},
{
"epoch": 0.28,
"grad_norm": 2.843621578822057,
"learning_rate": 1.959295148661105e-06,
"loss": 0.848,
"step": 688
},
{
"epoch": 0.28,
"grad_norm": 3.4347511717212833,
"learning_rate": 1.959174023343788e-06,
"loss": 0.8474,
"step": 689
},
{
"epoch": 0.28,
"grad_norm": 3.015401255341506,
"learning_rate": 1.9590527218332483e-06,
"loss": 0.8677,
"step": 690
},
{
"epoch": 0.28,
"grad_norm": 4.4542742912852455,
"learning_rate": 1.9589312441517687e-06,
"loss": 0.9072,
"step": 691
},
{
"epoch": 0.28,
"grad_norm": 3.570117291692264,
"learning_rate": 1.9588095903216638e-06,
"loss": 0.8808,
"step": 692
},
{
"epoch": 0.28,
"grad_norm": 3.3676636637258692,
"learning_rate": 1.9586877603652805e-06,
"loss": 0.9311,
"step": 693
},
{
"epoch": 0.28,
"grad_norm": 4.4714878755260345,
"learning_rate": 1.9585657543049973e-06,
"loss": 0.9357,
"step": 694
},
{
"epoch": 0.28,
"grad_norm": 3.0738360710228623,
"learning_rate": 1.9584435721632265e-06,
"loss": 0.9216,
"step": 695
},
{
"epoch": 0.28,
"grad_norm": 3.742836508182823,
"learning_rate": 1.9583212139624125e-06,
"loss": 1.038,
"step": 696
},
{
"epoch": 0.28,
"grad_norm": 2.7736266856056533,
"learning_rate": 1.9581986797250307e-06,
"loss": 0.8783,
"step": 697
},
{
"epoch": 0.28,
"grad_norm": 3.85475355691834,
"learning_rate": 1.9580759694735904e-06,
"loss": 0.9225,
"step": 698
},
{
"epoch": 0.29,
"grad_norm": 3.0982007282599087,
"learning_rate": 1.957953083230632e-06,
"loss": 0.9252,
"step": 699
},
{
"epoch": 0.29,
"grad_norm": 2.6375878825298904,
"learning_rate": 1.9578300210187292e-06,
"loss": 0.9617,
"step": 700
},
{
"epoch": 0.29,
"eval_loss": 0.9240374565124512,
"eval_runtime": 465.3654,
"eval_samples_per_second": 74.892,
"eval_steps_per_second": 4.682,
"step": 700
},
{
"epoch": 0.29,
"grad_norm": 2.968986242724906,
"learning_rate": 1.957706782860488e-06,
"loss": 0.9434,
"step": 701
},
{
"epoch": 0.29,
"grad_norm": 3.009729149164253,
"learning_rate": 1.9575833687785456e-06,
"loss": 0.9259,
"step": 702
},
{
"epoch": 0.29,
"grad_norm": 3.0258938100391033,
"learning_rate": 1.957459778795572e-06,
"loss": 0.8815,
"step": 703
},
{
"epoch": 0.29,
"grad_norm": 3.1464555048319527,
"learning_rate": 1.957336012934271e-06,
"loss": 0.9375,
"step": 704
},
{
"epoch": 0.29,
"grad_norm": 3.12938181554599,
"learning_rate": 1.9572120712173765e-06,
"loss": 0.9477,
"step": 705
},
{
"epoch": 0.29,
"grad_norm": 4.840610189475752,
"learning_rate": 1.957087953667656e-06,
"loss": 0.9766,
"step": 706
},
{
"epoch": 0.29,
"grad_norm": 5.459688903071921,
"learning_rate": 1.956963660307909e-06,
"loss": 0.8955,
"step": 707
},
{
"epoch": 0.29,
"grad_norm": 2.920362197707016,
"learning_rate": 1.956839191160967e-06,
"loss": 0.947,
"step": 708
},
{
"epoch": 0.29,
"grad_norm": 3.3363125441118484,
"learning_rate": 1.9567145462496946e-06,
"loss": 0.9966,
"step": 709
},
{
"epoch": 0.29,
"grad_norm": 3.0586976864436806,
"learning_rate": 1.956589725596988e-06,
"loss": 0.927,
"step": 710
},
{
"epoch": 0.29,
"grad_norm": 2.7152565624916165,
"learning_rate": 1.9564647292257755e-06,
"loss": 0.9814,
"step": 711
},
{
"epoch": 0.29,
"grad_norm": 3.1556388127647588,
"learning_rate": 1.956339557159018e-06,
"loss": 0.8821,
"step": 712
},
{
"epoch": 0.29,
"grad_norm": 3.07074275178268,
"learning_rate": 1.9562142094197093e-06,
"loss": 0.9858,
"step": 713
},
{
"epoch": 0.29,
"grad_norm": 3.1270976862509627,
"learning_rate": 1.956088686030874e-06,
"loss": 0.9503,
"step": 714
},
{
"epoch": 0.29,
"grad_norm": 2.590900794602496,
"learning_rate": 1.9559629870155707e-06,
"loss": 0.9138,
"step": 715
},
{
"epoch": 0.29,
"grad_norm": 2.884115527466902,
"learning_rate": 1.955837112396889e-06,
"loss": 0.9704,
"step": 716
},
{
"epoch": 0.29,
"grad_norm": 3.530604568494635,
"learning_rate": 1.9557110621979506e-06,
"loss": 0.9607,
"step": 717
},
{
"epoch": 0.29,
"grad_norm": 3.032332916443812,
"learning_rate": 1.9555848364419107e-06,
"loss": 0.9469,
"step": 718
},
{
"epoch": 0.29,
"grad_norm": 2.648970804468958,
"learning_rate": 1.9554584351519563e-06,
"loss": 0.953,
"step": 719
},
{
"epoch": 0.29,
"grad_norm": 3.7817470337436174,
"learning_rate": 1.9553318583513055e-06,
"loss": 0.925,
"step": 720
},
{
"epoch": 0.29,
"grad_norm": 2.986459309250747,
"learning_rate": 1.95520510606321e-06,
"loss": 0.9452,
"step": 721
},
{
"epoch": 0.29,
"grad_norm": 3.41688705070573,
"learning_rate": 1.9550781783109534e-06,
"loss": 0.963,
"step": 722
},
{
"epoch": 0.3,
"grad_norm": 2.8453381999630003,
"learning_rate": 1.9549510751178507e-06,
"loss": 0.9856,
"step": 723
},
{
"epoch": 0.3,
"grad_norm": 3.6678728945870276,
"learning_rate": 1.954823796507251e-06,
"loss": 0.915,
"step": 724
},
{
"epoch": 0.3,
"grad_norm": 2.975696654173514,
"learning_rate": 1.9546963425025334e-06,
"loss": 0.8818,
"step": 725
},
{
"epoch": 0.3,
"grad_norm": 2.681137749056913,
"learning_rate": 1.954568713127111e-06,
"loss": 0.9054,
"step": 726
},
{
"epoch": 0.3,
"grad_norm": 3.332952849764895,
"learning_rate": 1.954440908404428e-06,
"loss": 0.9351,
"step": 727
},
{
"epoch": 0.3,
"grad_norm": 3.8838573542934935,
"learning_rate": 1.9543129283579607e-06,
"loss": 1.0216,
"step": 728
},
{
"epoch": 0.3,
"grad_norm": 2.873826524379971,
"learning_rate": 1.954184773011219e-06,
"loss": 0.904,
"step": 729
},
{
"epoch": 0.3,
"grad_norm": 3.3093206631379095,
"learning_rate": 1.954056442387744e-06,
"loss": 0.9941,
"step": 730
},
{
"epoch": 0.3,
"grad_norm": 2.6562819190542934,
"learning_rate": 1.9539279365111083e-06,
"loss": 0.9399,
"step": 731
},
{
"epoch": 0.3,
"grad_norm": 3.7709572601405923,
"learning_rate": 1.9537992554049184e-06,
"loss": 0.9122,
"step": 732
},
{
"epoch": 0.3,
"grad_norm": 3.37182594305233,
"learning_rate": 1.9536703990928114e-06,
"loss": 0.9517,
"step": 733
},
{
"epoch": 0.3,
"grad_norm": 3.3042690064415283,
"learning_rate": 1.953541367598458e-06,
"loss": 0.8812,
"step": 734
},
{
"epoch": 0.3,
"grad_norm": 3.3738609002525304,
"learning_rate": 1.9534121609455593e-06,
"loss": 0.8896,
"step": 735
},
{
"epoch": 0.3,
"grad_norm": 3.7023723864539404,
"learning_rate": 1.9532827791578504e-06,
"loss": 0.9469,
"step": 736
},
{
"epoch": 0.3,
"grad_norm": 3.1539445047611503,
"learning_rate": 1.9531532222590974e-06,
"loss": 0.9596,
"step": 737
},
{
"epoch": 0.3,
"grad_norm": 4.4744477558828954,
"learning_rate": 1.953023490273099e-06,
"loss": 0.9606,
"step": 738
},
{
"epoch": 0.3,
"grad_norm": 3.7441690526655806,
"learning_rate": 1.9528935832236867e-06,
"loss": 0.8857,
"step": 739
},
{
"epoch": 0.3,
"grad_norm": 4.187852317105576,
"learning_rate": 1.9527635011347225e-06,
"loss": 0.8978,
"step": 740
},
{
"epoch": 0.3,
"grad_norm": 3.366534647410163,
"learning_rate": 1.952633244030102e-06,
"loss": 0.8786,
"step": 741
},
{
"epoch": 0.3,
"grad_norm": 3.296473415836188,
"learning_rate": 1.9525028119337523e-06,
"loss": 0.9376,
"step": 742
},
{
"epoch": 0.3,
"grad_norm": 2.879758920096026,
"learning_rate": 1.9523722048696327e-06,
"loss": 0.9135,
"step": 743
},
{
"epoch": 0.3,
"grad_norm": 4.461593711330092,
"learning_rate": 1.9522414228617356e-06,
"loss": 0.8956,
"step": 744
},
{
"epoch": 0.3,
"grad_norm": 3.7961717810967235,
"learning_rate": 1.9521104659340834e-06,
"loss": 0.8922,
"step": 745
},
{
"epoch": 0.3,
"grad_norm": 3.2191784497103395,
"learning_rate": 1.951979334110733e-06,
"loss": 0.9617,
"step": 746
},
{
"epoch": 0.3,
"grad_norm": 3.262866313573699,
"learning_rate": 1.951848027415772e-06,
"loss": 0.8943,
"step": 747
},
{
"epoch": 0.31,
"grad_norm": 3.3235601095880942,
"learning_rate": 1.9517165458733203e-06,
"loss": 0.9194,
"step": 748
},
{
"epoch": 0.31,
"grad_norm": 3.2868846125042848,
"learning_rate": 1.95158488950753e-06,
"loss": 0.9331,
"step": 749
},
{
"epoch": 0.31,
"grad_norm": 5.168977016790821,
"learning_rate": 1.951453058342586e-06,
"loss": 0.933,
"step": 750
},
{
"epoch": 0.31,
"grad_norm": 2.9650423942591058,
"learning_rate": 1.951321052402704e-06,
"loss": 0.9172,
"step": 751
},
{
"epoch": 0.31,
"grad_norm": 3.0183592266169885,
"learning_rate": 1.951188871712133e-06,
"loss": 0.9381,
"step": 752
},
{
"epoch": 0.31,
"grad_norm": 3.3250907269528183,
"learning_rate": 1.9510565162951534e-06,
"loss": 0.9801,
"step": 753
},
{
"epoch": 0.31,
"grad_norm": 2.933036821637405,
"learning_rate": 1.950923986176078e-06,
"loss": 0.9098,
"step": 754
},
{
"epoch": 0.31,
"grad_norm": 2.88502982273918,
"learning_rate": 1.950791281379252e-06,
"loss": 0.9634,
"step": 755
},
{
"epoch": 0.31,
"grad_norm": 4.00738533137771,
"learning_rate": 1.9506584019290516e-06,
"loss": 0.8459,
"step": 756
},
{
"epoch": 0.31,
"grad_norm": 2.5796433757037502,
"learning_rate": 1.950525347849886e-06,
"loss": 0.8936,
"step": 757
},
{
"epoch": 0.31,
"grad_norm": 3.0309073897400047,
"learning_rate": 1.9503921191661962e-06,
"loss": 0.9117,
"step": 758
},
{
"epoch": 0.31,
"grad_norm": 3.0021198502823547,
"learning_rate": 1.9502587159024556e-06,
"loss": 0.9163,
"step": 759
},
{
"epoch": 0.31,
"grad_norm": 2.8782192046348682,
"learning_rate": 1.9501251380831694e-06,
"loss": 0.9148,
"step": 760
},
{
"epoch": 0.31,
"grad_norm": 2.4782444459900344,
"learning_rate": 1.949991385732875e-06,
"loss": 0.9449,
"step": 761
},
{
"epoch": 0.31,
"grad_norm": 3.158988842037016,
"learning_rate": 1.9498574588761406e-06,
"loss": 0.945,
"step": 762
},
{
"epoch": 0.31,
"grad_norm": 3.28813394589638,
"learning_rate": 1.949723357537569e-06,
"loss": 0.8698,
"step": 763
},
{
"epoch": 0.31,
"grad_norm": 3.410665316213657,
"learning_rate": 1.9495890817417934e-06,
"loss": 1.0123,
"step": 764
},
{
"epoch": 0.31,
"grad_norm": 3.0468867701344555,
"learning_rate": 1.949454631513478e-06,
"loss": 0.8656,
"step": 765
},
{
"epoch": 0.31,
"grad_norm": 3.488436998318971,
"learning_rate": 1.949320006877322e-06,
"loss": 0.948,
"step": 766
},
{
"epoch": 0.31,
"grad_norm": 4.014737329450175,
"learning_rate": 1.9491852078580543e-06,
"loss": 0.8302,
"step": 767
},
{
"epoch": 0.31,
"grad_norm": 4.777679925110483,
"learning_rate": 1.949050234480436e-06,
"loss": 0.9077,
"step": 768
},
{
"epoch": 0.31,
"grad_norm": 4.478463454212438,
"learning_rate": 1.9489150867692613e-06,
"loss": 0.9669,
"step": 769
},
{
"epoch": 0.31,
"grad_norm": 4.125816325897733,
"learning_rate": 1.948779764749356e-06,
"loss": 0.8935,
"step": 770
},
{
"epoch": 0.31,
"grad_norm": 3.1838456907806876,
"learning_rate": 1.948644268445577e-06,
"loss": 0.9022,
"step": 771
},
{
"epoch": 0.32,
"grad_norm": 3.385530336195329,
"learning_rate": 1.9485085978828144e-06,
"loss": 0.9501,
"step": 772
},
{
"epoch": 0.32,
"grad_norm": 3.401502650969205,
"learning_rate": 1.94837275308599e-06,
"loss": 0.9342,
"step": 773
},
{
"epoch": 0.32,
"grad_norm": 3.1224307830478493,
"learning_rate": 1.948236734080057e-06,
"loss": 0.9201,
"step": 774
},
{
"epoch": 0.32,
"grad_norm": 3.2940169461424618,
"learning_rate": 1.9481005408900024e-06,
"loss": 0.9441,
"step": 775
},
{
"epoch": 0.32,
"grad_norm": 2.9829738733486213,
"learning_rate": 1.947964173540842e-06,
"loss": 0.8895,
"step": 776
},
{
"epoch": 0.32,
"grad_norm": 5.961581653376868,
"learning_rate": 1.9478276320576267e-06,
"loss": 0.8568,
"step": 777
},
{
"epoch": 0.32,
"grad_norm": 3.0254512007043655,
"learning_rate": 1.9476909164654383e-06,
"loss": 0.8885,
"step": 778
},
{
"epoch": 0.32,
"grad_norm": 2.960372612592654,
"learning_rate": 1.94755402678939e-06,
"loss": 0.9738,
"step": 779
},
{
"epoch": 0.32,
"grad_norm": 4.90596497357267,
"learning_rate": 1.9474169630546273e-06,
"loss": 0.8866,
"step": 780
},
{
"epoch": 0.32,
"grad_norm": 3.1261602906199704,
"learning_rate": 1.9472797252863276e-06,
"loss": 0.892,
"step": 781
},
{
"epoch": 0.32,
"grad_norm": 2.888913461010292,
"learning_rate": 1.9471423135097017e-06,
"loss": 0.9326,
"step": 782
},
{
"epoch": 0.32,
"grad_norm": 3.497996165893998,
"learning_rate": 1.9470047277499897e-06,
"loss": 0.8648,
"step": 783
},
{
"epoch": 0.32,
"grad_norm": 3.4235340691262466,
"learning_rate": 1.946866968032466e-06,
"loss": 0.8952,
"step": 784
},
{
"epoch": 0.32,
"grad_norm": 3.5496888740155326,
"learning_rate": 1.9467290343824354e-06,
"loss": 0.88,
"step": 785
},
{
"epoch": 0.32,
"grad_norm": 4.196719887678321,
"learning_rate": 1.946590926825236e-06,
"loss": 0.8919,
"step": 786
},
{
"epoch": 0.32,
"grad_norm": 2.644667271811777,
"learning_rate": 1.9464526453862364e-06,
"loss": 0.8608,
"step": 787
},
{
"epoch": 0.32,
"grad_norm": 3.1150070285348783,
"learning_rate": 1.9463141900908387e-06,
"loss": 0.9613,
"step": 788
},
{
"epoch": 0.32,
"grad_norm": 5.980287349960787,
"learning_rate": 1.9461755609644753e-06,
"loss": 0.9531,
"step": 789
},
{
"epoch": 0.32,
"grad_norm": 4.008070156945519,
"learning_rate": 1.946036758032612e-06,
"loss": 0.9352,
"step": 790
},
{
"epoch": 0.32,
"grad_norm": 2.8177055489320333,
"learning_rate": 1.9458977813207453e-06,
"loss": 0.8514,
"step": 791
},
{
"epoch": 0.32,
"grad_norm": 3.2887493053849584,
"learning_rate": 1.945758630854405e-06,
"loss": 0.9596,
"step": 792
},
{
"epoch": 0.32,
"grad_norm": 5.998456879210231,
"learning_rate": 1.945619306659151e-06,
"loss": 0.8963,
"step": 793
},
{
"epoch": 0.32,
"grad_norm": 2.888100187192589,
"learning_rate": 1.945479808760577e-06,
"loss": 0.9042,
"step": 794
},
{
"epoch": 0.32,
"grad_norm": 3.8389322755371436,
"learning_rate": 1.9453401371843072e-06,
"loss": 0.8951,
"step": 795
},
{
"epoch": 0.32,
"grad_norm": 2.912698226501802,
"learning_rate": 1.9452002919559986e-06,
"loss": 0.8971,
"step": 796
},
{
"epoch": 0.33,
"grad_norm": 3.4330450386426445,
"learning_rate": 1.94506027310134e-06,
"loss": 0.9946,
"step": 797
},
{
"epoch": 0.33,
"grad_norm": 3.5083233746822233,
"learning_rate": 1.9449200806460505e-06,
"loss": 0.9699,
"step": 798
},
{
"epoch": 0.33,
"grad_norm": 3.2990883224993457,
"learning_rate": 1.944779714615884e-06,
"loss": 0.8813,
"step": 799
},
{
"epoch": 0.33,
"grad_norm": 3.7735072104299987,
"learning_rate": 1.944639175036624e-06,
"loss": 0.933,
"step": 800
},
{
"epoch": 0.33,
"eval_loss": 0.9190006256103516,
"eval_runtime": 465.4719,
"eval_samples_per_second": 74.875,
"eval_steps_per_second": 4.681,
"step": 800
},
{
"epoch": 0.33,
"grad_norm": 4.90813324700834,
"learning_rate": 1.9444984619340863e-06,
"loss": 0.9422,
"step": 801
},
{
"epoch": 0.33,
"grad_norm": 3.2876757142263626,
"learning_rate": 1.9443575753341197e-06,
"loss": 0.9713,
"step": 802
},
{
"epoch": 0.33,
"grad_norm": 2.849460060691846,
"learning_rate": 1.944216515262603e-06,
"loss": 0.9491,
"step": 803
},
{
"epoch": 0.33,
"grad_norm": 2.713385089841564,
"learning_rate": 1.944075281745449e-06,
"loss": 0.9036,
"step": 804
},
{
"epoch": 0.33,
"grad_norm": 3.0515622748357645,
"learning_rate": 1.9439338748086e-06,
"loss": 0.8117,
"step": 805
},
{
"epoch": 0.33,
"grad_norm": 3.491099711578753,
"learning_rate": 1.943792294478033e-06,
"loss": 0.8765,
"step": 806
},
{
"epoch": 0.33,
"grad_norm": 3.738930715644742,
"learning_rate": 1.943650540779754e-06,
"loss": 0.8908,
"step": 807
},
{
"epoch": 0.33,
"grad_norm": 2.56362981508565,
"learning_rate": 1.943508613739802e-06,
"loss": 0.9111,
"step": 808
},
{
"epoch": 0.33,
"grad_norm": 3.454333663514252,
"learning_rate": 1.943366513384249e-06,
"loss": 0.9435,
"step": 809
},
{
"epoch": 0.33,
"grad_norm": 2.789865573446439,
"learning_rate": 1.943224239739197e-06,
"loss": 0.8207,
"step": 810
},
{
"epoch": 0.33,
"grad_norm": 2.7532202763687534,
"learning_rate": 1.943081792830781e-06,
"loss": 0.9444,
"step": 811
},
{
"epoch": 0.33,
"grad_norm": 3.797927062337903,
"learning_rate": 1.9429391726851674e-06,
"loss": 0.8896,
"step": 812
},
{
"epoch": 0.33,
"grad_norm": 2.973899016827078,
"learning_rate": 1.9427963793285543e-06,
"loss": 0.9012,
"step": 813
},
{
"epoch": 0.33,
"grad_norm": 3.5516368378211163,
"learning_rate": 1.942653412787172e-06,
"loss": 0.8502,
"step": 814
},
{
"epoch": 0.33,
"grad_norm": 3.4696054454170433,
"learning_rate": 1.942510273087282e-06,
"loss": 1.0133,
"step": 815
},
{
"epoch": 0.33,
"grad_norm": 3.262482865154964,
"learning_rate": 1.9423669602551787e-06,
"loss": 0.8308,
"step": 816
},
{
"epoch": 0.33,
"grad_norm": 2.976015026817646,
"learning_rate": 1.9422234743171868e-06,
"loss": 0.8775,
"step": 817
},
{
"epoch": 0.33,
"grad_norm": 3.109863675741082,
"learning_rate": 1.9420798152996643e-06,
"loss": 0.9423,
"step": 818
},
{
"epoch": 0.33,
"grad_norm": 3.1420806799283083,
"learning_rate": 1.9419359832289998e-06,
"loss": 0.9266,
"step": 819
},
{
"epoch": 0.33,
"grad_norm": 3.535208769925014,
"learning_rate": 1.9417919781316146e-06,
"loss": 0.9484,
"step": 820
},
{
"epoch": 0.34,
"grad_norm": 2.725012114255836,
"learning_rate": 1.941647800033961e-06,
"loss": 0.8517,
"step": 821
},
{
"epoch": 0.34,
"grad_norm": 2.6990111129485834,
"learning_rate": 1.941503448962524e-06,
"loss": 0.9109,
"step": 822
},
{
"epoch": 0.34,
"grad_norm": 2.834466691504805,
"learning_rate": 1.941358924943819e-06,
"loss": 0.8917,
"step": 823
},
{
"epoch": 0.34,
"grad_norm": 4.326597114566774,
"learning_rate": 1.941214228004395e-06,
"loss": 0.9448,
"step": 824
},
{
"epoch": 0.34,
"grad_norm": 3.572608706564346,
"learning_rate": 1.941069358170831e-06,
"loss": 0.9013,
"step": 825
},
{
"epoch": 0.34,
"grad_norm": 2.9578636673243675,
"learning_rate": 1.9409243154697388e-06,
"loss": 0.9114,
"step": 826
},
{
"epoch": 0.34,
"grad_norm": 2.7788743958565383,
"learning_rate": 1.9407790999277618e-06,
"loss": 0.9099,
"step": 827
},
{
"epoch": 0.34,
"grad_norm": 2.652519338421229,
"learning_rate": 1.9406337115715745e-06,
"loss": 0.8723,
"step": 828
},
{
"epoch": 0.34,
"grad_norm": 2.9837828256386225,
"learning_rate": 1.9404881504278845e-06,
"loss": 0.8666,
"step": 829
},
{
"epoch": 0.34,
"grad_norm": 2.733763060389568,
"learning_rate": 1.9403424165234295e-06,
"loss": 0.9341,
"step": 830
},
{
"epoch": 0.34,
"grad_norm": 3.347602366194662,
"learning_rate": 1.9401965098849805e-06,
"loss": 0.9593,
"step": 831
},
{
"epoch": 0.34,
"grad_norm": 3.9738701792798095,
"learning_rate": 1.9400504305393387e-06,
"loss": 0.916,
"step": 832
},
{
"epoch": 0.34,
"grad_norm": 2.954360561817526,
"learning_rate": 1.9399041785133384e-06,
"loss": 0.936,
"step": 833
},
{
"epoch": 0.34,
"grad_norm": 2.64402699652811,
"learning_rate": 1.9397577538338446e-06,
"loss": 0.9926,
"step": 834
},
{
"epoch": 0.34,
"grad_norm": 3.3423379160381557,
"learning_rate": 1.939611156527755e-06,
"loss": 0.9005,
"step": 835
},
{
"epoch": 0.34,
"grad_norm": 3.1155519330158916,
"learning_rate": 1.9394643866219983e-06,
"loss": 0.9097,
"step": 836
},
{
"epoch": 0.34,
"grad_norm": 3.103647028175201,
"learning_rate": 1.9393174441435344e-06,
"loss": 0.8577,
"step": 837
},
{
"epoch": 0.34,
"grad_norm": 3.2998439621605242,
"learning_rate": 1.9391703291193565e-06,
"loss": 0.9516,
"step": 838
},
{
"epoch": 0.34,
"grad_norm": 3.0475241963019135,
"learning_rate": 1.9390230415764877e-06,
"loss": 0.8598,
"step": 839
},
{
"epoch": 0.34,
"grad_norm": 3.5039675136601067,
"learning_rate": 1.938875581541984e-06,
"loss": 0.9243,
"step": 840
},
{
"epoch": 0.34,
"grad_norm": 3.0425392101680284,
"learning_rate": 1.9387279490429325e-06,
"loss": 0.9234,
"step": 841
},
{
"epoch": 0.34,
"grad_norm": 3.1909831909426574,
"learning_rate": 1.938580144106453e-06,
"loss": 0.9326,
"step": 842
},
{
"epoch": 0.34,
"grad_norm": 2.6866357632282525,
"learning_rate": 1.9384321667596953e-06,
"loss": 0.9398,
"step": 843
},
{
"epoch": 0.34,
"grad_norm": 3.080488314658747,
"learning_rate": 1.9382840170298423e-06,
"loss": 0.9437,
"step": 844
},
{
"epoch": 0.34,
"grad_norm": 2.926775356264796,
"learning_rate": 1.9381356949441074e-06,
"loss": 0.888,
"step": 845
},
{
"epoch": 0.35,
"grad_norm": 2.874215013824435,
"learning_rate": 1.937987200529737e-06,
"loss": 0.8858,
"step": 846
},
{
"epoch": 0.35,
"grad_norm": 3.2277566234677137,
"learning_rate": 1.9378385338140078e-06,
"loss": 1.0107,
"step": 847
},
{
"epoch": 0.35,
"grad_norm": 2.7359609389792263,
"learning_rate": 1.9376896948242293e-06,
"loss": 0.9412,
"step": 848
},
{
"epoch": 0.35,
"grad_norm": 2.6435786201609734,
"learning_rate": 1.9375406835877417e-06,
"loss": 0.9268,
"step": 849
},
{
"epoch": 0.35,
"grad_norm": 3.0019662506624343,
"learning_rate": 1.9373915001319177e-06,
"loss": 0.9081,
"step": 850
},
{
"epoch": 0.35,
"grad_norm": 3.0387442834053555,
"learning_rate": 1.9372421444841613e-06,
"loss": 0.8982,
"step": 851
},
{
"epoch": 0.35,
"grad_norm": 3.0357327712185933,
"learning_rate": 1.937092616671907e-06,
"loss": 0.9259,
"step": 852
},
{
"epoch": 0.35,
"grad_norm": 3.084590814637634,
"learning_rate": 1.936942916722623e-06,
"loss": 0.9191,
"step": 853
},
{
"epoch": 0.35,
"grad_norm": 3.000739841469742,
"learning_rate": 1.936793044663808e-06,
"loss": 0.9496,
"step": 854
},
{
"epoch": 0.35,
"grad_norm": 2.9533995247024563,
"learning_rate": 1.936643000522992e-06,
"loss": 0.9269,
"step": 855
},
{
"epoch": 0.35,
"grad_norm": 2.8976876831463003,
"learning_rate": 1.936492784327737e-06,
"loss": 0.9139,
"step": 856
},
{
"epoch": 0.35,
"grad_norm": 3.1557939585702988,
"learning_rate": 1.936342396105637e-06,
"loss": 0.9136,
"step": 857
},
{
"epoch": 0.35,
"grad_norm": 3.0480145143263058,
"learning_rate": 1.9361918358843167e-06,
"loss": 0.9238,
"step": 858
},
{
"epoch": 0.35,
"grad_norm": 3.7651227099805302,
"learning_rate": 1.9360411036914333e-06,
"loss": 0.9527,
"step": 859
},
{
"epoch": 0.35,
"grad_norm": 2.716138894721366,
"learning_rate": 1.9358901995546754e-06,
"loss": 0.9358,
"step": 860
},
{
"epoch": 0.35,
"grad_norm": 3.257797266296916,
"learning_rate": 1.9357391235017625e-06,
"loss": 1.0223,
"step": 861
},
{
"epoch": 0.35,
"grad_norm": 3.633534416529717,
"learning_rate": 1.935587875560446e-06,
"loss": 0.8808,
"step": 862
},
{
"epoch": 0.35,
"grad_norm": 3.069690808485766,
"learning_rate": 1.93543645575851e-06,
"loss": 0.8914,
"step": 863
},
{
"epoch": 0.35,
"grad_norm": 4.000100896385082,
"learning_rate": 1.935284864123768e-06,
"loss": 0.9068,
"step": 864
},
{
"epoch": 0.35,
"grad_norm": 2.8302235493980654,
"learning_rate": 1.9351331006840673e-06,
"loss": 0.8848,
"step": 865
},
{
"epoch": 0.35,
"grad_norm": 3.5005756661554415,
"learning_rate": 1.934981165467285e-06,
"loss": 0.8735,
"step": 866
},
{
"epoch": 0.35,
"grad_norm": 3.8919339447847854,
"learning_rate": 1.934829058501331e-06,
"loss": 0.9086,
"step": 867
},
{
"epoch": 0.35,
"grad_norm": 3.045322176328052,
"learning_rate": 1.9346767798141456e-06,
"loss": 0.9321,
"step": 868
},
{
"epoch": 0.35,
"grad_norm": 2.827071295596593,
"learning_rate": 1.934524329433702e-06,
"loss": 0.8532,
"step": 869
},
{
"epoch": 0.36,
"grad_norm": 3.069425346302677,
"learning_rate": 1.934371707388004e-06,
"loss": 0.9094,
"step": 870
},
{
"epoch": 0.36,
"grad_norm": 3.3701020520735168,
"learning_rate": 1.9342189137050863e-06,
"loss": 0.9639,
"step": 871
},
{
"epoch": 0.36,
"grad_norm": 3.5056543128083426,
"learning_rate": 1.9340659484130175e-06,
"loss": 0.9448,
"step": 872
},
{
"epoch": 0.36,
"grad_norm": 4.522964368511556,
"learning_rate": 1.9339128115398952e-06,
"loss": 0.9055,
"step": 873
},
{
"epoch": 0.36,
"grad_norm": 3.621722127879449,
"learning_rate": 1.9337595031138495e-06,
"loss": 0.9205,
"step": 874
},
{
"epoch": 0.36,
"grad_norm": 2.903397221731881,
"learning_rate": 1.9336060231630423e-06,
"loss": 0.8854,
"step": 875
},
{
"epoch": 0.36,
"grad_norm": 3.960809471416162,
"learning_rate": 1.9334523717156665e-06,
"loss": 0.9402,
"step": 876
},
{
"epoch": 0.36,
"grad_norm": 3.1392860547414236,
"learning_rate": 1.9332985487999472e-06,
"loss": 0.8578,
"step": 877
},
{
"epoch": 0.36,
"grad_norm": 3.341159197931347,
"learning_rate": 1.9331445544441407e-06,
"loss": 0.8866,
"step": 878
},
{
"epoch": 0.36,
"grad_norm": 3.139595724059706,
"learning_rate": 1.9329903886765335e-06,
"loss": 0.9124,
"step": 879
},
{
"epoch": 0.36,
"grad_norm": 3.7763560284218003,
"learning_rate": 1.932836051525446e-06,
"loss": 0.8611,
"step": 880
},
{
"epoch": 0.36,
"grad_norm": 2.568391745941199,
"learning_rate": 1.932681543019228e-06,
"loss": 0.8691,
"step": 881
},
{
"epoch": 0.36,
"grad_norm": 4.319495770406549,
"learning_rate": 1.9325268631862615e-06,
"loss": 0.8388,
"step": 882
},
{
"epoch": 0.36,
"grad_norm": 3.9655736899423406,
"learning_rate": 1.9323720120549606e-06,
"loss": 0.8588,
"step": 883
},
{
"epoch": 0.36,
"grad_norm": 3.3493437362668597,
"learning_rate": 1.9322169896537705e-06,
"loss": 0.9135,
"step": 884
},
{
"epoch": 0.36,
"grad_norm": 2.996088577337598,
"learning_rate": 1.9320617960111664e-06,
"loss": 0.8464,
"step": 885
},
{
"epoch": 0.36,
"grad_norm": 2.9241194385650986,
"learning_rate": 1.9319064311556578e-06,
"loss": 0.8839,
"step": 886
},
{
"epoch": 0.36,
"grad_norm": 3.5915726530358936,
"learning_rate": 1.9317508951157828e-06,
"loss": 0.8154,
"step": 887
},
{
"epoch": 0.36,
"grad_norm": 3.2585027438896703,
"learning_rate": 1.9315951879201135e-06,
"loss": 0.9165,
"step": 888
},
{
"epoch": 0.36,
"grad_norm": 3.083354949266171,
"learning_rate": 1.9314393095972506e-06,
"loss": 0.8856,
"step": 889
},
{
"epoch": 0.36,
"grad_norm": 3.69073263008199,
"learning_rate": 1.9312832601758295e-06,
"loss": 0.8802,
"step": 890
},
{
"epoch": 0.36,
"grad_norm": 2.7032350902702906,
"learning_rate": 1.931127039684514e-06,
"loss": 0.9709,
"step": 891
},
{
"epoch": 0.36,
"grad_norm": 3.172659927650331,
"learning_rate": 1.930970648152001e-06,
"loss": 0.8798,
"step": 892
},
{
"epoch": 0.36,
"grad_norm": 2.7771842193893006,
"learning_rate": 1.930814085607019e-06,
"loss": 0.9319,
"step": 893
},
{
"epoch": 0.36,
"grad_norm": 2.8807266884244793,
"learning_rate": 1.9306573520783267e-06,
"loss": 0.9597,
"step": 894
},
{
"epoch": 0.37,
"grad_norm": 2.8524375115393488,
"learning_rate": 1.930500447594716e-06,
"loss": 0.9838,
"step": 895
},
{
"epoch": 0.37,
"grad_norm": 3.527426024394915,
"learning_rate": 1.9303433721850074e-06,
"loss": 0.9627,
"step": 896
},
{
"epoch": 0.37,
"grad_norm": 2.962966796750454,
"learning_rate": 1.9301861258780557e-06,
"loss": 0.9612,
"step": 897
},
{
"epoch": 0.37,
"grad_norm": 2.659442402260841,
"learning_rate": 1.9300287087027457e-06,
"loss": 0.9515,
"step": 898
},
{
"epoch": 0.37,
"grad_norm": 3.327577755103706,
"learning_rate": 1.929871120687994e-06,
"loss": 0.9189,
"step": 899
},
{
"epoch": 0.37,
"grad_norm": 3.1064643006520556,
"learning_rate": 1.9297133618627475e-06,
"loss": 0.9625,
"step": 900
},
{
"epoch": 0.37,
"eval_loss": 0.9161908030509949,
"eval_runtime": 465.2107,
"eval_samples_per_second": 74.917,
"eval_steps_per_second": 4.684,
"step": 900
},
{
"epoch": 0.37,
"grad_norm": 3.009456200888498,
"learning_rate": 1.9295554322559863e-06,
"loss": 0.8962,
"step": 901
},
{
"epoch": 0.37,
"grad_norm": 2.9052423766382636,
"learning_rate": 1.92939733189672e-06,
"loss": 0.8704,
"step": 902
},
{
"epoch": 0.37,
"grad_norm": 2.7823520796536583,
"learning_rate": 1.9292390608139914e-06,
"loss": 0.9406,
"step": 903
},
{
"epoch": 0.37,
"grad_norm": 3.663354399349838,
"learning_rate": 1.929080619036873e-06,
"loss": 0.9128,
"step": 904
},
{
"epoch": 0.37,
"grad_norm": 3.277214354135901,
"learning_rate": 1.9289220065944696e-06,
"loss": 0.9501,
"step": 905
},
{
"epoch": 0.37,
"grad_norm": 2.6130358459029623,
"learning_rate": 1.9287632235159178e-06,
"loss": 0.9554,
"step": 906
},
{
"epoch": 0.37,
"grad_norm": 2.760415842980232,
"learning_rate": 1.928604269830384e-06,
"loss": 0.9505,
"step": 907
},
{
"epoch": 0.37,
"grad_norm": 3.4775238455215307,
"learning_rate": 1.928445145567067e-06,
"loss": 0.951,
"step": 908
},
{
"epoch": 0.37,
"grad_norm": 2.9419987031784705,
"learning_rate": 1.928285850755197e-06,
"loss": 0.9122,
"step": 909
},
{
"epoch": 0.37,
"grad_norm": 2.730948476263341,
"learning_rate": 1.9281263854240354e-06,
"loss": 0.9627,
"step": 910
},
{
"epoch": 0.37,
"grad_norm": 3.2006175322803783,
"learning_rate": 1.9279667496028744e-06,
"loss": 0.8297,
"step": 911
},
{
"epoch": 0.37,
"grad_norm": 3.0451602619620033,
"learning_rate": 1.927806943321038e-06,
"loss": 0.8603,
"step": 912
},
{
"epoch": 0.37,
"grad_norm": 2.920586922508814,
"learning_rate": 1.927646966607882e-06,
"loss": 0.9045,
"step": 913
},
{
"epoch": 0.37,
"grad_norm": 2.5364177550320925,
"learning_rate": 1.927486819492792e-06,
"loss": 1.0062,
"step": 914
},
{
"epoch": 0.37,
"grad_norm": 3.14114967263293,
"learning_rate": 1.9273265020051863e-06,
"loss": 0.9244,
"step": 915
},
{
"epoch": 0.37,
"grad_norm": 3.321088235451772,
"learning_rate": 1.9271660141745143e-06,
"loss": 0.9857,
"step": 916
},
{
"epoch": 0.37,
"grad_norm": 3.2357646656990102,
"learning_rate": 1.9270053560302567e-06,
"loss": 0.9502,
"step": 917
},
{
"epoch": 0.37,
"grad_norm": 6.855541490466651,
"learning_rate": 1.9268445276019242e-06,
"loss": 0.9679,
"step": 918
},
{
"epoch": 0.38,
"grad_norm": 3.6492067839484377,
"learning_rate": 1.9266835289190605e-06,
"loss": 0.915,
"step": 919
},
{
"epoch": 0.38,
"grad_norm": 6.612091011366016,
"learning_rate": 1.92652236001124e-06,
"loss": 0.912,
"step": 920
},
{
"epoch": 0.38,
"grad_norm": 3.248715624846823,
"learning_rate": 1.926361020908068e-06,
"loss": 0.9022,
"step": 921
},
{
"epoch": 0.38,
"grad_norm": 3.7385934321696204,
"learning_rate": 1.9261995116391807e-06,
"loss": 0.8866,
"step": 922
},
{
"epoch": 0.38,
"grad_norm": 3.517319371592901,
"learning_rate": 1.9260378322342473e-06,
"loss": 0.927,
"step": 923
},
{
"epoch": 0.38,
"grad_norm": 2.549495509282307,
"learning_rate": 1.9258759827229663e-06,
"loss": 0.9734,
"step": 924
},
{
"epoch": 0.38,
"grad_norm": 3.7912610046997886,
"learning_rate": 1.9257139631350687e-06,
"loss": 0.9292,
"step": 925
},
{
"epoch": 0.38,
"grad_norm": 2.9371459203537587,
"learning_rate": 1.9255517735003162e-06,
"loss": 0.8713,
"step": 926
},
{
"epoch": 0.38,
"grad_norm": 3.854597742789501,
"learning_rate": 1.9253894138485015e-06,
"loss": 0.9024,
"step": 927
},
{
"epoch": 0.38,
"grad_norm": 4.989685192445011,
"learning_rate": 1.9252268842094496e-06,
"loss": 0.9001,
"step": 928
},
{
"epoch": 0.38,
"grad_norm": 2.8234716458954416,
"learning_rate": 1.925064184613015e-06,
"loss": 0.9009,
"step": 929
},
{
"epoch": 0.38,
"grad_norm": 3.4166471393382047,
"learning_rate": 1.9249013150890856e-06,
"loss": 0.9484,
"step": 930
},
{
"epoch": 0.38,
"grad_norm": 3.6325200079957938,
"learning_rate": 1.9247382756675784e-06,
"loss": 0.8903,
"step": 931
},
{
"epoch": 0.38,
"grad_norm": 3.3192765390796235,
"learning_rate": 1.924575066378443e-06,
"loss": 0.894,
"step": 932
},
{
"epoch": 0.38,
"grad_norm": 2.9187997405886508,
"learning_rate": 1.9244116872516597e-06,
"loss": 0.8588,
"step": 933
},
{
"epoch": 0.38,
"grad_norm": 3.0885368470597587,
"learning_rate": 1.9242481383172397e-06,
"loss": 0.8615,
"step": 934
},
{
"epoch": 0.38,
"grad_norm": 3.3850973987834396,
"learning_rate": 1.924084419605226e-06,
"loss": 0.9607,
"step": 935
},
{
"epoch": 0.38,
"grad_norm": 2.7162998022704574,
"learning_rate": 1.9239205311456926e-06,
"loss": 0.9252,
"step": 936
},
{
"epoch": 0.38,
"grad_norm": 2.6776621730178274,
"learning_rate": 1.9237564729687446e-06,
"loss": 0.8775,
"step": 937
},
{
"epoch": 0.38,
"grad_norm": 3.630047333035831,
"learning_rate": 1.923592245104518e-06,
"loss": 0.9609,
"step": 938
},
{
"epoch": 0.38,
"grad_norm": 3.0043786965666306,
"learning_rate": 1.923427847583181e-06,
"loss": 1.0202,
"step": 939
},
{
"epoch": 0.38,
"grad_norm": 2.9518431652279555,
"learning_rate": 1.923263280434931e-06,
"loss": 0.8833,
"step": 940
},
{
"epoch": 0.38,
"grad_norm": 4.091474473608721,
"learning_rate": 1.923098543689999e-06,
"loss": 0.894,
"step": 941
},
{
"epoch": 0.38,
"grad_norm": 3.6897954427978155,
"learning_rate": 1.9229336373786455e-06,
"loss": 0.9746,
"step": 942
},
{
"epoch": 0.38,
"grad_norm": 3.3754204980643245,
"learning_rate": 1.922768561531162e-06,
"loss": 0.9452,
"step": 943
},
{
"epoch": 0.39,
"grad_norm": 2.622953699325627,
"learning_rate": 1.9226033161778725e-06,
"loss": 0.888,
"step": 944
},
{
"epoch": 0.39,
"grad_norm": 3.3173137869337355,
"learning_rate": 1.9224379013491306e-06,
"loss": 0.8964,
"step": 945
},
{
"epoch": 0.39,
"grad_norm": 2.7664488456233496,
"learning_rate": 1.922272317075323e-06,
"loss": 0.9136,
"step": 946
},
{
"epoch": 0.39,
"grad_norm": 3.4176895028963705,
"learning_rate": 1.922106563386865e-06,
"loss": 0.9136,
"step": 947
},
{
"epoch": 0.39,
"grad_norm": 2.6602210946725915,
"learning_rate": 1.921940640314205e-06,
"loss": 0.8663,
"step": 948
},
{
"epoch": 0.39,
"grad_norm": 2.9348410109114367,
"learning_rate": 1.9217745478878215e-06,
"loss": 0.9152,
"step": 949
},
{
"epoch": 0.39,
"grad_norm": 3.2338188070886575,
"learning_rate": 1.921608286138225e-06,
"loss": 0.9791,
"step": 950
},
{
"epoch": 0.39,
"grad_norm": 2.8201414603777395,
"learning_rate": 1.921441855095956e-06,
"loss": 0.9389,
"step": 951
},
{
"epoch": 0.39,
"grad_norm": 3.2045043858024598,
"learning_rate": 1.921275254791587e-06,
"loss": 0.9129,
"step": 952
},
{
"epoch": 0.39,
"grad_norm": 2.482738640747284,
"learning_rate": 1.9211084852557217e-06,
"loss": 0.9994,
"step": 953
},
{
"epoch": 0.39,
"grad_norm": 3.645650026317626,
"learning_rate": 1.920941546518993e-06,
"loss": 0.8989,
"step": 954
},
{
"epoch": 0.39,
"grad_norm": 3.505429562955243,
"learning_rate": 1.920774438612068e-06,
"loss": 0.9566,
"step": 955
},
{
"epoch": 0.39,
"grad_norm": 2.842628069223628,
"learning_rate": 1.920607161565642e-06,
"loss": 0.92,
"step": 956
},
{
"epoch": 0.39,
"grad_norm": 2.792132240926614,
"learning_rate": 1.920439715410443e-06,
"loss": 0.9624,
"step": 957
},
{
"epoch": 0.39,
"grad_norm": 3.618495229025687,
"learning_rate": 1.92027210017723e-06,
"loss": 0.8808,
"step": 958
},
{
"epoch": 0.39,
"grad_norm": 3.8518488623753737,
"learning_rate": 1.9201043158967916e-06,
"loss": 0.9256,
"step": 959
},
{
"epoch": 0.39,
"grad_norm": 2.8858068744791097,
"learning_rate": 1.9199363625999496e-06,
"loss": 0.8933,
"step": 960
},
{
"epoch": 0.39,
"grad_norm": 3.394723364001795,
"learning_rate": 1.919768240317556e-06,
"loss": 0.8903,
"step": 961
},
{
"epoch": 0.39,
"grad_norm": 4.983661609969551,
"learning_rate": 1.919599949080492e-06,
"loss": 0.9398,
"step": 962
},
{
"epoch": 0.39,
"grad_norm": 2.929143999494264,
"learning_rate": 1.9194314889196733e-06,
"loss": 0.8892,
"step": 963
},
{
"epoch": 0.39,
"grad_norm": 2.9816143836510522,
"learning_rate": 1.919262859866044e-06,
"loss": 0.9103,
"step": 964
},
{
"epoch": 0.39,
"grad_norm": 2.7607057665211294,
"learning_rate": 1.9190940619505797e-06,
"loss": 0.9145,
"step": 965
},
{
"epoch": 0.39,
"grad_norm": 5.450121545216756,
"learning_rate": 1.918925095204288e-06,
"loss": 0.9596,
"step": 966
},
{
"epoch": 0.39,
"grad_norm": 3.3677579672995637,
"learning_rate": 1.918755959658206e-06,
"loss": 0.9285,
"step": 967
},
{
"epoch": 0.4,
"grad_norm": 4.196260228262026,
"learning_rate": 1.9185866553434034e-06,
"loss": 0.9173,
"step": 968
},
{
"epoch": 0.4,
"grad_norm": 2.9749579190835673,
"learning_rate": 1.9184171822909804e-06,
"loss": 0.8911,
"step": 969
},
{
"epoch": 0.4,
"grad_norm": 4.239193718401052,
"learning_rate": 1.918247540532067e-06,
"loss": 0.9686,
"step": 970
},
{
"epoch": 0.4,
"grad_norm": 2.781506906120271,
"learning_rate": 1.9180777300978253e-06,
"loss": 0.843,
"step": 971
},
{
"epoch": 0.4,
"grad_norm": 2.5439441483396443,
"learning_rate": 1.9179077510194493e-06,
"loss": 0.8756,
"step": 972
},
{
"epoch": 0.4,
"grad_norm": 2.9528980464509016,
"learning_rate": 1.9177376033281617e-06,
"loss": 0.9418,
"step": 973
},
{
"epoch": 0.4,
"grad_norm": 3.3667270539808345,
"learning_rate": 1.917567287055218e-06,
"loss": 1.0181,
"step": 974
},
{
"epoch": 0.4,
"grad_norm": 3.025599183256814,
"learning_rate": 1.917396802231904e-06,
"loss": 0.8389,
"step": 975
},
{
"epoch": 0.4,
"grad_norm": 2.8341182537131275,
"learning_rate": 1.9172261488895367e-06,
"loss": 0.9807,
"step": 976
},
{
"epoch": 0.4,
"grad_norm": 2.867237428370912,
"learning_rate": 1.917055327059463e-06,
"loss": 0.9594,
"step": 977
},
{
"epoch": 0.4,
"grad_norm": 2.9376874063525715,
"learning_rate": 1.9168843367730626e-06,
"loss": 0.8714,
"step": 978
},
{
"epoch": 0.4,
"grad_norm": 4.3623314150746095,
"learning_rate": 1.9167131780617444e-06,
"loss": 0.9261,
"step": 979
},
{
"epoch": 0.4,
"grad_norm": 3.456965060488134,
"learning_rate": 1.9165418509569496e-06,
"loss": 0.979,
"step": 980
},
{
"epoch": 0.4,
"grad_norm": 3.757511668867802,
"learning_rate": 1.9163703554901493e-06,
"loss": 0.9242,
"step": 981
},
{
"epoch": 0.4,
"grad_norm": 3.406929988907451,
"learning_rate": 1.9161986916928463e-06,
"loss": 0.8939,
"step": 982
},
{
"epoch": 0.4,
"grad_norm": 2.713513159737217,
"learning_rate": 1.916026859596574e-06,
"loss": 0.8988,
"step": 983
},
{
"epoch": 0.4,
"grad_norm": 2.5712207147206443,
"learning_rate": 1.9158548592328964e-06,
"loss": 0.9096,
"step": 984
},
{
"epoch": 0.4,
"grad_norm": 2.953711729679448,
"learning_rate": 1.9156826906334085e-06,
"loss": 0.8955,
"step": 985
},
{
"epoch": 0.4,
"grad_norm": 3.224878691567116,
"learning_rate": 1.915510353829737e-06,
"loss": 0.9047,
"step": 986
},
{
"epoch": 0.4,
"grad_norm": 3.590251436063003,
"learning_rate": 1.9153378488535383e-06,
"loss": 0.9528,
"step": 987
},
{
"epoch": 0.4,
"grad_norm": 3.3719815050137343,
"learning_rate": 1.915165175736501e-06,
"loss": 0.8773,
"step": 988
},
{
"epoch": 0.4,
"grad_norm": 4.54614469031698,
"learning_rate": 1.9149923345103435e-06,
"loss": 0.9501,
"step": 989
},
{
"epoch": 0.4,
"grad_norm": 2.966903052500312,
"learning_rate": 1.9148193252068154e-06,
"loss": 0.9252,
"step": 990
},
{
"epoch": 0.4,
"grad_norm": 2.8675717360064343,
"learning_rate": 1.914646147857697e-06,
"loss": 0.8739,
"step": 991
},
{
"epoch": 0.4,
"grad_norm": 4.05539370162813,
"learning_rate": 1.9144728024948e-06,
"loss": 0.9635,
"step": 992
},
{
"epoch": 0.41,
"grad_norm": 3.2756152501823066,
"learning_rate": 1.914299289149967e-06,
"loss": 0.9078,
"step": 993
},
{
"epoch": 0.41,
"grad_norm": 3.039881024995141,
"learning_rate": 1.914125607855071e-06,
"loss": 0.9319,
"step": 994
},
{
"epoch": 0.41,
"grad_norm": 4.373964536728664,
"learning_rate": 1.9139517586420157e-06,
"loss": 0.9121,
"step": 995
},
{
"epoch": 0.41,
"grad_norm": 3.0252806311084797,
"learning_rate": 1.9137777415427358e-06,
"loss": 0.9727,
"step": 996
},
{
"epoch": 0.41,
"grad_norm": 2.7873332620342226,
"learning_rate": 1.9136035565891974e-06,
"loss": 0.8987,
"step": 997
},
{
"epoch": 0.41,
"grad_norm": 4.74181750054867,
"learning_rate": 1.9134292038133968e-06,
"loss": 0.912,
"step": 998
},
{
"epoch": 0.41,
"grad_norm": 3.3383467960632167,
"learning_rate": 1.913254683247361e-06,
"loss": 0.8799,
"step": 999
},
{
"epoch": 0.41,
"grad_norm": 7.839335164175711,
"learning_rate": 1.9130799949231486e-06,
"loss": 0.8402,
"step": 1000
},
{
"epoch": 0.41,
"eval_loss": 0.9139338731765747,
"eval_runtime": 466.1318,
"eval_samples_per_second": 74.769,
"eval_steps_per_second": 4.675,
"step": 1000
},
{
"epoch": 0.41,
"grad_norm": 4.707170811411979,
"learning_rate": 1.912905138872848e-06,
"loss": 0.8765,
"step": 1001
},
{
"epoch": 0.41,
"grad_norm": 3.3311625933622917,
"learning_rate": 1.91273011512858e-06,
"loss": 0.9264,
"step": 1002
},
{
"epoch": 0.41,
"grad_norm": 2.8917458071712416,
"learning_rate": 1.9125549237224943e-06,
"loss": 0.8952,
"step": 1003
},
{
"epoch": 0.41,
"grad_norm": 3.1681257041872786,
"learning_rate": 1.9123795646867727e-06,
"loss": 0.8843,
"step": 1004
},
{
"epoch": 0.41,
"grad_norm": 2.657750835716479,
"learning_rate": 1.912204038053627e-06,
"loss": 0.9023,
"step": 1005
},
{
"epoch": 0.41,
"grad_norm": 2.7176886982845514,
"learning_rate": 1.9120283438553005e-06,
"loss": 0.8614,
"step": 1006
},
{
"epoch": 0.41,
"grad_norm": 5.675321116301775,
"learning_rate": 1.9118524821240667e-06,
"loss": 0.8992,
"step": 1007
},
{
"epoch": 0.41,
"grad_norm": 2.8451326640513273,
"learning_rate": 1.91167645289223e-06,
"loss": 0.9287,
"step": 1008
},
{
"epoch": 0.41,
"grad_norm": 4.653562274465996,
"learning_rate": 1.911500256192126e-06,
"loss": 0.9779,
"step": 1009
},
{
"epoch": 0.41,
"grad_norm": 3.0675291782145573,
"learning_rate": 1.9113238920561207e-06,
"loss": 0.8649,
"step": 1010
},
{
"epoch": 0.41,
"grad_norm": 3.1106367768273477,
"learning_rate": 1.9111473605166105e-06,
"loss": 0.9061,
"step": 1011
},
{
"epoch": 0.41,
"grad_norm": 2.7919571447987908,
"learning_rate": 1.9109706616060235e-06,
"loss": 0.8898,
"step": 1012
},
{
"epoch": 0.41,
"grad_norm": 3.247779397597812,
"learning_rate": 1.9107937953568177e-06,
"loss": 0.9163,
"step": 1013
},
{
"epoch": 0.41,
"grad_norm": 5.293176683036312,
"learning_rate": 1.910616761801482e-06,
"loss": 0.8612,
"step": 1014
},
{
"epoch": 0.41,
"grad_norm": 4.889068349917519,
"learning_rate": 1.910439560972537e-06,
"loss": 0.8996,
"step": 1015
},
{
"epoch": 0.41,
"grad_norm": 4.012528596270623,
"learning_rate": 1.910262192902532e-06,
"loss": 0.8821,
"step": 1016
},
{
"epoch": 0.42,
"grad_norm": 3.2588975836276273,
"learning_rate": 1.910084657624049e-06,
"loss": 0.8688,
"step": 1017
},
{
"epoch": 0.42,
"grad_norm": 4.584604500421807,
"learning_rate": 1.9099069551696993e-06,
"loss": 0.895,
"step": 1018
},
{
"epoch": 0.42,
"grad_norm": 3.6713293039050234,
"learning_rate": 1.909729085572126e-06,
"loss": 0.8905,
"step": 1019
},
{
"epoch": 0.42,
"grad_norm": 2.9675043111954893,
"learning_rate": 1.909551048864002e-06,
"loss": 0.9713,
"step": 1020
},
{
"epoch": 0.42,
"grad_norm": 3.213038033679988,
"learning_rate": 1.9093728450780324e-06,
"loss": 0.9662,
"step": 1021
},
{
"epoch": 0.42,
"grad_norm": 3.122072555177349,
"learning_rate": 1.9091944742469507e-06,
"loss": 0.8947,
"step": 1022
},
{
"epoch": 0.42,
"grad_norm": 2.8204430647286474,
"learning_rate": 1.909015936403523e-06,
"loss": 0.8855,
"step": 1023
},
{
"epoch": 0.42,
"grad_norm": 3.0290815933832436,
"learning_rate": 1.9088372315805453e-06,
"loss": 0.9015,
"step": 1024
},
{
"epoch": 0.42,
"grad_norm": 3.197497176606551,
"learning_rate": 1.908658359810844e-06,
"loss": 0.963,
"step": 1025
},
{
"epoch": 0.42,
"grad_norm": 3.0323028442321536,
"learning_rate": 1.908479321127277e-06,
"loss": 0.9818,
"step": 1026
},
{
"epoch": 0.42,
"grad_norm": 3.484120743961505,
"learning_rate": 1.9083001155627324e-06,
"loss": 0.9521,
"step": 1027
},
{
"epoch": 0.42,
"grad_norm": 2.7165197981936213,
"learning_rate": 1.908120743150128e-06,
"loss": 0.9287,
"step": 1028
},
{
"epoch": 0.42,
"grad_norm": 3.4712237824852368,
"learning_rate": 1.9079412039224147e-06,
"loss": 0.9315,
"step": 1029
},
{
"epoch": 0.42,
"grad_norm": 2.8698457791330614,
"learning_rate": 1.9077614979125714e-06,
"loss": 0.9157,
"step": 1030
},
{
"epoch": 0.42,
"grad_norm": 2.9065549167181506,
"learning_rate": 1.9075816251536093e-06,
"loss": 0.9447,
"step": 1031
},
{
"epoch": 0.42,
"grad_norm": 3.7539378286144283,
"learning_rate": 1.9074015856785694e-06,
"loss": 0.9018,
"step": 1032
},
{
"epoch": 0.42,
"grad_norm": 3.2026294486723326,
"learning_rate": 1.9072213795205238e-06,
"loss": 0.9152,
"step": 1033
},
{
"epoch": 0.42,
"grad_norm": 3.023446299157657,
"learning_rate": 1.9070410067125754e-06,
"loss": 0.9554,
"step": 1034
},
{
"epoch": 0.42,
"grad_norm": 2.9560507160552643,
"learning_rate": 1.9068604672878563e-06,
"loss": 0.9406,
"step": 1035
},
{
"epoch": 0.42,
"grad_norm": 2.6132397921327835,
"learning_rate": 1.9066797612795313e-06,
"loss": 0.9544,
"step": 1036
},
{
"epoch": 0.42,
"grad_norm": 2.7456526655613582,
"learning_rate": 1.9064988887207942e-06,
"loss": 0.9991,
"step": 1037
},
{
"epoch": 0.42,
"grad_norm": 3.788792649482802,
"learning_rate": 1.9063178496448704e-06,
"loss": 0.8638,
"step": 1038
},
{
"epoch": 0.42,
"grad_norm": 2.912097038137617,
"learning_rate": 1.9061366440850152e-06,
"loss": 0.9289,
"step": 1039
},
{
"epoch": 0.42,
"grad_norm": 2.5729584972700428,
"learning_rate": 1.905955272074515e-06,
"loss": 0.9323,
"step": 1040
},
{
"epoch": 0.42,
"grad_norm": 2.9946835326986747,
"learning_rate": 1.9057737336466859e-06,
"loss": 0.8759,
"step": 1041
},
{
"epoch": 0.43,
"grad_norm": 2.877719441354195,
"learning_rate": 1.9055920288348754e-06,
"loss": 0.9217,
"step": 1042
},
{
"epoch": 0.43,
"grad_norm": 3.2547603544509207,
"learning_rate": 1.9054101576724617e-06,
"loss": 0.8849,
"step": 1043
},
{
"epoch": 0.43,
"grad_norm": 2.7937355766605387,
"learning_rate": 1.905228120192853e-06,
"loss": 0.9208,
"step": 1044
},
{
"epoch": 0.43,
"grad_norm": 3.144262796380569,
"learning_rate": 1.9050459164294881e-06,
"loss": 0.8688,
"step": 1045
},
{
"epoch": 0.43,
"grad_norm": 3.3519230845207066,
"learning_rate": 1.9048635464158367e-06,
"loss": 0.9273,
"step": 1046
},
{
"epoch": 0.43,
"grad_norm": 2.971770018674094,
"learning_rate": 1.9046810101853987e-06,
"loss": 0.9291,
"step": 1047
},
{
"epoch": 0.43,
"grad_norm": 3.0319572896048252,
"learning_rate": 1.9044983077717047e-06,
"loss": 0.9496,
"step": 1048
},
{
"epoch": 0.43,
"grad_norm": 3.6337634461758017,
"learning_rate": 1.9043154392083158e-06,
"loss": 0.8508,
"step": 1049
},
{
"epoch": 0.43,
"grad_norm": 3.3635627437320825,
"learning_rate": 1.9041324045288236e-06,
"loss": 0.8558,
"step": 1050
},
{
"epoch": 0.43,
"grad_norm": 4.660947515911968,
"learning_rate": 1.9039492037668502e-06,
"loss": 0.9492,
"step": 1051
},
{
"epoch": 0.43,
"grad_norm": 3.2286213565825936,
"learning_rate": 1.9037658369560482e-06,
"loss": 0.883,
"step": 1052
},
{
"epoch": 0.43,
"grad_norm": 2.895876319363955,
"learning_rate": 1.9035823041301011e-06,
"loss": 0.8551,
"step": 1053
},
{
"epoch": 0.43,
"grad_norm": 4.445415033886705,
"learning_rate": 1.903398605322722e-06,
"loss": 0.9724,
"step": 1054
},
{
"epoch": 0.43,
"grad_norm": 3.248570828538869,
"learning_rate": 1.9032147405676554e-06,
"loss": 0.9024,
"step": 1055
},
{
"epoch": 0.43,
"grad_norm": 2.8774839631010747,
"learning_rate": 1.9030307098986756e-06,
"loss": 0.874,
"step": 1056
},
{
"epoch": 0.43,
"grad_norm": 3.8385251877196933,
"learning_rate": 1.902846513349588e-06,
"loss": 0.8507,
"step": 1057
},
{
"epoch": 0.43,
"grad_norm": 3.075701230929701,
"learning_rate": 1.902662150954228e-06,
"loss": 0.9415,
"step": 1058
},
{
"epoch": 0.43,
"grad_norm": 3.204567519998842,
"learning_rate": 1.9024776227464615e-06,
"loss": 1.0034,
"step": 1059
},
{
"epoch": 0.43,
"grad_norm": 4.027437175804035,
"learning_rate": 1.902292928760185e-06,
"loss": 0.8843,
"step": 1060
},
{
"epoch": 0.43,
"grad_norm": 2.816256530063764,
"learning_rate": 1.9021080690293257e-06,
"loss": 0.8806,
"step": 1061
},
{
"epoch": 0.43,
"grad_norm": 3.201122023952863,
"learning_rate": 1.9019230435878409e-06,
"loss": 0.9297,
"step": 1062
},
{
"epoch": 0.43,
"grad_norm": 3.7621970933229885,
"learning_rate": 1.9017378524697181e-06,
"loss": 0.9875,
"step": 1063
},
{
"epoch": 0.43,
"grad_norm": 3.5040508932823906,
"learning_rate": 1.9015524957089758e-06,
"loss": 0.9041,
"step": 1064
},
{
"epoch": 0.43,
"grad_norm": 2.6717157094628377,
"learning_rate": 1.9013669733396624e-06,
"loss": 0.983,
"step": 1065
},
{
"epoch": 0.44,
"grad_norm": 3.197129385314839,
"learning_rate": 1.9011812853958575e-06,
"loss": 0.8594,
"step": 1066
},
{
"epoch": 0.44,
"grad_norm": 3.651245733334084,
"learning_rate": 1.9009954319116704e-06,
"loss": 0.9036,
"step": 1067
},
{
"epoch": 0.44,
"grad_norm": 2.8017548588980743,
"learning_rate": 1.9008094129212408e-06,
"loss": 0.9647,
"step": 1068
},
{
"epoch": 0.44,
"grad_norm": 2.4536788134574476,
"learning_rate": 1.9006232284587393e-06,
"loss": 0.9155,
"step": 1069
},
{
"epoch": 0.44,
"grad_norm": 2.856621276088278,
"learning_rate": 1.9004368785583664e-06,
"loss": 0.9135,
"step": 1070
},
{
"epoch": 0.44,
"grad_norm": 2.53808061693718,
"learning_rate": 1.9002503632543535e-06,
"loss": 0.8654,
"step": 1071
},
{
"epoch": 0.44,
"grad_norm": 2.76977064516295,
"learning_rate": 1.9000636825809617e-06,
"loss": 0.8794,
"step": 1072
},
{
"epoch": 0.44,
"grad_norm": 2.686037367779306,
"learning_rate": 1.8998768365724832e-06,
"loss": 0.9261,
"step": 1073
},
{
"epoch": 0.44,
"grad_norm": 4.732014653845707,
"learning_rate": 1.8996898252632399e-06,
"loss": 0.8743,
"step": 1074
},
{
"epoch": 0.44,
"grad_norm": 2.7114756492727765,
"learning_rate": 1.899502648687585e-06,
"loss": 0.9701,
"step": 1075
},
{
"epoch": 0.44,
"grad_norm": 2.5210500028391096,
"learning_rate": 1.8993153068799006e-06,
"loss": 0.9343,
"step": 1076
},
{
"epoch": 0.44,
"grad_norm": 2.7531494382103827,
"learning_rate": 1.8991277998746008e-06,
"loss": 0.9777,
"step": 1077
},
{
"epoch": 0.44,
"grad_norm": 3.111197852122283,
"learning_rate": 1.8989401277061287e-06,
"loss": 0.9384,
"step": 1078
},
{
"epoch": 0.44,
"grad_norm": 3.206598113988706,
"learning_rate": 1.8987522904089588e-06,
"loss": 0.8751,
"step": 1079
},
{
"epoch": 0.44,
"grad_norm": 3.0027679677924497,
"learning_rate": 1.8985642880175952e-06,
"loss": 0.9812,
"step": 1080
},
{
"epoch": 0.44,
"grad_norm": 3.589783667407417,
"learning_rate": 1.8983761205665724e-06,
"loss": 0.9102,
"step": 1081
},
{
"epoch": 0.44,
"grad_norm": 2.873221176427272,
"learning_rate": 1.8981877880904556e-06,
"loss": 0.918,
"step": 1082
},
{
"epoch": 0.44,
"grad_norm": 2.7526933708393577,
"learning_rate": 1.8979992906238398e-06,
"loss": 0.9102,
"step": 1083
},
{
"epoch": 0.44,
"grad_norm": 3.3181844638790468,
"learning_rate": 1.897810628201351e-06,
"loss": 0.8904,
"step": 1084
},
{
"epoch": 0.44,
"grad_norm": 4.043963388739977,
"learning_rate": 1.8976218008576447e-06,
"loss": 0.9315,
"step": 1085
},
{
"epoch": 0.44,
"grad_norm": 3.3598369194689486,
"learning_rate": 1.8974328086274073e-06,
"loss": 0.9902,
"step": 1086
},
{
"epoch": 0.44,
"grad_norm": 3.413796680882389,
"learning_rate": 1.8972436515453559e-06,
"loss": 0.832,
"step": 1087
},
{
"epoch": 0.44,
"grad_norm": 3.3552055089114012,
"learning_rate": 1.8970543296462358e-06,
"loss": 0.8665,
"step": 1088
},
{
"epoch": 0.44,
"grad_norm": 2.799479440652192,
"learning_rate": 1.8968648429648253e-06,
"loss": 0.9524,
"step": 1089
},
{
"epoch": 0.44,
"grad_norm": 2.726128048741973,
"learning_rate": 1.8966751915359316e-06,
"loss": 0.8824,
"step": 1090
},
{
"epoch": 0.45,
"grad_norm": 3.2408728784425826,
"learning_rate": 1.8964853753943916e-06,
"loss": 0.8571,
"step": 1091
},
{
"epoch": 0.45,
"grad_norm": 2.8504620163744283,
"learning_rate": 1.8962953945750735e-06,
"loss": 0.9837,
"step": 1092
},
{
"epoch": 0.45,
"grad_norm": 2.887921934322234,
"learning_rate": 1.8961052491128754e-06,
"loss": 0.8957,
"step": 1093
},
{
"epoch": 0.45,
"grad_norm": 2.648124250869718,
"learning_rate": 1.8959149390427258e-06,
"loss": 0.8848,
"step": 1094
},
{
"epoch": 0.45,
"grad_norm": 2.848347459981718,
"learning_rate": 1.895724464399583e-06,
"loss": 0.903,
"step": 1095
},
{
"epoch": 0.45,
"grad_norm": 3.937606905474955,
"learning_rate": 1.895533825218436e-06,
"loss": 0.8718,
"step": 1096
},
{
"epoch": 0.45,
"grad_norm": 2.9957802954943964,
"learning_rate": 1.8953430215343033e-06,
"loss": 0.9543,
"step": 1097
},
{
"epoch": 0.45,
"grad_norm": 3.607196661053565,
"learning_rate": 1.895152053382235e-06,
"loss": 0.9578,
"step": 1098
},
{
"epoch": 0.45,
"grad_norm": 2.8031296050511987,
"learning_rate": 1.8949609207973095e-06,
"loss": 0.9054,
"step": 1099
},
{
"epoch": 0.45,
"grad_norm": 2.62550505818335,
"learning_rate": 1.8947696238146376e-06,
"loss": 0.8852,
"step": 1100
},
{
"epoch": 0.45,
"eval_loss": 0.9109283685684204,
"eval_runtime": 465.5176,
"eval_samples_per_second": 74.867,
"eval_steps_per_second": 4.681,
"step": 1100
},
{
"epoch": 0.45,
"grad_norm": 2.5360567195929242,
"learning_rate": 1.8945781624693583e-06,
"loss": 0.8568,
"step": 1101
},
{
"epoch": 0.45,
"grad_norm": 3.123040109057781,
"learning_rate": 1.894386536796642e-06,
"loss": 0.9148,
"step": 1102
},
{
"epoch": 0.45,
"grad_norm": 3.4573551133304457,
"learning_rate": 1.8941947468316887e-06,
"loss": 0.8934,
"step": 1103
},
{
"epoch": 0.45,
"grad_norm": 3.782965150094567,
"learning_rate": 1.8940027926097292e-06,
"loss": 0.906,
"step": 1104
},
{
"epoch": 0.45,
"grad_norm": 3.915733566091718,
"learning_rate": 1.8938106741660235e-06,
"loss": 0.9246,
"step": 1105
},
{
"epoch": 0.45,
"grad_norm": 2.5478528547314303,
"learning_rate": 1.893618391535863e-06,
"loss": 0.9171,
"step": 1106
},
{
"epoch": 0.45,
"grad_norm": 3.228345883359015,
"learning_rate": 1.8934259447545681e-06,
"loss": 0.9427,
"step": 1107
},
{
"epoch": 0.45,
"grad_norm": 2.733941655531938,
"learning_rate": 1.8932333338574902e-06,
"loss": 0.9176,
"step": 1108
},
{
"epoch": 0.45,
"grad_norm": 3.8360539534664624,
"learning_rate": 1.8930405588800101e-06,
"loss": 0.9376,
"step": 1109
},
{
"epoch": 0.45,
"grad_norm": 3.444288083011698,
"learning_rate": 1.89284761985754e-06,
"loss": 0.9269,
"step": 1110
},
{
"epoch": 0.45,
"grad_norm": 3.698928359724756,
"learning_rate": 1.89265451682552e-06,
"loss": 0.828,
"step": 1111
},
{
"epoch": 0.45,
"grad_norm": 4.258147229028701,
"learning_rate": 1.892461249819423e-06,
"loss": 0.887,
"step": 1112
},
{
"epoch": 0.45,
"grad_norm": 4.419488289759872,
"learning_rate": 1.89226781887475e-06,
"loss": 0.8416,
"step": 1113
},
{
"epoch": 0.45,
"grad_norm": 3.4376897931845534,
"learning_rate": 1.8920742240270332e-06,
"loss": 0.8519,
"step": 1114
},
{
"epoch": 0.46,
"grad_norm": 2.836643117779229,
"learning_rate": 1.8918804653118343e-06,
"loss": 0.9544,
"step": 1115
},
{
"epoch": 0.46,
"grad_norm": 3.0165295130871486,
"learning_rate": 1.8916865427647455e-06,
"loss": 0.8864,
"step": 1116
},
{
"epoch": 0.46,
"grad_norm": 3.680924891867032,
"learning_rate": 1.891492456421389e-06,
"loss": 0.8522,
"step": 1117
},
{
"epoch": 0.46,
"grad_norm": 3.536978809825786,
"learning_rate": 1.8912982063174168e-06,
"loss": 0.9606,
"step": 1118
},
{
"epoch": 0.46,
"grad_norm": 2.694748935204901,
"learning_rate": 1.8911037924885114e-06,
"loss": 0.8511,
"step": 1119
},
{
"epoch": 0.46,
"grad_norm": 2.8094950774296343,
"learning_rate": 1.8909092149703852e-06,
"loss": 0.9499,
"step": 1120
},
{
"epoch": 0.46,
"grad_norm": 2.8939722325346846,
"learning_rate": 1.8907144737987805e-06,
"loss": 0.9262,
"step": 1121
},
{
"epoch": 0.46,
"grad_norm": 3.8881300205346356,
"learning_rate": 1.8905195690094703e-06,
"loss": 0.8937,
"step": 1122
},
{
"epoch": 0.46,
"grad_norm": 2.8699911404076923,
"learning_rate": 1.8903245006382568e-06,
"loss": 0.8938,
"step": 1123
},
{
"epoch": 0.46,
"grad_norm": 2.523131032152541,
"learning_rate": 1.8901292687209722e-06,
"loss": 0.9074,
"step": 1124
},
{
"epoch": 0.46,
"grad_norm": 3.102693326622357,
"learning_rate": 1.8899338732934798e-06,
"loss": 0.9051,
"step": 1125
},
{
"epoch": 0.46,
"grad_norm": 3.349779363691837,
"learning_rate": 1.8897383143916722e-06,
"loss": 0.937,
"step": 1126
},
{
"epoch": 0.46,
"grad_norm": 4.932588959205853,
"learning_rate": 1.8895425920514723e-06,
"loss": 1.0292,
"step": 1127
},
{
"epoch": 0.46,
"grad_norm": 2.7030640012052136,
"learning_rate": 1.8893467063088321e-06,
"loss": 0.9005,
"step": 1128
},
{
"epoch": 0.46,
"grad_norm": 3.003613958718408,
"learning_rate": 1.8891506571997353e-06,
"loss": 0.9401,
"step": 1129
},
{
"epoch": 0.46,
"grad_norm": 2.6024068871485486,
"learning_rate": 1.888954444760194e-06,
"loss": 0.9728,
"step": 1130
},
{
"epoch": 0.46,
"grad_norm": 2.7445025878874336,
"learning_rate": 1.8887580690262515e-06,
"loss": 0.9186,
"step": 1131
},
{
"epoch": 0.46,
"grad_norm": 3.4038891560022884,
"learning_rate": 1.88856153003398e-06,
"loss": 0.9234,
"step": 1132
},
{
"epoch": 0.46,
"grad_norm": 2.536865175167684,
"learning_rate": 1.8883648278194828e-06,
"loss": 0.9216,
"step": 1133
},
{
"epoch": 0.46,
"grad_norm": 3.0151595675434644,
"learning_rate": 1.8881679624188928e-06,
"loss": 0.8872,
"step": 1134
},
{
"epoch": 0.46,
"grad_norm": 2.770394023298059,
"learning_rate": 1.8879709338683715e-06,
"loss": 0.9369,
"step": 1135
},
{
"epoch": 0.46,
"grad_norm": 2.500165908973051,
"learning_rate": 1.8877737422041133e-06,
"loss": 0.9323,
"step": 1136
},
{
"epoch": 0.46,
"grad_norm": 2.646372675346756,
"learning_rate": 1.8875763874623395e-06,
"loss": 0.9549,
"step": 1137
},
{
"epoch": 0.46,
"grad_norm": 2.798794878248987,
"learning_rate": 1.8873788696793035e-06,
"loss": 0.9152,
"step": 1138
},
{
"epoch": 0.46,
"grad_norm": 3.4229050399653556,
"learning_rate": 1.887181188891288e-06,
"loss": 0.9632,
"step": 1139
},
{
"epoch": 0.47,
"grad_norm": 2.8525144814026926,
"learning_rate": 1.8869833451346045e-06,
"loss": 0.9309,
"step": 1140
},
{
"epoch": 0.47,
"grad_norm": 2.5723755930708947,
"learning_rate": 1.8867853384455963e-06,
"loss": 0.9339,
"step": 1141
},
{
"epoch": 0.47,
"grad_norm": 3.0748062982544777,
"learning_rate": 1.8865871688606356e-06,
"loss": 0.8698,
"step": 1142
},
{
"epoch": 0.47,
"grad_norm": 2.765931480166499,
"learning_rate": 1.8863888364161244e-06,
"loss": 0.9257,
"step": 1143
},
{
"epoch": 0.47,
"grad_norm": 3.207326450703322,
"learning_rate": 1.8861903411484957e-06,
"loss": 0.8602,
"step": 1144
},
{
"epoch": 0.47,
"grad_norm": 3.2651859346561336,
"learning_rate": 1.8859916830942106e-06,
"loss": 0.8563,
"step": 1145
},
{
"epoch": 0.47,
"grad_norm": 2.8575602978021615,
"learning_rate": 1.8857928622897617e-06,
"loss": 0.9073,
"step": 1146
},
{
"epoch": 0.47,
"grad_norm": 3.0326756182110812,
"learning_rate": 1.8855938787716708e-06,
"loss": 0.9033,
"step": 1147
},
{
"epoch": 0.47,
"grad_norm": 2.9003552533263837,
"learning_rate": 1.88539473257649e-06,
"loss": 0.8726,
"step": 1148
},
{
"epoch": 0.47,
"grad_norm": 2.6468262349694607,
"learning_rate": 1.8851954237408004e-06,
"loss": 0.9451,
"step": 1149
},
{
"epoch": 0.47,
"grad_norm": 2.6029092124259154,
"learning_rate": 1.8849959523012137e-06,
"loss": 0.8832,
"step": 1150
},
{
"epoch": 0.47,
"grad_norm": 3.097295229608598,
"learning_rate": 1.884796318294372e-06,
"loss": 0.9083,
"step": 1151
},
{
"epoch": 0.47,
"grad_norm": 2.9124394715661523,
"learning_rate": 1.8845965217569462e-06,
"loss": 0.8576,
"step": 1152
},
{
"epoch": 0.47,
"grad_norm": 3.072563267420756,
"learning_rate": 1.8843965627256372e-06,
"loss": 0.9297,
"step": 1153
},
{
"epoch": 0.47,
"grad_norm": 2.7807351571611054,
"learning_rate": 1.884196441237176e-06,
"loss": 0.9676,
"step": 1154
},
{
"epoch": 0.47,
"grad_norm": 3.9728016742671532,
"learning_rate": 1.8839961573283238e-06,
"loss": 0.9413,
"step": 1155
},
{
"epoch": 0.47,
"grad_norm": 3.5882325825966697,
"learning_rate": 1.883795711035871e-06,
"loss": 0.9173,
"step": 1156
},
{
"epoch": 0.47,
"grad_norm": 3.409178088202683,
"learning_rate": 1.8835951023966387e-06,
"loss": 0.8855,
"step": 1157
},
{
"epoch": 0.47,
"grad_norm": 3.0929896307787406,
"learning_rate": 1.8833943314474764e-06,
"loss": 0.9541,
"step": 1158
},
{
"epoch": 0.47,
"grad_norm": 2.6483438724411412,
"learning_rate": 1.8831933982252646e-06,
"loss": 0.8599,
"step": 1159
},
{
"epoch": 0.47,
"grad_norm": 2.707346492533441,
"learning_rate": 1.8829923027669134e-06,
"loss": 0.8906,
"step": 1160
},
{
"epoch": 0.47,
"grad_norm": 3.412293842541458,
"learning_rate": 1.8827910451093626e-06,
"loss": 0.8582,
"step": 1161
},
{
"epoch": 0.47,
"grad_norm": 3.314410692473783,
"learning_rate": 1.8825896252895812e-06,
"loss": 0.9094,
"step": 1162
},
{
"epoch": 0.47,
"grad_norm": 2.955491452390502,
"learning_rate": 1.882388043344569e-06,
"loss": 0.8531,
"step": 1163
},
{
"epoch": 0.48,
"grad_norm": 2.8883714568811345,
"learning_rate": 1.8821862993113555e-06,
"loss": 0.9288,
"step": 1164
},
{
"epoch": 0.48,
"grad_norm": 2.9227052033777556,
"learning_rate": 1.8819843932269987e-06,
"loss": 0.9491,
"step": 1165
},
{
"epoch": 0.48,
"grad_norm": 3.0985704044559634,
"learning_rate": 1.8817823251285877e-06,
"loss": 0.8897,
"step": 1166
},
{
"epoch": 0.48,
"grad_norm": 3.5807983714515417,
"learning_rate": 1.8815800950532411e-06,
"loss": 0.9029,
"step": 1167
},
{
"epoch": 0.48,
"grad_norm": 3.249113746850553,
"learning_rate": 1.881377703038107e-06,
"loss": 0.9291,
"step": 1168
},
{
"epoch": 0.48,
"grad_norm": 3.1162567581957092,
"learning_rate": 1.8811751491203634e-06,
"loss": 0.8292,
"step": 1169
},
{
"epoch": 0.48,
"grad_norm": 3.6099381925157794,
"learning_rate": 1.8809724333372175e-06,
"loss": 0.8859,
"step": 1170
},
{
"epoch": 0.48,
"grad_norm": 3.5583805211541435,
"learning_rate": 1.880769555725907e-06,
"loss": 0.883,
"step": 1171
},
{
"epoch": 0.48,
"grad_norm": 2.855637237233463,
"learning_rate": 1.8805665163236993e-06,
"loss": 0.9056,
"step": 1172
},
{
"epoch": 0.48,
"grad_norm": 3.2112032827977752,
"learning_rate": 1.8803633151678909e-06,
"loss": 0.8811,
"step": 1173
},
{
"epoch": 0.48,
"grad_norm": 2.614147175459903,
"learning_rate": 1.8801599522958083e-06,
"loss": 0.8743,
"step": 1174
},
{
"epoch": 0.48,
"grad_norm": 3.3806519465527907,
"learning_rate": 1.879956427744808e-06,
"loss": 0.8938,
"step": 1175
},
{
"epoch": 0.48,
"grad_norm": 3.528343355156741,
"learning_rate": 1.8797527415522758e-06,
"loss": 0.9325,
"step": 1176
},
{
"epoch": 0.48,
"grad_norm": 3.8117658574089472,
"learning_rate": 1.8795488937556275e-06,
"loss": 0.9084,
"step": 1177
},
{
"epoch": 0.48,
"grad_norm": 3.1040029200520936,
"learning_rate": 1.8793448843923083e-06,
"loss": 0.8949,
"step": 1178
},
{
"epoch": 0.48,
"grad_norm": 2.703900537967034,
"learning_rate": 1.8791407134997934e-06,
"loss": 0.9571,
"step": 1179
},
{
"epoch": 0.48,
"grad_norm": 2.8106007580539103,
"learning_rate": 1.8789363811155875e-06,
"loss": 0.8908,
"step": 1180
},
{
"epoch": 0.48,
"grad_norm": 2.8053428249858223,
"learning_rate": 1.8787318872772246e-06,
"loss": 0.9979,
"step": 1181
},
{
"epoch": 0.48,
"grad_norm": 3.1173394926471536,
"learning_rate": 1.878527232022269e-06,
"loss": 0.8884,
"step": 1182
},
{
"epoch": 0.48,
"grad_norm": 3.884174957978604,
"learning_rate": 1.8783224153883146e-06,
"loss": 0.8772,
"step": 1183
},
{
"epoch": 0.48,
"grad_norm": 3.1927142746085844,
"learning_rate": 1.8781174374129842e-06,
"loss": 0.9864,
"step": 1184
},
{
"epoch": 0.48,
"grad_norm": 3.402985213055965,
"learning_rate": 1.8779122981339311e-06,
"loss": 0.8919,
"step": 1185
},
{
"epoch": 0.48,
"grad_norm": 3.3631082588606276,
"learning_rate": 1.8777069975888375e-06,
"loss": 0.8951,
"step": 1186
},
{
"epoch": 0.48,
"grad_norm": 3.0858478225011687,
"learning_rate": 1.8775015358154164e-06,
"loss": 0.8626,
"step": 1187
},
{
"epoch": 0.48,
"grad_norm": 3.5049914791513843,
"learning_rate": 1.8772959128514087e-06,
"loss": 0.8479,
"step": 1188
},
{
"epoch": 0.49,
"grad_norm": 3.1070756877201453,
"learning_rate": 1.8770901287345863e-06,
"loss": 0.9097,
"step": 1189
},
{
"epoch": 0.49,
"grad_norm": 2.969341224459655,
"learning_rate": 1.8768841835027501e-06,
"loss": 0.9177,
"step": 1190
},
{
"epoch": 0.49,
"grad_norm": 3.8132701216471423,
"learning_rate": 1.876678077193731e-06,
"loss": 0.8908,
"step": 1191
},
{
"epoch": 0.49,
"grad_norm": 2.7858856971886765,
"learning_rate": 1.876471809845389e-06,
"loss": 0.9485,
"step": 1192
},
{
"epoch": 0.49,
"grad_norm": 2.693970911333755,
"learning_rate": 1.8762653814956136e-06,
"loss": 0.8608,
"step": 1193
},
{
"epoch": 0.49,
"grad_norm": 2.7879631014582733,
"learning_rate": 1.8760587921823246e-06,
"loss": 0.9825,
"step": 1194
},
{
"epoch": 0.49,
"grad_norm": 2.5309021866488175,
"learning_rate": 1.8758520419434707e-06,
"loss": 0.8706,
"step": 1195
},
{
"epoch": 0.49,
"grad_norm": 2.831991106808424,
"learning_rate": 1.8756451308170305e-06,
"loss": 0.9084,
"step": 1196
},
{
"epoch": 0.49,
"grad_norm": 3.200285600916395,
"learning_rate": 1.8754380588410123e-06,
"loss": 0.8685,
"step": 1197
},
{
"epoch": 0.49,
"grad_norm": 2.737498031174991,
"learning_rate": 1.8752308260534534e-06,
"loss": 0.9582,
"step": 1198
},
{
"epoch": 0.49,
"grad_norm": 3.1347645496108414,
"learning_rate": 1.8750234324924208e-06,
"loss": 0.8626,
"step": 1199
},
{
"epoch": 0.49,
"grad_norm": 2.923046691979865,
"learning_rate": 1.8748158781960113e-06,
"loss": 0.8951,
"step": 1200
},
{
"epoch": 0.49,
"eval_loss": 0.908894419670105,
"eval_runtime": 464.6572,
"eval_samples_per_second": 75.006,
"eval_steps_per_second": 4.689,
"step": 1200
},
{
"epoch": 0.49,
"grad_norm": 2.8583890829722125,
"learning_rate": 1.8746081632023514e-06,
"loss": 0.9213,
"step": 1201
},
{
"epoch": 0.49,
"grad_norm": 3.254055346461189,
"learning_rate": 1.8744002875495966e-06,
"loss": 0.8523,
"step": 1202
},
{
"epoch": 0.49,
"grad_norm": 2.9583550828296783,
"learning_rate": 1.8741922512759321e-06,
"loss": 0.8911,
"step": 1203
},
{
"epoch": 0.49,
"grad_norm": 3.1390416107520287,
"learning_rate": 1.8739840544195724e-06,
"loss": 0.9018,
"step": 1204
},
{
"epoch": 0.49,
"grad_norm": 2.951875597800887,
"learning_rate": 1.8737756970187624e-06,
"loss": 0.9334,
"step": 1205
},
{
"epoch": 0.49,
"grad_norm": 2.837738722944825,
"learning_rate": 1.873567179111775e-06,
"loss": 0.8722,
"step": 1206
},
{
"epoch": 0.49,
"grad_norm": 3.3108850532005367,
"learning_rate": 1.8733585007369142e-06,
"loss": 0.9359,
"step": 1207
},
{
"epoch": 0.49,
"grad_norm": 2.6847031025120036,
"learning_rate": 1.873149661932512e-06,
"loss": 0.964,
"step": 1208
},
{
"epoch": 0.49,
"grad_norm": 2.7990633197457147,
"learning_rate": 1.872940662736931e-06,
"loss": 0.8921,
"step": 1209
},
{
"epoch": 0.49,
"grad_norm": 4.224966708131868,
"learning_rate": 1.8727315031885626e-06,
"loss": 0.8823,
"step": 1210
},
{
"epoch": 0.49,
"grad_norm": 3.5160494379445284,
"learning_rate": 1.872522183325828e-06,
"loss": 0.9152,
"step": 1211
},
{
"epoch": 0.49,
"grad_norm": 2.7310286680106355,
"learning_rate": 1.8723127031871776e-06,
"loss": 0.9486,
"step": 1212
},
{
"epoch": 0.5,
"grad_norm": 2.8175105043592614,
"learning_rate": 1.8721030628110916e-06,
"loss": 0.885,
"step": 1213
},
{
"epoch": 0.5,
"grad_norm": 3.397877988229638,
"learning_rate": 1.871893262236079e-06,
"loss": 0.9116,
"step": 1214
},
{
"epoch": 0.5,
"grad_norm": 3.6473187473291513,
"learning_rate": 1.871683301500679e-06,
"loss": 0.8791,
"step": 1215
},
{
"epoch": 0.5,
"grad_norm": 3.219334281331208,
"learning_rate": 1.8714731806434597e-06,
"loss": 0.8757,
"step": 1216
},
{
"epoch": 0.5,
"grad_norm": 3.0457865474907178,
"learning_rate": 1.8712628997030189e-06,
"loss": 0.8829,
"step": 1217
},
{
"epoch": 0.5,
"grad_norm": 3.102110830253005,
"learning_rate": 1.8710524587179833e-06,
"loss": 1.017,
"step": 1218
},
{
"epoch": 0.5,
"grad_norm": 2.8938057337824867,
"learning_rate": 1.8708418577270095e-06,
"loss": 0.9215,
"step": 1219
},
{
"epoch": 0.5,
"grad_norm": 2.9583845621898814,
"learning_rate": 1.8706310967687835e-06,
"loss": 0.8909,
"step": 1220
},
{
"epoch": 0.5,
"grad_norm": 2.6963650488231767,
"learning_rate": 1.8704201758820206e-06,
"loss": 0.961,
"step": 1221
},
{
"epoch": 0.5,
"grad_norm": 2.848036843612148,
"learning_rate": 1.8702090951054653e-06,
"loss": 0.9665,
"step": 1222
},
{
"epoch": 0.5,
"grad_norm": 2.7540353509775737,
"learning_rate": 1.8699978544778917e-06,
"loss": 0.8725,
"step": 1223
},
{
"epoch": 0.5,
"grad_norm": 3.193442710163957,
"learning_rate": 1.8697864540381026e-06,
"loss": 0.9571,
"step": 1224
},
{
"epoch": 0.5,
"grad_norm": 2.8269694169410844,
"learning_rate": 1.8695748938249315e-06,
"loss": 0.9308,
"step": 1225
},
{
"epoch": 0.5,
"grad_norm": 3.077251800501496,
"learning_rate": 1.86936317387724e-06,
"loss": 0.8955,
"step": 1226
},
{
"epoch": 0.5,
"grad_norm": 2.8261093719077044,
"learning_rate": 1.8691512942339193e-06,
"loss": 0.8392,
"step": 1227
},
{
"epoch": 0.5,
"grad_norm": 3.45195650853066,
"learning_rate": 1.868939254933891e-06,
"loss": 0.8892,
"step": 1228
},
{
"epoch": 0.5,
"grad_norm": 2.9222269102152016,
"learning_rate": 1.8687270560161043e-06,
"loss": 0.9305,
"step": 1229
},
{
"epoch": 0.5,
"grad_norm": 2.7559376236530655,
"learning_rate": 1.868514697519539e-06,
"loss": 0.9293,
"step": 1230
},
{
"epoch": 0.5,
"grad_norm": 2.5422995575051943,
"learning_rate": 1.8683021794832034e-06,
"loss": 0.9389,
"step": 1231
},
{
"epoch": 0.5,
"grad_norm": 3.06521236005551,
"learning_rate": 1.868089501946136e-06,
"loss": 0.9242,
"step": 1232
},
{
"epoch": 0.5,
"grad_norm": 3.0085687081787715,
"learning_rate": 1.8678766649474038e-06,
"loss": 0.9173,
"step": 1233
},
{
"epoch": 0.5,
"grad_norm": 3.0274386388370123,
"learning_rate": 1.8676636685261037e-06,
"loss": 0.9667,
"step": 1234
},
{
"epoch": 0.5,
"grad_norm": 3.108979955904097,
"learning_rate": 1.867450512721361e-06,
"loss": 0.9295,
"step": 1235
},
{
"epoch": 0.5,
"grad_norm": 2.7596626057266938,
"learning_rate": 1.867237197572331e-06,
"loss": 0.9004,
"step": 1236
},
{
"epoch": 0.5,
"grad_norm": 2.82017340729043,
"learning_rate": 1.8670237231181988e-06,
"loss": 0.9413,
"step": 1237
},
{
"epoch": 0.51,
"grad_norm": 2.4955406864936185,
"learning_rate": 1.8668100893981778e-06,
"loss": 0.9151,
"step": 1238
},
{
"epoch": 0.51,
"grad_norm": 3.0173734953959666,
"learning_rate": 1.8665962964515102e-06,
"loss": 0.9833,
"step": 1239
},
{
"epoch": 0.51,
"grad_norm": 3.2196068941254645,
"learning_rate": 1.866382344317469e-06,
"loss": 0.8699,
"step": 1240
},
{
"epoch": 0.51,
"grad_norm": 2.5654194769741947,
"learning_rate": 1.8661682330353555e-06,
"loss": 0.8875,
"step": 1241
},
{
"epoch": 0.51,
"grad_norm": 4.579434933107168,
"learning_rate": 1.8659539626445e-06,
"loss": 0.8241,
"step": 1242
},
{
"epoch": 0.51,
"grad_norm": 3.2411454701215425,
"learning_rate": 1.8657395331842628e-06,
"loss": 0.8918,
"step": 1243
},
{
"epoch": 0.51,
"grad_norm": 2.6730529777203134,
"learning_rate": 1.8655249446940328e-06,
"loss": 0.8354,
"step": 1244
},
{
"epoch": 0.51,
"grad_norm": 2.8692960761132604,
"learning_rate": 1.8653101972132289e-06,
"loss": 0.9014,
"step": 1245
},
{
"epoch": 0.51,
"grad_norm": 2.6092648152285225,
"learning_rate": 1.8650952907812973e-06,
"loss": 0.8697,
"step": 1246
},
{
"epoch": 0.51,
"grad_norm": 2.703759778794585,
"learning_rate": 1.8648802254377162e-06,
"loss": 0.9558,
"step": 1247
},
{
"epoch": 0.51,
"grad_norm": 2.894111804419412,
"learning_rate": 1.864665001221991e-06,
"loss": 0.9631,
"step": 1248
},
{
"epoch": 0.51,
"grad_norm": 3.462204574619209,
"learning_rate": 1.8644496181736562e-06,
"loss": 0.9438,
"step": 1249
},
{
"epoch": 0.51,
"grad_norm": 2.5079727232793685,
"learning_rate": 1.8642340763322767e-06,
"loss": 0.9454,
"step": 1250
},
{
"epoch": 0.51,
"grad_norm": 2.875844047885332,
"learning_rate": 1.8640183757374459e-06,
"loss": 0.9864,
"step": 1251
},
{
"epoch": 0.51,
"grad_norm": 2.9072282332510992,
"learning_rate": 1.8638025164287866e-06,
"loss": 0.9246,
"step": 1252
},
{
"epoch": 0.51,
"grad_norm": 2.786702573437684,
"learning_rate": 1.86358649844595e-06,
"loss": 0.9349,
"step": 1253
},
{
"epoch": 0.51,
"grad_norm": 3.0017053261101214,
"learning_rate": 1.8633703218286172e-06,
"loss": 0.8815,
"step": 1254
},
{
"epoch": 0.51,
"grad_norm": 2.9669874739623943,
"learning_rate": 1.8631539866164987e-06,
"loss": 0.8788,
"step": 1255
},
{
"epoch": 0.51,
"grad_norm": 2.9968452991921617,
"learning_rate": 1.8629374928493333e-06,
"loss": 0.9386,
"step": 1256
},
{
"epoch": 0.51,
"grad_norm": 3.3834322479255463,
"learning_rate": 1.8627208405668897e-06,
"loss": 0.9,
"step": 1257
},
{
"epoch": 0.51,
"grad_norm": 2.919857649679711,
"learning_rate": 1.8625040298089644e-06,
"loss": 0.9337,
"step": 1258
},
{
"epoch": 0.51,
"grad_norm": 3.163679464547735,
"learning_rate": 1.8622870606153853e-06,
"loss": 0.8907,
"step": 1259
},
{
"epoch": 0.51,
"grad_norm": 2.6952660777833852,
"learning_rate": 1.862069933026007e-06,
"loss": 0.932,
"step": 1260
},
{
"epoch": 0.51,
"grad_norm": 3.009478363684415,
"learning_rate": 1.8618526470807146e-06,
"loss": 0.8614,
"step": 1261
},
{
"epoch": 0.51,
"grad_norm": 2.7334039134509918,
"learning_rate": 1.8616352028194217e-06,
"loss": 0.9668,
"step": 1262
},
{
"epoch": 0.52,
"grad_norm": 2.594412644650864,
"learning_rate": 1.8614176002820715e-06,
"loss": 0.9473,
"step": 1263
},
{
"epoch": 0.52,
"grad_norm": 2.6110229324317094,
"learning_rate": 1.8611998395086359e-06,
"loss": 0.9592,
"step": 1264
},
{
"epoch": 0.52,
"grad_norm": 3.5049688584155545,
"learning_rate": 1.8609819205391163e-06,
"loss": 0.9011,
"step": 1265
},
{
"epoch": 0.52,
"grad_norm": 2.895582737452778,
"learning_rate": 1.8607638434135419e-06,
"loss": 0.9094,
"step": 1266
},
{
"epoch": 0.52,
"grad_norm": 3.1620155727604597,
"learning_rate": 1.8605456081719728e-06,
"loss": 0.9189,
"step": 1267
},
{
"epoch": 0.52,
"grad_norm": 3.5828466815166045,
"learning_rate": 1.8603272148544966e-06,
"loss": 0.8521,
"step": 1268
},
{
"epoch": 0.52,
"grad_norm": 3.464068902186186,
"learning_rate": 1.860108663501231e-06,
"loss": 0.9034,
"step": 1269
},
{
"epoch": 0.52,
"grad_norm": 3.141535920184389,
"learning_rate": 1.859889954152322e-06,
"loss": 0.9614,
"step": 1270
},
{
"epoch": 0.52,
"grad_norm": 3.0270668851505675,
"learning_rate": 1.8596710868479453e-06,
"loss": 0.9341,
"step": 1271
},
{
"epoch": 0.52,
"grad_norm": 2.853022853596971,
"learning_rate": 1.8594520616283043e-06,
"loss": 0.8554,
"step": 1272
},
{
"epoch": 0.52,
"grad_norm": 2.6838707638896695,
"learning_rate": 1.8592328785336333e-06,
"loss": 0.881,
"step": 1273
},
{
"epoch": 0.52,
"grad_norm": 3.819369038985289,
"learning_rate": 1.8590135376041943e-06,
"loss": 0.8752,
"step": 1274
},
{
"epoch": 0.52,
"grad_norm": 2.9328800612802337,
"learning_rate": 1.8587940388802782e-06,
"loss": 0.9555,
"step": 1275
},
{
"epoch": 0.52,
"grad_norm": 2.9886934281498787,
"learning_rate": 1.8585743824022057e-06,
"loss": 0.9139,
"step": 1276
},
{
"epoch": 0.52,
"grad_norm": 3.6018031211603874,
"learning_rate": 1.8583545682103266e-06,
"loss": 0.9135,
"step": 1277
},
{
"epoch": 0.52,
"grad_norm": 2.910376499641723,
"learning_rate": 1.8581345963450179e-06,
"loss": 0.8833,
"step": 1278
},
{
"epoch": 0.52,
"grad_norm": 2.8639374619429563,
"learning_rate": 1.857914466846688e-06,
"loss": 0.8855,
"step": 1279
},
{
"epoch": 0.52,
"grad_norm": 2.744841953572245,
"learning_rate": 1.8576941797557723e-06,
"loss": 0.927,
"step": 1280
},
{
"epoch": 0.52,
"grad_norm": 3.018130650812206,
"learning_rate": 1.8574737351127362e-06,
"loss": 0.94,
"step": 1281
},
{
"epoch": 0.52,
"grad_norm": 2.4646620688105316,
"learning_rate": 1.857253132958074e-06,
"loss": 0.928,
"step": 1282
},
{
"epoch": 0.52,
"grad_norm": 2.5092573148733166,
"learning_rate": 1.8570323733323084e-06,
"loss": 0.9178,
"step": 1283
},
{
"epoch": 0.52,
"grad_norm": 2.5017527709526957,
"learning_rate": 1.856811456275991e-06,
"loss": 0.8846,
"step": 1284
},
{
"epoch": 0.52,
"grad_norm": 2.616363067070405,
"learning_rate": 1.8565903818297036e-06,
"loss": 0.8935,
"step": 1285
},
{
"epoch": 0.52,
"grad_norm": 2.8142445469208535,
"learning_rate": 1.8563691500340548e-06,
"loss": 0.8768,
"step": 1286
},
{
"epoch": 0.53,
"grad_norm": 3.039638167794474,
"learning_rate": 1.856147760929684e-06,
"loss": 0.9129,
"step": 1287
},
{
"epoch": 0.53,
"grad_norm": 4.808217825391662,
"learning_rate": 1.8559262145572586e-06,
"loss": 0.8932,
"step": 1288
},
{
"epoch": 0.53,
"grad_norm": 2.806152718088999,
"learning_rate": 1.8557045109574754e-06,
"loss": 0.8519,
"step": 1289
},
{
"epoch": 0.53,
"grad_norm": 2.9378733959795085,
"learning_rate": 1.855482650171059e-06,
"loss": 0.9472,
"step": 1290
},
{
"epoch": 0.53,
"grad_norm": 2.55986695054741,
"learning_rate": 1.8552606322387637e-06,
"loss": 0.9512,
"step": 1291
},
{
"epoch": 0.53,
"grad_norm": 2.941635812620943,
"learning_rate": 1.8550384572013735e-06,
"loss": 0.9121,
"step": 1292
},
{
"epoch": 0.53,
"grad_norm": 3.0154769864857798,
"learning_rate": 1.854816125099699e-06,
"loss": 0.9226,
"step": 1293
},
{
"epoch": 0.53,
"grad_norm": 3.5762700676876262,
"learning_rate": 1.8545936359745818e-06,
"loss": 0.9044,
"step": 1294
},
{
"epoch": 0.53,
"grad_norm": 2.6959451737369093,
"learning_rate": 1.8543709898668913e-06,
"loss": 0.9284,
"step": 1295
},
{
"epoch": 0.53,
"grad_norm": 3.005210303505453,
"learning_rate": 1.8541481868175262e-06,
"loss": 0.9518,
"step": 1296
},
{
"epoch": 0.53,
"grad_norm": 2.8045455523176486,
"learning_rate": 1.8539252268674132e-06,
"loss": 0.8787,
"step": 1297
},
{
"epoch": 0.53,
"grad_norm": 2.9153210787448933,
"learning_rate": 1.853702110057509e-06,
"loss": 0.9354,
"step": 1298
},
{
"epoch": 0.53,
"grad_norm": 3.4106635856134373,
"learning_rate": 1.8534788364287982e-06,
"loss": 0.868,
"step": 1299
},
{
"epoch": 0.53,
"grad_norm": 2.9802766819195594,
"learning_rate": 1.8532554060222947e-06,
"loss": 0.918,
"step": 1300
},
{
"epoch": 0.53,
"eval_loss": 0.9075450897216797,
"eval_runtime": 465.906,
"eval_samples_per_second": 74.805,
"eval_steps_per_second": 4.677,
"step": 1300
},
{
"epoch": 0.53,
"grad_norm": 2.9132555862846674,
"learning_rate": 1.8530318188790405e-06,
"loss": 0.9429,
"step": 1301
},
{
"epoch": 0.53,
"grad_norm": 3.5938030091223583,
"learning_rate": 1.852808075040108e-06,
"loss": 0.9146,
"step": 1302
},
{
"epoch": 0.53,
"grad_norm": 2.7532771490492554,
"learning_rate": 1.8525841745465961e-06,
"loss": 0.9079,
"step": 1303
},
{
"epoch": 0.53,
"grad_norm": 2.534033895622318,
"learning_rate": 1.8523601174396343e-06,
"loss": 0.9292,
"step": 1304
},
{
"epoch": 0.53,
"grad_norm": 2.6437737860024506,
"learning_rate": 1.8521359037603806e-06,
"loss": 0.9371,
"step": 1305
},
{
"epoch": 0.53,
"grad_norm": 2.685481552735365,
"learning_rate": 1.8519115335500207e-06,
"loss": 0.8646,
"step": 1306
},
{
"epoch": 0.53,
"grad_norm": 2.664246117662947,
"learning_rate": 1.85168700684977e-06,
"loss": 0.9069,
"step": 1307
},
{
"epoch": 0.53,
"grad_norm": 3.4479087387758796,
"learning_rate": 1.8514623237008723e-06,
"loss": 0.8889,
"step": 1308
},
{
"epoch": 0.53,
"grad_norm": 2.6041237014358054,
"learning_rate": 1.8512374841446006e-06,
"loss": 0.9327,
"step": 1309
},
{
"epoch": 0.53,
"grad_norm": 3.5428073872545895,
"learning_rate": 1.851012488222256e-06,
"loss": 0.9299,
"step": 1310
},
{
"epoch": 0.53,
"grad_norm": 3.327380552462028,
"learning_rate": 1.8507873359751685e-06,
"loss": 0.8588,
"step": 1311
},
{
"epoch": 0.54,
"grad_norm": 4.330997761779057,
"learning_rate": 1.850562027444697e-06,
"loss": 0.9131,
"step": 1312
},
{
"epoch": 0.54,
"grad_norm": 3.421140756674206,
"learning_rate": 1.8503365626722297e-06,
"loss": 0.973,
"step": 1313
},
{
"epoch": 0.54,
"grad_norm": 3.316652919919943,
"learning_rate": 1.8501109416991815e-06,
"loss": 0.8969,
"step": 1314
},
{
"epoch": 0.54,
"grad_norm": 3.946187935606095,
"learning_rate": 1.8498851645669986e-06,
"loss": 0.9359,
"step": 1315
},
{
"epoch": 0.54,
"grad_norm": 3.7449005983233112,
"learning_rate": 1.8496592313171538e-06,
"loss": 0.9223,
"step": 1316
},
{
"epoch": 0.54,
"grad_norm": 3.2349650116163144,
"learning_rate": 1.8494331419911493e-06,
"loss": 0.7779,
"step": 1317
},
{
"epoch": 0.54,
"grad_norm": 3.431179463537583,
"learning_rate": 1.8492068966305168e-06,
"loss": 0.9068,
"step": 1318
},
{
"epoch": 0.54,
"grad_norm": 2.5768264130412506,
"learning_rate": 1.848980495276815e-06,
"loss": 0.9764,
"step": 1319
},
{
"epoch": 0.54,
"grad_norm": 3.517060875141672,
"learning_rate": 1.848753937971633e-06,
"loss": 0.8626,
"step": 1320
},
{
"epoch": 0.54,
"grad_norm": 3.5251108845917583,
"learning_rate": 1.8485272247565872e-06,
"loss": 0.9557,
"step": 1321
},
{
"epoch": 0.54,
"grad_norm": 2.9772632297822765,
"learning_rate": 1.8483003556733237e-06,
"loss": 0.9251,
"step": 1322
},
{
"epoch": 0.54,
"grad_norm": 4.46015933933627,
"learning_rate": 1.8480733307635159e-06,
"loss": 0.9092,
"step": 1323
},
{
"epoch": 0.54,
"grad_norm": 2.7172489368693875,
"learning_rate": 1.847846150068867e-06,
"loss": 0.9256,
"step": 1324
},
{
"epoch": 0.54,
"grad_norm": 3.047895464148809,
"learning_rate": 1.8476188136311084e-06,
"loss": 0.869,
"step": 1325
},
{
"epoch": 0.54,
"grad_norm": 2.942758131615926,
"learning_rate": 1.847391321492e-06,
"loss": 0.9067,
"step": 1326
},
{
"epoch": 0.54,
"grad_norm": 3.570921868630955,
"learning_rate": 1.8471636736933309e-06,
"loss": 0.9562,
"step": 1327
},
{
"epoch": 0.54,
"grad_norm": 3.2808585000904795,
"learning_rate": 1.8469358702769177e-06,
"loss": 0.8779,
"step": 1328
},
{
"epoch": 0.54,
"grad_norm": 2.7552926482956046,
"learning_rate": 1.8467079112846067e-06,
"loss": 0.9128,
"step": 1329
},
{
"epoch": 0.54,
"grad_norm": 3.511422751083706,
"learning_rate": 1.8464797967582722e-06,
"loss": 0.9135,
"step": 1330
},
{
"epoch": 0.54,
"grad_norm": 3.2601633517342665,
"learning_rate": 1.8462515267398171e-06,
"loss": 0.9057,
"step": 1331
},
{
"epoch": 0.54,
"grad_norm": 3.835731498200863,
"learning_rate": 1.8460231012711728e-06,
"loss": 0.9294,
"step": 1332
},
{
"epoch": 0.54,
"grad_norm": 3.0095327684737856,
"learning_rate": 1.8457945203942993e-06,
"loss": 0.8981,
"step": 1333
},
{
"epoch": 0.54,
"grad_norm": 2.815776675822424,
"learning_rate": 1.8455657841511858e-06,
"loss": 0.8848,
"step": 1334
},
{
"epoch": 0.54,
"grad_norm": 2.8666135994254085,
"learning_rate": 1.8453368925838485e-06,
"loss": 0.876,
"step": 1335
},
{
"epoch": 0.55,
"grad_norm": 3.0054355521642058,
"learning_rate": 1.8451078457343343e-06,
"loss": 0.9434,
"step": 1336
},
{
"epoch": 0.55,
"grad_norm": 3.209282614229144,
"learning_rate": 1.8448786436447166e-06,
"loss": 0.8975,
"step": 1337
},
{
"epoch": 0.55,
"grad_norm": 3.6906821098285474,
"learning_rate": 1.8446492863570982e-06,
"loss": 0.8548,
"step": 1338
},
{
"epoch": 0.55,
"grad_norm": 3.3137852724034937,
"learning_rate": 1.8444197739136107e-06,
"loss": 0.8959,
"step": 1339
},
{
"epoch": 0.55,
"grad_norm": 2.327935824086638,
"learning_rate": 1.8441901063564136e-06,
"loss": 0.8755,
"step": 1340
},
{
"epoch": 0.55,
"grad_norm": 3.15754328262364,
"learning_rate": 1.843960283727695e-06,
"loss": 0.9132,
"step": 1341
},
{
"epoch": 0.55,
"grad_norm": 3.0244296968059046,
"learning_rate": 1.8437303060696721e-06,
"loss": 0.8741,
"step": 1342
},
{
"epoch": 0.55,
"grad_norm": 3.179153119146885,
"learning_rate": 1.8435001734245893e-06,
"loss": 0.9401,
"step": 1343
},
{
"epoch": 0.55,
"grad_norm": 2.995878257942581,
"learning_rate": 1.8432698858347212e-06,
"loss": 0.8563,
"step": 1344
},
{
"epoch": 0.55,
"grad_norm": 2.718404433980867,
"learning_rate": 1.8430394433423695e-06,
"loss": 0.9063,
"step": 1345
},
{
"epoch": 0.55,
"grad_norm": 2.9818857354345725,
"learning_rate": 1.842808845989865e-06,
"loss": 0.9276,
"step": 1346
},
{
"epoch": 0.55,
"grad_norm": 2.821920668771963,
"learning_rate": 1.8425780938195662e-06,
"loss": 0.8937,
"step": 1347
},
{
"epoch": 0.55,
"grad_norm": 2.67827333377803,
"learning_rate": 1.8423471868738614e-06,
"loss": 0.8338,
"step": 1348
},
{
"epoch": 0.55,
"grad_norm": 5.518272611114038,
"learning_rate": 1.8421161251951656e-06,
"loss": 0.8601,
"step": 1349
},
{
"epoch": 0.55,
"grad_norm": 2.651798455792766,
"learning_rate": 1.8418849088259238e-06,
"loss": 0.9253,
"step": 1350
},
{
"epoch": 0.55,
"grad_norm": 2.8675287148573783,
"learning_rate": 1.8416535378086086e-06,
"loss": 1.0072,
"step": 1351
},
{
"epoch": 0.55,
"grad_norm": 2.6485097502904904,
"learning_rate": 1.841422012185721e-06,
"loss": 0.9047,
"step": 1352
},
{
"epoch": 0.55,
"grad_norm": 3.179296323883959,
"learning_rate": 1.8411903319997907e-06,
"loss": 0.825,
"step": 1353
},
{
"epoch": 0.55,
"grad_norm": 2.6332125336649477,
"learning_rate": 1.8409584972933755e-06,
"loss": 0.8751,
"step": 1354
},
{
"epoch": 0.55,
"grad_norm": 2.765960714533217,
"learning_rate": 1.8407265081090619e-06,
"loss": 0.9527,
"step": 1355
},
{
"epoch": 0.55,
"grad_norm": 3.8352628740686554,
"learning_rate": 1.8404943644894646e-06,
"loss": 0.9181,
"step": 1356
},
{
"epoch": 0.55,
"grad_norm": 3.7317860496550237,
"learning_rate": 1.8402620664772266e-06,
"loss": 0.9219,
"step": 1357
},
{
"epoch": 0.55,
"grad_norm": 3.738498263680275,
"learning_rate": 1.8400296141150193e-06,
"loss": 0.9596,
"step": 1358
},
{
"epoch": 0.55,
"grad_norm": 3.0381065794311706,
"learning_rate": 1.8397970074455427e-06,
"loss": 0.9174,
"step": 1359
},
{
"epoch": 0.55,
"grad_norm": 3.319303432169093,
"learning_rate": 1.8395642465115247e-06,
"loss": 0.891,
"step": 1360
},
{
"epoch": 0.56,
"grad_norm": 3.5555349773802942,
"learning_rate": 1.8393313313557214e-06,
"loss": 0.9954,
"step": 1361
},
{
"epoch": 0.56,
"grad_norm": 2.555717853333345,
"learning_rate": 1.8390982620209187e-06,
"loss": 0.9086,
"step": 1362
},
{
"epoch": 0.56,
"grad_norm": 2.6927857721291013,
"learning_rate": 1.8388650385499288e-06,
"loss": 0.9019,
"step": 1363
},
{
"epoch": 0.56,
"grad_norm": 3.1771402352196914,
"learning_rate": 1.8386316609855937e-06,
"loss": 0.931,
"step": 1364
},
{
"epoch": 0.56,
"grad_norm": 2.583053370287236,
"learning_rate": 1.8383981293707827e-06,
"loss": 0.9856,
"step": 1365
},
{
"epoch": 0.56,
"grad_norm": 2.3592484251381416,
"learning_rate": 1.8381644437483938e-06,
"loss": 0.9055,
"step": 1366
},
{
"epoch": 0.56,
"grad_norm": 2.5711923871171303,
"learning_rate": 1.837930604161354e-06,
"loss": 0.8804,
"step": 1367
},
{
"epoch": 0.56,
"grad_norm": 2.9751620130260887,
"learning_rate": 1.837696610652617e-06,
"loss": 0.9049,
"step": 1368
},
{
"epoch": 0.56,
"grad_norm": 2.61661624743771,
"learning_rate": 1.8374624632651663e-06,
"loss": 0.9119,
"step": 1369
},
{
"epoch": 0.56,
"grad_norm": 3.314519725301463,
"learning_rate": 1.8372281620420129e-06,
"loss": 0.797,
"step": 1370
},
{
"epoch": 0.56,
"grad_norm": 3.4331855727063956,
"learning_rate": 1.8369937070261963e-06,
"loss": 0.8789,
"step": 1371
},
{
"epoch": 0.56,
"grad_norm": 2.924871018223905,
"learning_rate": 1.836759098260784e-06,
"loss": 0.8569,
"step": 1372
},
{
"epoch": 0.56,
"grad_norm": 3.0988364057606277,
"learning_rate": 1.836524335788872e-06,
"loss": 0.9406,
"step": 1373
},
{
"epoch": 0.56,
"grad_norm": 2.7704696791898202,
"learning_rate": 1.8362894196535845e-06,
"loss": 0.9666,
"step": 1374
},
{
"epoch": 0.56,
"grad_norm": 3.3899908841329722,
"learning_rate": 1.8360543498980737e-06,
"loss": 0.911,
"step": 1375
},
{
"epoch": 0.56,
"grad_norm": 2.563253080727456,
"learning_rate": 1.8358191265655203e-06,
"loss": 0.9498,
"step": 1376
},
{
"epoch": 0.56,
"grad_norm": 3.153589614861708,
"learning_rate": 1.835583749699133e-06,
"loss": 0.8111,
"step": 1377
},
{
"epoch": 0.56,
"grad_norm": 2.746673213436209,
"learning_rate": 1.835348219342149e-06,
"loss": 0.8826,
"step": 1378
},
{
"epoch": 0.56,
"grad_norm": 3.03726790872311,
"learning_rate": 1.8351125355378334e-06,
"loss": 0.8727,
"step": 1379
},
{
"epoch": 0.56,
"grad_norm": 2.6250991084420527,
"learning_rate": 1.8348766983294797e-06,
"loss": 0.9007,
"step": 1380
},
{
"epoch": 0.56,
"grad_norm": 3.7768307242119796,
"learning_rate": 1.8346407077604092e-06,
"loss": 0.9059,
"step": 1381
},
{
"epoch": 0.56,
"grad_norm": 2.3408898703828074,
"learning_rate": 1.8344045638739717e-06,
"loss": 0.9327,
"step": 1382
},
{
"epoch": 0.56,
"grad_norm": 2.807391432737967,
"learning_rate": 1.834168266713545e-06,
"loss": 0.9255,
"step": 1383
},
{
"epoch": 0.56,
"grad_norm": 3.4429477332805463,
"learning_rate": 1.8339318163225355e-06,
"loss": 0.8589,
"step": 1384
},
{
"epoch": 0.57,
"grad_norm": 2.908464132242198,
"learning_rate": 1.8336952127443771e-06,
"loss": 0.938,
"step": 1385
},
{
"epoch": 0.57,
"grad_norm": 3.5041622267832553,
"learning_rate": 1.8334584560225323e-06,
"loss": 0.8435,
"step": 1386
},
{
"epoch": 0.57,
"grad_norm": 2.848730907990916,
"learning_rate": 1.833221546200492e-06,
"loss": 0.8123,
"step": 1387
},
{
"epoch": 0.57,
"grad_norm": 2.692379032325378,
"learning_rate": 1.8329844833217736e-06,
"loss": 0.8768,
"step": 1388
},
{
"epoch": 0.57,
"grad_norm": 2.632713946049691,
"learning_rate": 1.832747267429925e-06,
"loss": 0.8866,
"step": 1389
},
{
"epoch": 0.57,
"grad_norm": 2.834387353987096,
"learning_rate": 1.8325098985685204e-06,
"loss": 0.8855,
"step": 1390
},
{
"epoch": 0.57,
"grad_norm": 3.4844770852063087,
"learning_rate": 1.8322723767811628e-06,
"loss": 0.913,
"step": 1391
},
{
"epoch": 0.57,
"grad_norm": 2.4073133818418966,
"learning_rate": 1.832034702111483e-06,
"loss": 0.9008,
"step": 1392
},
{
"epoch": 0.57,
"grad_norm": 2.605281605875384,
"learning_rate": 1.8317968746031407e-06,
"loss": 0.8945,
"step": 1393
},
{
"epoch": 0.57,
"grad_norm": 3.14318071835555,
"learning_rate": 1.8315588942998227e-06,
"loss": 0.8922,
"step": 1394
},
{
"epoch": 0.57,
"grad_norm": 3.7850786963613694,
"learning_rate": 1.8313207612452442e-06,
"loss": 0.8418,
"step": 1395
},
{
"epoch": 0.57,
"grad_norm": 2.7624038934153563,
"learning_rate": 1.8310824754831486e-06,
"loss": 0.8807,
"step": 1396
},
{
"epoch": 0.57,
"grad_norm": 2.9766316486397666,
"learning_rate": 1.8308440370573072e-06,
"loss": 0.8677,
"step": 1397
},
{
"epoch": 0.57,
"grad_norm": 2.4641508662040663,
"learning_rate": 1.830605446011519e-06,
"loss": 0.9478,
"step": 1398
},
{
"epoch": 0.57,
"grad_norm": 2.921133197515091,
"learning_rate": 1.8303667023896123e-06,
"loss": 0.9099,
"step": 1399
},
{
"epoch": 0.57,
"grad_norm": 2.9087330758848724,
"learning_rate": 1.830127806235442e-06,
"loss": 0.8809,
"step": 1400
},
{
"epoch": 0.57,
"eval_loss": 0.9042284488677979,
"eval_runtime": 464.6362,
"eval_samples_per_second": 75.009,
"eval_steps_per_second": 4.69,
"step": 1400
},
{
"epoch": 0.57,
"grad_norm": 3.0529192663440843,
"learning_rate": 1.8298887575928913e-06,
"loss": 0.9463,
"step": 1401
},
{
"epoch": 0.57,
"grad_norm": 2.4868788190273228,
"learning_rate": 1.8296495565058722e-06,
"loss": 0.9526,
"step": 1402
},
{
"epoch": 0.57,
"grad_norm": 2.836775726483153,
"learning_rate": 1.8294102030183241e-06,
"loss": 0.877,
"step": 1403
},
{
"epoch": 0.57,
"grad_norm": 2.9870899553975176,
"learning_rate": 1.8291706971742142e-06,
"loss": 0.9617,
"step": 1404
},
{
"epoch": 0.57,
"grad_norm": 2.8904273958293634,
"learning_rate": 1.828931039017538e-06,
"loss": 0.9175,
"step": 1405
},
{
"epoch": 0.57,
"grad_norm": 2.707365501935251,
"learning_rate": 1.8286912285923192e-06,
"loss": 0.9836,
"step": 1406
},
{
"epoch": 0.57,
"grad_norm": 2.536144691911259,
"learning_rate": 1.8284512659426095e-06,
"loss": 0.9246,
"step": 1407
},
{
"epoch": 0.57,
"grad_norm": 3.398580545509084,
"learning_rate": 1.8282111511124873e-06,
"loss": 0.8687,
"step": 1408
},
{
"epoch": 0.57,
"grad_norm": 3.782305519249483,
"learning_rate": 1.8279708841460606e-06,
"loss": 0.8549,
"step": 1409
},
{
"epoch": 0.58,
"grad_norm": 2.80421855827184,
"learning_rate": 1.8277304650874648e-06,
"loss": 0.8963,
"step": 1410
},
{
"epoch": 0.58,
"grad_norm": 3.796549934623593,
"learning_rate": 1.8274898939808624e-06,
"loss": 0.8773,
"step": 1411
},
{
"epoch": 0.58,
"grad_norm": 3.3880227903317865,
"learning_rate": 1.8272491708704455e-06,
"loss": 0.7991,
"step": 1412
},
{
"epoch": 0.58,
"grad_norm": 2.4085508982427957,
"learning_rate": 1.8270082958004323e-06,
"loss": 0.9772,
"step": 1413
},
{
"epoch": 0.58,
"grad_norm": 3.106730531352387,
"learning_rate": 1.8267672688150705e-06,
"loss": 0.8838,
"step": 1414
},
{
"epoch": 0.58,
"grad_norm": 2.8658802770452825,
"learning_rate": 1.8265260899586345e-06,
"loss": 0.9183,
"step": 1415
},
{
"epoch": 0.58,
"grad_norm": 2.624369643787591,
"learning_rate": 1.8262847592754275e-06,
"loss": 0.8948,
"step": 1416
},
{
"epoch": 0.58,
"grad_norm": 2.633528817717931,
"learning_rate": 1.8260432768097796e-06,
"loss": 0.9037,
"step": 1417
},
{
"epoch": 0.58,
"grad_norm": 3.214133048876547,
"learning_rate": 1.82580164260605e-06,
"loss": 0.8554,
"step": 1418
},
{
"epoch": 0.58,
"grad_norm": 3.234087459455778,
"learning_rate": 1.8255598567086242e-06,
"loss": 0.952,
"step": 1419
},
{
"epoch": 0.58,
"grad_norm": 3.269241678992246,
"learning_rate": 1.8253179191619178e-06,
"loss": 0.8981,
"step": 1420
},
{
"epoch": 0.58,
"grad_norm": 2.8281077263938523,
"learning_rate": 1.8250758300103716e-06,
"loss": 0.8679,
"step": 1421
},
{
"epoch": 0.58,
"grad_norm": 2.5174016414119125,
"learning_rate": 1.8248335892984566e-06,
"loss": 0.935,
"step": 1422
},
{
"epoch": 0.58,
"grad_norm": 3.340024960932295,
"learning_rate": 1.8245911970706703e-06,
"loss": 0.9355,
"step": 1423
},
{
"epoch": 0.58,
"grad_norm": 2.5886318565444397,
"learning_rate": 1.8243486533715381e-06,
"loss": 0.8702,
"step": 1424
},
{
"epoch": 0.58,
"grad_norm": 2.8161770803718906,
"learning_rate": 1.8241059582456142e-06,
"loss": 0.9524,
"step": 1425
},
{
"epoch": 0.58,
"grad_norm": 2.572690300203282,
"learning_rate": 1.823863111737479e-06,
"loss": 0.8683,
"step": 1426
},
{
"epoch": 0.58,
"grad_norm": 2.928572934853922,
"learning_rate": 1.823620113891742e-06,
"loss": 0.9058,
"step": 1427
},
{
"epoch": 0.58,
"grad_norm": 3.4076922135462606,
"learning_rate": 1.8233769647530407e-06,
"loss": 0.9133,
"step": 1428
},
{
"epoch": 0.58,
"grad_norm": 2.329437888201068,
"learning_rate": 1.8231336643660393e-06,
"loss": 0.9067,
"step": 1429
},
{
"epoch": 0.58,
"grad_norm": 3.2959332939621278,
"learning_rate": 1.8228902127754299e-06,
"loss": 0.858,
"step": 1430
},
{
"epoch": 0.58,
"grad_norm": 2.5242249958853273,
"learning_rate": 1.8226466100259334e-06,
"loss": 0.9194,
"step": 1431
},
{
"epoch": 0.58,
"grad_norm": 2.9478013678790034,
"learning_rate": 1.8224028561622978e-06,
"loss": 0.9385,
"step": 1432
},
{
"epoch": 0.58,
"grad_norm": 3.014532016529034,
"learning_rate": 1.8221589512292983e-06,
"loss": 0.8762,
"step": 1433
},
{
"epoch": 0.59,
"grad_norm": 3.1486366901837775,
"learning_rate": 1.821914895271739e-06,
"loss": 0.9149,
"step": 1434
},
{
"epoch": 0.59,
"grad_norm": 3.0812705093909027,
"learning_rate": 1.821670688334451e-06,
"loss": 0.9035,
"step": 1435
},
{
"epoch": 0.59,
"grad_norm": 3.228181852688186,
"learning_rate": 1.8214263304622935e-06,
"loss": 0.8798,
"step": 1436
},
{
"epoch": 0.59,
"grad_norm": 3.2201277641728883,
"learning_rate": 1.8211818217001528e-06,
"loss": 0.8836,
"step": 1437
},
{
"epoch": 0.59,
"grad_norm": 3.1870611937867364,
"learning_rate": 1.8209371620929442e-06,
"loss": 0.9251,
"step": 1438
},
{
"epoch": 0.59,
"grad_norm": 2.508423148093909,
"learning_rate": 1.8206923516856087e-06,
"loss": 0.8845,
"step": 1439
},
{
"epoch": 0.59,
"grad_norm": 2.7836558341639877,
"learning_rate": 1.8204473905231175e-06,
"loss": 0.8776,
"step": 1440
},
{
"epoch": 0.59,
"grad_norm": 2.4493826700468087,
"learning_rate": 1.8202022786504668e-06,
"loss": 0.867,
"step": 1441
},
{
"epoch": 0.59,
"grad_norm": 2.6148620449686537,
"learning_rate": 1.819957016112683e-06,
"loss": 0.8908,
"step": 1442
},
{
"epoch": 0.59,
"grad_norm": 4.009159720331481,
"learning_rate": 1.8197116029548184e-06,
"loss": 0.8573,
"step": 1443
},
{
"epoch": 0.59,
"grad_norm": 2.4711143148033585,
"learning_rate": 1.8194660392219536e-06,
"loss": 0.8716,
"step": 1444
},
{
"epoch": 0.59,
"grad_norm": 2.7355684788936854,
"learning_rate": 1.8192203249591972e-06,
"loss": 0.898,
"step": 1445
},
{
"epoch": 0.59,
"grad_norm": 3.5966808690759713,
"learning_rate": 1.8189744602116847e-06,
"loss": 0.9131,
"step": 1446
},
{
"epoch": 0.59,
"grad_norm": 2.984695469807654,
"learning_rate": 1.81872844502458e-06,
"loss": 0.9495,
"step": 1447
},
{
"epoch": 0.59,
"grad_norm": 2.6767416432520372,
"learning_rate": 1.8184822794430743e-06,
"loss": 0.8558,
"step": 1448
},
{
"epoch": 0.59,
"grad_norm": 2.405494123488941,
"learning_rate": 1.8182359635123858e-06,
"loss": 0.8618,
"step": 1449
},
{
"epoch": 0.59,
"grad_norm": 3.001468309206258,
"learning_rate": 1.817989497277762e-06,
"loss": 0.9143,
"step": 1450
},
{
"epoch": 0.59,
"grad_norm": 2.9322100228959473,
"learning_rate": 1.8177428807844758e-06,
"loss": 0.9228,
"step": 1451
},
{
"epoch": 0.59,
"grad_norm": 2.7390871287187997,
"learning_rate": 1.8174961140778295e-06,
"loss": 0.9006,
"step": 1452
},
{
"epoch": 0.59,
"grad_norm": 2.4903265777682417,
"learning_rate": 1.8172491972031524e-06,
"loss": 0.9006,
"step": 1453
},
{
"epoch": 0.59,
"grad_norm": 2.939680178315496,
"learning_rate": 1.8170021302058007e-06,
"loss": 0.8968,
"step": 1454
},
{
"epoch": 0.59,
"grad_norm": 2.2450106145504605,
"learning_rate": 1.8167549131311594e-06,
"loss": 0.8875,
"step": 1455
},
{
"epoch": 0.59,
"grad_norm": 2.4566027845854435,
"learning_rate": 1.8165075460246401e-06,
"loss": 0.8544,
"step": 1456
},
{
"epoch": 0.59,
"grad_norm": 3.3131449756563423,
"learning_rate": 1.8162600289316826e-06,
"loss": 0.9163,
"step": 1457
},
{
"epoch": 0.59,
"grad_norm": 2.495267411857596,
"learning_rate": 1.8160123618977538e-06,
"loss": 0.884,
"step": 1458
},
{
"epoch": 0.6,
"grad_norm": 2.510031572924042,
"learning_rate": 1.8157645449683486e-06,
"loss": 0.8669,
"step": 1459
},
{
"epoch": 0.6,
"grad_norm": 3.4486107658990846,
"learning_rate": 1.8155165781889887e-06,
"loss": 0.8671,
"step": 1460
},
{
"epoch": 0.6,
"grad_norm": 2.555872974908906,
"learning_rate": 1.815268461605224e-06,
"loss": 0.8829,
"step": 1461
},
{
"epoch": 0.6,
"grad_norm": 3.478266417624919,
"learning_rate": 1.8150201952626313e-06,
"loss": 0.8854,
"step": 1462
},
{
"epoch": 0.6,
"grad_norm": 3.1611560779437804,
"learning_rate": 1.8147717792068162e-06,
"loss": 0.9958,
"step": 1463
},
{
"epoch": 0.6,
"grad_norm": 2.783297368976785,
"learning_rate": 1.8145232134834104e-06,
"loss": 0.9152,
"step": 1464
},
{
"epoch": 0.6,
"grad_norm": 3.7993730598304287,
"learning_rate": 1.8142744981380731e-06,
"loss": 0.8333,
"step": 1465
},
{
"epoch": 0.6,
"grad_norm": 3.3517541042066656,
"learning_rate": 1.8140256332164926e-06,
"loss": 0.9168,
"step": 1466
},
{
"epoch": 0.6,
"grad_norm": 2.683106831863197,
"learning_rate": 1.8137766187643824e-06,
"loss": 0.8893,
"step": 1467
},
{
"epoch": 0.6,
"grad_norm": 4.183411391510586,
"learning_rate": 1.8135274548274853e-06,
"loss": 0.9066,
"step": 1468
},
{
"epoch": 0.6,
"grad_norm": 2.8710754701214585,
"learning_rate": 1.8132781414515707e-06,
"loss": 0.8629,
"step": 1469
},
{
"epoch": 0.6,
"grad_norm": 3.9689487989355268,
"learning_rate": 1.8130286786824355e-06,
"loss": 1.0118,
"step": 1470
},
{
"epoch": 0.6,
"grad_norm": 2.8069408599645143,
"learning_rate": 1.812779066565905e-06,
"loss": 0.9614,
"step": 1471
},
{
"epoch": 0.6,
"grad_norm": 3.017530636327195,
"learning_rate": 1.8125293051478298e-06,
"loss": 0.8946,
"step": 1472
},
{
"epoch": 0.6,
"grad_norm": 3.44179552355399,
"learning_rate": 1.8122793944740903e-06,
"loss": 0.9055,
"step": 1473
},
{
"epoch": 0.6,
"grad_norm": 3.390583643849616,
"learning_rate": 1.8120293345905925e-06,
"loss": 0.8936,
"step": 1474
},
{
"epoch": 0.6,
"grad_norm": 2.768118705887388,
"learning_rate": 1.8117791255432712e-06,
"loss": 0.9031,
"step": 1475
},
{
"epoch": 0.6,
"grad_norm": 2.716818393076782,
"learning_rate": 1.8115287673780876e-06,
"loss": 0.9161,
"step": 1476
},
{
"epoch": 0.6,
"grad_norm": 2.732930789053605,
"learning_rate": 1.8112782601410306e-06,
"loss": 0.881,
"step": 1477
},
{
"epoch": 0.6,
"grad_norm": 2.932878811245221,
"learning_rate": 1.811027603878117e-06,
"loss": 0.8948,
"step": 1478
},
{
"epoch": 0.6,
"grad_norm": 3.0400177124309717,
"learning_rate": 1.81077679863539e-06,
"loss": 0.8961,
"step": 1479
},
{
"epoch": 0.6,
"grad_norm": 3.360057806645818,
"learning_rate": 1.8105258444589206e-06,
"loss": 0.9015,
"step": 1480
},
{
"epoch": 0.6,
"grad_norm": 3.3267952701578847,
"learning_rate": 1.8102747413948078e-06,
"loss": 0.9142,
"step": 1481
},
{
"epoch": 0.6,
"grad_norm": 3.8090914155397293,
"learning_rate": 1.810023489489177e-06,
"loss": 0.8315,
"step": 1482
},
{
"epoch": 0.61,
"grad_norm": 2.8886177752181497,
"learning_rate": 1.8097720887881813e-06,
"loss": 0.8704,
"step": 1483
},
{
"epoch": 0.61,
"grad_norm": 3.0001158457887054,
"learning_rate": 1.8095205393380016e-06,
"loss": 0.8928,
"step": 1484
},
{
"epoch": 0.61,
"grad_norm": 3.088979723192518,
"learning_rate": 1.8092688411848452e-06,
"loss": 0.9637,
"step": 1485
},
{
"epoch": 0.61,
"grad_norm": 2.741687377302349,
"learning_rate": 1.8090169943749474e-06,
"loss": 0.8793,
"step": 1486
},
{
"epoch": 0.61,
"grad_norm": 3.230479240861982,
"learning_rate": 1.8087649989545706e-06,
"loss": 0.9626,
"step": 1487
},
{
"epoch": 0.61,
"grad_norm": 2.581761921148654,
"learning_rate": 1.8085128549700043e-06,
"loss": 0.9505,
"step": 1488
},
{
"epoch": 0.61,
"grad_norm": 2.935505920444266,
"learning_rate": 1.808260562467566e-06,
"loss": 0.9394,
"step": 1489
},
{
"epoch": 0.61,
"grad_norm": 2.6534696373020172,
"learning_rate": 1.8080081214935996e-06,
"loss": 0.8595,
"step": 1490
},
{
"epoch": 0.61,
"grad_norm": 2.733602994106435,
"learning_rate": 1.8077555320944767e-06,
"loss": 0.8948,
"step": 1491
},
{
"epoch": 0.61,
"grad_norm": 3.0423113706931773,
"learning_rate": 1.807502794316596e-06,
"loss": 0.925,
"step": 1492
},
{
"epoch": 0.61,
"grad_norm": 2.4450592550819876,
"learning_rate": 1.8072499082063836e-06,
"loss": 0.8905,
"step": 1493
},
{
"epoch": 0.61,
"grad_norm": 3.284302242530637,
"learning_rate": 1.8069968738102931e-06,
"loss": 0.8967,
"step": 1494
},
{
"epoch": 0.61,
"grad_norm": 2.2785596730406312,
"learning_rate": 1.8067436911748051e-06,
"loss": 0.9588,
"step": 1495
},
{
"epoch": 0.61,
"grad_norm": 2.7686761491744534,
"learning_rate": 1.806490360346427e-06,
"loss": 0.889,
"step": 1496
},
{
"epoch": 0.61,
"grad_norm": 2.7715803473477543,
"learning_rate": 1.806236881371694e-06,
"loss": 0.9402,
"step": 1497
},
{
"epoch": 0.61,
"grad_norm": 3.511361971935122,
"learning_rate": 1.8059832542971681e-06,
"loss": 0.9072,
"step": 1498
},
{
"epoch": 0.61,
"grad_norm": 3.961383046090191,
"learning_rate": 1.8057294791694392e-06,
"loss": 0.9813,
"step": 1499
},
{
"epoch": 0.61,
"grad_norm": 2.6132938993135317,
"learning_rate": 1.8054755560351239e-06,
"loss": 0.9174,
"step": 1500
},
{
"epoch": 0.61,
"eval_loss": 0.903252899646759,
"eval_runtime": 465.7861,
"eval_samples_per_second": 74.824,
"eval_steps_per_second": 4.678,
"step": 1500
},
{
"epoch": 0.61,
"grad_norm": 2.4341894615894013,
"learning_rate": 1.8052214849408652e-06,
"loss": 0.9472,
"step": 1501
},
{
"epoch": 0.61,
"grad_norm": 2.6783649564213454,
"learning_rate": 1.8049672659333353e-06,
"loss": 0.8596,
"step": 1502
},
{
"epoch": 0.61,
"grad_norm": 2.6867685587156678,
"learning_rate": 1.8047128990592313e-06,
"loss": 0.9097,
"step": 1503
},
{
"epoch": 0.61,
"grad_norm": 2.761663288097778,
"learning_rate": 1.8044583843652794e-06,
"loss": 0.8911,
"step": 1504
},
{
"epoch": 0.61,
"grad_norm": 2.974021052851676,
"learning_rate": 1.804203721898231e-06,
"loss": 0.9279,
"step": 1505
},
{
"epoch": 0.61,
"grad_norm": 2.967996642184887,
"learning_rate": 1.8039489117048672e-06,
"loss": 0.9321,
"step": 1506
},
{
"epoch": 0.61,
"grad_norm": 3.0379418533746807,
"learning_rate": 1.8036939538319932e-06,
"loss": 0.9399,
"step": 1507
},
{
"epoch": 0.62,
"grad_norm": 2.696886901044943,
"learning_rate": 1.8034388483264442e-06,
"loss": 0.9356,
"step": 1508
},
{
"epoch": 0.62,
"grad_norm": 3.2033023318906046,
"learning_rate": 1.8031835952350805e-06,
"loss": 0.9227,
"step": 1509
},
{
"epoch": 0.62,
"grad_norm": 2.8343126640061254,
"learning_rate": 1.8029281946047901e-06,
"loss": 0.8441,
"step": 1510
},
{
"epoch": 0.62,
"grad_norm": 3.23568368258336,
"learning_rate": 1.8026726464824886e-06,
"loss": 0.8317,
"step": 1511
},
{
"epoch": 0.62,
"grad_norm": 2.9335814647557417,
"learning_rate": 1.802416950915118e-06,
"loss": 0.9003,
"step": 1512
},
{
"epoch": 0.62,
"grad_norm": 2.7230830586608126,
"learning_rate": 1.8021611079496484e-06,
"loss": 0.8936,
"step": 1513
},
{
"epoch": 0.62,
"grad_norm": 2.7338471528426496,
"learning_rate": 1.8019051176330753e-06,
"loss": 0.9099,
"step": 1514
},
{
"epoch": 0.62,
"grad_norm": 3.039434701174705,
"learning_rate": 1.8016489800124228e-06,
"loss": 0.8442,
"step": 1515
},
{
"epoch": 0.62,
"grad_norm": 3.1777867761467453,
"learning_rate": 1.8013926951347417e-06,
"loss": 0.8519,
"step": 1516
},
{
"epoch": 0.62,
"grad_norm": 3.009418233104468,
"learning_rate": 1.801136263047109e-06,
"loss": 0.8773,
"step": 1517
},
{
"epoch": 0.62,
"grad_norm": 2.845452588441992,
"learning_rate": 1.8008796837966297e-06,
"loss": 0.8979,
"step": 1518
},
{
"epoch": 0.62,
"grad_norm": 2.7504664047805485,
"learning_rate": 1.800622957430436e-06,
"loss": 0.9115,
"step": 1519
},
{
"epoch": 0.62,
"grad_norm": 3.0941610908742967,
"learning_rate": 1.8003660839956858e-06,
"loss": 0.8261,
"step": 1520
},
{
"epoch": 0.62,
"grad_norm": 2.7186266367949643,
"learning_rate": 1.8001090635395656e-06,
"loss": 0.917,
"step": 1521
},
{
"epoch": 0.62,
"grad_norm": 8.35051197262363,
"learning_rate": 1.7998518961092875e-06,
"loss": 0.8752,
"step": 1522
},
{
"epoch": 0.62,
"grad_norm": 3.595362596854687,
"learning_rate": 1.7995945817520916e-06,
"loss": 0.9051,
"step": 1523
},
{
"epoch": 0.62,
"grad_norm": 2.6097484733497245,
"learning_rate": 1.7993371205152447e-06,
"loss": 0.9378,
"step": 1524
},
{
"epoch": 0.62,
"grad_norm": 3.2266531708552875,
"learning_rate": 1.7990795124460406e-06,
"loss": 0.9104,
"step": 1525
},
{
"epoch": 0.62,
"grad_norm": 3.0455020732141116,
"learning_rate": 1.7988217575918002e-06,
"loss": 0.8995,
"step": 1526
},
{
"epoch": 0.62,
"grad_norm": 2.5856868998207294,
"learning_rate": 1.7985638559998706e-06,
"loss": 0.8767,
"step": 1527
},
{
"epoch": 0.62,
"grad_norm": 2.643955912023456,
"learning_rate": 1.7983058077176264e-06,
"loss": 0.8432,
"step": 1528
},
{
"epoch": 0.62,
"grad_norm": 4.40701632289689,
"learning_rate": 1.79804761279247e-06,
"loss": 0.9136,
"step": 1529
},
{
"epoch": 0.62,
"grad_norm": 2.578140805613738,
"learning_rate": 1.7977892712718293e-06,
"loss": 0.946,
"step": 1530
},
{
"epoch": 0.62,
"grad_norm": 2.997828395668107,
"learning_rate": 1.7975307832031597e-06,
"loss": 0.8705,
"step": 1531
},
{
"epoch": 0.63,
"grad_norm": 3.0229452192315267,
"learning_rate": 1.7972721486339437e-06,
"loss": 0.9122,
"step": 1532
},
{
"epoch": 0.63,
"grad_norm": 2.71001446855671,
"learning_rate": 1.7970133676116908e-06,
"loss": 0.9196,
"step": 1533
},
{
"epoch": 0.63,
"grad_norm": 3.5437571376403283,
"learning_rate": 1.7967544401839368e-06,
"loss": 0.9107,
"step": 1534
},
{
"epoch": 0.63,
"grad_norm": 3.2586370703830645,
"learning_rate": 1.7964953663982448e-06,
"loss": 0.8696,
"step": 1535
},
{
"epoch": 0.63,
"grad_norm": 3.18574904092518,
"learning_rate": 1.796236146302205e-06,
"loss": 0.886,
"step": 1536
},
{
"epoch": 0.63,
"grad_norm": 2.967598119626474,
"learning_rate": 1.795976779943434e-06,
"loss": 0.9393,
"step": 1537
},
{
"epoch": 0.63,
"grad_norm": 3.029178554989078,
"learning_rate": 1.7957172673695756e-06,
"loss": 0.8685,
"step": 1538
},
{
"epoch": 0.63,
"grad_norm": 3.332907240389432,
"learning_rate": 1.7954576086283004e-06,
"loss": 0.9695,
"step": 1539
},
{
"epoch": 0.63,
"grad_norm": 3.1484240359203146,
"learning_rate": 1.795197803767306e-06,
"loss": 0.898,
"step": 1540
},
{
"epoch": 0.63,
"grad_norm": 3.312579541717166,
"learning_rate": 1.794937852834316e-06,
"loss": 0.8428,
"step": 1541
},
{
"epoch": 0.63,
"grad_norm": 3.137791759151072,
"learning_rate": 1.794677755877082e-06,
"loss": 0.9395,
"step": 1542
},
{
"epoch": 0.63,
"grad_norm": 2.9880946884973567,
"learning_rate": 1.7944175129433816e-06,
"loss": 0.897,
"step": 1543
},
{
"epoch": 0.63,
"grad_norm": 2.607184345698004,
"learning_rate": 1.7941571240810198e-06,
"loss": 0.8845,
"step": 1544
},
{
"epoch": 0.63,
"grad_norm": 3.153756571401806,
"learning_rate": 1.793896589337828e-06,
"loss": 0.9133,
"step": 1545
},
{
"epoch": 0.63,
"grad_norm": 2.614153771541694,
"learning_rate": 1.7936359087616646e-06,
"loss": 0.9282,
"step": 1546
},
{
"epoch": 0.63,
"grad_norm": 3.22512452158539,
"learning_rate": 1.7933750824004144e-06,
"loss": 0.804,
"step": 1547
},
{
"epoch": 0.63,
"grad_norm": 2.6184619995388227,
"learning_rate": 1.7931141103019899e-06,
"loss": 0.8567,
"step": 1548
},
{
"epoch": 0.63,
"grad_norm": 3.033328039915391,
"learning_rate": 1.7928529925143287e-06,
"loss": 0.8076,
"step": 1549
},
{
"epoch": 0.63,
"grad_norm": 2.793284640528153,
"learning_rate": 1.7925917290853976e-06,
"loss": 0.9532,
"step": 1550
},
{
"epoch": 0.63,
"grad_norm": 2.525841191495341,
"learning_rate": 1.7923303200631877e-06,
"loss": 0.9382,
"step": 1551
},
{
"epoch": 0.63,
"grad_norm": 3.87781365536885,
"learning_rate": 1.7920687654957182e-06,
"loss": 0.9579,
"step": 1552
},
{
"epoch": 0.63,
"grad_norm": 3.514896782697138,
"learning_rate": 1.791807065431035e-06,
"loss": 0.8536,
"step": 1553
},
{
"epoch": 0.63,
"grad_norm": 3.540731750951074,
"learning_rate": 1.7915452199172102e-06,
"loss": 0.8824,
"step": 1554
},
{
"epoch": 0.63,
"grad_norm": 2.6767934914372282,
"learning_rate": 1.7912832290023433e-06,
"loss": 0.9147,
"step": 1555
},
{
"epoch": 0.63,
"grad_norm": 2.555542478853647,
"learning_rate": 1.7910210927345596e-06,
"loss": 0.9185,
"step": 1556
},
{
"epoch": 0.64,
"grad_norm": 2.927227970224104,
"learning_rate": 1.790758811162012e-06,
"loss": 0.8646,
"step": 1557
},
{
"epoch": 0.64,
"grad_norm": 2.8854073386423265,
"learning_rate": 1.7904963843328794e-06,
"loss": 0.9038,
"step": 1558
},
{
"epoch": 0.64,
"grad_norm": 3.669290589547803,
"learning_rate": 1.7902338122953675e-06,
"loss": 0.8811,
"step": 1559
},
{
"epoch": 0.64,
"grad_norm": 3.054469843214573,
"learning_rate": 1.7899710950977096e-06,
"loss": 0.9125,
"step": 1560
},
{
"epoch": 0.64,
"grad_norm": 2.4187170646943854,
"learning_rate": 1.7897082327881644e-06,
"loss": 0.9391,
"step": 1561
},
{
"epoch": 0.64,
"grad_norm": 3.2460994864913397,
"learning_rate": 1.7894452254150174e-06,
"loss": 0.8417,
"step": 1562
},
{
"epoch": 0.64,
"grad_norm": 2.9286072603520616,
"learning_rate": 1.789182073026582e-06,
"loss": 0.9117,
"step": 1563
},
{
"epoch": 0.64,
"grad_norm": 2.6601897276013755,
"learning_rate": 1.7889187756711967e-06,
"loss": 0.9117,
"step": 1564
},
{
"epoch": 0.64,
"grad_norm": 3.2546350225751857,
"learning_rate": 1.788655333397228e-06,
"loss": 0.8702,
"step": 1565
},
{
"epoch": 0.64,
"grad_norm": 3.3591927279580274,
"learning_rate": 1.7883917462530676e-06,
"loss": 0.8942,
"step": 1566
},
{
"epoch": 0.64,
"grad_norm": 3.221424760994017,
"learning_rate": 1.7881280142871345e-06,
"loss": 0.8671,
"step": 1567
},
{
"epoch": 0.64,
"grad_norm": 2.8885295033370753,
"learning_rate": 1.7878641375478748e-06,
"loss": 0.8528,
"step": 1568
},
{
"epoch": 0.64,
"grad_norm": 2.8049610862883516,
"learning_rate": 1.7876001160837604e-06,
"loss": 0.9083,
"step": 1569
},
{
"epoch": 0.64,
"grad_norm": 3.2728331107806503,
"learning_rate": 1.7873359499432903e-06,
"loss": 0.9044,
"step": 1570
},
{
"epoch": 0.64,
"grad_norm": 3.0881108180116135,
"learning_rate": 1.7870716391749895e-06,
"loss": 0.9672,
"step": 1571
},
{
"epoch": 0.64,
"grad_norm": 2.5264178302964484,
"learning_rate": 1.7868071838274107e-06,
"loss": 0.9187,
"step": 1572
},
{
"epoch": 0.64,
"grad_norm": 3.076788070980852,
"learning_rate": 1.7865425839491315e-06,
"loss": 0.8734,
"step": 1573
},
{
"epoch": 0.64,
"grad_norm": 2.9859220615574595,
"learning_rate": 1.7862778395887577e-06,
"loss": 0.9084,
"step": 1574
},
{
"epoch": 0.64,
"grad_norm": 2.64693132053535,
"learning_rate": 1.7860129507949204e-06,
"loss": 0.8922,
"step": 1575
},
{
"epoch": 0.64,
"grad_norm": 3.15500371793696,
"learning_rate": 1.7857479176162776e-06,
"loss": 0.8713,
"step": 1576
},
{
"epoch": 0.64,
"grad_norm": 3.4285624013140046,
"learning_rate": 1.785482740101515e-06,
"loss": 0.8897,
"step": 1577
},
{
"epoch": 0.64,
"grad_norm": 3.63177412531693,
"learning_rate": 1.7852174182993424e-06,
"loss": 0.9243,
"step": 1578
},
{
"epoch": 0.64,
"grad_norm": 2.636716074110495,
"learning_rate": 1.7849519522584986e-06,
"loss": 0.9632,
"step": 1579
},
{
"epoch": 0.64,
"grad_norm": 2.313564460066587,
"learning_rate": 1.7846863420277467e-06,
"loss": 0.8451,
"step": 1580
},
{
"epoch": 0.65,
"grad_norm": 3.3274802142897713,
"learning_rate": 1.7844205876558784e-06,
"loss": 0.8291,
"step": 1581
},
{
"epoch": 0.65,
"grad_norm": 2.702070073547803,
"learning_rate": 1.7841546891917099e-06,
"loss": 0.9288,
"step": 1582
},
{
"epoch": 0.65,
"grad_norm": 2.6822670202527537,
"learning_rate": 1.7838886466840855e-06,
"loss": 0.945,
"step": 1583
},
{
"epoch": 0.65,
"grad_norm": 2.8503934601755834,
"learning_rate": 1.7836224601818748e-06,
"loss": 0.8789,
"step": 1584
},
{
"epoch": 0.65,
"grad_norm": 3.999473497761987,
"learning_rate": 1.7833561297339745e-06,
"loss": 0.8971,
"step": 1585
},
{
"epoch": 0.65,
"grad_norm": 2.9097012569997487,
"learning_rate": 1.7830896553893075e-06,
"loss": 0.8267,
"step": 1586
},
{
"epoch": 0.65,
"grad_norm": 2.4027931174306927,
"learning_rate": 1.7828230371968233e-06,
"loss": 0.8875,
"step": 1587
},
{
"epoch": 0.65,
"grad_norm": 2.4651921547641744,
"learning_rate": 1.7825562752054974e-06,
"loss": 0.9008,
"step": 1588
},
{
"epoch": 0.65,
"grad_norm": 2.8766079280743098,
"learning_rate": 1.7822893694643324e-06,
"loss": 0.9171,
"step": 1589
},
{
"epoch": 0.65,
"grad_norm": 3.262045719083602,
"learning_rate": 1.7820223200223567e-06,
"loss": 0.884,
"step": 1590
},
{
"epoch": 0.65,
"grad_norm": 3.815245673388042,
"learning_rate": 1.7817551269286252e-06,
"loss": 0.8994,
"step": 1591
},
{
"epoch": 0.65,
"grad_norm": 3.224404180580554,
"learning_rate": 1.7814877902322198e-06,
"loss": 0.9835,
"step": 1592
},
{
"epoch": 0.65,
"grad_norm": 3.097229159685121,
"learning_rate": 1.781220309982248e-06,
"loss": 0.9554,
"step": 1593
},
{
"epoch": 0.65,
"grad_norm": 2.6353313503594458,
"learning_rate": 1.7809526862278437e-06,
"loss": 0.9077,
"step": 1594
},
{
"epoch": 0.65,
"grad_norm": 3.046255183526184,
"learning_rate": 1.7806849190181676e-06,
"loss": 0.8877,
"step": 1595
},
{
"epoch": 0.65,
"grad_norm": 2.6264447863840625,
"learning_rate": 1.7804170084024065e-06,
"loss": 0.9215,
"step": 1596
},
{
"epoch": 0.65,
"grad_norm": 3.319144214549714,
"learning_rate": 1.7801489544297744e-06,
"loss": 0.891,
"step": 1597
},
{
"epoch": 0.65,
"grad_norm": 2.689831787655044,
"learning_rate": 1.7798807571495094e-06,
"loss": 0.9084,
"step": 1598
},
{
"epoch": 0.65,
"grad_norm": 2.907085427751333,
"learning_rate": 1.7796124166108783e-06,
"loss": 0.8836,
"step": 1599
},
{
"epoch": 0.65,
"grad_norm": 2.810484360518369,
"learning_rate": 1.7793439328631731e-06,
"loss": 0.8984,
"step": 1600
},
{
"epoch": 0.65,
"eval_loss": 0.9006547927856445,
"eval_runtime": 465.5099,
"eval_samples_per_second": 74.868,
"eval_steps_per_second": 4.681,
"step": 1600
},
{
"epoch": 0.65,
"grad_norm": 3.151115069785489,
"learning_rate": 1.7790753059557127e-06,
"loss": 0.8908,
"step": 1601
},
{
"epoch": 0.65,
"grad_norm": 2.9044014545605927,
"learning_rate": 1.778806535937841e-06,
"loss": 0.9091,
"step": 1602
},
{
"epoch": 0.65,
"grad_norm": 2.742465318724604,
"learning_rate": 1.77853762285893e-06,
"loss": 0.8845,
"step": 1603
},
{
"epoch": 0.65,
"grad_norm": 3.0682642189079856,
"learning_rate": 1.7782685667683763e-06,
"loss": 0.875,
"step": 1604
},
{
"epoch": 0.65,
"grad_norm": 2.609650875545538,
"learning_rate": 1.777999367715604e-06,
"loss": 0.908,
"step": 1605
},
{
"epoch": 0.66,
"grad_norm": 4.45152856999714,
"learning_rate": 1.7777300257500628e-06,
"loss": 0.9253,
"step": 1606
},
{
"epoch": 0.66,
"grad_norm": 4.939572987965063,
"learning_rate": 1.7774605409212292e-06,
"loss": 0.8836,
"step": 1607
},
{
"epoch": 0.66,
"grad_norm": 2.3277594447636933,
"learning_rate": 1.7771909132786049e-06,
"loss": 0.9166,
"step": 1608
},
{
"epoch": 0.66,
"grad_norm": 3.733596850195229,
"learning_rate": 1.776921142871719e-06,
"loss": 0.8213,
"step": 1609
},
{
"epoch": 0.66,
"grad_norm": 3.2994299013016772,
"learning_rate": 1.7766512297501263e-06,
"loss": 0.9004,
"step": 1610
},
{
"epoch": 0.66,
"grad_norm": 3.1394904680739404,
"learning_rate": 1.776381173963408e-06,
"loss": 0.8793,
"step": 1611
},
{
"epoch": 0.66,
"grad_norm": 3.9793994333020475,
"learning_rate": 1.7761109755611705e-06,
"loss": 0.9137,
"step": 1612
},
{
"epoch": 0.66,
"grad_norm": 3.3998710233834966,
"learning_rate": 1.7758406345930482e-06,
"loss": 0.9057,
"step": 1613
},
{
"epoch": 0.66,
"grad_norm": 2.9883461059975183,
"learning_rate": 1.7755701511087008e-06,
"loss": 0.9853,
"step": 1614
},
{
"epoch": 0.66,
"grad_norm": 2.335029918377393,
"learning_rate": 1.7752995251578137e-06,
"loss": 0.8736,
"step": 1615
},
{
"epoch": 0.66,
"grad_norm": 2.8776915220170185,
"learning_rate": 1.7750287567900989e-06,
"loss": 0.9296,
"step": 1616
},
{
"epoch": 0.66,
"grad_norm": 3.098451395038211,
"learning_rate": 1.7747578460552946e-06,
"loss": 0.9823,
"step": 1617
},
{
"epoch": 0.66,
"grad_norm": 3.034721616634397,
"learning_rate": 1.7744867930031653e-06,
"loss": 0.8951,
"step": 1618
},
{
"epoch": 0.66,
"grad_norm": 3.2105954418081146,
"learning_rate": 1.7742155976835012e-06,
"loss": 0.9032,
"step": 1619
},
{
"epoch": 0.66,
"grad_norm": 2.737982767981556,
"learning_rate": 1.773944260146119e-06,
"loss": 0.9295,
"step": 1620
},
{
"epoch": 0.66,
"grad_norm": 2.8161337108705946,
"learning_rate": 1.7736727804408616e-06,
"loss": 0.8588,
"step": 1621
},
{
"epoch": 0.66,
"grad_norm": 3.0206025796564786,
"learning_rate": 1.7734011586175973e-06,
"loss": 0.9445,
"step": 1622
},
{
"epoch": 0.66,
"grad_norm": 2.603574779992507,
"learning_rate": 1.7731293947262218e-06,
"loss": 0.948,
"step": 1623
},
{
"epoch": 0.66,
"grad_norm": 2.432754107296118,
"learning_rate": 1.7728574888166555e-06,
"loss": 0.8899,
"step": 1624
},
{
"epoch": 0.66,
"grad_norm": 2.8168868738039317,
"learning_rate": 1.772585440938846e-06,
"loss": 0.9413,
"step": 1625
},
{
"epoch": 0.66,
"grad_norm": 3.081309356939286,
"learning_rate": 1.7723132511427658e-06,
"loss": 0.8604,
"step": 1626
},
{
"epoch": 0.66,
"grad_norm": 4.305630349149536,
"learning_rate": 1.772040919478415e-06,
"loss": 0.9238,
"step": 1627
},
{
"epoch": 0.66,
"grad_norm": 2.9678787464932235,
"learning_rate": 1.7717684459958182e-06,
"loss": 0.9835,
"step": 1628
},
{
"epoch": 0.66,
"grad_norm": 2.6647685896892557,
"learning_rate": 1.7714958307450271e-06,
"loss": 0.9281,
"step": 1629
},
{
"epoch": 0.67,
"grad_norm": 3.0010546874970547,
"learning_rate": 1.7712230737761194e-06,
"loss": 0.9145,
"step": 1630
},
{
"epoch": 0.67,
"grad_norm": 2.8336428583806526,
"learning_rate": 1.7709501751391983e-06,
"loss": 0.9168,
"step": 1631
},
{
"epoch": 0.67,
"grad_norm": 2.7556334077247286,
"learning_rate": 1.770677134884393e-06,
"loss": 0.8541,
"step": 1632
},
{
"epoch": 0.67,
"grad_norm": 2.999521755648439,
"learning_rate": 1.7704039530618593e-06,
"loss": 0.8922,
"step": 1633
},
{
"epoch": 0.67,
"grad_norm": 3.1892267210392764,
"learning_rate": 1.7701306297217786e-06,
"loss": 0.9659,
"step": 1634
},
{
"epoch": 0.67,
"grad_norm": 2.68668062381237,
"learning_rate": 1.7698571649143585e-06,
"loss": 0.8759,
"step": 1635
},
{
"epoch": 0.67,
"grad_norm": 3.1199750052750845,
"learning_rate": 1.7695835586898324e-06,
"loss": 0.8607,
"step": 1636
},
{
"epoch": 0.67,
"grad_norm": 2.9537574031923635,
"learning_rate": 1.7693098110984596e-06,
"loss": 0.8927,
"step": 1637
},
{
"epoch": 0.67,
"grad_norm": 2.688433592161539,
"learning_rate": 1.7690359221905259e-06,
"loss": 0.9168,
"step": 1638
},
{
"epoch": 0.67,
"grad_norm": 2.868306996906755,
"learning_rate": 1.7687618920163424e-06,
"loss": 0.899,
"step": 1639
},
{
"epoch": 0.67,
"grad_norm": 2.574271250327665,
"learning_rate": 1.7684877206262462e-06,
"loss": 0.9297,
"step": 1640
},
{
"epoch": 0.67,
"grad_norm": 2.9522888131867875,
"learning_rate": 1.7682134080706013e-06,
"loss": 0.8831,
"step": 1641
},
{
"epoch": 0.67,
"grad_norm": 3.8299395479651164,
"learning_rate": 1.7679389543997963e-06,
"loss": 0.895,
"step": 1642
},
{
"epoch": 0.67,
"grad_norm": 3.1366530842172073,
"learning_rate": 1.7676643596642462e-06,
"loss": 0.9568,
"step": 1643
},
{
"epoch": 0.67,
"grad_norm": 2.9519683272467927,
"learning_rate": 1.7673896239143926e-06,
"loss": 0.9066,
"step": 1644
},
{
"epoch": 0.67,
"grad_norm": 2.6211598437542634,
"learning_rate": 1.767114747200702e-06,
"loss": 0.9225,
"step": 1645
},
{
"epoch": 0.67,
"grad_norm": 2.8974441531716155,
"learning_rate": 1.7668397295736677e-06,
"loss": 0.8211,
"step": 1646
},
{
"epoch": 0.67,
"grad_norm": 2.6511583275352892,
"learning_rate": 1.7665645710838074e-06,
"loss": 0.888,
"step": 1647
},
{
"epoch": 0.67,
"grad_norm": 4.014269716800578,
"learning_rate": 1.766289271781667e-06,
"loss": 0.8079,
"step": 1648
},
{
"epoch": 0.67,
"grad_norm": 3.1007847293488386,
"learning_rate": 1.7660138317178164e-06,
"loss": 0.9624,
"step": 1649
},
{
"epoch": 0.67,
"grad_norm": 3.982999022350478,
"learning_rate": 1.7657382509428516e-06,
"loss": 0.8596,
"step": 1650
},
{
"epoch": 0.67,
"grad_norm": 3.262774554581949,
"learning_rate": 1.765462529507395e-06,
"loss": 0.9252,
"step": 1651
},
{
"epoch": 0.67,
"grad_norm": 3.1944155886748273,
"learning_rate": 1.765186667462095e-06,
"loss": 0.8014,
"step": 1652
},
{
"epoch": 0.67,
"grad_norm": 2.8578913033988904,
"learning_rate": 1.7649106648576248e-06,
"loss": 0.8793,
"step": 1653
},
{
"epoch": 0.67,
"grad_norm": 2.566521412634323,
"learning_rate": 1.7646345217446842e-06,
"loss": 0.9064,
"step": 1654
},
{
"epoch": 0.68,
"grad_norm": 2.920742754102452,
"learning_rate": 1.7643582381739991e-06,
"loss": 0.9404,
"step": 1655
},
{
"epoch": 0.68,
"grad_norm": 4.670951551425196,
"learning_rate": 1.7640818141963203e-06,
"loss": 0.8845,
"step": 1656
},
{
"epoch": 0.68,
"grad_norm": 2.9792596820547645,
"learning_rate": 1.763805249862425e-06,
"loss": 0.9486,
"step": 1657
},
{
"epoch": 0.68,
"grad_norm": 2.5203814941958336,
"learning_rate": 1.763528545223116e-06,
"loss": 1.0399,
"step": 1658
},
{
"epoch": 0.68,
"grad_norm": 2.7890129721740813,
"learning_rate": 1.763251700329222e-06,
"loss": 0.8265,
"step": 1659
},
{
"epoch": 0.68,
"grad_norm": 2.6882430228118213,
"learning_rate": 1.7629747152315972e-06,
"loss": 0.9377,
"step": 1660
},
{
"epoch": 0.68,
"grad_norm": 2.6666327467471174,
"learning_rate": 1.7626975899811218e-06,
"loss": 0.9151,
"step": 1661
},
{
"epoch": 0.68,
"grad_norm": 2.439520733739567,
"learning_rate": 1.7624203246287022e-06,
"loss": 0.8969,
"step": 1662
},
{
"epoch": 0.68,
"grad_norm": 2.7224041281736273,
"learning_rate": 1.762142919225269e-06,
"loss": 0.9045,
"step": 1663
},
{
"epoch": 0.68,
"grad_norm": 2.850943974026024,
"learning_rate": 1.7618653738217804e-06,
"loss": 1.0064,
"step": 1664
},
{
"epoch": 0.68,
"grad_norm": 2.2166083117436077,
"learning_rate": 1.761587688469219e-06,
"loss": 0.9078,
"step": 1665
},
{
"epoch": 0.68,
"grad_norm": 3.1063329935728987,
"learning_rate": 1.7613098632185934e-06,
"loss": 0.9163,
"step": 1666
},
{
"epoch": 0.68,
"grad_norm": 2.7162430141303133,
"learning_rate": 1.7610318981209386e-06,
"loss": 0.9365,
"step": 1667
},
{
"epoch": 0.68,
"grad_norm": 2.578324317533197,
"learning_rate": 1.7607537932273143e-06,
"loss": 0.9085,
"step": 1668
},
{
"epoch": 0.68,
"grad_norm": 3.2059796301699954,
"learning_rate": 1.7604755485888068e-06,
"loss": 0.9078,
"step": 1669
},
{
"epoch": 0.68,
"grad_norm": 2.468039481925014,
"learning_rate": 1.7601971642565273e-06,
"loss": 0.8941,
"step": 1670
},
{
"epoch": 0.68,
"grad_norm": 2.8917829923650817,
"learning_rate": 1.7599186402816126e-06,
"loss": 0.8915,
"step": 1671
},
{
"epoch": 0.68,
"grad_norm": 2.557998422086717,
"learning_rate": 1.7596399767152262e-06,
"loss": 0.8948,
"step": 1672
},
{
"epoch": 0.68,
"grad_norm": 2.567240894188107,
"learning_rate": 1.759361173608556e-06,
"loss": 0.9353,
"step": 1673
},
{
"epoch": 0.68,
"grad_norm": 2.7785611558935326,
"learning_rate": 1.7590822310128163e-06,
"loss": 0.8185,
"step": 1674
},
{
"epoch": 0.68,
"grad_norm": 2.472209158421066,
"learning_rate": 1.758803148979247e-06,
"loss": 0.9646,
"step": 1675
},
{
"epoch": 0.68,
"grad_norm": 2.9005164167606985,
"learning_rate": 1.758523927559113e-06,
"loss": 0.9492,
"step": 1676
},
{
"epoch": 0.68,
"grad_norm": 2.7675805379248084,
"learning_rate": 1.7582445668037056e-06,
"loss": 0.8723,
"step": 1677
},
{
"epoch": 0.68,
"grad_norm": 2.8242312122357713,
"learning_rate": 1.757965066764341e-06,
"loss": 0.8816,
"step": 1678
},
{
"epoch": 0.69,
"grad_norm": 2.783274610678779,
"learning_rate": 1.7576854274923617e-06,
"loss": 0.9115,
"step": 1679
},
{
"epoch": 0.69,
"grad_norm": 3.4063388816067866,
"learning_rate": 1.7574056490391352e-06,
"loss": 0.8492,
"step": 1680
},
{
"epoch": 0.69,
"grad_norm": 2.793766995970068,
"learning_rate": 1.7571257314560545e-06,
"loss": 0.9898,
"step": 1681
},
{
"epoch": 0.69,
"grad_norm": 2.95801564632073,
"learning_rate": 1.7568456747945387e-06,
"loss": 0.8176,
"step": 1682
},
{
"epoch": 0.69,
"grad_norm": 2.7693187490312487,
"learning_rate": 1.7565654791060319e-06,
"loss": 0.9372,
"step": 1683
},
{
"epoch": 0.69,
"grad_norm": 3.4690783059990493,
"learning_rate": 1.7562851444420042e-06,
"loss": 0.9648,
"step": 1684
},
{
"epoch": 0.69,
"grad_norm": 2.5085051558283147,
"learning_rate": 1.7560046708539509e-06,
"loss": 0.9332,
"step": 1685
},
{
"epoch": 0.69,
"grad_norm": 2.511422240692237,
"learning_rate": 1.7557240583933932e-06,
"loss": 0.8623,
"step": 1686
},
{
"epoch": 0.69,
"grad_norm": 2.93956810110292,
"learning_rate": 1.7554433071118772e-06,
"loss": 0.8335,
"step": 1687
},
{
"epoch": 0.69,
"grad_norm": 3.037089034226794,
"learning_rate": 1.755162417060975e-06,
"loss": 0.915,
"step": 1688
},
{
"epoch": 0.69,
"grad_norm": 3.1530116144916254,
"learning_rate": 1.7548813882922841e-06,
"loss": 0.9077,
"step": 1689
},
{
"epoch": 0.69,
"grad_norm": 3.1880417179092553,
"learning_rate": 1.7546002208574274e-06,
"loss": 0.9027,
"step": 1690
},
{
"epoch": 0.69,
"grad_norm": 2.847038561749161,
"learning_rate": 1.754318914808053e-06,
"loss": 0.8976,
"step": 1691
},
{
"epoch": 0.69,
"grad_norm": 2.9733721992761803,
"learning_rate": 1.7540374701958353e-06,
"loss": 0.8353,
"step": 1692
},
{
"epoch": 0.69,
"grad_norm": 2.9815730576359925,
"learning_rate": 1.7537558870724732e-06,
"loss": 0.8966,
"step": 1693
},
{
"epoch": 0.69,
"grad_norm": 2.7298313132563243,
"learning_rate": 1.7534741654896914e-06,
"loss": 0.9965,
"step": 1694
},
{
"epoch": 0.69,
"grad_norm": 2.888021642832356,
"learning_rate": 1.7531923054992403e-06,
"loss": 0.8942,
"step": 1695
},
{
"epoch": 0.69,
"grad_norm": 2.8629552088467407,
"learning_rate": 1.7529103071528956e-06,
"loss": 0.8576,
"step": 1696
},
{
"epoch": 0.69,
"grad_norm": 2.745128134718651,
"learning_rate": 1.7526281705024583e-06,
"loss": 0.9454,
"step": 1697
},
{
"epoch": 0.69,
"grad_norm": 3.2191476444465916,
"learning_rate": 1.7523458955997545e-06,
"loss": 1.0383,
"step": 1698
},
{
"epoch": 0.69,
"grad_norm": 2.6103486062255046,
"learning_rate": 1.7520634824966362e-06,
"loss": 0.8722,
"step": 1699
},
{
"epoch": 0.69,
"grad_norm": 2.4614275212416565,
"learning_rate": 1.7517809312449806e-06,
"loss": 0.8823,
"step": 1700
},
{
"epoch": 0.69,
"eval_loss": 0.8986980319023132,
"eval_runtime": 464.6727,
"eval_samples_per_second": 75.003,
"eval_steps_per_second": 4.689,
"step": 1700
},
{
"epoch": 0.69,
"grad_norm": 3.283518419484581,
"learning_rate": 1.7514982418966906e-06,
"loss": 0.8754,
"step": 1701
},
{
"epoch": 0.69,
"grad_norm": 2.901576117059834,
"learning_rate": 1.7512154145036937e-06,
"loss": 0.9224,
"step": 1702
},
{
"epoch": 0.69,
"grad_norm": 2.6886096502450716,
"learning_rate": 1.7509324491179435e-06,
"loss": 0.9214,
"step": 1703
},
{
"epoch": 0.7,
"grad_norm": 2.506506165705048,
"learning_rate": 1.7506493457914186e-06,
"loss": 0.975,
"step": 1704
},
{
"epoch": 0.7,
"grad_norm": 3.2285345295493078,
"learning_rate": 1.7503661045761229e-06,
"loss": 0.9055,
"step": 1705
},
{
"epoch": 0.7,
"grad_norm": 3.501056402207899,
"learning_rate": 1.7500827255240859e-06,
"loss": 0.9035,
"step": 1706
},
{
"epoch": 0.7,
"grad_norm": 2.982711790840549,
"learning_rate": 1.7497992086873618e-06,
"loss": 0.9639,
"step": 1707
},
{
"epoch": 0.7,
"grad_norm": 2.4820522523568305,
"learning_rate": 1.7495155541180313e-06,
"loss": 0.9462,
"step": 1708
},
{
"epoch": 0.7,
"grad_norm": 2.932150563238868,
"learning_rate": 1.749231761868199e-06,
"loss": 0.8955,
"step": 1709
},
{
"epoch": 0.7,
"grad_norm": 3.085583306518288,
"learning_rate": 1.7489478319899959e-06,
"loss": 0.8441,
"step": 1710
},
{
"epoch": 0.7,
"grad_norm": 3.062520481047023,
"learning_rate": 1.748663764535578e-06,
"loss": 0.9122,
"step": 1711
},
{
"epoch": 0.7,
"grad_norm": 2.949848626822578,
"learning_rate": 1.7483795595571253e-06,
"loss": 0.9154,
"step": 1712
},
{
"epoch": 0.7,
"grad_norm": 3.375488296715614,
"learning_rate": 1.7480952171068455e-06,
"loss": 0.9453,
"step": 1713
},
{
"epoch": 0.7,
"grad_norm": 3.588930119352415,
"learning_rate": 1.7478107372369694e-06,
"loss": 0.8601,
"step": 1714
},
{
"epoch": 0.7,
"grad_norm": 2.7636879488710235,
"learning_rate": 1.7475261199997542e-06,
"loss": 0.8943,
"step": 1715
},
{
"epoch": 0.7,
"grad_norm": 2.447710679386565,
"learning_rate": 1.747241365447482e-06,
"loss": 0.8641,
"step": 1716
},
{
"epoch": 0.7,
"grad_norm": 2.8848642979455095,
"learning_rate": 1.7469564736324597e-06,
"loss": 0.9175,
"step": 1717
},
{
"epoch": 0.7,
"grad_norm": 2.884689411116974,
"learning_rate": 1.7466714446070206e-06,
"loss": 0.9361,
"step": 1718
},
{
"epoch": 0.7,
"grad_norm": 2.315220978736344,
"learning_rate": 1.7463862784235217e-06,
"loss": 0.9185,
"step": 1719
},
{
"epoch": 0.7,
"grad_norm": 2.663896971691663,
"learning_rate": 1.7461009751343463e-06,
"loss": 0.8505,
"step": 1720
},
{
"epoch": 0.7,
"grad_norm": 2.865374555776061,
"learning_rate": 1.7458155347919026e-06,
"loss": 0.9713,
"step": 1721
},
{
"epoch": 0.7,
"grad_norm": 2.7450067752953267,
"learning_rate": 1.7455299574486238e-06,
"loss": 0.9081,
"step": 1722
},
{
"epoch": 0.7,
"grad_norm": 2.822642699698326,
"learning_rate": 1.7452442431569678e-06,
"loss": 0.863,
"step": 1723
},
{
"epoch": 0.7,
"grad_norm": 3.1461256412890912,
"learning_rate": 1.744958391969419e-06,
"loss": 0.8274,
"step": 1724
},
{
"epoch": 0.7,
"grad_norm": 2.681508517198968,
"learning_rate": 1.7446724039384862e-06,
"loss": 0.9141,
"step": 1725
},
{
"epoch": 0.7,
"grad_norm": 3.0195865128684085,
"learning_rate": 1.7443862791167028e-06,
"loss": 0.8959,
"step": 1726
},
{
"epoch": 0.7,
"grad_norm": 2.958172117791613,
"learning_rate": 1.7441000175566278e-06,
"loss": 0.854,
"step": 1727
},
{
"epoch": 0.71,
"grad_norm": 2.805941321506203,
"learning_rate": 1.7438136193108457e-06,
"loss": 0.9407,
"step": 1728
},
{
"epoch": 0.71,
"grad_norm": 2.3565187730728523,
"learning_rate": 1.7435270844319655e-06,
"loss": 0.9043,
"step": 1729
},
{
"epoch": 0.71,
"grad_norm": 2.898650852822176,
"learning_rate": 1.7432404129726218e-06,
"loss": 0.9112,
"step": 1730
},
{
"epoch": 0.71,
"grad_norm": 3.0199868415858173,
"learning_rate": 1.742953604985474e-06,
"loss": 0.8714,
"step": 1731
},
{
"epoch": 0.71,
"grad_norm": 3.348535093220099,
"learning_rate": 1.7426666605232066e-06,
"loss": 0.8606,
"step": 1732
},
{
"epoch": 0.71,
"grad_norm": 2.444451396179888,
"learning_rate": 1.7423795796385288e-06,
"loss": 0.8912,
"step": 1733
},
{
"epoch": 0.71,
"grad_norm": 3.5783323353039,
"learning_rate": 1.7420923623841759e-06,
"loss": 0.8404,
"step": 1734
},
{
"epoch": 0.71,
"grad_norm": 3.155821497433822,
"learning_rate": 1.741805008812907e-06,
"loss": 0.8441,
"step": 1735
},
{
"epoch": 0.71,
"grad_norm": 2.8657282078222464,
"learning_rate": 1.7415175189775071e-06,
"loss": 0.8811,
"step": 1736
},
{
"epoch": 0.71,
"grad_norm": 2.8318530493259444,
"learning_rate": 1.741229892930786e-06,
"loss": 0.8951,
"step": 1737
},
{
"epoch": 0.71,
"grad_norm": 2.6689074530261463,
"learning_rate": 1.7409421307255787e-06,
"loss": 0.8742,
"step": 1738
},
{
"epoch": 0.71,
"grad_norm": 2.9391865063461666,
"learning_rate": 1.7406542324147445e-06,
"loss": 0.9461,
"step": 1739
},
{
"epoch": 0.71,
"grad_norm": 2.447307615582581,
"learning_rate": 1.7403661980511685e-06,
"loss": 1.0011,
"step": 1740
},
{
"epoch": 0.71,
"grad_norm": 2.8834482201481366,
"learning_rate": 1.7400780276877608e-06,
"loss": 0.8856,
"step": 1741
},
{
"epoch": 0.71,
"grad_norm": 2.503698348238097,
"learning_rate": 1.7397897213774556e-06,
"loss": 0.9425,
"step": 1742
},
{
"epoch": 0.71,
"grad_norm": 3.143440317206001,
"learning_rate": 1.7395012791732129e-06,
"loss": 0.8599,
"step": 1743
},
{
"epoch": 0.71,
"grad_norm": 2.8175938116481465,
"learning_rate": 1.7392127011280174e-06,
"loss": 0.9622,
"step": 1744
},
{
"epoch": 0.71,
"grad_norm": 2.7839565266995594,
"learning_rate": 1.738923987294879e-06,
"loss": 0.8917,
"step": 1745
},
{
"epoch": 0.71,
"grad_norm": 3.5607057901354255,
"learning_rate": 1.7386351377268316e-06,
"loss": 0.8377,
"step": 1746
},
{
"epoch": 0.71,
"grad_norm": 3.021695560936154,
"learning_rate": 1.7383461524769357e-06,
"loss": 0.8964,
"step": 1747
},
{
"epoch": 0.71,
"grad_norm": 2.927529857589276,
"learning_rate": 1.7380570315982753e-06,
"loss": 0.9678,
"step": 1748
},
{
"epoch": 0.71,
"grad_norm": 3.1648403803359417,
"learning_rate": 1.7377677751439594e-06,
"loss": 0.8655,
"step": 1749
},
{
"epoch": 0.71,
"grad_norm": 2.8121620459673355,
"learning_rate": 1.7374783831671231e-06,
"loss": 0.9042,
"step": 1750
},
{
"epoch": 0.71,
"grad_norm": 2.6249235575895837,
"learning_rate": 1.737188855720925e-06,
"loss": 0.9944,
"step": 1751
},
{
"epoch": 0.71,
"grad_norm": 3.0462587858159216,
"learning_rate": 1.7368991928585493e-06,
"loss": 0.8668,
"step": 1752
},
{
"epoch": 0.72,
"grad_norm": 2.6781479004610973,
"learning_rate": 1.736609394633205e-06,
"loss": 0.9428,
"step": 1753
},
{
"epoch": 0.72,
"grad_norm": 2.7213927745659507,
"learning_rate": 1.7363194610981258e-06,
"loss": 0.8332,
"step": 1754
},
{
"epoch": 0.72,
"grad_norm": 2.895585212191621,
"learning_rate": 1.7360293923065705e-06,
"loss": 0.9051,
"step": 1755
},
{
"epoch": 0.72,
"grad_norm": 2.886677441994355,
"learning_rate": 1.7357391883118227e-06,
"loss": 0.9012,
"step": 1756
},
{
"epoch": 0.72,
"grad_norm": 2.69748630960738,
"learning_rate": 1.7354488491671901e-06,
"loss": 0.8799,
"step": 1757
},
{
"epoch": 0.72,
"grad_norm": 3.8721329929461468,
"learning_rate": 1.7351583749260068e-06,
"loss": 0.7412,
"step": 1758
},
{
"epoch": 0.72,
"grad_norm": 2.906386839208882,
"learning_rate": 1.73486776564163e-06,
"loss": 0.7983,
"step": 1759
},
{
"epoch": 0.72,
"grad_norm": 2.944934629401425,
"learning_rate": 1.7345770213674432e-06,
"loss": 0.8469,
"step": 1760
},
{
"epoch": 0.72,
"grad_norm": 2.839964733765131,
"learning_rate": 1.7342861421568533e-06,
"loss": 0.9329,
"step": 1761
},
{
"epoch": 0.72,
"grad_norm": 2.6377006970426615,
"learning_rate": 1.733995128063293e-06,
"loss": 0.8575,
"step": 1762
},
{
"epoch": 0.72,
"grad_norm": 2.900975764723041,
"learning_rate": 1.7337039791402197e-06,
"loss": 0.9082,
"step": 1763
},
{
"epoch": 0.72,
"grad_norm": 2.859643959675396,
"learning_rate": 1.7334126954411148e-06,
"loss": 0.9324,
"step": 1764
},
{
"epoch": 0.72,
"grad_norm": 3.501023803271044,
"learning_rate": 1.7331212770194851e-06,
"loss": 0.9094,
"step": 1765
},
{
"epoch": 0.72,
"grad_norm": 3.1795082542275774,
"learning_rate": 1.7328297239288624e-06,
"loss": 0.8976,
"step": 1766
},
{
"epoch": 0.72,
"grad_norm": 3.0085650780780275,
"learning_rate": 1.7325380362228023e-06,
"loss": 0.8888,
"step": 1767
},
{
"epoch": 0.72,
"grad_norm": 2.588998105990209,
"learning_rate": 1.7322462139548866e-06,
"loss": 0.8936,
"step": 1768
},
{
"epoch": 0.72,
"grad_norm": 3.036217097111523,
"learning_rate": 1.73195425717872e-06,
"loss": 0.9106,
"step": 1769
},
{
"epoch": 0.72,
"grad_norm": 2.899954134642203,
"learning_rate": 1.731662165947933e-06,
"loss": 0.8722,
"step": 1770
},
{
"epoch": 0.72,
"grad_norm": 2.8106368361045155,
"learning_rate": 1.7313699403161807e-06,
"loss": 0.9467,
"step": 1771
},
{
"epoch": 0.72,
"grad_norm": 2.8132906151166517,
"learning_rate": 1.7310775803371429e-06,
"loss": 0.9063,
"step": 1772
},
{
"epoch": 0.72,
"grad_norm": 2.918866392977625,
"learning_rate": 1.730785086064524e-06,
"loss": 0.9152,
"step": 1773
},
{
"epoch": 0.72,
"grad_norm": 3.007391425259785,
"learning_rate": 1.730492457552053e-06,
"loss": 0.8762,
"step": 1774
},
{
"epoch": 0.72,
"grad_norm": 2.827404609243976,
"learning_rate": 1.7301996948534834e-06,
"loss": 0.897,
"step": 1775
},
{
"epoch": 0.72,
"grad_norm": 2.2578027281858515,
"learning_rate": 1.7299067980225938e-06,
"loss": 0.9107,
"step": 1776
},
{
"epoch": 0.73,
"grad_norm": 2.601807963529424,
"learning_rate": 1.729613767113187e-06,
"loss": 0.9141,
"step": 1777
},
{
"epoch": 0.73,
"grad_norm": 2.832386891170482,
"learning_rate": 1.7293206021790912e-06,
"loss": 0.8915,
"step": 1778
},
{
"epoch": 0.73,
"grad_norm": 3.200574726858662,
"learning_rate": 1.7290273032741578e-06,
"loss": 0.8992,
"step": 1779
},
{
"epoch": 0.73,
"grad_norm": 2.844500703715354,
"learning_rate": 1.728733870452264e-06,
"loss": 0.8818,
"step": 1780
},
{
"epoch": 0.73,
"grad_norm": 2.8107751014903735,
"learning_rate": 1.7284403037673117e-06,
"loss": 0.9382,
"step": 1781
},
{
"epoch": 0.73,
"grad_norm": 2.571285762889501,
"learning_rate": 1.7281466032732263e-06,
"loss": 0.9701,
"step": 1782
},
{
"epoch": 0.73,
"grad_norm": 2.9726594861736664,
"learning_rate": 1.7278527690239586e-06,
"loss": 0.9124,
"step": 1783
},
{
"epoch": 0.73,
"grad_norm": 2.7632282932363736,
"learning_rate": 1.727558801073484e-06,
"loss": 0.8692,
"step": 1784
},
{
"epoch": 0.73,
"grad_norm": 3.290347104240868,
"learning_rate": 1.727264699475802e-06,
"loss": 0.9323,
"step": 1785
},
{
"epoch": 0.73,
"grad_norm": 3.1724997430911612,
"learning_rate": 1.7269704642849372e-06,
"loss": 0.9127,
"step": 1786
},
{
"epoch": 0.73,
"grad_norm": 2.5311216532452465,
"learning_rate": 1.7266760955549384e-06,
"loss": 0.9088,
"step": 1787
},
{
"epoch": 0.73,
"grad_norm": 3.860194629819168,
"learning_rate": 1.7263815933398785e-06,
"loss": 0.8996,
"step": 1788
},
{
"epoch": 0.73,
"grad_norm": 2.6329936010924486,
"learning_rate": 1.7260869576938556e-06,
"loss": 0.9303,
"step": 1789
},
{
"epoch": 0.73,
"grad_norm": 3.155480540791573,
"learning_rate": 1.7257921886709927e-06,
"loss": 0.9308,
"step": 1790
},
{
"epoch": 0.73,
"grad_norm": 2.8588296109315623,
"learning_rate": 1.725497286325436e-06,
"loss": 0.8994,
"step": 1791
},
{
"epoch": 0.73,
"grad_norm": 3.3678376517325663,
"learning_rate": 1.7252022507113572e-06,
"loss": 0.9199,
"step": 1792
},
{
"epoch": 0.73,
"grad_norm": 3.6700691561612464,
"learning_rate": 1.7249070818829522e-06,
"loss": 0.7954,
"step": 1793
},
{
"epoch": 0.73,
"grad_norm": 4.037952990655294,
"learning_rate": 1.7246117798944408e-06,
"loss": 0.912,
"step": 1794
},
{
"epoch": 0.73,
"grad_norm": 3.271668168284019,
"learning_rate": 1.7243163448000687e-06,
"loss": 0.8638,
"step": 1795
},
{
"epoch": 0.73,
"grad_norm": 2.500089344598476,
"learning_rate": 1.7240207766541048e-06,
"loss": 0.862,
"step": 1796
},
{
"epoch": 0.73,
"grad_norm": 3.757352458570679,
"learning_rate": 1.7237250755108423e-06,
"loss": 0.9231,
"step": 1797
},
{
"epoch": 0.73,
"grad_norm": 3.186674959052312,
"learning_rate": 1.7234292414246e-06,
"loss": 0.8598,
"step": 1798
},
{
"epoch": 0.73,
"grad_norm": 2.758997797886992,
"learning_rate": 1.7231332744497204e-06,
"loss": 0.8985,
"step": 1799
},
{
"epoch": 0.73,
"grad_norm": 3.029093379300519,
"learning_rate": 1.7228371746405702e-06,
"loss": 0.8778,
"step": 1800
},
{
"epoch": 0.73,
"eval_loss": 0.8967553973197937,
"eval_runtime": 465.7244,
"eval_samples_per_second": 74.834,
"eval_steps_per_second": 4.679,
"step": 1800
},
{
"epoch": 0.73,
"grad_norm": 2.6332751280385547,
"learning_rate": 1.7225409420515404e-06,
"loss": 0.9378,
"step": 1801
},
{
"epoch": 0.74,
"grad_norm": 3.2015086148558063,
"learning_rate": 1.7222445767370473e-06,
"loss": 0.8488,
"step": 1802
},
{
"epoch": 0.74,
"grad_norm": 2.83473401739915,
"learning_rate": 1.721948078751531e-06,
"loss": 0.8446,
"step": 1803
},
{
"epoch": 0.74,
"grad_norm": 3.412973911012215,
"learning_rate": 1.721651448149456e-06,
"loss": 0.9438,
"step": 1804
},
{
"epoch": 0.74,
"grad_norm": 2.792190775953681,
"learning_rate": 1.721354684985311e-06,
"loss": 0.8522,
"step": 1805
},
{
"epoch": 0.74,
"grad_norm": 3.1274271618812257,
"learning_rate": 1.721057789313609e-06,
"loss": 0.9025,
"step": 1806
},
{
"epoch": 0.74,
"grad_norm": 3.0668600224575875,
"learning_rate": 1.720760761188888e-06,
"loss": 0.9073,
"step": 1807
},
{
"epoch": 0.74,
"grad_norm": 3.0824533738218935,
"learning_rate": 1.72046360066571e-06,
"loss": 0.8551,
"step": 1808
},
{
"epoch": 0.74,
"grad_norm": 3.062949354773459,
"learning_rate": 1.7201663077986605e-06,
"loss": 0.8542,
"step": 1809
},
{
"epoch": 0.74,
"grad_norm": 2.770914239012348,
"learning_rate": 1.7198688826423506e-06,
"loss": 0.8428,
"step": 1810
},
{
"epoch": 0.74,
"grad_norm": 3.0312717664729742,
"learning_rate": 1.719571325251415e-06,
"loss": 0.9492,
"step": 1811
},
{
"epoch": 0.74,
"grad_norm": 2.879840664737245,
"learning_rate": 1.7192736356805128e-06,
"loss": 0.8938,
"step": 1812
},
{
"epoch": 0.74,
"grad_norm": 2.8403406856474587,
"learning_rate": 1.7189758139843273e-06,
"loss": 0.8822,
"step": 1813
},
{
"epoch": 0.74,
"grad_norm": 2.546927131354577,
"learning_rate": 1.718677860217566e-06,
"loss": 0.93,
"step": 1814
},
{
"epoch": 0.74,
"grad_norm": 3.0398724208079737,
"learning_rate": 1.718379774434961e-06,
"loss": 0.8787,
"step": 1815
},
{
"epoch": 0.74,
"grad_norm": 2.855719842777268,
"learning_rate": 1.7180815566912688e-06,
"loss": 0.8494,
"step": 1816
},
{
"epoch": 0.74,
"grad_norm": 3.7716825530437177,
"learning_rate": 1.7177832070412694e-06,
"loss": 0.9308,
"step": 1817
},
{
"epoch": 0.74,
"grad_norm": 3.518199484210041,
"learning_rate": 1.7174847255397675e-06,
"loss": 0.8417,
"step": 1818
},
{
"epoch": 0.74,
"grad_norm": 3.663128533081115,
"learning_rate": 1.717186112241592e-06,
"loss": 0.8676,
"step": 1819
},
{
"epoch": 0.74,
"grad_norm": 3.3399656149734103,
"learning_rate": 1.716887367201596e-06,
"loss": 0.8801,
"step": 1820
},
{
"epoch": 0.74,
"grad_norm": 2.7667274568734643,
"learning_rate": 1.7165884904746567e-06,
"loss": 0.862,
"step": 1821
},
{
"epoch": 0.74,
"grad_norm": 3.1443192950263437,
"learning_rate": 1.7162894821156754e-06,
"loss": 0.8335,
"step": 1822
},
{
"epoch": 0.74,
"grad_norm": 2.4624220198809192,
"learning_rate": 1.715990342179578e-06,
"loss": 0.8479,
"step": 1823
},
{
"epoch": 0.74,
"grad_norm": 2.8136526060494447,
"learning_rate": 1.7156910707213146e-06,
"loss": 0.8821,
"step": 1824
},
{
"epoch": 0.74,
"grad_norm": 3.037436314154012,
"learning_rate": 1.7153916677958585e-06,
"loss": 0.8478,
"step": 1825
},
{
"epoch": 0.75,
"grad_norm": 3.2127240918423796,
"learning_rate": 1.7150921334582082e-06,
"loss": 0.8942,
"step": 1826
},
{
"epoch": 0.75,
"grad_norm": 4.587075437022675,
"learning_rate": 1.7147924677633859e-06,
"loss": 0.9252,
"step": 1827
},
{
"epoch": 0.75,
"grad_norm": 2.7044717420977693,
"learning_rate": 1.7144926707664377e-06,
"loss": 0.9522,
"step": 1828
},
{
"epoch": 0.75,
"grad_norm": 3.04077007684609,
"learning_rate": 1.7141927425224346e-06,
"loss": 0.8761,
"step": 1829
},
{
"epoch": 0.75,
"grad_norm": 2.7862298451384295,
"learning_rate": 1.713892683086471e-06,
"loss": 0.9035,
"step": 1830
},
{
"epoch": 0.75,
"grad_norm": 3.24503205981993,
"learning_rate": 1.7135924925136656e-06,
"loss": 0.914,
"step": 1831
},
{
"epoch": 0.75,
"grad_norm": 2.806954909349893,
"learning_rate": 1.7132921708591613e-06,
"loss": 0.8461,
"step": 1832
},
{
"epoch": 0.75,
"grad_norm": 2.6160161143086036,
"learning_rate": 1.7129917181781249e-06,
"loss": 0.8646,
"step": 1833
},
{
"epoch": 0.75,
"grad_norm": 3.020930000874223,
"learning_rate": 1.7126911345257472e-06,
"loss": 0.9014,
"step": 1834
},
{
"epoch": 0.75,
"grad_norm": 2.773124803181032,
"learning_rate": 1.7123904199572431e-06,
"loss": 0.959,
"step": 1835
},
{
"epoch": 0.75,
"grad_norm": 2.729814179016735,
"learning_rate": 1.7120895745278525e-06,
"loss": 0.8342,
"step": 1836
},
{
"epoch": 0.75,
"grad_norm": 2.494258047644448,
"learning_rate": 1.7117885982928377e-06,
"loss": 0.9106,
"step": 1837
},
{
"epoch": 0.75,
"grad_norm": 3.2672666437458786,
"learning_rate": 1.7114874913074858e-06,
"loss": 0.8684,
"step": 1838
},
{
"epoch": 0.75,
"grad_norm": 2.132385657335843,
"learning_rate": 1.7111862536271083e-06,
"loss": 0.9297,
"step": 1839
},
{
"epoch": 0.75,
"grad_norm": 3.0419435256459573,
"learning_rate": 1.71088488530704e-06,
"loss": 0.9189,
"step": 1840
},
{
"epoch": 0.75,
"grad_norm": 3.380173458638942,
"learning_rate": 1.7105833864026406e-06,
"loss": 0.8588,
"step": 1841
},
{
"epoch": 0.75,
"grad_norm": 2.774539730857192,
"learning_rate": 1.7102817569692929e-06,
"loss": 0.8674,
"step": 1842
},
{
"epoch": 0.75,
"grad_norm": 2.9629398780124028,
"learning_rate": 1.7099799970624038e-06,
"loss": 0.9573,
"step": 1843
},
{
"epoch": 0.75,
"grad_norm": 3.113628240741703,
"learning_rate": 1.709678106737405e-06,
"loss": 0.9243,
"step": 1844
},
{
"epoch": 0.75,
"grad_norm": 3.085718155033621,
"learning_rate": 1.7093760860497507e-06,
"loss": 0.8315,
"step": 1845
},
{
"epoch": 0.75,
"grad_norm": 3.6363887739942125,
"learning_rate": 1.7090739350549202e-06,
"loss": 0.8729,
"step": 1846
},
{
"epoch": 0.75,
"grad_norm": 2.506808678691133,
"learning_rate": 1.7087716538084168e-06,
"loss": 0.9501,
"step": 1847
},
{
"epoch": 0.75,
"grad_norm": 2.828660810907063,
"learning_rate": 1.7084692423657669e-06,
"loss": 0.8963,
"step": 1848
},
{
"epoch": 0.75,
"grad_norm": 2.9702963980769783,
"learning_rate": 1.7081667007825216e-06,
"loss": 0.8909,
"step": 1849
},
{
"epoch": 0.75,
"grad_norm": 3.4034998723535224,
"learning_rate": 1.7078640291142553e-06,
"loss": 0.8785,
"step": 1850
},
{
"epoch": 0.76,
"grad_norm": 2.6622353626258164,
"learning_rate": 1.7075612274165668e-06,
"loss": 0.958,
"step": 1851
},
{
"epoch": 0.76,
"grad_norm": 2.751498482098041,
"learning_rate": 1.707258295745078e-06,
"loss": 0.8991,
"step": 1852
},
{
"epoch": 0.76,
"grad_norm": 2.3928073717435105,
"learning_rate": 1.706955234155436e-06,
"loss": 0.957,
"step": 1853
},
{
"epoch": 0.76,
"grad_norm": 2.9306726844187834,
"learning_rate": 1.7066520427033107e-06,
"loss": 0.9285,
"step": 1854
},
{
"epoch": 0.76,
"grad_norm": 3.481281784247888,
"learning_rate": 1.7063487214443957e-06,
"loss": 0.9833,
"step": 1855
},
{
"epoch": 0.76,
"grad_norm": 3.8393810195486893,
"learning_rate": 1.7060452704344095e-06,
"loss": 0.9024,
"step": 1856
},
{
"epoch": 0.76,
"grad_norm": 2.5261260143396895,
"learning_rate": 1.7057416897290935e-06,
"loss": 0.8173,
"step": 1857
},
{
"epoch": 0.76,
"grad_norm": 2.536716249427303,
"learning_rate": 1.7054379793842133e-06,
"loss": 0.8692,
"step": 1858
},
{
"epoch": 0.76,
"grad_norm": 3.006484945620636,
"learning_rate": 1.7051341394555583e-06,
"loss": 0.8695,
"step": 1859
},
{
"epoch": 0.76,
"grad_norm": 2.733432409170324,
"learning_rate": 1.7048301699989413e-06,
"loss": 0.9203,
"step": 1860
},
{
"epoch": 0.76,
"grad_norm": 2.7834656091603174,
"learning_rate": 1.7045260710702002e-06,
"loss": 0.9354,
"step": 1861
},
{
"epoch": 0.76,
"grad_norm": 2.555738981733667,
"learning_rate": 1.7042218427251946e-06,
"loss": 0.8976,
"step": 1862
},
{
"epoch": 0.76,
"grad_norm": 2.6498647195338507,
"learning_rate": 1.7039174850198099e-06,
"loss": 0.8979,
"step": 1863
},
{
"epoch": 0.76,
"grad_norm": 2.7747790608098994,
"learning_rate": 1.703612998009954e-06,
"loss": 0.9636,
"step": 1864
},
{
"epoch": 0.76,
"grad_norm": 3.05513395689839,
"learning_rate": 1.7033083817515586e-06,
"loss": 0.8854,
"step": 1865
},
{
"epoch": 0.76,
"grad_norm": 3.0836095470543072,
"learning_rate": 1.70300363630058e-06,
"loss": 0.8759,
"step": 1866
},
{
"epoch": 0.76,
"grad_norm": 3.119242805732746,
"learning_rate": 1.7026987617129977e-06,
"loss": 0.9124,
"step": 1867
},
{
"epoch": 0.76,
"grad_norm": 2.6951320066290934,
"learning_rate": 1.7023937580448144e-06,
"loss": 0.886,
"step": 1868
},
{
"epoch": 0.76,
"grad_norm": 2.8541321407957136,
"learning_rate": 1.7020886253520577e-06,
"loss": 0.8462,
"step": 1869
},
{
"epoch": 0.76,
"grad_norm": 3.235397880115037,
"learning_rate": 1.7017833636907777e-06,
"loss": 0.8829,
"step": 1870
},
{
"epoch": 0.76,
"grad_norm": 3.121746944408749,
"learning_rate": 1.701477973117049e-06,
"loss": 0.8506,
"step": 1871
},
{
"epoch": 0.76,
"grad_norm": 3.402799638845167,
"learning_rate": 1.7011724536869694e-06,
"loss": 0.902,
"step": 1872
},
{
"epoch": 0.76,
"grad_norm": 3.1492265035633853,
"learning_rate": 1.700866805456661e-06,
"loss": 0.8864,
"step": 1873
},
{
"epoch": 0.76,
"grad_norm": 3.1092958412314564,
"learning_rate": 1.7005610284822685e-06,
"loss": 0.8927,
"step": 1874
},
{
"epoch": 0.77,
"grad_norm": 2.6452271926625004,
"learning_rate": 1.7002551228199612e-06,
"loss": 0.9493,
"step": 1875
},
{
"epoch": 0.77,
"grad_norm": 2.6976525638029085,
"learning_rate": 1.699949088525932e-06,
"loss": 0.9199,
"step": 1876
},
{
"epoch": 0.77,
"grad_norm": 2.607738223090653,
"learning_rate": 1.699642925656397e-06,
"loss": 0.9448,
"step": 1877
},
{
"epoch": 0.77,
"grad_norm": 2.9132058741731934,
"learning_rate": 1.6993366342675958e-06,
"loss": 0.8317,
"step": 1878
},
{
"epoch": 0.77,
"grad_norm": 3.3320233740612193,
"learning_rate": 1.699030214415792e-06,
"loss": 0.9119,
"step": 1879
},
{
"epoch": 0.77,
"grad_norm": 2.8630722334436416,
"learning_rate": 1.6987236661572729e-06,
"loss": 0.8877,
"step": 1880
},
{
"epoch": 0.77,
"grad_norm": 2.6645262964016627,
"learning_rate": 1.6984169895483486e-06,
"loss": 0.8874,
"step": 1881
},
{
"epoch": 0.77,
"grad_norm": 2.4624582958662633,
"learning_rate": 1.6981101846453542e-06,
"loss": 0.8999,
"step": 1882
},
{
"epoch": 0.77,
"grad_norm": 2.737894617832523,
"learning_rate": 1.6978032515046467e-06,
"loss": 0.8611,
"step": 1883
},
{
"epoch": 0.77,
"grad_norm": 2.547699997714653,
"learning_rate": 1.697496190182608e-06,
"loss": 0.8817,
"step": 1884
},
{
"epoch": 0.77,
"grad_norm": 2.445197860514478,
"learning_rate": 1.6971890007356428e-06,
"loss": 0.8621,
"step": 1885
},
{
"epoch": 0.77,
"grad_norm": 2.4657072327529748,
"learning_rate": 1.6968816832201794e-06,
"loss": 0.8649,
"step": 1886
},
{
"epoch": 0.77,
"grad_norm": 2.9712262585790437,
"learning_rate": 1.69657423769267e-06,
"loss": 0.8939,
"step": 1887
},
{
"epoch": 0.77,
"grad_norm": 3.259438425003548,
"learning_rate": 1.69626666420959e-06,
"loss": 0.8328,
"step": 1888
},
{
"epoch": 0.77,
"grad_norm": 3.3190382991537173,
"learning_rate": 1.6959589628274385e-06,
"loss": 0.9075,
"step": 1889
},
{
"epoch": 0.77,
"grad_norm": 2.8135945025577898,
"learning_rate": 1.6956511336027376e-06,
"loss": 0.828,
"step": 1890
},
{
"epoch": 0.77,
"grad_norm": 2.7401973469125096,
"learning_rate": 1.695343176592034e-06,
"loss": 0.881,
"step": 1891
},
{
"epoch": 0.77,
"grad_norm": 2.8974736945703174,
"learning_rate": 1.6950350918518964e-06,
"loss": 0.8978,
"step": 1892
},
{
"epoch": 0.77,
"grad_norm": 2.6552005182864447,
"learning_rate": 1.6947268794389181e-06,
"loss": 0.8652,
"step": 1893
},
{
"epoch": 0.77,
"grad_norm": 2.8470289343163,
"learning_rate": 1.6944185394097152e-06,
"loss": 0.8563,
"step": 1894
},
{
"epoch": 0.77,
"grad_norm": 3.0935790140483275,
"learning_rate": 1.6941100718209276e-06,
"loss": 0.9138,
"step": 1895
},
{
"epoch": 0.77,
"grad_norm": 2.668729841125488,
"learning_rate": 1.6938014767292188e-06,
"loss": 0.9417,
"step": 1896
},
{
"epoch": 0.77,
"grad_norm": 2.789502906333908,
"learning_rate": 1.6934927541912752e-06,
"loss": 0.9186,
"step": 1897
},
{
"epoch": 0.77,
"grad_norm": 3.2402227153884198,
"learning_rate": 1.6931839042638067e-06,
"loss": 0.8547,
"step": 1898
},
{
"epoch": 0.77,
"grad_norm": 4.3638785666824305,
"learning_rate": 1.692874927003547e-06,
"loss": 0.9022,
"step": 1899
},
{
"epoch": 0.78,
"grad_norm": 2.661982068430516,
"learning_rate": 1.692565822467253e-06,
"loss": 0.929,
"step": 1900
},
{
"epoch": 0.78,
"eval_loss": 0.8947169184684753,
"eval_runtime": 466.7766,
"eval_samples_per_second": 74.665,
"eval_steps_per_second": 4.668,
"step": 1900
},
{
"epoch": 0.78,
"grad_norm": 3.4117086749566825,
"learning_rate": 1.6922565907117048e-06,
"loss": 0.8126,
"step": 1901
},
{
"epoch": 0.78,
"grad_norm": 2.4900027775455023,
"learning_rate": 1.6919472317937063e-06,
"loss": 0.8748,
"step": 1902
},
{
"epoch": 0.78,
"grad_norm": 2.912149259641751,
"learning_rate": 1.6916377457700839e-06,
"loss": 0.877,
"step": 1903
},
{
"epoch": 0.78,
"grad_norm": 3.0067959115974543,
"learning_rate": 1.6913281326976883e-06,
"loss": 0.7876,
"step": 1904
},
{
"epoch": 0.78,
"grad_norm": 2.7338050401173124,
"learning_rate": 1.691018392633393e-06,
"loss": 0.9299,
"step": 1905
},
{
"epoch": 0.78,
"grad_norm": 3.6156587725732585,
"learning_rate": 1.690708525634095e-06,
"loss": 0.8613,
"step": 1906
},
{
"epoch": 0.78,
"grad_norm": 3.116930182001255,
"learning_rate": 1.6903985317567147e-06,
"loss": 0.8605,
"step": 1907
},
{
"epoch": 0.78,
"grad_norm": 2.765016834637611,
"learning_rate": 1.6900884110581957e-06,
"loss": 0.896,
"step": 1908
},
{
"epoch": 0.78,
"grad_norm": 3.1000359496367818,
"learning_rate": 1.6897781635955046e-06,
"loss": 0.8541,
"step": 1909
},
{
"epoch": 0.78,
"grad_norm": 3.607285567046138,
"learning_rate": 1.689467789425632e-06,
"loss": 0.8491,
"step": 1910
},
{
"epoch": 0.78,
"grad_norm": 3.166065036584536,
"learning_rate": 1.6891572886055912e-06,
"loss": 0.9266,
"step": 1911
},
{
"epoch": 0.78,
"grad_norm": 2.5148570503485543,
"learning_rate": 1.6888466611924188e-06,
"loss": 0.897,
"step": 1912
},
{
"epoch": 0.78,
"grad_norm": 2.943754310874003,
"learning_rate": 1.6885359072431746e-06,
"loss": 0.8984,
"step": 1913
},
{
"epoch": 0.78,
"grad_norm": 3.8349814735706764,
"learning_rate": 1.6882250268149422e-06,
"loss": 0.8897,
"step": 1914
},
{
"epoch": 0.78,
"grad_norm": 3.1439258735825284,
"learning_rate": 1.687914019964828e-06,
"loss": 0.9178,
"step": 1915
},
{
"epoch": 0.78,
"grad_norm": 2.505738300524754,
"learning_rate": 1.687602886749962e-06,
"loss": 0.9271,
"step": 1916
},
{
"epoch": 0.78,
"grad_norm": 2.850386089342914,
"learning_rate": 1.687291627227496e-06,
"loss": 0.9453,
"step": 1917
},
{
"epoch": 0.78,
"grad_norm": 2.43942778999752,
"learning_rate": 1.6869802414546071e-06,
"loss": 0.9374,
"step": 1918
},
{
"epoch": 0.78,
"grad_norm": 2.583763106925088,
"learning_rate": 1.6866687294884941e-06,
"loss": 0.9123,
"step": 1919
},
{
"epoch": 0.78,
"grad_norm": 2.8898245764473613,
"learning_rate": 1.6863570913863793e-06,
"loss": 0.9771,
"step": 1920
},
{
"epoch": 0.78,
"grad_norm": 2.8871073512427974,
"learning_rate": 1.6860453272055093e-06,
"loss": 0.8133,
"step": 1921
},
{
"epoch": 0.78,
"grad_norm": 2.5625004991411573,
"learning_rate": 1.6857334370031517e-06,
"loss": 0.8517,
"step": 1922
},
{
"epoch": 0.78,
"grad_norm": 3.1370834173040323,
"learning_rate": 1.6854214208365994e-06,
"loss": 0.8531,
"step": 1923
},
{
"epoch": 0.79,
"grad_norm": 2.8200634322027005,
"learning_rate": 1.6851092787631667e-06,
"loss": 0.8268,
"step": 1924
},
{
"epoch": 0.79,
"grad_norm": 2.4960175343241398,
"learning_rate": 1.6847970108401924e-06,
"loss": 0.874,
"step": 1925
},
{
"epoch": 0.79,
"grad_norm": 2.842183305285763,
"learning_rate": 1.684484617125037e-06,
"loss": 0.8724,
"step": 1926
},
{
"epoch": 0.79,
"grad_norm": 3.322794707930227,
"learning_rate": 1.6841720976750863e-06,
"loss": 0.9266,
"step": 1927
},
{
"epoch": 0.79,
"grad_norm": 3.5464268672401627,
"learning_rate": 1.6838594525477465e-06,
"loss": 0.8061,
"step": 1928
},
{
"epoch": 0.79,
"grad_norm": 3.076769607545288,
"learning_rate": 1.6835466818004492e-06,
"loss": 0.9243,
"step": 1929
},
{
"epoch": 0.79,
"grad_norm": 2.6869902354580013,
"learning_rate": 1.683233785490647e-06,
"loss": 0.9106,
"step": 1930
},
{
"epoch": 0.79,
"grad_norm": 2.477082179118166,
"learning_rate": 1.6829207636758178e-06,
"loss": 0.8954,
"step": 1931
},
{
"epoch": 0.79,
"grad_norm": 3.033140744387853,
"learning_rate": 1.6826076164134606e-06,
"loss": 0.8991,
"step": 1932
},
{
"epoch": 0.79,
"grad_norm": 2.620622020667726,
"learning_rate": 1.6822943437610988e-06,
"loss": 0.8271,
"step": 1933
},
{
"epoch": 0.79,
"grad_norm": 3.0336318993542437,
"learning_rate": 1.6819809457762775e-06,
"loss": 0.8715,
"step": 1934
},
{
"epoch": 0.79,
"grad_norm": 2.5937073059908253,
"learning_rate": 1.6816674225165666e-06,
"loss": 0.8769,
"step": 1935
},
{
"epoch": 0.79,
"grad_norm": 3.1258902054811224,
"learning_rate": 1.6813537740395574e-06,
"loss": 0.8231,
"step": 1936
},
{
"epoch": 0.79,
"grad_norm": 3.310367339919999,
"learning_rate": 1.681040000402865e-06,
"loss": 0.8725,
"step": 1937
},
{
"epoch": 0.79,
"grad_norm": 2.692255174139625,
"learning_rate": 1.6807261016641274e-06,
"loss": 0.9236,
"step": 1938
},
{
"epoch": 0.79,
"grad_norm": 2.4017360796125566,
"learning_rate": 1.6804120778810052e-06,
"loss": 0.9867,
"step": 1939
},
{
"epoch": 0.79,
"grad_norm": 3.1320829548760023,
"learning_rate": 1.6800979291111826e-06,
"loss": 0.8907,
"step": 1940
},
{
"epoch": 0.79,
"grad_norm": 2.5997045288987644,
"learning_rate": 1.679783655412366e-06,
"loss": 0.8529,
"step": 1941
},
{
"epoch": 0.79,
"grad_norm": 2.8062170140855898,
"learning_rate": 1.6794692568422856e-06,
"loss": 0.8778,
"step": 1942
},
{
"epoch": 0.79,
"grad_norm": 3.7868058578352946,
"learning_rate": 1.6791547334586941e-06,
"loss": 0.8888,
"step": 1943
},
{
"epoch": 0.79,
"grad_norm": 2.861534731927047,
"learning_rate": 1.678840085319367e-06,
"loss": 0.8425,
"step": 1944
},
{
"epoch": 0.79,
"grad_norm": 2.806080951965443,
"learning_rate": 1.6785253124821024e-06,
"loss": 0.8866,
"step": 1945
},
{
"epoch": 0.79,
"grad_norm": 2.3135836582492106,
"learning_rate": 1.6782104150047225e-06,
"loss": 0.9344,
"step": 1946
},
{
"epoch": 0.79,
"grad_norm": 2.890637039979027,
"learning_rate": 1.6778953929450714e-06,
"loss": 0.8917,
"step": 1947
},
{
"epoch": 0.79,
"grad_norm": 3.5331322398225065,
"learning_rate": 1.6775802463610162e-06,
"loss": 0.8996,
"step": 1948
},
{
"epoch": 0.8,
"grad_norm": 3.01999770171339,
"learning_rate": 1.677264975310447e-06,
"loss": 0.8821,
"step": 1949
},
{
"epoch": 0.8,
"grad_norm": 2.718980625499144,
"learning_rate": 1.676949579851277e-06,
"loss": 0.8743,
"step": 1950
},
{
"epoch": 0.8,
"grad_norm": 2.881434293610783,
"learning_rate": 1.676634060041442e-06,
"loss": 0.9488,
"step": 1951
},
{
"epoch": 0.8,
"grad_norm": 2.8098517402215912,
"learning_rate": 1.6763184159389002e-06,
"loss": 0.9098,
"step": 1952
},
{
"epoch": 0.8,
"grad_norm": 2.8241232356749535,
"learning_rate": 1.6760026476016336e-06,
"loss": 0.8944,
"step": 1953
},
{
"epoch": 0.8,
"grad_norm": 2.632032989242208,
"learning_rate": 1.6756867550876463e-06,
"loss": 0.8948,
"step": 1954
},
{
"epoch": 0.8,
"grad_norm": 3.24479451159178,
"learning_rate": 1.675370738454966e-06,
"loss": 0.9326,
"step": 1955
},
{
"epoch": 0.8,
"grad_norm": 2.734514428870467,
"learning_rate": 1.6750545977616417e-06,
"loss": 0.9781,
"step": 1956
},
{
"epoch": 0.8,
"grad_norm": 3.7917518846629044,
"learning_rate": 1.6747383330657468e-06,
"loss": 0.9037,
"step": 1957
},
{
"epoch": 0.8,
"grad_norm": 3.53440400257828,
"learning_rate": 1.6744219444253761e-06,
"loss": 0.8394,
"step": 1958
},
{
"epoch": 0.8,
"grad_norm": 3.300836958549457,
"learning_rate": 1.6741054318986491e-06,
"loss": 0.9948,
"step": 1959
},
{
"epoch": 0.8,
"grad_norm": 2.8385711027040665,
"learning_rate": 1.6737887955437055e-06,
"loss": 0.8965,
"step": 1960
},
{
"epoch": 0.8,
"grad_norm": 2.840217115565452,
"learning_rate": 1.6734720354187102e-06,
"loss": 0.8816,
"step": 1961
},
{
"epoch": 0.8,
"grad_norm": 2.670942271382101,
"learning_rate": 1.6731551515818487e-06,
"loss": 0.9185,
"step": 1962
},
{
"epoch": 0.8,
"grad_norm": 2.5954925296474283,
"learning_rate": 1.6728381440913309e-06,
"loss": 0.8915,
"step": 1963
},
{
"epoch": 0.8,
"grad_norm": 3.491914737641242,
"learning_rate": 1.6725210130053886e-06,
"loss": 0.9088,
"step": 1964
},
{
"epoch": 0.8,
"grad_norm": 2.5937000189970005,
"learning_rate": 1.672203758382276e-06,
"loss": 0.9098,
"step": 1965
},
{
"epoch": 0.8,
"grad_norm": 2.402334474872088,
"learning_rate": 1.6718863802802713e-06,
"loss": 0.9036,
"step": 1966
},
{
"epoch": 0.8,
"grad_norm": 3.1375430277742558,
"learning_rate": 1.671568878757674e-06,
"loss": 0.9011,
"step": 1967
},
{
"epoch": 0.8,
"grad_norm": 3.3076437043386315,
"learning_rate": 1.671251253872807e-06,
"loss": 0.871,
"step": 1968
},
{
"epoch": 0.8,
"grad_norm": 2.7464532955888874,
"learning_rate": 1.670933505684015e-06,
"loss": 0.8478,
"step": 1969
},
{
"epoch": 0.8,
"grad_norm": 2.581323546498945,
"learning_rate": 1.6706156342496669e-06,
"loss": 0.8632,
"step": 1970
},
{
"epoch": 0.8,
"grad_norm": 2.777783247113796,
"learning_rate": 1.6702976396281528e-06,
"loss": 0.9126,
"step": 1971
},
{
"epoch": 0.8,
"grad_norm": 3.132459298863436,
"learning_rate": 1.6699795218778862e-06,
"loss": 0.858,
"step": 1972
},
{
"epoch": 0.81,
"grad_norm": 2.9403318689928613,
"learning_rate": 1.669661281057303e-06,
"loss": 0.8422,
"step": 1973
},
{
"epoch": 0.81,
"grad_norm": 3.3095096046743446,
"learning_rate": 1.6693429172248616e-06,
"loss": 0.8975,
"step": 1974
},
{
"epoch": 0.81,
"grad_norm": 2.9886258858597783,
"learning_rate": 1.6690244304390434e-06,
"loss": 0.8902,
"step": 1975
},
{
"epoch": 0.81,
"grad_norm": 2.676907635110452,
"learning_rate": 1.6687058207583515e-06,
"loss": 0.936,
"step": 1976
},
{
"epoch": 0.81,
"grad_norm": 3.2031422043211113,
"learning_rate": 1.6683870882413124e-06,
"loss": 0.8813,
"step": 1977
},
{
"epoch": 0.81,
"grad_norm": 2.699032715014665,
"learning_rate": 1.6680682329464752e-06,
"loss": 0.8954,
"step": 1978
},
{
"epoch": 0.81,
"grad_norm": 3.014482131652684,
"learning_rate": 1.6677492549324112e-06,
"loss": 0.902,
"step": 1979
},
{
"epoch": 0.81,
"grad_norm": 3.475663537778148,
"learning_rate": 1.667430154257714e-06,
"loss": 0.9119,
"step": 1980
},
{
"epoch": 0.81,
"grad_norm": 3.1337758979389148,
"learning_rate": 1.6671109309810002e-06,
"loss": 1.0245,
"step": 1981
},
{
"epoch": 0.81,
"grad_norm": 2.860578638230439,
"learning_rate": 1.6667915851609089e-06,
"loss": 0.8928,
"step": 1982
},
{
"epoch": 0.81,
"grad_norm": 2.3790264265420866,
"learning_rate": 1.6664721168561018e-06,
"loss": 0.8745,
"step": 1983
},
{
"epoch": 0.81,
"grad_norm": 3.0390048404526078,
"learning_rate": 1.6661525261252622e-06,
"loss": 0.8564,
"step": 1984
},
{
"epoch": 0.81,
"grad_norm": 2.605507736182945,
"learning_rate": 1.6658328130270971e-06,
"loss": 0.92,
"step": 1985
},
{
"epoch": 0.81,
"grad_norm": 3.0578214909839403,
"learning_rate": 1.665512977620335e-06,
"loss": 0.8668,
"step": 1986
},
{
"epoch": 0.81,
"grad_norm": 2.990569140615956,
"learning_rate": 1.6651930199637284e-06,
"loss": 0.9559,
"step": 1987
},
{
"epoch": 0.81,
"grad_norm": 2.690406250581072,
"learning_rate": 1.6648729401160495e-06,
"loss": 0.8659,
"step": 1988
},
{
"epoch": 0.81,
"grad_norm": 2.8130067611231753,
"learning_rate": 1.664552738136096e-06,
"loss": 0.8613,
"step": 1989
},
{
"epoch": 0.81,
"grad_norm": 2.792320764579432,
"learning_rate": 1.664232414082686e-06,
"loss": 0.9377,
"step": 1990
},
{
"epoch": 0.81,
"grad_norm": 3.442974711464397,
"learning_rate": 1.663911968014661e-06,
"loss": 0.93,
"step": 1991
},
{
"epoch": 0.81,
"grad_norm": 2.8569400159142155,
"learning_rate": 1.6635913999908842e-06,
"loss": 0.8745,
"step": 1992
},
{
"epoch": 0.81,
"grad_norm": 2.8283793705264486,
"learning_rate": 1.6632707100702418e-06,
"loss": 0.8504,
"step": 1993
},
{
"epoch": 0.81,
"grad_norm": 2.680462022148769,
"learning_rate": 1.6629498983116422e-06,
"loss": 0.819,
"step": 1994
},
{
"epoch": 0.81,
"grad_norm": 2.511366930256453,
"learning_rate": 1.6626289647740164e-06,
"loss": 0.8815,
"step": 1995
},
{
"epoch": 0.81,
"grad_norm": 2.775278541385518,
"learning_rate": 1.6623079095163171e-06,
"loss": 0.9563,
"step": 1996
},
{
"epoch": 0.81,
"grad_norm": 2.8875388005815026,
"learning_rate": 1.6619867325975198e-06,
"loss": 0.8805,
"step": 1997
},
{
"epoch": 0.82,
"grad_norm": 3.137745088848395,
"learning_rate": 1.6616654340766227e-06,
"loss": 0.8892,
"step": 1998
},
{
"epoch": 0.82,
"grad_norm": 3.8353454985931674,
"learning_rate": 1.6613440140126459e-06,
"loss": 0.8484,
"step": 1999
},
{
"epoch": 0.82,
"grad_norm": 2.66108325935434,
"learning_rate": 1.6610224724646317e-06,
"loss": 0.8865,
"step": 2000
},
{
"epoch": 0.82,
"eval_loss": 0.8936405777931213,
"eval_runtime": 466.2846,
"eval_samples_per_second": 74.744,
"eval_steps_per_second": 4.673,
"step": 2000
},
{
"epoch": 0.82,
"grad_norm": 2.7590052699229592,
"learning_rate": 1.660700809491645e-06,
"loss": 0.8403,
"step": 2001
},
{
"epoch": 0.82,
"grad_norm": 3.2877686109758337,
"learning_rate": 1.6603790251527733e-06,
"loss": 0.9062,
"step": 2002
},
{
"epoch": 0.82,
"grad_norm": 3.703341746065545,
"learning_rate": 1.6600571195071252e-06,
"loss": 0.8891,
"step": 2003
},
{
"epoch": 0.82,
"grad_norm": 3.359350333799082,
"learning_rate": 1.6597350926138331e-06,
"loss": 0.8809,
"step": 2004
},
{
"epoch": 0.82,
"grad_norm": 3.1686603388237407,
"learning_rate": 1.6594129445320505e-06,
"loss": 0.815,
"step": 2005
},
{
"epoch": 0.82,
"grad_norm": 3.5391247104646935,
"learning_rate": 1.6590906753209542e-06,
"loss": 0.9178,
"step": 2006
},
{
"epoch": 0.82,
"grad_norm": 3.1677991915361465,
"learning_rate": 1.658768285039742e-06,
"loss": 0.8886,
"step": 2007
},
{
"epoch": 0.82,
"grad_norm": 3.3801024432902804,
"learning_rate": 1.6584457737476355e-06,
"loss": 0.9532,
"step": 2008
},
{
"epoch": 0.82,
"grad_norm": 3.1911325610512815,
"learning_rate": 1.658123141503877e-06,
"loss": 0.8938,
"step": 2009
},
{
"epoch": 0.82,
"grad_norm": 2.704810572134154,
"learning_rate": 1.6578003883677317e-06,
"loss": 0.864,
"step": 2010
},
{
"epoch": 0.82,
"grad_norm": 3.245685841029581,
"learning_rate": 1.6574775143984868e-06,
"loss": 0.879,
"step": 2011
},
{
"epoch": 0.82,
"grad_norm": 2.490448722497763,
"learning_rate": 1.6571545196554528e-06,
"loss": 0.981,
"step": 2012
},
{
"epoch": 0.82,
"grad_norm": 2.96528557140544,
"learning_rate": 1.6568314041979607e-06,
"loss": 0.9289,
"step": 2013
},
{
"epoch": 0.82,
"grad_norm": 2.6685235823481848,
"learning_rate": 1.6565081680853643e-06,
"loss": 0.8656,
"step": 2014
},
{
"epoch": 0.82,
"grad_norm": 2.8594112321975858,
"learning_rate": 1.6561848113770402e-06,
"loss": 0.9594,
"step": 2015
},
{
"epoch": 0.82,
"grad_norm": 2.74459130711699,
"learning_rate": 1.6558613341323864e-06,
"loss": 0.9149,
"step": 2016
},
{
"epoch": 0.82,
"grad_norm": 2.9607974284655487,
"learning_rate": 1.6555377364108232e-06,
"loss": 0.9741,
"step": 2017
},
{
"epoch": 0.82,
"grad_norm": 2.832139361675203,
"learning_rate": 1.6552140182717933e-06,
"loss": 0.855,
"step": 2018
},
{
"epoch": 0.82,
"grad_norm": 2.4737912438426446,
"learning_rate": 1.6548901797747612e-06,
"loss": 0.8973,
"step": 2019
},
{
"epoch": 0.82,
"grad_norm": 3.196827771885827,
"learning_rate": 1.654566220979214e-06,
"loss": 0.9429,
"step": 2020
},
{
"epoch": 0.82,
"grad_norm": 2.701451836038249,
"learning_rate": 1.65424214194466e-06,
"loss": 0.9164,
"step": 2021
},
{
"epoch": 0.83,
"grad_norm": 2.6459120397319857,
"learning_rate": 1.6539179427306307e-06,
"loss": 0.862,
"step": 2022
},
{
"epoch": 0.83,
"grad_norm": 2.927281532712312,
"learning_rate": 1.6535936233966784e-06,
"loss": 0.9164,
"step": 2023
},
{
"epoch": 0.83,
"grad_norm": 3.393178904998137,
"learning_rate": 1.6532691840023795e-06,
"loss": 0.914,
"step": 2024
},
{
"epoch": 0.83,
"grad_norm": 2.955971616584532,
"learning_rate": 1.6529446246073296e-06,
"loss": 0.8348,
"step": 2025
},
{
"epoch": 0.83,
"grad_norm": 2.8677370019899446,
"learning_rate": 1.6526199452711485e-06,
"loss": 0.921,
"step": 2026
},
{
"epoch": 0.83,
"grad_norm": 7.367491556438016,
"learning_rate": 1.6522951460534777e-06,
"loss": 0.8723,
"step": 2027
},
{
"epoch": 0.83,
"grad_norm": 2.5481509433983756,
"learning_rate": 1.65197022701398e-06,
"loss": 0.8768,
"step": 2028
},
{
"epoch": 0.83,
"grad_norm": 2.802499226443521,
"learning_rate": 1.651645188212341e-06,
"loss": 0.8885,
"step": 2029
},
{
"epoch": 0.83,
"grad_norm": 2.4724882984299192,
"learning_rate": 1.6513200297082677e-06,
"loss": 0.8963,
"step": 2030
},
{
"epoch": 0.83,
"grad_norm": 3.6139472941414903,
"learning_rate": 1.6509947515614893e-06,
"loss": 0.9266,
"step": 2031
},
{
"epoch": 0.83,
"grad_norm": 2.98491536974976,
"learning_rate": 1.6506693538317574e-06,
"loss": 0.8739,
"step": 2032
},
{
"epoch": 0.83,
"grad_norm": 3.1969763753482274,
"learning_rate": 1.6503438365788442e-06,
"loss": 0.9191,
"step": 2033
},
{
"epoch": 0.83,
"grad_norm": 2.768369226760884,
"learning_rate": 1.6500181998625463e-06,
"loss": 0.9073,
"step": 2034
},
{
"epoch": 0.83,
"grad_norm": 3.359344293969231,
"learning_rate": 1.6496924437426794e-06,
"loss": 0.8928,
"step": 2035
},
{
"epoch": 0.83,
"grad_norm": 2.5984751344198176,
"learning_rate": 1.6493665682790835e-06,
"loss": 0.8879,
"step": 2036
},
{
"epoch": 0.83,
"grad_norm": 3.045349359611623,
"learning_rate": 1.6490405735316183e-06,
"loss": 0.9508,
"step": 2037
},
{
"epoch": 0.83,
"grad_norm": 2.6643754299816784,
"learning_rate": 1.6487144595601681e-06,
"loss": 0.9379,
"step": 2038
},
{
"epoch": 0.83,
"grad_norm": 3.226897244201286,
"learning_rate": 1.6483882264246371e-06,
"loss": 0.8397,
"step": 2039
},
{
"epoch": 0.83,
"grad_norm": 2.944568239280113,
"learning_rate": 1.6480618741849515e-06,
"loss": 0.8434,
"step": 2040
},
{
"epoch": 0.83,
"grad_norm": 3.335200680723088,
"learning_rate": 1.6477354029010603e-06,
"loss": 0.8299,
"step": 2041
},
{
"epoch": 0.83,
"grad_norm": 2.8908463979295353,
"learning_rate": 1.6474088126329336e-06,
"loss": 0.8873,
"step": 2042
},
{
"epoch": 0.83,
"grad_norm": 2.7691100156583586,
"learning_rate": 1.6470821034405637e-06,
"loss": 0.8986,
"step": 2043
},
{
"epoch": 0.83,
"grad_norm": 2.3632544951101826,
"learning_rate": 1.6467552753839648e-06,
"loss": 0.9162,
"step": 2044
},
{
"epoch": 0.83,
"grad_norm": 2.434072333518799,
"learning_rate": 1.6464283285231727e-06,
"loss": 0.9286,
"step": 2045
},
{
"epoch": 0.83,
"grad_norm": 2.8150347329031784,
"learning_rate": 1.646101262918245e-06,
"loss": 0.9134,
"step": 2046
},
{
"epoch": 0.84,
"grad_norm": 3.1541160566076623,
"learning_rate": 1.6457740786292618e-06,
"loss": 0.8548,
"step": 2047
},
{
"epoch": 0.84,
"grad_norm": 2.4837088971857493,
"learning_rate": 1.6454467757163238e-06,
"loss": 1.0346,
"step": 2048
},
{
"epoch": 0.84,
"grad_norm": 2.4813225162437034,
"learning_rate": 1.6451193542395545e-06,
"loss": 0.8945,
"step": 2049
},
{
"epoch": 0.84,
"grad_norm": 3.1750958616715828,
"learning_rate": 1.6447918142590983e-06,
"loss": 0.8443,
"step": 2050
},
{
"epoch": 0.84,
"grad_norm": 3.031092135502076,
"learning_rate": 1.644464155835123e-06,
"loss": 1.0246,
"step": 2051
},
{
"epoch": 0.84,
"grad_norm": 3.7276701999740336,
"learning_rate": 1.644136379027816e-06,
"loss": 0.8843,
"step": 2052
},
{
"epoch": 0.84,
"grad_norm": 3.500023378991977,
"learning_rate": 1.6438084838973878e-06,
"loss": 0.8759,
"step": 2053
},
{
"epoch": 0.84,
"grad_norm": 3.5721169298079594,
"learning_rate": 1.6434804705040705e-06,
"loss": 0.8545,
"step": 2054
},
{
"epoch": 0.84,
"grad_norm": 2.7200136630622804,
"learning_rate": 1.6431523389081177e-06,
"loss": 0.9811,
"step": 2055
},
{
"epoch": 0.84,
"grad_norm": 2.6492133004046816,
"learning_rate": 1.6428240891698045e-06,
"loss": 0.8935,
"step": 2056
},
{
"epoch": 0.84,
"grad_norm": 2.781226833705946,
"learning_rate": 1.6424957213494282e-06,
"loss": 0.979,
"step": 2057
},
{
"epoch": 0.84,
"grad_norm": 2.9840288240769253,
"learning_rate": 1.6421672355073075e-06,
"loss": 0.8744,
"step": 2058
},
{
"epoch": 0.84,
"grad_norm": 3.0144377881564277,
"learning_rate": 1.641838631703783e-06,
"loss": 0.86,
"step": 2059
},
{
"epoch": 0.84,
"grad_norm": 2.4290045187821434,
"learning_rate": 1.6415099099992165e-06,
"loss": 0.896,
"step": 2060
},
{
"epoch": 0.84,
"grad_norm": 2.7484955089509224,
"learning_rate": 1.6411810704539919e-06,
"loss": 0.9446,
"step": 2061
},
{
"epoch": 0.84,
"grad_norm": 4.034934996279932,
"learning_rate": 1.6408521131285147e-06,
"loss": 0.8924,
"step": 2062
},
{
"epoch": 0.84,
"grad_norm": 3.0508269248812914,
"learning_rate": 1.6405230380832118e-06,
"loss": 0.861,
"step": 2063
},
{
"epoch": 0.84,
"grad_norm": 2.7740103433290617,
"learning_rate": 1.640193845378532e-06,
"loss": 0.8822,
"step": 2064
},
{
"epoch": 0.84,
"grad_norm": 2.92693968986525,
"learning_rate": 1.6398645350749454e-06,
"loss": 0.8718,
"step": 2065
},
{
"epoch": 0.84,
"grad_norm": 3.522213939542427,
"learning_rate": 1.6395351072329441e-06,
"loss": 0.8885,
"step": 2066
},
{
"epoch": 0.84,
"grad_norm": 2.8658397986608097,
"learning_rate": 1.6392055619130417e-06,
"loss": 0.9278,
"step": 2067
},
{
"epoch": 0.84,
"grad_norm": 2.584238179237454,
"learning_rate": 1.6388758991757725e-06,
"loss": 0.8405,
"step": 2068
},
{
"epoch": 0.84,
"grad_norm": 2.514564391778897,
"learning_rate": 1.638546119081694e-06,
"loss": 0.9162,
"step": 2069
},
{
"epoch": 0.84,
"grad_norm": 3.0352439266853835,
"learning_rate": 1.6382162216913842e-06,
"loss": 0.9402,
"step": 2070
},
{
"epoch": 0.85,
"grad_norm": 2.995775416177161,
"learning_rate": 1.6378862070654425e-06,
"loss": 0.8839,
"step": 2071
},
{
"epoch": 0.85,
"grad_norm": 4.096642330178395,
"learning_rate": 1.63755607526449e-06,
"loss": 0.9098,
"step": 2072
},
{
"epoch": 0.85,
"grad_norm": 3.307579349198495,
"learning_rate": 1.6372258263491703e-06,
"loss": 0.9267,
"step": 2073
},
{
"epoch": 0.85,
"grad_norm": 2.935830723413875,
"learning_rate": 1.6368954603801471e-06,
"loss": 0.8385,
"step": 2074
},
{
"epoch": 0.85,
"grad_norm": 2.6149614530304066,
"learning_rate": 1.6365649774181063e-06,
"loss": 0.8913,
"step": 2075
},
{
"epoch": 0.85,
"grad_norm": 3.8295400900489507,
"learning_rate": 1.6362343775237548e-06,
"loss": 0.8651,
"step": 2076
},
{
"epoch": 0.85,
"grad_norm": 3.173002783823531,
"learning_rate": 1.6359036607578224e-06,
"loss": 0.929,
"step": 2077
},
{
"epoch": 0.85,
"grad_norm": 3.4531672420098976,
"learning_rate": 1.6355728271810584e-06,
"loss": 0.8618,
"step": 2078
},
{
"epoch": 0.85,
"grad_norm": 3.091599631402553,
"learning_rate": 1.635241876854235e-06,
"loss": 0.8941,
"step": 2079
},
{
"epoch": 0.85,
"grad_norm": 3.0857444455509464,
"learning_rate": 1.6349108098381448e-06,
"loss": 0.9144,
"step": 2080
},
{
"epoch": 0.85,
"grad_norm": 3.1124586884314107,
"learning_rate": 1.634579626193603e-06,
"loss": 0.8545,
"step": 2081
},
{
"epoch": 0.85,
"grad_norm": 3.24738463204807,
"learning_rate": 1.6342483259814452e-06,
"loss": 0.9157,
"step": 2082
},
{
"epoch": 0.85,
"grad_norm": 2.914587863594466,
"learning_rate": 1.6339169092625291e-06,
"loss": 0.8991,
"step": 2083
},
{
"epoch": 0.85,
"grad_norm": 3.084649003663344,
"learning_rate": 1.6335853760977334e-06,
"loss": 0.8542,
"step": 2084
},
{
"epoch": 0.85,
"grad_norm": 2.9255677572103127,
"learning_rate": 1.633253726547958e-06,
"loss": 0.8505,
"step": 2085
},
{
"epoch": 0.85,
"grad_norm": 2.767198761091259,
"learning_rate": 1.632921960674125e-06,
"loss": 0.956,
"step": 2086
},
{
"epoch": 0.85,
"grad_norm": 3.3823571676882334,
"learning_rate": 1.6325900785371771e-06,
"loss": 0.8477,
"step": 2087
},
{
"epoch": 0.85,
"grad_norm": 2.6383657896551367,
"learning_rate": 1.632258080198079e-06,
"loss": 0.8751,
"step": 2088
},
{
"epoch": 0.85,
"grad_norm": 2.600982440050627,
"learning_rate": 1.6319259657178152e-06,
"loss": 0.8716,
"step": 2089
},
{
"epoch": 0.85,
"grad_norm": 2.8095015717434,
"learning_rate": 1.631593735157394e-06,
"loss": 0.8776,
"step": 2090
},
{
"epoch": 0.85,
"grad_norm": 2.8565118632285253,
"learning_rate": 1.6312613885778432e-06,
"loss": 0.928,
"step": 2091
},
{
"epoch": 0.85,
"grad_norm": 2.9368721322625495,
"learning_rate": 1.6309289260402123e-06,
"loss": 0.8479,
"step": 2092
},
{
"epoch": 0.85,
"grad_norm": 2.930291801361324,
"learning_rate": 1.6305963476055723e-06,
"loss": 0.8624,
"step": 2093
},
{
"epoch": 0.85,
"grad_norm": 2.727173273080247,
"learning_rate": 1.6302636533350156e-06,
"loss": 0.8731,
"step": 2094
},
{
"epoch": 0.85,
"grad_norm": 3.1437286722373528,
"learning_rate": 1.6299308432896552e-06,
"loss": 0.8837,
"step": 2095
},
{
"epoch": 0.86,
"grad_norm": 2.6692324028061702,
"learning_rate": 1.6295979175306265e-06,
"loss": 0.9547,
"step": 2096
},
{
"epoch": 0.86,
"grad_norm": 2.7444046981654946,
"learning_rate": 1.6292648761190853e-06,
"loss": 0.8878,
"step": 2097
},
{
"epoch": 0.86,
"grad_norm": 3.5127051787976584,
"learning_rate": 1.6289317191162086e-06,
"loss": 0.8481,
"step": 2098
},
{
"epoch": 0.86,
"grad_norm": 3.327259748871107,
"learning_rate": 1.6285984465831954e-06,
"loss": 0.9208,
"step": 2099
},
{
"epoch": 0.86,
"grad_norm": 2.783568092237267,
"learning_rate": 1.628265058581265e-06,
"loss": 0.8629,
"step": 2100
},
{
"epoch": 0.86,
"eval_loss": 0.8921508193016052,
"eval_runtime": 465.6913,
"eval_samples_per_second": 74.839,
"eval_steps_per_second": 4.679,
"step": 2100
},
{
"epoch": 0.86,
"grad_norm": 3.0812676781734827,
"learning_rate": 1.6279315551716581e-06,
"loss": 0.9052,
"step": 2101
},
{
"epoch": 0.86,
"grad_norm": 2.956169988221401,
"learning_rate": 1.6275979364156374e-06,
"loss": 0.884,
"step": 2102
},
{
"epoch": 0.86,
"grad_norm": 3.30943408977074,
"learning_rate": 1.6272642023744858e-06,
"loss": 0.9563,
"step": 2103
},
{
"epoch": 0.86,
"grad_norm": 3.2327278758691746,
"learning_rate": 1.626930353109508e-06,
"loss": 0.8748,
"step": 2104
},
{
"epoch": 0.86,
"grad_norm": 3.9150786068824495,
"learning_rate": 1.6265963886820297e-06,
"loss": 0.8493,
"step": 2105
},
{
"epoch": 0.86,
"grad_norm": 3.149671225959013,
"learning_rate": 1.6262623091533978e-06,
"loss": 0.8558,
"step": 2106
},
{
"epoch": 0.86,
"grad_norm": 2.7145623021545813,
"learning_rate": 1.6259281145849801e-06,
"loss": 0.884,
"step": 2107
},
{
"epoch": 0.86,
"grad_norm": 2.7500040657866043,
"learning_rate": 1.6255938050381652e-06,
"loss": 0.8871,
"step": 2108
},
{
"epoch": 0.86,
"grad_norm": 3.578567968071669,
"learning_rate": 1.625259380574364e-06,
"loss": 0.89,
"step": 2109
},
{
"epoch": 0.86,
"grad_norm": 3.3427768124877013,
"learning_rate": 1.6249248412550079e-06,
"loss": 0.8918,
"step": 2110
},
{
"epoch": 0.86,
"grad_norm": 2.6964012427249426,
"learning_rate": 1.624590187141549e-06,
"loss": 0.8546,
"step": 2111
},
{
"epoch": 0.86,
"grad_norm": 2.7173624204041573,
"learning_rate": 1.6242554182954605e-06,
"loss": 0.8714,
"step": 2112
},
{
"epoch": 0.86,
"grad_norm": 2.7820168971670265,
"learning_rate": 1.6239205347782371e-06,
"loss": 0.9158,
"step": 2113
},
{
"epoch": 0.86,
"grad_norm": 2.9690168032730426,
"learning_rate": 1.623585536651395e-06,
"loss": 0.8375,
"step": 2114
},
{
"epoch": 0.86,
"grad_norm": 3.3910380119276575,
"learning_rate": 1.6232504239764706e-06,
"loss": 0.874,
"step": 2115
},
{
"epoch": 0.86,
"grad_norm": 2.976796376053272,
"learning_rate": 1.6229151968150215e-06,
"loss": 0.9121,
"step": 2116
},
{
"epoch": 0.86,
"grad_norm": 2.8634928901170213,
"learning_rate": 1.6225798552286265e-06,
"loss": 0.857,
"step": 2117
},
{
"epoch": 0.86,
"grad_norm": 2.8231552816602963,
"learning_rate": 1.6222443992788853e-06,
"loss": 0.905,
"step": 2118
},
{
"epoch": 0.86,
"grad_norm": 3.0279326806056575,
"learning_rate": 1.621908829027419e-06,
"loss": 0.9541,
"step": 2119
},
{
"epoch": 0.87,
"grad_norm": 2.7869184573252284,
"learning_rate": 1.621573144535869e-06,
"loss": 0.9405,
"step": 2120
},
{
"epoch": 0.87,
"grad_norm": 2.4070274864578454,
"learning_rate": 1.6212373458658984e-06,
"loss": 0.9372,
"step": 2121
},
{
"epoch": 0.87,
"grad_norm": 2.702760330504293,
"learning_rate": 1.620901433079191e-06,
"loss": 0.8733,
"step": 2122
},
{
"epoch": 0.87,
"grad_norm": 2.6522187681554996,
"learning_rate": 1.6205654062374512e-06,
"loss": 0.8626,
"step": 2123
},
{
"epoch": 0.87,
"grad_norm": 2.5426324583415356,
"learning_rate": 1.6202292654024045e-06,
"loss": 0.8997,
"step": 2124
},
{
"epoch": 0.87,
"grad_norm": 2.6854578565861105,
"learning_rate": 1.6198930106357982e-06,
"loss": 0.8659,
"step": 2125
},
{
"epoch": 0.87,
"grad_norm": 2.4810283788475815,
"learning_rate": 1.6195566419993992e-06,
"loss": 0.8866,
"step": 2126
},
{
"epoch": 0.87,
"grad_norm": 2.9939844352867566,
"learning_rate": 1.6192201595549964e-06,
"loss": 0.9102,
"step": 2127
},
{
"epoch": 0.87,
"grad_norm": 2.5482553767980582,
"learning_rate": 1.6188835633643987e-06,
"loss": 0.864,
"step": 2128
},
{
"epoch": 0.87,
"grad_norm": 2.8285659469451847,
"learning_rate": 1.6185468534894367e-06,
"loss": 0.8603,
"step": 2129
},
{
"epoch": 0.87,
"grad_norm": 2.4287381675048434,
"learning_rate": 1.6182100299919614e-06,
"loss": 0.9942,
"step": 2130
},
{
"epoch": 0.87,
"grad_norm": 2.800259272382821,
"learning_rate": 1.617873092933845e-06,
"loss": 0.8238,
"step": 2131
},
{
"epoch": 0.87,
"grad_norm": 2.9784752367596563,
"learning_rate": 1.61753604237698e-06,
"loss": 0.8366,
"step": 2132
},
{
"epoch": 0.87,
"grad_norm": 2.5906117611750883,
"learning_rate": 1.6171988783832804e-06,
"loss": 0.8864,
"step": 2133
},
{
"epoch": 0.87,
"grad_norm": 2.4910663017611383,
"learning_rate": 1.6168616010146802e-06,
"loss": 0.8909,
"step": 2134
},
{
"epoch": 0.87,
"grad_norm": 2.8107032811825685,
"learning_rate": 1.6165242103331358e-06,
"loss": 0.8761,
"step": 2135
},
{
"epoch": 0.87,
"grad_norm": 2.695901115409725,
"learning_rate": 1.6161867064006228e-06,
"loss": 0.8802,
"step": 2136
},
{
"epoch": 0.87,
"grad_norm": 3.1158575294300146,
"learning_rate": 1.615849089279138e-06,
"loss": 0.8695,
"step": 2137
},
{
"epoch": 0.87,
"grad_norm": 2.832074682049337,
"learning_rate": 1.615511359030699e-06,
"loss": 0.9187,
"step": 2138
},
{
"epoch": 0.87,
"grad_norm": 2.9325622837041214,
"learning_rate": 1.6151735157173452e-06,
"loss": 0.9023,
"step": 2139
},
{
"epoch": 0.87,
"grad_norm": 3.1457636270635603,
"learning_rate": 1.6148355594011354e-06,
"loss": 0.8474,
"step": 2140
},
{
"epoch": 0.87,
"grad_norm": 2.799900401934471,
"learning_rate": 1.6144974901441497e-06,
"loss": 0.9171,
"step": 2141
},
{
"epoch": 0.87,
"grad_norm": 3.9287159019831517,
"learning_rate": 1.6141593080084893e-06,
"loss": 0.9086,
"step": 2142
},
{
"epoch": 0.87,
"grad_norm": 3.273983922440811,
"learning_rate": 1.613821013056275e-06,
"loss": 0.8696,
"step": 2143
},
{
"epoch": 0.87,
"grad_norm": 3.2661503678102064,
"learning_rate": 1.61348260534965e-06,
"loss": 0.9185,
"step": 2144
},
{
"epoch": 0.88,
"grad_norm": 2.9182126285001107,
"learning_rate": 1.6131440849507768e-06,
"loss": 0.7898,
"step": 2145
},
{
"epoch": 0.88,
"grad_norm": 2.70601037927509,
"learning_rate": 1.6128054519218393e-06,
"loss": 0.9439,
"step": 2146
},
{
"epoch": 0.88,
"grad_norm": 2.7217848018407804,
"learning_rate": 1.6124667063250414e-06,
"loss": 0.8755,
"step": 2147
},
{
"epoch": 0.88,
"grad_norm": 2.4427722359573947,
"learning_rate": 1.6121278482226092e-06,
"loss": 0.8546,
"step": 2148
},
{
"epoch": 0.88,
"grad_norm": 2.984906891935736,
"learning_rate": 1.6117888776767873e-06,
"loss": 0.8525,
"step": 2149
},
{
"epoch": 0.88,
"grad_norm": 3.132500900394273,
"learning_rate": 1.6114497947498428e-06,
"loss": 0.8053,
"step": 2150
},
{
"epoch": 0.88,
"grad_norm": 2.97930352813734,
"learning_rate": 1.6111105995040627e-06,
"loss": 0.9876,
"step": 2151
},
{
"epoch": 0.88,
"grad_norm": 2.3160794519372074,
"learning_rate": 1.6107712920017544e-06,
"loss": 0.9734,
"step": 2152
},
{
"epoch": 0.88,
"grad_norm": 2.8744602832979362,
"learning_rate": 1.6104318723052464e-06,
"loss": 0.8581,
"step": 2153
},
{
"epoch": 0.88,
"grad_norm": 2.5625233365454285,
"learning_rate": 1.6100923404768874e-06,
"loss": 0.9289,
"step": 2154
},
{
"epoch": 0.88,
"grad_norm": 2.8438378837630687,
"learning_rate": 1.6097526965790471e-06,
"loss": 0.8654,
"step": 2155
},
{
"epoch": 0.88,
"grad_norm": 3.4237661549777174,
"learning_rate": 1.6094129406741158e-06,
"loss": 0.9356,
"step": 2156
},
{
"epoch": 0.88,
"grad_norm": 2.592251173595049,
"learning_rate": 1.6090730728245037e-06,
"loss": 0.8544,
"step": 2157
},
{
"epoch": 0.88,
"grad_norm": 2.6985344894109153,
"learning_rate": 1.6087330930926424e-06,
"loss": 0.9375,
"step": 2158
},
{
"epoch": 0.88,
"grad_norm": 3.4921834620352277,
"learning_rate": 1.6083930015409833e-06,
"loss": 0.8063,
"step": 2159
},
{
"epoch": 0.88,
"grad_norm": 3.637209411049718,
"learning_rate": 1.608052798231999e-06,
"loss": 0.8118,
"step": 2160
},
{
"epoch": 0.88,
"grad_norm": 3.1608248436692303,
"learning_rate": 1.6077124832281821e-06,
"loss": 0.8474,
"step": 2161
},
{
"epoch": 0.88,
"grad_norm": 2.8959421952584705,
"learning_rate": 1.6073720565920463e-06,
"loss": 0.8887,
"step": 2162
},
{
"epoch": 0.88,
"grad_norm": 2.95297628613181,
"learning_rate": 1.6070315183861257e-06,
"loss": 0.8747,
"step": 2163
},
{
"epoch": 0.88,
"grad_norm": 3.374694203731936,
"learning_rate": 1.6066908686729737e-06,
"loss": 0.8385,
"step": 2164
},
{
"epoch": 0.88,
"grad_norm": 3.1565399040186506,
"learning_rate": 1.606350107515166e-06,
"loss": 0.8514,
"step": 2165
},
{
"epoch": 0.88,
"grad_norm": 3.14041199221672,
"learning_rate": 1.6060092349752977e-06,
"loss": 0.8445,
"step": 2166
},
{
"epoch": 0.88,
"grad_norm": 3.103671184854047,
"learning_rate": 1.6056682511159845e-06,
"loss": 0.8834,
"step": 2167
},
{
"epoch": 0.88,
"grad_norm": 2.9888897681873288,
"learning_rate": 1.6053271559998625e-06,
"loss": 0.8897,
"step": 2168
},
{
"epoch": 0.89,
"grad_norm": 2.526586707993972,
"learning_rate": 1.6049859496895886e-06,
"loss": 0.9141,
"step": 2169
},
{
"epoch": 0.89,
"grad_norm": 3.4462515459833383,
"learning_rate": 1.6046446322478397e-06,
"loss": 0.8899,
"step": 2170
},
{
"epoch": 0.89,
"grad_norm": 2.6801757057580997,
"learning_rate": 1.604303203737314e-06,
"loss": 0.9215,
"step": 2171
},
{
"epoch": 0.89,
"grad_norm": 2.890797347504238,
"learning_rate": 1.6039616642207284e-06,
"loss": 0.873,
"step": 2172
},
{
"epoch": 0.89,
"grad_norm": 3.087791144895048,
"learning_rate": 1.6036200137608216e-06,
"loss": 0.9012,
"step": 2173
},
{
"epoch": 0.89,
"grad_norm": 3.429210988653777,
"learning_rate": 1.6032782524203524e-06,
"loss": 0.8873,
"step": 2174
},
{
"epoch": 0.89,
"grad_norm": 2.4514418769151294,
"learning_rate": 1.6029363802621e-06,
"loss": 0.8621,
"step": 2175
},
{
"epoch": 0.89,
"grad_norm": 2.89641935150371,
"learning_rate": 1.6025943973488635e-06,
"loss": 0.9448,
"step": 2176
},
{
"epoch": 0.89,
"grad_norm": 2.883961443283617,
"learning_rate": 1.6022523037434624e-06,
"loss": 0.8793,
"step": 2177
},
{
"epoch": 0.89,
"grad_norm": 3.723825796382935,
"learning_rate": 1.6019100995087373e-06,
"loss": 0.8108,
"step": 2178
},
{
"epoch": 0.89,
"grad_norm": 2.5571352194412085,
"learning_rate": 1.6015677847075483e-06,
"loss": 0.872,
"step": 2179
},
{
"epoch": 0.89,
"grad_norm": 2.45416737946924,
"learning_rate": 1.6012253594027764e-06,
"loss": 0.8391,
"step": 2180
},
{
"epoch": 0.89,
"grad_norm": 2.77970901574232,
"learning_rate": 1.6008828236573224e-06,
"loss": 0.917,
"step": 2181
},
{
"epoch": 0.89,
"grad_norm": 2.888034786413823,
"learning_rate": 1.6005401775341078e-06,
"loss": 0.8903,
"step": 2182
},
{
"epoch": 0.89,
"grad_norm": 3.0117217761658805,
"learning_rate": 1.6001974210960737e-06,
"loss": 0.907,
"step": 2183
},
{
"epoch": 0.89,
"grad_norm": 3.353425481787497,
"learning_rate": 1.5998545544061824e-06,
"loss": 0.89,
"step": 2184
},
{
"epoch": 0.89,
"grad_norm": 2.6054193662382907,
"learning_rate": 1.5995115775274157e-06,
"loss": 0.878,
"step": 2185
},
{
"epoch": 0.89,
"grad_norm": 2.8911160375539344,
"learning_rate": 1.5991684905227762e-06,
"loss": 0.9395,
"step": 2186
},
{
"epoch": 0.89,
"grad_norm": 2.802065993377508,
"learning_rate": 1.5988252934552864e-06,
"loss": 0.9626,
"step": 2187
},
{
"epoch": 0.89,
"grad_norm": 2.5110631319438257,
"learning_rate": 1.598481986387989e-06,
"loss": 0.8123,
"step": 2188
},
{
"epoch": 0.89,
"grad_norm": 2.489169257517133,
"learning_rate": 1.5981385693839472e-06,
"loss": 0.8889,
"step": 2189
},
{
"epoch": 0.89,
"grad_norm": 3.063423842912826,
"learning_rate": 1.5977950425062438e-06,
"loss": 0.9025,
"step": 2190
},
{
"epoch": 0.89,
"grad_norm": 2.652544233155348,
"learning_rate": 1.5974514058179822e-06,
"loss": 0.9672,
"step": 2191
},
{
"epoch": 0.89,
"grad_norm": 2.7095034329037357,
"learning_rate": 1.5971076593822864e-06,
"loss": 0.8761,
"step": 2192
},
{
"epoch": 0.89,
"grad_norm": 2.7227517494075526,
"learning_rate": 1.5967638032622994e-06,
"loss": 0.8593,
"step": 2193
},
{
"epoch": 0.9,
"grad_norm": 2.898266771279744,
"learning_rate": 1.5964198375211857e-06,
"loss": 0.9214,
"step": 2194
},
{
"epoch": 0.9,
"grad_norm": 3.026950425386447,
"learning_rate": 1.5960757622221293e-06,
"loss": 0.9168,
"step": 2195
},
{
"epoch": 0.9,
"grad_norm": 2.9590910837260846,
"learning_rate": 1.5957315774283332e-06,
"loss": 0.8502,
"step": 2196
},
{
"epoch": 0.9,
"grad_norm": 3.042912727826984,
"learning_rate": 1.5953872832030234e-06,
"loss": 0.8861,
"step": 2197
},
{
"epoch": 0.9,
"grad_norm": 2.356846082393032,
"learning_rate": 1.5950428796094425e-06,
"loss": 0.8955,
"step": 2198
},
{
"epoch": 0.9,
"grad_norm": 2.8021259955367293,
"learning_rate": 1.594698366710856e-06,
"loss": 0.8161,
"step": 2199
},
{
"epoch": 0.9,
"grad_norm": 2.514245964047131,
"learning_rate": 1.594353744570548e-06,
"loss": 0.9059,
"step": 2200
},
{
"epoch": 0.9,
"eval_loss": 0.8905588388442993,
"eval_runtime": 464.1497,
"eval_samples_per_second": 75.088,
"eval_steps_per_second": 4.695,
"step": 2200
},
{
"epoch": 0.9,
"grad_norm": 2.3822053398468497,
"learning_rate": 1.594009013251823e-06,
"loss": 0.836,
"step": 2201
},
{
"epoch": 0.9,
"grad_norm": 2.834639554773577,
"learning_rate": 1.5936641728180061e-06,
"loss": 0.8557,
"step": 2202
},
{
"epoch": 0.9,
"grad_norm": 3.370600957082851,
"learning_rate": 1.593319223332441e-06,
"loss": 0.8797,
"step": 2203
},
{
"epoch": 0.9,
"grad_norm": 2.782780985079358,
"learning_rate": 1.5929741648584934e-06,
"loss": 0.9028,
"step": 2204
},
{
"epoch": 0.9,
"grad_norm": 3.393000266295313,
"learning_rate": 1.5926289974595472e-06,
"loss": 0.8852,
"step": 2205
},
{
"epoch": 0.9,
"grad_norm": 2.446406918278143,
"learning_rate": 1.5922837211990078e-06,
"loss": 0.889,
"step": 2206
},
{
"epoch": 0.9,
"grad_norm": 2.8062128415337373,
"learning_rate": 1.5919383361402995e-06,
"loss": 0.8496,
"step": 2207
},
{
"epoch": 0.9,
"grad_norm": 2.9957719866692534,
"learning_rate": 1.591592842346867e-06,
"loss": 0.8173,
"step": 2208
},
{
"epoch": 0.9,
"grad_norm": 2.554308075685344,
"learning_rate": 1.591247239882175e-06,
"loss": 0.829,
"step": 2209
},
{
"epoch": 0.9,
"grad_norm": 2.6854340140313955,
"learning_rate": 1.5909015288097086e-06,
"loss": 0.9276,
"step": 2210
},
{
"epoch": 0.9,
"grad_norm": 3.079099702644832,
"learning_rate": 1.5905557091929715e-06,
"loss": 0.8656,
"step": 2211
},
{
"epoch": 0.9,
"grad_norm": 2.760671150176172,
"learning_rate": 1.5902097810954888e-06,
"loss": 0.9242,
"step": 2212
},
{
"epoch": 0.9,
"grad_norm": 2.8434276030210652,
"learning_rate": 1.5898637445808052e-06,
"loss": 0.9041,
"step": 2213
},
{
"epoch": 0.9,
"grad_norm": 2.765383708056856,
"learning_rate": 1.5895175997124843e-06,
"loss": 0.9408,
"step": 2214
},
{
"epoch": 0.9,
"grad_norm": 2.418366294232195,
"learning_rate": 1.589171346554111e-06,
"loss": 0.903,
"step": 2215
},
{
"epoch": 0.9,
"grad_norm": 2.929303218362304,
"learning_rate": 1.588824985169289e-06,
"loss": 0.8768,
"step": 2216
},
{
"epoch": 0.9,
"grad_norm": 2.5696787436017057,
"learning_rate": 1.5884785156216432e-06,
"loss": 0.8889,
"step": 2217
},
{
"epoch": 0.91,
"grad_norm": 2.547936584197333,
"learning_rate": 1.5881319379748163e-06,
"loss": 0.8911,
"step": 2218
},
{
"epoch": 0.91,
"grad_norm": 2.9578709904111893,
"learning_rate": 1.587785252292473e-06,
"loss": 0.8903,
"step": 2219
},
{
"epoch": 0.91,
"grad_norm": 2.7117494831406614,
"learning_rate": 1.5874384586382967e-06,
"loss": 0.9318,
"step": 2220
},
{
"epoch": 0.91,
"grad_norm": 2.9308026247002004,
"learning_rate": 1.5870915570759907e-06,
"loss": 0.862,
"step": 2221
},
{
"epoch": 0.91,
"grad_norm": 2.489195993999161,
"learning_rate": 1.5867445476692777e-06,
"loss": 0.9093,
"step": 2222
},
{
"epoch": 0.91,
"grad_norm": 3.606514586270724,
"learning_rate": 1.586397430481902e-06,
"loss": 0.8814,
"step": 2223
},
{
"epoch": 0.91,
"grad_norm": 2.8447895290884744,
"learning_rate": 1.5860502055776257e-06,
"loss": 0.8822,
"step": 2224
},
{
"epoch": 0.91,
"grad_norm": 2.5763716055438217,
"learning_rate": 1.5857028730202314e-06,
"loss": 0.9012,
"step": 2225
},
{
"epoch": 0.91,
"grad_norm": 3.1435938248105733,
"learning_rate": 1.5853554328735222e-06,
"loss": 0.875,
"step": 2226
},
{
"epoch": 0.91,
"grad_norm": 2.5171994104563455,
"learning_rate": 1.5850078852013198e-06,
"loss": 0.8084,
"step": 2227
},
{
"epoch": 0.91,
"grad_norm": 2.2802515135984294,
"learning_rate": 1.584660230067466e-06,
"loss": 0.8584,
"step": 2228
},
{
"epoch": 0.91,
"grad_norm": 2.70149111931541,
"learning_rate": 1.5843124675358227e-06,
"loss": 0.8689,
"step": 2229
},
{
"epoch": 0.91,
"grad_norm": 3.0807737256238723,
"learning_rate": 1.5839645976702714e-06,
"loss": 0.8788,
"step": 2230
},
{
"epoch": 0.91,
"grad_norm": 2.657241981834731,
"learning_rate": 1.583616620534713e-06,
"loss": 0.8522,
"step": 2231
},
{
"epoch": 0.91,
"grad_norm": 2.839629733059881,
"learning_rate": 1.5832685361930688e-06,
"loss": 0.8594,
"step": 2232
},
{
"epoch": 0.91,
"grad_norm": 2.6514713666182326,
"learning_rate": 1.582920344709279e-06,
"loss": 0.8663,
"step": 2233
},
{
"epoch": 0.91,
"grad_norm": 2.6805151043460476,
"learning_rate": 1.582572046147304e-06,
"loss": 0.9004,
"step": 2234
},
{
"epoch": 0.91,
"grad_norm": 3.0303723610212843,
"learning_rate": 1.582223640571123e-06,
"loss": 0.9039,
"step": 2235
},
{
"epoch": 0.91,
"grad_norm": 2.30597496497862,
"learning_rate": 1.5818751280447367e-06,
"loss": 0.8773,
"step": 2236
},
{
"epoch": 0.91,
"grad_norm": 2.5405553260084983,
"learning_rate": 1.5815265086321636e-06,
"loss": 0.8571,
"step": 2237
},
{
"epoch": 0.91,
"grad_norm": 3.1981578455179087,
"learning_rate": 1.5811777823974426e-06,
"loss": 0.8954,
"step": 2238
},
{
"epoch": 0.91,
"grad_norm": 2.69412376036328,
"learning_rate": 1.580828949404632e-06,
"loss": 0.8793,
"step": 2239
},
{
"epoch": 0.91,
"grad_norm": 2.85808271101662,
"learning_rate": 1.5804800097178102e-06,
"loss": 0.9606,
"step": 2240
},
{
"epoch": 0.91,
"grad_norm": 2.4685626916665604,
"learning_rate": 1.5801309634010744e-06,
"loss": 0.9118,
"step": 2241
},
{
"epoch": 0.91,
"grad_norm": 3.767339641477171,
"learning_rate": 1.5797818105185425e-06,
"loss": 0.9049,
"step": 2242
},
{
"epoch": 0.92,
"grad_norm": 2.710630509442484,
"learning_rate": 1.5794325511343507e-06,
"loss": 0.9326,
"step": 2243
},
{
"epoch": 0.92,
"grad_norm": 3.054123407243192,
"learning_rate": 1.5790831853126557e-06,
"loss": 0.9132,
"step": 2244
},
{
"epoch": 0.92,
"grad_norm": 2.420403355316319,
"learning_rate": 1.5787337131176334e-06,
"loss": 0.8747,
"step": 2245
},
{
"epoch": 0.92,
"grad_norm": 2.865755036587207,
"learning_rate": 1.5783841346134791e-06,
"loss": 0.9116,
"step": 2246
},
{
"epoch": 0.92,
"grad_norm": 2.5254541130138644,
"learning_rate": 1.5780344498644082e-06,
"loss": 0.7733,
"step": 2247
},
{
"epoch": 0.92,
"grad_norm": 3.048460529137758,
"learning_rate": 1.5776846589346549e-06,
"loss": 0.8672,
"step": 2248
},
{
"epoch": 0.92,
"grad_norm": 2.7913043126738892,
"learning_rate": 1.577334761888473e-06,
"loss": 1.0177,
"step": 2249
},
{
"epoch": 0.92,
"grad_norm": 3.3281024206473844,
"learning_rate": 1.5769847587901368e-06,
"loss": 0.8642,
"step": 2250
},
{
"epoch": 0.92,
"grad_norm": 2.98054313031752,
"learning_rate": 1.5766346497039385e-06,
"loss": 0.8651,
"step": 2251
},
{
"epoch": 0.92,
"grad_norm": 3.179362972871479,
"learning_rate": 1.5762844346941908e-06,
"loss": 0.8474,
"step": 2252
},
{
"epoch": 0.92,
"grad_norm": 2.5647077858900733,
"learning_rate": 1.5759341138252259e-06,
"loss": 0.7723,
"step": 2253
},
{
"epoch": 0.92,
"grad_norm": 3.9846386928161555,
"learning_rate": 1.5755836871613947e-06,
"loss": 0.9387,
"step": 2254
},
{
"epoch": 0.92,
"grad_norm": 2.822954768931439,
"learning_rate": 1.5752331547670688e-06,
"loss": 0.9228,
"step": 2255
},
{
"epoch": 0.92,
"grad_norm": 2.6962390036122326,
"learning_rate": 1.5748825167066375e-06,
"loss": 0.8669,
"step": 2256
},
{
"epoch": 0.92,
"grad_norm": 3.061863373847459,
"learning_rate": 1.5745317730445108e-06,
"loss": 0.9483,
"step": 2257
},
{
"epoch": 0.92,
"grad_norm": 2.6258790575209225,
"learning_rate": 1.5741809238451178e-06,
"loss": 0.8431,
"step": 2258
},
{
"epoch": 0.92,
"grad_norm": 3.4398858218150674,
"learning_rate": 1.5738299691729068e-06,
"loss": 0.961,
"step": 2259
},
{
"epoch": 0.92,
"grad_norm": 2.7868898881986603,
"learning_rate": 1.573478909092346e-06,
"loss": 0.8388,
"step": 2260
},
{
"epoch": 0.92,
"grad_norm": 2.4801792380287386,
"learning_rate": 1.573127743667922e-06,
"loss": 0.8571,
"step": 2261
},
{
"epoch": 0.92,
"grad_norm": 3.6679352644968954,
"learning_rate": 1.5727764729641418e-06,
"loss": 0.8748,
"step": 2262
},
{
"epoch": 0.92,
"grad_norm": 2.407559638013297,
"learning_rate": 1.5724250970455306e-06,
"loss": 0.9002,
"step": 2263
},
{
"epoch": 0.92,
"grad_norm": 2.8671015519582257,
"learning_rate": 1.5720736159766343e-06,
"loss": 0.8768,
"step": 2264
},
{
"epoch": 0.92,
"grad_norm": 3.0510900879692127,
"learning_rate": 1.571722029822017e-06,
"loss": 0.8586,
"step": 2265
},
{
"epoch": 0.92,
"grad_norm": 3.1747317334121576,
"learning_rate": 1.5713703386462625e-06,
"loss": 0.8593,
"step": 2266
},
{
"epoch": 0.93,
"grad_norm": 2.917413992252316,
"learning_rate": 1.5710185425139739e-06,
"loss": 0.9532,
"step": 2267
},
{
"epoch": 0.93,
"grad_norm": 2.559650340247892,
"learning_rate": 1.5706666414897738e-06,
"loss": 0.9318,
"step": 2268
},
{
"epoch": 0.93,
"grad_norm": 2.4830186774095346,
"learning_rate": 1.5703146356383036e-06,
"loss": 0.93,
"step": 2269
},
{
"epoch": 0.93,
"grad_norm": 3.0846117844774943,
"learning_rate": 1.5699625250242242e-06,
"loss": 0.9451,
"step": 2270
},
{
"epoch": 0.93,
"grad_norm": 2.699329434312517,
"learning_rate": 1.5696103097122156e-06,
"loss": 0.8656,
"step": 2271
},
{
"epoch": 0.93,
"grad_norm": 2.7652189311277207,
"learning_rate": 1.5692579897669772e-06,
"loss": 0.9774,
"step": 2272
},
{
"epoch": 0.93,
"grad_norm": 4.017389654439979,
"learning_rate": 1.5689055652532282e-06,
"loss": 0.896,
"step": 2273
},
{
"epoch": 0.93,
"grad_norm": 2.502785589812816,
"learning_rate": 1.5685530362357056e-06,
"loss": 0.8397,
"step": 2274
},
{
"epoch": 0.93,
"grad_norm": 2.5651974318917943,
"learning_rate": 1.5682004027791668e-06,
"loss": 0.8889,
"step": 2275
},
{
"epoch": 0.93,
"grad_norm": 2.945803742599342,
"learning_rate": 1.5678476649483878e-06,
"loss": 0.9312,
"step": 2276
},
{
"epoch": 0.93,
"grad_norm": 2.9393904644101396,
"learning_rate": 1.5674948228081642e-06,
"loss": 0.9166,
"step": 2277
},
{
"epoch": 0.93,
"grad_norm": 2.840877234220877,
"learning_rate": 1.5671418764233103e-06,
"loss": 0.9022,
"step": 2278
},
{
"epoch": 0.93,
"grad_norm": 2.933566396200748,
"learning_rate": 1.5667888258586595e-06,
"loss": 0.8898,
"step": 2279
},
{
"epoch": 0.93,
"grad_norm": 2.55016437363315,
"learning_rate": 1.5664356711790652e-06,
"loss": 0.8802,
"step": 2280
},
{
"epoch": 0.93,
"grad_norm": 2.8437996324877255,
"learning_rate": 1.5660824124493987e-06,
"loss": 0.9626,
"step": 2281
},
{
"epoch": 0.93,
"grad_norm": 4.113793485031338,
"learning_rate": 1.5657290497345518e-06,
"loss": 0.8624,
"step": 2282
},
{
"epoch": 0.93,
"grad_norm": 3.251865177764506,
"learning_rate": 1.565375583099434e-06,
"loss": 0.8835,
"step": 2283
},
{
"epoch": 0.93,
"grad_norm": 2.4133992711727483,
"learning_rate": 1.5650220126089746e-06,
"loss": 0.9192,
"step": 2284
},
{
"epoch": 0.93,
"grad_norm": 2.7475387241111684,
"learning_rate": 1.564668338328122e-06,
"loss": 0.9003,
"step": 2285
},
{
"epoch": 0.93,
"grad_norm": 3.0891460875048646,
"learning_rate": 1.5643145603218436e-06,
"loss": 0.8922,
"step": 2286
},
{
"epoch": 0.93,
"grad_norm": 2.971135441201397,
"learning_rate": 1.5639606786551256e-06,
"loss": 0.7842,
"step": 2287
},
{
"epoch": 0.93,
"grad_norm": 2.5446757946915657,
"learning_rate": 1.5636066933929741e-06,
"loss": 0.867,
"step": 2288
},
{
"epoch": 0.93,
"grad_norm": 2.8259046142375683,
"learning_rate": 1.5632526046004127e-06,
"loss": 0.8927,
"step": 2289
},
{
"epoch": 0.93,
"grad_norm": 2.9365753814451394,
"learning_rate": 1.5628984123424856e-06,
"loss": 0.897,
"step": 2290
},
{
"epoch": 0.93,
"grad_norm": 2.9900733013464347,
"learning_rate": 1.562544116684255e-06,
"loss": 0.8806,
"step": 2291
},
{
"epoch": 0.94,
"grad_norm": 2.9894757638860256,
"learning_rate": 1.5621897176908025e-06,
"loss": 0.831,
"step": 2292
},
{
"epoch": 0.94,
"grad_norm": 2.4661235289626817,
"learning_rate": 1.5618352154272288e-06,
"loss": 0.8849,
"step": 2293
},
{
"epoch": 0.94,
"grad_norm": 2.6488950712366246,
"learning_rate": 1.561480609958653e-06,
"loss": 0.9403,
"step": 2294
},
{
"epoch": 0.94,
"grad_norm": 2.841425059099907,
"learning_rate": 1.5611259013502134e-06,
"loss": 0.9378,
"step": 2295
},
{
"epoch": 0.94,
"grad_norm": 2.761691382270886,
"learning_rate": 1.5607710896670679e-06,
"loss": 0.8661,
"step": 2296
},
{
"epoch": 0.94,
"grad_norm": 2.35213135005833,
"learning_rate": 1.5604161749743923e-06,
"loss": 0.9479,
"step": 2297
},
{
"epoch": 0.94,
"grad_norm": 2.516335110198306,
"learning_rate": 1.5600611573373822e-06,
"loss": 0.9388,
"step": 2298
},
{
"epoch": 0.94,
"grad_norm": 2.590858404052936,
"learning_rate": 1.5597060368212515e-06,
"loss": 0.8613,
"step": 2299
},
{
"epoch": 0.94,
"grad_norm": 2.545911651047052,
"learning_rate": 1.5593508134912334e-06,
"loss": 0.9137,
"step": 2300
},
{
"epoch": 0.94,
"eval_loss": 0.8885095119476318,
"eval_runtime": 465.376,
"eval_samples_per_second": 74.89,
"eval_steps_per_second": 4.682,
"step": 2300
},
{
"epoch": 0.94,
"grad_norm": 2.6905304806926145,
"learning_rate": 1.5589954874125794e-06,
"loss": 0.87,
"step": 2301
},
{
"epoch": 0.94,
"grad_norm": 2.8464689291323007,
"learning_rate": 1.558640058650561e-06,
"loss": 0.8471,
"step": 2302
},
{
"epoch": 0.94,
"grad_norm": 2.8891658106713267,
"learning_rate": 1.5582845272704674e-06,
"loss": 0.9281,
"step": 2303
},
{
"epoch": 0.94,
"grad_norm": 3.1179610475181736,
"learning_rate": 1.557928893337607e-06,
"loss": 0.9458,
"step": 2304
},
{
"epoch": 0.94,
"grad_norm": 2.3928105964731983,
"learning_rate": 1.5575731569173076e-06,
"loss": 0.9343,
"step": 2305
},
{
"epoch": 0.94,
"grad_norm": 2.2501080020275923,
"learning_rate": 1.5572173180749148e-06,
"loss": 0.89,
"step": 2306
},
{
"epoch": 0.94,
"grad_norm": 2.8004842287577043,
"learning_rate": 1.556861376875794e-06,
"loss": 0.8686,
"step": 2307
},
{
"epoch": 0.94,
"grad_norm": 2.958172668672793,
"learning_rate": 1.5565053333853289e-06,
"loss": 0.9209,
"step": 2308
},
{
"epoch": 0.94,
"grad_norm": 2.608862071613659,
"learning_rate": 1.556149187668922e-06,
"loss": 0.9369,
"step": 2309
},
{
"epoch": 0.94,
"grad_norm": 2.879142225412582,
"learning_rate": 1.5557929397919943e-06,
"loss": 0.871,
"step": 2310
},
{
"epoch": 0.94,
"grad_norm": 2.78614748824832,
"learning_rate": 1.5554365898199868e-06,
"loss": 0.8138,
"step": 2311
},
{
"epoch": 0.94,
"grad_norm": 3.225101957328606,
"learning_rate": 1.5550801378183575e-06,
"loss": 0.9139,
"step": 2312
},
{
"epoch": 0.94,
"grad_norm": 3.2071169892079485,
"learning_rate": 1.5547235838525844e-06,
"loss": 0.8465,
"step": 2313
},
{
"epoch": 0.94,
"grad_norm": 2.5591573726798926,
"learning_rate": 1.5543669279881642e-06,
"loss": 0.9138,
"step": 2314
},
{
"epoch": 0.94,
"grad_norm": 2.87778474054276,
"learning_rate": 1.5540101702906112e-06,
"loss": 0.8686,
"step": 2315
},
{
"epoch": 0.95,
"grad_norm": 2.8200058610884677,
"learning_rate": 1.5536533108254597e-06,
"loss": 0.8697,
"step": 2316
},
{
"epoch": 0.95,
"grad_norm": 3.0308448614837302,
"learning_rate": 1.553296349658262e-06,
"loss": 0.8975,
"step": 2317
},
{
"epoch": 0.95,
"grad_norm": 2.7348803142617575,
"learning_rate": 1.552939286854589e-06,
"loss": 0.8544,
"step": 2318
},
{
"epoch": 0.95,
"grad_norm": 2.9582745571129156,
"learning_rate": 1.552582122480031e-06,
"loss": 0.8931,
"step": 2319
},
{
"epoch": 0.95,
"grad_norm": 3.1908003009558894,
"learning_rate": 1.5522248566001962e-06,
"loss": 0.788,
"step": 2320
},
{
"epoch": 0.95,
"grad_norm": 3.6730002566693605,
"learning_rate": 1.551867489280712e-06,
"loss": 0.8627,
"step": 2321
},
{
"epoch": 0.95,
"grad_norm": 3.097817046509011,
"learning_rate": 1.5515100205872238e-06,
"loss": 0.97,
"step": 2322
},
{
"epoch": 0.95,
"grad_norm": 3.4436111308491384,
"learning_rate": 1.5511524505853956e-06,
"loss": 0.8605,
"step": 2323
},
{
"epoch": 0.95,
"grad_norm": 3.2231281026774963,
"learning_rate": 1.5507947793409115e-06,
"loss": 0.8547,
"step": 2324
},
{
"epoch": 0.95,
"grad_norm": 2.5252241295962894,
"learning_rate": 1.550437006919472e-06,
"loss": 0.9188,
"step": 2325
},
{
"epoch": 0.95,
"grad_norm": 4.0439045020141355,
"learning_rate": 1.550079133386798e-06,
"loss": 0.8318,
"step": 2326
},
{
"epoch": 0.95,
"grad_norm": 3.7323865478706595,
"learning_rate": 1.5497211588086279e-06,
"loss": 0.9251,
"step": 2327
},
{
"epoch": 0.95,
"grad_norm": 4.462772603086909,
"learning_rate": 1.549363083250719e-06,
"loss": 0.9057,
"step": 2328
},
{
"epoch": 0.95,
"grad_norm": 2.987491493003971,
"learning_rate": 1.5490049067788471e-06,
"loss": 0.9785,
"step": 2329
},
{
"epoch": 0.95,
"grad_norm": 2.5486178897145315,
"learning_rate": 1.548646629458807e-06,
"loss": 0.8559,
"step": 2330
},
{
"epoch": 0.95,
"grad_norm": 2.6438169912821157,
"learning_rate": 1.548288251356411e-06,
"loss": 0.9751,
"step": 2331
},
{
"epoch": 0.95,
"grad_norm": 3.060225425490553,
"learning_rate": 1.5479297725374907e-06,
"loss": 0.847,
"step": 2332
},
{
"epoch": 0.95,
"grad_norm": 2.6405785013557113,
"learning_rate": 1.5475711930678963e-06,
"loss": 0.9467,
"step": 2333
},
{
"epoch": 0.95,
"grad_norm": 2.983869507121559,
"learning_rate": 1.5472125130134962e-06,
"loss": 0.8739,
"step": 2334
},
{
"epoch": 0.95,
"grad_norm": 2.7544422606580814,
"learning_rate": 1.5468537324401772e-06,
"loss": 0.8975,
"step": 2335
},
{
"epoch": 0.95,
"grad_norm": 2.789098982199318,
"learning_rate": 1.5464948514138442e-06,
"loss": 0.8953,
"step": 2336
},
{
"epoch": 0.95,
"grad_norm": 2.5505582524533206,
"learning_rate": 1.5461358700004218e-06,
"loss": 0.8972,
"step": 2337
},
{
"epoch": 0.95,
"grad_norm": 3.030609849725633,
"learning_rate": 1.5457767882658516e-06,
"loss": 0.8568,
"step": 2338
},
{
"epoch": 0.95,
"grad_norm": 2.742665321664662,
"learning_rate": 1.545417606276095e-06,
"loss": 0.7688,
"step": 2339
},
{
"epoch": 0.95,
"grad_norm": 3.1137182879049043,
"learning_rate": 1.5450583240971304e-06,
"loss": 0.816,
"step": 2340
},
{
"epoch": 0.96,
"grad_norm": 3.217998220372437,
"learning_rate": 1.5446989417949555e-06,
"loss": 0.8945,
"step": 2341
},
{
"epoch": 0.96,
"grad_norm": 2.8655646900854683,
"learning_rate": 1.5443394594355861e-06,
"loss": 0.8426,
"step": 2342
},
{
"epoch": 0.96,
"grad_norm": 2.777912054357338,
"learning_rate": 1.5439798770850566e-06,
"loss": 0.8909,
"step": 2343
},
{
"epoch": 0.96,
"grad_norm": 3.459332653539971,
"learning_rate": 1.54362019480942e-06,
"loss": 0.9371,
"step": 2344
},
{
"epoch": 0.96,
"grad_norm": 3.1532819400392813,
"learning_rate": 1.5432604126747465e-06,
"loss": 0.9504,
"step": 2345
},
{
"epoch": 0.96,
"grad_norm": 4.865915691500783,
"learning_rate": 1.5429005307471261e-06,
"loss": 0.8672,
"step": 2346
},
{
"epoch": 0.96,
"grad_norm": 3.099694518631143,
"learning_rate": 1.5425405490926661e-06,
"loss": 0.9157,
"step": 2347
},
{
"epoch": 0.96,
"grad_norm": 3.672915490675007,
"learning_rate": 1.5421804677774928e-06,
"loss": 0.9342,
"step": 2348
},
{
"epoch": 0.96,
"grad_norm": 2.5673746024043003,
"learning_rate": 1.54182028686775e-06,
"loss": 0.9042,
"step": 2349
},
{
"epoch": 0.96,
"grad_norm": 2.479547530951453,
"learning_rate": 1.5414600064296007e-06,
"loss": 0.8873,
"step": 2350
},
{
"epoch": 0.96,
"grad_norm": 3.0223354773427724,
"learning_rate": 1.5410996265292256e-06,
"loss": 0.8812,
"step": 2351
},
{
"epoch": 0.96,
"grad_norm": 3.212291511355554,
"learning_rate": 1.5407391472328239e-06,
"loss": 0.8822,
"step": 2352
},
{
"epoch": 0.96,
"grad_norm": 3.453795310729497,
"learning_rate": 1.5403785686066133e-06,
"loss": 0.9763,
"step": 2353
},
{
"epoch": 0.96,
"grad_norm": 2.564820240908072,
"learning_rate": 1.540017890716829e-06,
"loss": 0.9251,
"step": 2354
},
{
"epoch": 0.96,
"grad_norm": 3.1499966422017733,
"learning_rate": 1.5396571136297248e-06,
"loss": 0.8231,
"step": 2355
},
{
"epoch": 0.96,
"grad_norm": 3.3439308278071804,
"learning_rate": 1.5392962374115731e-06,
"loss": 0.9005,
"step": 2356
},
{
"epoch": 0.96,
"grad_norm": 2.941316972999271,
"learning_rate": 1.5389352621286645e-06,
"loss": 0.9323,
"step": 2357
},
{
"epoch": 0.96,
"grad_norm": 2.3590718448006927,
"learning_rate": 1.5385741878473066e-06,
"loss": 0.9103,
"step": 2358
},
{
"epoch": 0.96,
"grad_norm": 3.0625612379466083,
"learning_rate": 1.5382130146338268e-06,
"loss": 0.8277,
"step": 2359
},
{
"epoch": 0.96,
"grad_norm": 3.1216531481844645,
"learning_rate": 1.53785174255457e-06,
"loss": 0.9067,
"step": 2360
},
{
"epoch": 0.96,
"grad_norm": 2.6423203596667855,
"learning_rate": 1.5374903716758988e-06,
"loss": 0.8347,
"step": 2361
},
{
"epoch": 0.96,
"grad_norm": 2.8797987278801207,
"learning_rate": 1.5371289020641946e-06,
"loss": 0.8664,
"step": 2362
},
{
"epoch": 0.96,
"grad_norm": 2.5406694893336734,
"learning_rate": 1.5367673337858569e-06,
"loss": 0.8411,
"step": 2363
},
{
"epoch": 0.96,
"grad_norm": 2.9276490504867274,
"learning_rate": 1.536405666907303e-06,
"loss": 0.8793,
"step": 2364
},
{
"epoch": 0.97,
"grad_norm": 2.950872275980001,
"learning_rate": 1.5360439014949683e-06,
"loss": 0.8826,
"step": 2365
},
{
"epoch": 0.97,
"grad_norm": 3.1706643327335184,
"learning_rate": 1.5356820376153063e-06,
"loss": 0.8352,
"step": 2366
},
{
"epoch": 0.97,
"grad_norm": 2.480094224261953,
"learning_rate": 1.5353200753347894e-06,
"loss": 0.8255,
"step": 2367
},
{
"epoch": 0.97,
"grad_norm": 5.4333849686671325,
"learning_rate": 1.5349580147199065e-06,
"loss": 0.8976,
"step": 2368
},
{
"epoch": 0.97,
"grad_norm": 3.347973211896213,
"learning_rate": 1.5345958558371665e-06,
"loss": 0.8414,
"step": 2369
},
{
"epoch": 0.97,
"grad_norm": 3.6216090161046712,
"learning_rate": 1.5342335987530946e-06,
"loss": 0.866,
"step": 2370
},
{
"epoch": 0.97,
"grad_norm": 2.5573808593688567,
"learning_rate": 1.533871243534235e-06,
"loss": 0.9028,
"step": 2371
},
{
"epoch": 0.97,
"grad_norm": 2.634713644908199,
"learning_rate": 1.5335087902471493e-06,
"loss": 0.9336,
"step": 2372
},
{
"epoch": 0.97,
"grad_norm": 3.265318148284081,
"learning_rate": 1.5331462389584184e-06,
"loss": 0.8672,
"step": 2373
},
{
"epoch": 0.97,
"grad_norm": 2.9490160532698053,
"learning_rate": 1.5327835897346396e-06,
"loss": 0.8173,
"step": 2374
},
{
"epoch": 0.97,
"grad_norm": 2.784745926176486,
"learning_rate": 1.532420842642429e-06,
"loss": 0.8435,
"step": 2375
},
{
"epoch": 0.97,
"grad_norm": 2.4588356358847316,
"learning_rate": 1.5320579977484202e-06,
"loss": 0.8953,
"step": 2376
},
{
"epoch": 0.97,
"grad_norm": 2.515770714063543,
"learning_rate": 1.5316950551192663e-06,
"loss": 0.8693,
"step": 2377
},
{
"epoch": 0.97,
"grad_norm": 2.563118729714713,
"learning_rate": 1.5313320148216363e-06,
"loss": 0.9236,
"step": 2378
},
{
"epoch": 0.97,
"grad_norm": 2.5841071557688475,
"learning_rate": 1.530968876922218e-06,
"loss": 0.8905,
"step": 2379
},
{
"epoch": 0.97,
"grad_norm": 3.2736071096191575,
"learning_rate": 1.5306056414877176e-06,
"loss": 0.9105,
"step": 2380
},
{
"epoch": 0.97,
"grad_norm": 3.398081077774067,
"learning_rate": 1.5302423085848584e-06,
"loss": 0.8398,
"step": 2381
},
{
"epoch": 0.97,
"grad_norm": 2.653925752455571,
"learning_rate": 1.5298788782803823e-06,
"loss": 0.8708,
"step": 2382
},
{
"epoch": 0.97,
"grad_norm": 2.9903168958105577,
"learning_rate": 1.5295153506410484e-06,
"loss": 0.8817,
"step": 2383
},
{
"epoch": 0.97,
"grad_norm": 2.7847924330317566,
"learning_rate": 1.5291517257336345e-06,
"loss": 0.9728,
"step": 2384
},
{
"epoch": 0.97,
"grad_norm": 2.767253924246785,
"learning_rate": 1.5287880036249357e-06,
"loss": 0.9112,
"step": 2385
},
{
"epoch": 0.97,
"grad_norm": 2.623318694047327,
"learning_rate": 1.5284241843817649e-06,
"loss": 0.874,
"step": 2386
},
{
"epoch": 0.97,
"grad_norm": 2.931987954398909,
"learning_rate": 1.5280602680709529e-06,
"loss": 0.9209,
"step": 2387
},
{
"epoch": 0.97,
"grad_norm": 2.5509947158768833,
"learning_rate": 1.527696254759349e-06,
"loss": 0.8318,
"step": 2388
},
{
"epoch": 0.97,
"grad_norm": 2.676906009188457,
"learning_rate": 1.527332144513819e-06,
"loss": 0.9335,
"step": 2389
},
{
"epoch": 0.98,
"grad_norm": 2.5134866979870694,
"learning_rate": 1.526967937401248e-06,
"loss": 0.8961,
"step": 2390
},
{
"epoch": 0.98,
"grad_norm": 3.6101632067797786,
"learning_rate": 1.526603633488538e-06,
"loss": 0.8909,
"step": 2391
},
{
"epoch": 0.98,
"grad_norm": 2.627145596884011,
"learning_rate": 1.5262392328426086e-06,
"loss": 0.8001,
"step": 2392
},
{
"epoch": 0.98,
"grad_norm": 2.9895306256389187,
"learning_rate": 1.5258747355303979e-06,
"loss": 0.8854,
"step": 2393
},
{
"epoch": 0.98,
"grad_norm": 3.514650899845729,
"learning_rate": 1.5255101416188612e-06,
"loss": 0.9324,
"step": 2394
},
{
"epoch": 0.98,
"grad_norm": 2.553912023484815,
"learning_rate": 1.5251454511749716e-06,
"loss": 0.8967,
"step": 2395
},
{
"epoch": 0.98,
"grad_norm": 2.892851407784536,
"learning_rate": 1.5247806642657203e-06,
"loss": 0.9,
"step": 2396
},
{
"epoch": 0.98,
"grad_norm": 2.9698764799046042,
"learning_rate": 1.524415780958116e-06,
"loss": 0.8092,
"step": 2397
},
{
"epoch": 0.98,
"grad_norm": 3.0431882442128666,
"learning_rate": 1.5240508013191848e-06,
"loss": 0.8096,
"step": 2398
},
{
"epoch": 0.98,
"grad_norm": 2.623361331627465,
"learning_rate": 1.5236857254159712e-06,
"loss": 0.8585,
"step": 2399
},
{
"epoch": 0.98,
"grad_norm": 2.777930091692707,
"learning_rate": 1.5233205533155365e-06,
"loss": 0.8102,
"step": 2400
},
{
"epoch": 0.98,
"eval_loss": 0.8862309455871582,
"eval_runtime": 466.5897,
"eval_samples_per_second": 74.695,
"eval_steps_per_second": 4.67,
"step": 2400
},
{
"epoch": 0.98,
"grad_norm": 3.1892978405945374,
"learning_rate": 1.5229552850849606e-06,
"loss": 0.9461,
"step": 2401
},
{
"epoch": 0.98,
"grad_norm": 3.1867428869648817,
"learning_rate": 1.52258992079134e-06,
"loss": 0.9034,
"step": 2402
},
{
"epoch": 0.98,
"grad_norm": 3.6716269138381024,
"learning_rate": 1.52222446050179e-06,
"loss": 0.8274,
"step": 2403
},
{
"epoch": 0.98,
"grad_norm": 3.0099906034845003,
"learning_rate": 1.5218589042834426e-06,
"loss": 0.8584,
"step": 2404
},
{
"epoch": 0.98,
"grad_norm": 2.509513183658091,
"learning_rate": 1.5214932522034478e-06,
"loss": 0.8885,
"step": 2405
},
{
"epoch": 0.98,
"grad_norm": 2.303228178121596,
"learning_rate": 1.521127504328974e-06,
"loss": 0.8933,
"step": 2406
},
{
"epoch": 0.98,
"grad_norm": 2.9965951967454054,
"learning_rate": 1.520761660727205e-06,
"loss": 0.893,
"step": 2407
},
{
"epoch": 0.98,
"grad_norm": 2.952630836962747,
"learning_rate": 1.5203957214653446e-06,
"loss": 0.799,
"step": 2408
},
{
"epoch": 0.98,
"grad_norm": 2.495816414896098,
"learning_rate": 1.5200296866106129e-06,
"loss": 0.9112,
"step": 2409
},
{
"epoch": 0.98,
"grad_norm": 3.151220565206038,
"learning_rate": 1.5196635562302479e-06,
"loss": 0.8535,
"step": 2410
},
{
"epoch": 0.98,
"grad_norm": 2.8553031796059836,
"learning_rate": 1.5192973303915046e-06,
"loss": 0.9039,
"step": 2411
},
{
"epoch": 0.98,
"grad_norm": 3.1731009795639946,
"learning_rate": 1.5189310091616568e-06,
"loss": 0.9031,
"step": 2412
},
{
"epoch": 0.98,
"grad_norm": 2.671718832020298,
"learning_rate": 1.5185645926079942e-06,
"loss": 0.8672,
"step": 2413
},
{
"epoch": 0.99,
"grad_norm": 2.3786837638461447,
"learning_rate": 1.5181980807978256e-06,
"loss": 0.9337,
"step": 2414
},
{
"epoch": 0.99,
"grad_norm": 2.917618874586411,
"learning_rate": 1.5178314737984755e-06,
"loss": 0.8687,
"step": 2415
},
{
"epoch": 0.99,
"grad_norm": 3.207429404935404,
"learning_rate": 1.517464771677288e-06,
"loss": 0.9405,
"step": 2416
},
{
"epoch": 0.99,
"grad_norm": 2.614683760166066,
"learning_rate": 1.517097974501623e-06,
"loss": 0.8565,
"step": 2417
},
{
"epoch": 0.99,
"grad_norm": 2.3170059397510694,
"learning_rate": 1.5167310823388586e-06,
"loss": 0.9031,
"step": 2418
},
{
"epoch": 0.99,
"grad_norm": 2.6260334371104257,
"learning_rate": 1.5163640952563902e-06,
"loss": 0.8568,
"step": 2419
},
{
"epoch": 0.99,
"grad_norm": 2.882920416077479,
"learning_rate": 1.5159970133216305e-06,
"loss": 0.9378,
"step": 2420
},
{
"epoch": 0.99,
"grad_norm": 3.075366303526867,
"learning_rate": 1.5156298366020098e-06,
"loss": 0.8628,
"step": 2421
},
{
"epoch": 0.99,
"grad_norm": 2.6677916439919853,
"learning_rate": 1.5152625651649757e-06,
"loss": 0.8753,
"step": 2422
},
{
"epoch": 0.99,
"grad_norm": 3.2102847238793863,
"learning_rate": 1.5148951990779937e-06,
"loss": 0.9489,
"step": 2423
},
{
"epoch": 0.99,
"grad_norm": 2.7609158501761817,
"learning_rate": 1.5145277384085455e-06,
"loss": 0.8635,
"step": 2424
},
{
"epoch": 0.99,
"grad_norm": 2.367401850186432,
"learning_rate": 1.5141601832241318e-06,
"loss": 0.8989,
"step": 2425
},
{
"epoch": 0.99,
"grad_norm": 2.927538540130683,
"learning_rate": 1.513792533592269e-06,
"loss": 0.9453,
"step": 2426
},
{
"epoch": 0.99,
"grad_norm": 2.661832591696076,
"learning_rate": 1.5134247895804927e-06,
"loss": 0.8336,
"step": 2427
},
{
"epoch": 0.99,
"grad_norm": 2.4722418368369006,
"learning_rate": 1.513056951256353e-06,
"loss": 0.952,
"step": 2428
},
{
"epoch": 0.99,
"grad_norm": 3.1632152856466758,
"learning_rate": 1.512689018687421e-06,
"loss": 0.916,
"step": 2429
},
{
"epoch": 0.99,
"grad_norm": 3.3560986803794877,
"learning_rate": 1.5123209919412818e-06,
"loss": 0.8579,
"step": 2430
},
{
"epoch": 0.99,
"grad_norm": 2.9197726793901366,
"learning_rate": 1.51195287108554e-06,
"loss": 0.9336,
"step": 2431
},
{
"epoch": 0.99,
"grad_norm": 3.0966380981004877,
"learning_rate": 1.5115846561878165e-06,
"loss": 0.885,
"step": 2432
},
{
"epoch": 0.99,
"grad_norm": 2.3671609857276903,
"learning_rate": 1.5112163473157495e-06,
"loss": 0.8912,
"step": 2433
},
{
"epoch": 0.99,
"grad_norm": 2.9964135268930185,
"learning_rate": 1.5108479445369948e-06,
"loss": 0.8294,
"step": 2434
},
{
"epoch": 0.99,
"grad_norm": 2.8004280762884197,
"learning_rate": 1.5104794479192254e-06,
"loss": 0.8594,
"step": 2435
},
{
"epoch": 0.99,
"grad_norm": 3.3504420964963764,
"learning_rate": 1.5101108575301313e-06,
"loss": 0.8689,
"step": 2436
},
{
"epoch": 0.99,
"grad_norm": 2.7690020232538957,
"learning_rate": 1.5097421734374196e-06,
"loss": 0.8425,
"step": 2437
},
{
"epoch": 0.99,
"grad_norm": 2.741595280940412,
"learning_rate": 1.5093733957088152e-06,
"loss": 0.9322,
"step": 2438
},
{
"epoch": 1.0,
"grad_norm": 3.32161516492172,
"learning_rate": 1.50900452441206e-06,
"loss": 0.9049,
"step": 2439
},
{
"epoch": 1.0,
"grad_norm": 3.0719352953851247,
"learning_rate": 1.5086355596149124e-06,
"loss": 0.8302,
"step": 2440
},
{
"epoch": 1.0,
"grad_norm": 3.1387865869327074,
"learning_rate": 1.5082665013851488e-06,
"loss": 0.9506,
"step": 2441
},
{
"epoch": 1.0,
"grad_norm": 2.4709299932996442,
"learning_rate": 1.5078973497905628e-06,
"loss": 0.9201,
"step": 2442
},
{
"epoch": 1.0,
"grad_norm": 3.0453879399494292,
"learning_rate": 1.5075281048989644e-06,
"loss": 0.8536,
"step": 2443
},
{
"epoch": 1.0,
"grad_norm": 3.5483986674606425,
"learning_rate": 1.5071587667781811e-06,
"loss": 0.9961,
"step": 2444
},
{
"epoch": 1.0,
"grad_norm": 2.4863440937897656,
"learning_rate": 1.506789335496058e-06,
"loss": 0.9392,
"step": 2445
},
{
"epoch": 1.0,
"grad_norm": 4.0307714510132735,
"learning_rate": 1.5064198111204566e-06,
"loss": 0.8423,
"step": 2446
},
{
"epoch": 1.0,
"grad_norm": 2.470872786473922,
"learning_rate": 1.506050193719256e-06,
"loss": 0.8637,
"step": 2447
},
{
"epoch": 1.0,
"grad_norm": 3.272147752613857,
"learning_rate": 1.505680483360352e-06,
"loss": 0.8722,
"step": 2448
},
{
"epoch": 1.0,
"grad_norm": 2.857031001121624,
"learning_rate": 1.5053106801116582e-06,
"loss": 0.8767,
"step": 2449
},
{
"epoch": 1.0,
"grad_norm": 3.6221112309806642,
"learning_rate": 1.5049407840411039e-06,
"loss": 0.926,
"step": 2450
},
{
"epoch": 1.0,
"grad_norm": 2.608113310027252,
"learning_rate": 1.5045707952166372e-06,
"loss": 0.8223,
"step": 2451
},
{
"epoch": 1.0,
"grad_norm": 2.473420063320589,
"learning_rate": 1.5042007137062216e-06,
"loss": 0.8093,
"step": 2452
},
{
"epoch": 1.0,
"grad_norm": 2.6052327247239955,
"learning_rate": 1.5038305395778387e-06,
"loss": 0.8231,
"step": 2453
},
{
"epoch": 1.0,
"grad_norm": 2.560798206846995,
"learning_rate": 1.5034602728994865e-06,
"loss": 0.7946,
"step": 2454
},
{
"epoch": 1.0,
"grad_norm": 3.1012195063378503,
"learning_rate": 1.5030899137391809e-06,
"loss": 0.739,
"step": 2455
},
{
"epoch": 1.0,
"grad_norm": 2.9547616536360852,
"learning_rate": 1.5027194621649534e-06,
"loss": 0.8446,
"step": 2456
},
{
"epoch": 1.0,
"grad_norm": 2.8367189832340762,
"learning_rate": 1.5023489182448537e-06,
"loss": 0.8046,
"step": 2457
},
{
"epoch": 1.0,
"grad_norm": 2.7912790743424876,
"learning_rate": 1.5019782820469478e-06,
"loss": 0.8448,
"step": 2458
},
{
"epoch": 1.0,
"grad_norm": 2.7173635575574044,
"learning_rate": 1.501607553639319e-06,
"loss": 0.8131,
"step": 2459
},
{
"epoch": 1.0,
"grad_norm": 3.287994801358043,
"learning_rate": 1.501236733090067e-06,
"loss": 0.801,
"step": 2460
},
{
"epoch": 1.0,
"grad_norm": 2.3650184983666294,
"learning_rate": 1.5008658204673093e-06,
"loss": 0.8423,
"step": 2461
},
{
"epoch": 1.0,
"grad_norm": 2.892473650201963,
"learning_rate": 1.5004948158391797e-06,
"loss": 0.7875,
"step": 2462
},
{
"epoch": 1.01,
"grad_norm": 3.408817144640001,
"learning_rate": 1.5001237192738292e-06,
"loss": 0.8217,
"step": 2463
},
{
"epoch": 1.01,
"grad_norm": 2.6840569517106294,
"learning_rate": 1.4997525308394245e-06,
"loss": 0.8709,
"step": 2464
},
{
"epoch": 1.01,
"grad_norm": 2.4134553425310665,
"learning_rate": 1.4993812506041515e-06,
"loss": 0.8257,
"step": 2465
},
{
"epoch": 1.01,
"grad_norm": 2.4756685969788346,
"learning_rate": 1.4990098786362111e-06,
"loss": 0.7881,
"step": 2466
},
{
"epoch": 1.01,
"grad_norm": 2.740885414040201,
"learning_rate": 1.4986384150038214e-06,
"loss": 0.8641,
"step": 2467
},
{
"epoch": 1.01,
"grad_norm": 2.712591457798475,
"learning_rate": 1.498266859775218e-06,
"loss": 0.8578,
"step": 2468
},
{
"epoch": 1.01,
"grad_norm": 2.9593961727387788,
"learning_rate": 1.4978952130186525e-06,
"loss": 0.8379,
"step": 2469
},
{
"epoch": 1.01,
"grad_norm": 3.1236376828446235,
"learning_rate": 1.4975234748023936e-06,
"loss": 0.8567,
"step": 2470
},
{
"epoch": 1.01,
"grad_norm": 2.774549072076336,
"learning_rate": 1.4971516451947272e-06,
"loss": 0.7999,
"step": 2471
},
{
"epoch": 1.01,
"grad_norm": 2.85432356845237,
"learning_rate": 1.4967797242639555e-06,
"loss": 0.8442,
"step": 2472
},
{
"epoch": 1.01,
"grad_norm": 4.410589461686802,
"learning_rate": 1.4964077120783978e-06,
"loss": 0.7807,
"step": 2473
},
{
"epoch": 1.01,
"grad_norm": 2.624663490718534,
"learning_rate": 1.4960356087063896e-06,
"loss": 0.8221,
"step": 2474
},
{
"epoch": 1.01,
"grad_norm": 2.9947129764627443,
"learning_rate": 1.4956634142162843e-06,
"loss": 0.8784,
"step": 2475
},
{
"epoch": 1.01,
"grad_norm": 2.43273658150499,
"learning_rate": 1.4952911286764508e-06,
"loss": 0.8128,
"step": 2476
},
{
"epoch": 1.01,
"grad_norm": 2.447125195419745,
"learning_rate": 1.4949187521552746e-06,
"loss": 0.8673,
"step": 2477
},
{
"epoch": 1.01,
"grad_norm": 2.9204083377063688,
"learning_rate": 1.4945462847211598e-06,
"loss": 0.8061,
"step": 2478
},
{
"epoch": 1.01,
"grad_norm": 2.4281473618652316,
"learning_rate": 1.4941737264425252e-06,
"loss": 0.847,
"step": 2479
},
{
"epoch": 1.01,
"grad_norm": 2.810222886039672,
"learning_rate": 1.4938010773878068e-06,
"loss": 0.8697,
"step": 2480
},
{
"epoch": 1.01,
"grad_norm": 2.5807545286967715,
"learning_rate": 1.4934283376254584e-06,
"loss": 0.8514,
"step": 2481
},
{
"epoch": 1.01,
"grad_norm": 3.044240473167376,
"learning_rate": 1.4930555072239487e-06,
"loss": 0.8802,
"step": 2482
},
{
"epoch": 1.01,
"grad_norm": 2.6127242532841746,
"learning_rate": 1.4926825862517643e-06,
"loss": 0.7916,
"step": 2483
},
{
"epoch": 1.01,
"grad_norm": 2.6346659711279874,
"learning_rate": 1.4923095747774078e-06,
"loss": 0.7939,
"step": 2484
},
{
"epoch": 1.01,
"grad_norm": 3.3468355501404763,
"learning_rate": 1.4919364728693989e-06,
"loss": 0.7894,
"step": 2485
},
{
"epoch": 1.01,
"grad_norm": 3.5383882425725877,
"learning_rate": 1.4915632805962735e-06,
"loss": 0.7693,
"step": 2486
},
{
"epoch": 1.01,
"grad_norm": 2.8576747620351903,
"learning_rate": 1.4911899980265843e-06,
"loss": 0.8137,
"step": 2487
},
{
"epoch": 1.02,
"grad_norm": 3.0859628934843495,
"learning_rate": 1.4908166252289007e-06,
"loss": 0.8562,
"step": 2488
},
{
"epoch": 1.02,
"grad_norm": 2.885821960123096,
"learning_rate": 1.4904431622718087e-06,
"loss": 0.8651,
"step": 2489
},
{
"epoch": 1.02,
"grad_norm": 2.460276331271547,
"learning_rate": 1.49006960922391e-06,
"loss": 0.8327,
"step": 2490
},
{
"epoch": 1.02,
"grad_norm": 3.107806426788133,
"learning_rate": 1.4896959661538247e-06,
"loss": 0.7834,
"step": 2491
},
{
"epoch": 1.02,
"grad_norm": 2.7895762474012042,
"learning_rate": 1.4893222331301872e-06,
"loss": 0.7888,
"step": 2492
},
{
"epoch": 1.02,
"grad_norm": 3.301138368120173,
"learning_rate": 1.4889484102216496e-06,
"loss": 0.8495,
"step": 2493
},
{
"epoch": 1.02,
"grad_norm": 4.299276902154659,
"learning_rate": 1.4885744974968813e-06,
"loss": 0.818,
"step": 2494
},
{
"epoch": 1.02,
"grad_norm": 3.4604745363759326,
"learning_rate": 1.4882004950245665e-06,
"loss": 0.7844,
"step": 2495
},
{
"epoch": 1.02,
"grad_norm": 3.158193752537624,
"learning_rate": 1.4878264028734068e-06,
"loss": 0.8592,
"step": 2496
},
{
"epoch": 1.02,
"grad_norm": 3.194869779642749,
"learning_rate": 1.4874522211121209e-06,
"loss": 0.8086,
"step": 2497
},
{
"epoch": 1.02,
"grad_norm": 2.9704005039951817,
"learning_rate": 1.4870779498094421e-06,
"loss": 0.795,
"step": 2498
},
{
"epoch": 1.02,
"grad_norm": 3.7122445623726925,
"learning_rate": 1.4867035890341222e-06,
"loss": 0.8493,
"step": 2499
},
{
"epoch": 1.02,
"grad_norm": 2.7601035241913174,
"learning_rate": 1.486329138854928e-06,
"loss": 0.8341,
"step": 2500
},
{
"epoch": 1.02,
"eval_loss": 0.8889659643173218,
"eval_runtime": 466.5173,
"eval_samples_per_second": 74.707,
"eval_steps_per_second": 4.671,
"step": 2500
},
{
"epoch": 1.02,
"grad_norm": 2.76674988079019,
"learning_rate": 1.4859545993406436e-06,
"loss": 0.8052,
"step": 2501
},
{
"epoch": 1.02,
"grad_norm": 2.5991310473486156,
"learning_rate": 1.4855799705600692e-06,
"loss": 0.9282,
"step": 2502
},
{
"epoch": 1.02,
"grad_norm": 2.58496539115846,
"learning_rate": 1.4852052525820207e-06,
"loss": 0.8145,
"step": 2503
},
{
"epoch": 1.02,
"grad_norm": 2.302796931469877,
"learning_rate": 1.484830445475332e-06,
"loss": 0.8579,
"step": 2504
},
{
"epoch": 1.02,
"grad_norm": 3.3913731387582513,
"learning_rate": 1.4844555493088517e-06,
"loss": 0.858,
"step": 2505
},
{
"epoch": 1.02,
"grad_norm": 3.1185999518103964,
"learning_rate": 1.4840805641514456e-06,
"loss": 0.8206,
"step": 2506
},
{
"epoch": 1.02,
"grad_norm": 2.57292121113726,
"learning_rate": 1.4837054900719958e-06,
"loss": 0.7922,
"step": 2507
},
{
"epoch": 1.02,
"grad_norm": 2.8879823684956745,
"learning_rate": 1.4833303271394007e-06,
"loss": 0.8443,
"step": 2508
},
{
"epoch": 1.02,
"grad_norm": 3.182394287709047,
"learning_rate": 1.4829550754225752e-06,
"loss": 0.7815,
"step": 2509
},
{
"epoch": 1.02,
"grad_norm": 4.028172342460221,
"learning_rate": 1.4825797349904496e-06,
"loss": 0.7905,
"step": 2510
},
{
"epoch": 1.02,
"grad_norm": 2.698381201936925,
"learning_rate": 1.482204305911972e-06,
"loss": 0.8083,
"step": 2511
},
{
"epoch": 1.03,
"grad_norm": 2.452951639451854,
"learning_rate": 1.4818287882561053e-06,
"loss": 0.8785,
"step": 2512
},
{
"epoch": 1.03,
"grad_norm": 2.7380171206283292,
"learning_rate": 1.4814531820918297e-06,
"loss": 0.8313,
"step": 2513
},
{
"epoch": 1.03,
"grad_norm": 3.085622729898539,
"learning_rate": 1.4810774874881411e-06,
"loss": 0.8136,
"step": 2514
},
{
"epoch": 1.03,
"grad_norm": 3.047169965638033,
"learning_rate": 1.4807017045140521e-06,
"loss": 0.8364,
"step": 2515
},
{
"epoch": 1.03,
"grad_norm": 2.645537425811084,
"learning_rate": 1.4803258332385908e-06,
"loss": 0.8425,
"step": 2516
},
{
"epoch": 1.03,
"grad_norm": 4.300622001152736,
"learning_rate": 1.4799498737308026e-06,
"loss": 0.7615,
"step": 2517
},
{
"epoch": 1.03,
"grad_norm": 2.7176099377892857,
"learning_rate": 1.4795738260597481e-06,
"loss": 0.7946,
"step": 2518
},
{
"epoch": 1.03,
"grad_norm": 2.9220423113997263,
"learning_rate": 1.4791976902945047e-06,
"loss": 0.7409,
"step": 2519
},
{
"epoch": 1.03,
"grad_norm": 2.833837600288801,
"learning_rate": 1.4788214665041653e-06,
"loss": 0.8485,
"step": 2520
},
{
"epoch": 1.03,
"grad_norm": 3.3116530896832157,
"learning_rate": 1.4784451547578402e-06,
"loss": 0.8522,
"step": 2521
},
{
"epoch": 1.03,
"grad_norm": 3.1172828798843937,
"learning_rate": 1.4780687551246546e-06,
"loss": 0.8107,
"step": 2522
},
{
"epoch": 1.03,
"grad_norm": 2.9263241539183404,
"learning_rate": 1.4776922676737504e-06,
"loss": 0.8303,
"step": 2523
},
{
"epoch": 1.03,
"grad_norm": 2.5382361961766695,
"learning_rate": 1.4773156924742857e-06,
"loss": 0.8826,
"step": 2524
},
{
"epoch": 1.03,
"grad_norm": 3.004234583046955,
"learning_rate": 1.4769390295954345e-06,
"loss": 0.825,
"step": 2525
},
{
"epoch": 1.03,
"grad_norm": 3.13932250360166,
"learning_rate": 1.4765622791063872e-06,
"loss": 0.8318,
"step": 2526
},
{
"epoch": 1.03,
"grad_norm": 2.997588668180501,
"learning_rate": 1.4761854410763496e-06,
"loss": 0.8323,
"step": 2527
},
{
"epoch": 1.03,
"grad_norm": 2.33295799973737,
"learning_rate": 1.475808515574545e-06,
"loss": 0.7699,
"step": 2528
},
{
"epoch": 1.03,
"grad_norm": 3.63758620557116,
"learning_rate": 1.4754315026702106e-06,
"loss": 0.8198,
"step": 2529
},
{
"epoch": 1.03,
"grad_norm": 2.7832360665944216,
"learning_rate": 1.475054402432602e-06,
"loss": 0.8034,
"step": 2530
},
{
"epoch": 1.03,
"grad_norm": 3.2755828995707867,
"learning_rate": 1.4746772149309891e-06,
"loss": 0.8499,
"step": 2531
},
{
"epoch": 1.03,
"grad_norm": 3.173767325182273,
"learning_rate": 1.4742999402346588e-06,
"loss": 0.8501,
"step": 2532
},
{
"epoch": 1.03,
"grad_norm": 4.752680666258145,
"learning_rate": 1.4739225784129137e-06,
"loss": 0.8589,
"step": 2533
},
{
"epoch": 1.03,
"grad_norm": 2.875457307567514,
"learning_rate": 1.473545129535072e-06,
"loss": 0.7803,
"step": 2534
},
{
"epoch": 1.03,
"grad_norm": 4.834194393333386,
"learning_rate": 1.473167593670469e-06,
"loss": 0.8879,
"step": 2535
},
{
"epoch": 1.03,
"grad_norm": 3.748427344663051,
"learning_rate": 1.472789970888454e-06,
"loss": 0.8991,
"step": 2536
},
{
"epoch": 1.04,
"grad_norm": 2.5113699556130853,
"learning_rate": 1.4724122612583952e-06,
"loss": 0.8555,
"step": 2537
},
{
"epoch": 1.04,
"grad_norm": 2.903171417590896,
"learning_rate": 1.4720344648496737e-06,
"loss": 0.8452,
"step": 2538
},
{
"epoch": 1.04,
"grad_norm": 3.23025400004249,
"learning_rate": 1.4716565817316886e-06,
"loss": 0.8203,
"step": 2539
},
{
"epoch": 1.04,
"grad_norm": 2.471227208594443,
"learning_rate": 1.4712786119738543e-06,
"loss": 0.8137,
"step": 2540
},
{
"epoch": 1.04,
"grad_norm": 2.898434792808942,
"learning_rate": 1.4709005556456009e-06,
"loss": 0.8686,
"step": 2541
},
{
"epoch": 1.04,
"grad_norm": 3.7358650742465698,
"learning_rate": 1.4705224128163743e-06,
"loss": 0.787,
"step": 2542
},
{
"epoch": 1.04,
"grad_norm": 2.7507299080711647,
"learning_rate": 1.4701441835556369e-06,
"loss": 0.7941,
"step": 2543
},
{
"epoch": 1.04,
"grad_norm": 2.5813755433425842,
"learning_rate": 1.4697658679328667e-06,
"loss": 0.8507,
"step": 2544
},
{
"epoch": 1.04,
"grad_norm": 2.572001973722218,
"learning_rate": 1.4693874660175572e-06,
"loss": 0.8645,
"step": 2545
},
{
"epoch": 1.04,
"grad_norm": 3.4917653533739066,
"learning_rate": 1.469008977879218e-06,
"loss": 0.7949,
"step": 2546
},
{
"epoch": 1.04,
"grad_norm": 3.5489283155236775,
"learning_rate": 1.468630403587375e-06,
"loss": 0.7591,
"step": 2547
},
{
"epoch": 1.04,
"grad_norm": 3.38341557428432,
"learning_rate": 1.4682517432115695e-06,
"loss": 0.8183,
"step": 2548
},
{
"epoch": 1.04,
"grad_norm": 2.7013920766469517,
"learning_rate": 1.4678729968213582e-06,
"loss": 0.8508,
"step": 2549
},
{
"epoch": 1.04,
"grad_norm": 2.9347179691183944,
"learning_rate": 1.4674941644863144e-06,
"loss": 0.8615,
"step": 2550
},
{
"epoch": 1.04,
"grad_norm": 3.098128852150094,
"learning_rate": 1.4671152462760268e-06,
"loss": 0.8152,
"step": 2551
},
{
"epoch": 1.04,
"grad_norm": 2.949431496478963,
"learning_rate": 1.4667362422600993e-06,
"loss": 0.834,
"step": 2552
},
{
"epoch": 1.04,
"grad_norm": 2.6116273589380574,
"learning_rate": 1.4663571525081531e-06,
"loss": 0.8652,
"step": 2553
},
{
"epoch": 1.04,
"grad_norm": 2.4075813482650616,
"learning_rate": 1.4659779770898237e-06,
"loss": 0.8823,
"step": 2554
},
{
"epoch": 1.04,
"grad_norm": 3.4685627508059778,
"learning_rate": 1.4655987160747625e-06,
"loss": 0.8077,
"step": 2555
},
{
"epoch": 1.04,
"grad_norm": 3.157750376858168,
"learning_rate": 1.4652193695326376e-06,
"loss": 0.7627,
"step": 2556
},
{
"epoch": 1.04,
"grad_norm": 3.206861336807424,
"learning_rate": 1.4648399375331317e-06,
"loss": 0.8796,
"step": 2557
},
{
"epoch": 1.04,
"grad_norm": 4.057039132419901,
"learning_rate": 1.4644604201459443e-06,
"loss": 0.8009,
"step": 2558
},
{
"epoch": 1.04,
"grad_norm": 2.3810135909476324,
"learning_rate": 1.4640808174407892e-06,
"loss": 0.8194,
"step": 2559
},
{
"epoch": 1.04,
"grad_norm": 3.467248569878302,
"learning_rate": 1.4637011294873972e-06,
"loss": 0.9363,
"step": 2560
},
{
"epoch": 1.05,
"grad_norm": 5.5587397326510555,
"learning_rate": 1.4633213563555137e-06,
"loss": 0.747,
"step": 2561
},
{
"epoch": 1.05,
"grad_norm": 3.0916261216529426,
"learning_rate": 1.4629414981149006e-06,
"loss": 0.8353,
"step": 2562
},
{
"epoch": 1.05,
"grad_norm": 3.521636134607523,
"learning_rate": 1.462561554835335e-06,
"loss": 0.8141,
"step": 2563
},
{
"epoch": 1.05,
"grad_norm": 3.0245256527254907,
"learning_rate": 1.46218152658661e-06,
"loss": 0.8411,
"step": 2564
},
{
"epoch": 1.05,
"grad_norm": 3.615310564414657,
"learning_rate": 1.461801413438533e-06,
"loss": 0.8473,
"step": 2565
},
{
"epoch": 1.05,
"grad_norm": 2.7420517082588827,
"learning_rate": 1.4614212154609292e-06,
"loss": 0.9219,
"step": 2566
},
{
"epoch": 1.05,
"grad_norm": 2.5979970253720546,
"learning_rate": 1.4610409327236373e-06,
"loss": 0.8416,
"step": 2567
},
{
"epoch": 1.05,
"grad_norm": 2.4120657583086578,
"learning_rate": 1.4606605652965124e-06,
"loss": 0.8588,
"step": 2568
},
{
"epoch": 1.05,
"grad_norm": 2.507035341414559,
"learning_rate": 1.4602801132494262e-06,
"loss": 0.857,
"step": 2569
},
{
"epoch": 1.05,
"grad_norm": 2.694393892995022,
"learning_rate": 1.4598995766522638e-06,
"loss": 0.7711,
"step": 2570
},
{
"epoch": 1.05,
"grad_norm": 3.1681288008900896,
"learning_rate": 1.4595189555749279e-06,
"loss": 0.7977,
"step": 2571
},
{
"epoch": 1.05,
"grad_norm": 3.090912500066552,
"learning_rate": 1.4591382500873352e-06,
"loss": 0.8625,
"step": 2572
},
{
"epoch": 1.05,
"grad_norm": 3.939112140380514,
"learning_rate": 1.4587574602594188e-06,
"loss": 0.7739,
"step": 2573
},
{
"epoch": 1.05,
"grad_norm": 3.6858333974950086,
"learning_rate": 1.4583765861611265e-06,
"loss": 0.8317,
"step": 2574
},
{
"epoch": 1.05,
"grad_norm": 3.245576099340975,
"learning_rate": 1.4579956278624229e-06,
"loss": 0.8772,
"step": 2575
},
{
"epoch": 1.05,
"grad_norm": 3.252371797866354,
"learning_rate": 1.4576145854332867e-06,
"loss": 0.8341,
"step": 2576
},
{
"epoch": 1.05,
"grad_norm": 3.8176337858964606,
"learning_rate": 1.4572334589437127e-06,
"loss": 0.8238,
"step": 2577
},
{
"epoch": 1.05,
"grad_norm": 2.3935285824232135,
"learning_rate": 1.4568522484637107e-06,
"loss": 0.7727,
"step": 2578
},
{
"epoch": 1.05,
"grad_norm": 3.7068888707817744,
"learning_rate": 1.4564709540633069e-06,
"loss": 0.7832,
"step": 2579
},
{
"epoch": 1.05,
"grad_norm": 3.4966404228018817,
"learning_rate": 1.456089575812542e-06,
"loss": 0.8255,
"step": 2580
},
{
"epoch": 1.05,
"grad_norm": 2.942546712201021,
"learning_rate": 1.4557081137814722e-06,
"loss": 0.8127,
"step": 2581
},
{
"epoch": 1.05,
"grad_norm": 3.068863373132351,
"learning_rate": 1.4553265680401694e-06,
"loss": 0.8286,
"step": 2582
},
{
"epoch": 1.05,
"grad_norm": 2.9272198140481316,
"learning_rate": 1.4549449386587206e-06,
"loss": 0.8942,
"step": 2583
},
{
"epoch": 1.05,
"grad_norm": 2.6013470324157484,
"learning_rate": 1.4545632257072286e-06,
"loss": 0.9684,
"step": 2584
},
{
"epoch": 1.05,
"grad_norm": 4.68498268815913,
"learning_rate": 1.4541814292558114e-06,
"loss": 0.8181,
"step": 2585
},
{
"epoch": 1.06,
"grad_norm": 3.1342182835957644,
"learning_rate": 1.4537995493746017e-06,
"loss": 0.7696,
"step": 2586
},
{
"epoch": 1.06,
"grad_norm": 3.279350910813539,
"learning_rate": 1.4534175861337486e-06,
"loss": 0.8349,
"step": 2587
},
{
"epoch": 1.06,
"grad_norm": 2.706492007762849,
"learning_rate": 1.4530355396034152e-06,
"loss": 0.8311,
"step": 2588
},
{
"epoch": 1.06,
"grad_norm": 3.7360171821454515,
"learning_rate": 1.4526534098537814e-06,
"loss": 0.7869,
"step": 2589
},
{
"epoch": 1.06,
"grad_norm": 2.907548198861126,
"learning_rate": 1.4522711969550415e-06,
"loss": 0.8201,
"step": 2590
},
{
"epoch": 1.06,
"grad_norm": 3.9591308253753645,
"learning_rate": 1.4518889009774044e-06,
"loss": 0.7906,
"step": 2591
},
{
"epoch": 1.06,
"grad_norm": 3.282109913278299,
"learning_rate": 1.4515065219910963e-06,
"loss": 0.8276,
"step": 2592
},
{
"epoch": 1.06,
"grad_norm": 3.6055889796633576,
"learning_rate": 1.4511240600663563e-06,
"loss": 0.8047,
"step": 2593
},
{
"epoch": 1.06,
"grad_norm": 3.703838767050441,
"learning_rate": 1.4507415152734403e-06,
"loss": 0.8159,
"step": 2594
},
{
"epoch": 1.06,
"grad_norm": 2.9264356408650682,
"learning_rate": 1.4503588876826192e-06,
"loss": 0.8805,
"step": 2595
},
{
"epoch": 1.06,
"grad_norm": 3.5573461998058242,
"learning_rate": 1.4499761773641784e-06,
"loss": 0.7666,
"step": 2596
},
{
"epoch": 1.06,
"grad_norm": 2.757556221896587,
"learning_rate": 1.4495933843884197e-06,
"loss": 0.9031,
"step": 2597
},
{
"epoch": 1.06,
"grad_norm": 3.2607722556182215,
"learning_rate": 1.4492105088256585e-06,
"loss": 0.8183,
"step": 2598
},
{
"epoch": 1.06,
"grad_norm": 2.5683438595755366,
"learning_rate": 1.448827550746227e-06,
"loss": 0.8678,
"step": 2599
},
{
"epoch": 1.06,
"grad_norm": 4.253434147792728,
"learning_rate": 1.4484445102204709e-06,
"loss": 0.7832,
"step": 2600
},
{
"epoch": 1.06,
"eval_loss": 0.8873896598815918,
"eval_runtime": 466.3298,
"eval_samples_per_second": 74.737,
"eval_steps_per_second": 4.673,
"step": 2600
},
{
"epoch": 1.06,
"grad_norm": 3.1672138326374255,
"learning_rate": 1.4480613873187526e-06,
"loss": 0.7916,
"step": 2601
},
{
"epoch": 1.06,
"grad_norm": 2.4353847159429005,
"learning_rate": 1.447678182111449e-06,
"loss": 0.836,
"step": 2602
},
{
"epoch": 1.06,
"grad_norm": 3.857756152486758,
"learning_rate": 1.4472948946689521e-06,
"loss": 0.8091,
"step": 2603
},
{
"epoch": 1.06,
"grad_norm": 2.623518201074271,
"learning_rate": 1.446911525061668e-06,
"loss": 0.8826,
"step": 2604
},
{
"epoch": 1.06,
"grad_norm": 3.2507410778116768,
"learning_rate": 1.4465280733600205e-06,
"loss": 0.7881,
"step": 2605
},
{
"epoch": 1.06,
"grad_norm": 2.3650595522768953,
"learning_rate": 1.4461445396344456e-06,
"loss": 0.8589,
"step": 2606
},
{
"epoch": 1.06,
"grad_norm": 3.0975289742793595,
"learning_rate": 1.445760923955396e-06,
"loss": 0.8265,
"step": 2607
},
{
"epoch": 1.06,
"grad_norm": 19.24607863490518,
"learning_rate": 1.4453772263933392e-06,
"loss": 0.8095,
"step": 2608
},
{
"epoch": 1.06,
"grad_norm": 2.9191891529260747,
"learning_rate": 1.4449934470187577e-06,
"loss": 0.8609,
"step": 2609
},
{
"epoch": 1.07,
"grad_norm": 3.4827323477340144,
"learning_rate": 1.4446095859021486e-06,
"loss": 0.8331,
"step": 2610
},
{
"epoch": 1.07,
"grad_norm": 3.175523723328953,
"learning_rate": 1.4442256431140244e-06,
"loss": 0.8454,
"step": 2611
},
{
"epoch": 1.07,
"grad_norm": 3.154413706279885,
"learning_rate": 1.443841618724913e-06,
"loss": 0.8792,
"step": 2612
},
{
"epoch": 1.07,
"grad_norm": 3.5195494092018595,
"learning_rate": 1.4434575128053569e-06,
"loss": 0.8217,
"step": 2613
},
{
"epoch": 1.07,
"grad_norm": 2.433077821432321,
"learning_rate": 1.4430733254259126e-06,
"loss": 0.8386,
"step": 2614
},
{
"epoch": 1.07,
"grad_norm": 3.0939124423282656,
"learning_rate": 1.4426890566571534e-06,
"loss": 0.7542,
"step": 2615
},
{
"epoch": 1.07,
"grad_norm": 2.3811351539413197,
"learning_rate": 1.4423047065696668e-06,
"loss": 0.8257,
"step": 2616
},
{
"epoch": 1.07,
"grad_norm": 4.768571847372881,
"learning_rate": 1.4419202752340537e-06,
"loss": 0.764,
"step": 2617
},
{
"epoch": 1.07,
"grad_norm": 2.8901788667706145,
"learning_rate": 1.4415357627209333e-06,
"loss": 0.8573,
"step": 2618
},
{
"epoch": 1.07,
"grad_norm": 2.8995140492059535,
"learning_rate": 1.441151169100936e-06,
"loss": 0.8493,
"step": 2619
},
{
"epoch": 1.07,
"grad_norm": 3.8222934962505617,
"learning_rate": 1.44076649444471e-06,
"loss": 0.8226,
"step": 2620
},
{
"epoch": 1.07,
"grad_norm": 3.3309143204375995,
"learning_rate": 1.4403817388229165e-06,
"loss": 0.8133,
"step": 2621
},
{
"epoch": 1.07,
"grad_norm": 2.950049202966287,
"learning_rate": 1.4399969023062327e-06,
"loss": 0.8227,
"step": 2622
},
{
"epoch": 1.07,
"grad_norm": 3.408142913340388,
"learning_rate": 1.43961198496535e-06,
"loss": 0.8649,
"step": 2623
},
{
"epoch": 1.07,
"grad_norm": 3.197581041769516,
"learning_rate": 1.4392269868709747e-06,
"loss": 0.8421,
"step": 2624
},
{
"epoch": 1.07,
"grad_norm": 2.580737491774892,
"learning_rate": 1.4388419080938285e-06,
"loss": 0.9105,
"step": 2625
},
{
"epoch": 1.07,
"grad_norm": 2.833938445225599,
"learning_rate": 1.4384567487046474e-06,
"loss": 0.8002,
"step": 2626
},
{
"epoch": 1.07,
"grad_norm": 3.015732281861534,
"learning_rate": 1.4380715087741824e-06,
"loss": 0.8437,
"step": 2627
},
{
"epoch": 1.07,
"grad_norm": 3.1617087701109936,
"learning_rate": 1.4376861883731992e-06,
"loss": 0.8279,
"step": 2628
},
{
"epoch": 1.07,
"grad_norm": 2.6957478099387626,
"learning_rate": 1.4373007875724784e-06,
"loss": 0.8844,
"step": 2629
},
{
"epoch": 1.07,
"grad_norm": 3.9671123345681965,
"learning_rate": 1.4369153064428153e-06,
"loss": 0.8305,
"step": 2630
},
{
"epoch": 1.07,
"grad_norm": 3.2928957438630824,
"learning_rate": 1.4365297450550198e-06,
"loss": 0.7881,
"step": 2631
},
{
"epoch": 1.07,
"grad_norm": 3.1654690547316093,
"learning_rate": 1.4361441034799167e-06,
"loss": 0.859,
"step": 2632
},
{
"epoch": 1.07,
"grad_norm": 4.068569981652718,
"learning_rate": 1.4357583817883456e-06,
"loss": 0.8761,
"step": 2633
},
{
"epoch": 1.07,
"grad_norm": 2.6797252750540874,
"learning_rate": 1.435372580051161e-06,
"loss": 0.9404,
"step": 2634
},
{
"epoch": 1.08,
"grad_norm": 4.071667795754453,
"learning_rate": 1.4349866983392315e-06,
"loss": 0.8064,
"step": 2635
},
{
"epoch": 1.08,
"grad_norm": 3.0594618842399597,
"learning_rate": 1.4346007367234406e-06,
"loss": 0.8758,
"step": 2636
},
{
"epoch": 1.08,
"grad_norm": 3.079051088127495,
"learning_rate": 1.4342146952746869e-06,
"loss": 0.8486,
"step": 2637
},
{
"epoch": 1.08,
"grad_norm": 3.022601111049257,
"learning_rate": 1.4338285740638833e-06,
"loss": 0.7909,
"step": 2638
},
{
"epoch": 1.08,
"grad_norm": 2.640098830574861,
"learning_rate": 1.4334423731619578e-06,
"loss": 0.7848,
"step": 2639
},
{
"epoch": 1.08,
"grad_norm": 2.4954993407853854,
"learning_rate": 1.433056092639852e-06,
"loss": 0.8252,
"step": 2640
},
{
"epoch": 1.08,
"grad_norm": 3.0648687057218624,
"learning_rate": 1.4326697325685228e-06,
"loss": 0.8147,
"step": 2641
},
{
"epoch": 1.08,
"grad_norm": 2.657347644472562,
"learning_rate": 1.4322832930189427e-06,
"loss": 0.7596,
"step": 2642
},
{
"epoch": 1.08,
"grad_norm": 2.5368452383953097,
"learning_rate": 1.4318967740620966e-06,
"loss": 0.8587,
"step": 2643
},
{
"epoch": 1.08,
"grad_norm": 2.372978377941748,
"learning_rate": 1.4315101757689862e-06,
"loss": 0.83,
"step": 2644
},
{
"epoch": 1.08,
"grad_norm": 2.747407339894967,
"learning_rate": 1.4311234982106258e-06,
"loss": 0.8541,
"step": 2645
},
{
"epoch": 1.08,
"grad_norm": 2.691778076393842,
"learning_rate": 1.430736741458046e-06,
"loss": 0.8256,
"step": 2646
},
{
"epoch": 1.08,
"grad_norm": 2.831767036849166,
"learning_rate": 1.4303499055822909e-06,
"loss": 0.8096,
"step": 2647
},
{
"epoch": 1.08,
"grad_norm": 4.7332662599287305,
"learning_rate": 1.4299629906544192e-06,
"loss": 0.8385,
"step": 2648
},
{
"epoch": 1.08,
"grad_norm": 4.865032971388529,
"learning_rate": 1.4295759967455046e-06,
"loss": 0.8418,
"step": 2649
},
{
"epoch": 1.08,
"grad_norm": 3.831464230352611,
"learning_rate": 1.429188923926635e-06,
"loss": 0.8117,
"step": 2650
},
{
"epoch": 1.08,
"grad_norm": 2.821537020707176,
"learning_rate": 1.4288017722689127e-06,
"loss": 0.8205,
"step": 2651
},
{
"epoch": 1.08,
"grad_norm": 3.6776874029234654,
"learning_rate": 1.428414541843455e-06,
"loss": 0.8135,
"step": 2652
},
{
"epoch": 1.08,
"grad_norm": 4.417176753986449,
"learning_rate": 1.4280272327213928e-06,
"loss": 0.7695,
"step": 2653
},
{
"epoch": 1.08,
"grad_norm": 3.3110745981998204,
"learning_rate": 1.4276398449738723e-06,
"loss": 0.8256,
"step": 2654
},
{
"epoch": 1.08,
"grad_norm": 2.8236634226064914,
"learning_rate": 1.4272523786720534e-06,
"loss": 0.8955,
"step": 2655
},
{
"epoch": 1.08,
"grad_norm": 2.7607573732658195,
"learning_rate": 1.4268648338871108e-06,
"loss": 0.8725,
"step": 2656
},
{
"epoch": 1.08,
"grad_norm": 3.4336428672076966,
"learning_rate": 1.4264772106902344e-06,
"loss": 0.873,
"step": 2657
},
{
"epoch": 1.08,
"grad_norm": 2.582477708944191,
"learning_rate": 1.4260895091526266e-06,
"loss": 0.791,
"step": 2658
},
{
"epoch": 1.09,
"grad_norm": 2.5175117473444573,
"learning_rate": 1.4257017293455064e-06,
"loss": 0.8856,
"step": 2659
},
{
"epoch": 1.09,
"grad_norm": 2.930343505229495,
"learning_rate": 1.4253138713401057e-06,
"loss": 0.8323,
"step": 2660
},
{
"epoch": 1.09,
"grad_norm": 3.3370714026831667,
"learning_rate": 1.4249259352076707e-06,
"loss": 0.8729,
"step": 2661
},
{
"epoch": 1.09,
"grad_norm": 3.082309408684867,
"learning_rate": 1.424537921019463e-06,
"loss": 0.742,
"step": 2662
},
{
"epoch": 1.09,
"grad_norm": 2.599346667165896,
"learning_rate": 1.4241498288467576e-06,
"loss": 0.8581,
"step": 2663
},
{
"epoch": 1.09,
"grad_norm": 2.8870422026664797,
"learning_rate": 1.4237616587608447e-06,
"loss": 0.8389,
"step": 2664
},
{
"epoch": 1.09,
"grad_norm": 3.164434673535275,
"learning_rate": 1.423373410833028e-06,
"loss": 0.8114,
"step": 2665
},
{
"epoch": 1.09,
"grad_norm": 2.7919603348181417,
"learning_rate": 1.4229850851346253e-06,
"loss": 0.8128,
"step": 2666
},
{
"epoch": 1.09,
"grad_norm": 2.8528747931675293,
"learning_rate": 1.42259668173697e-06,
"loss": 0.8611,
"step": 2667
},
{
"epoch": 1.09,
"grad_norm": 2.9919591689651477,
"learning_rate": 1.422208200711408e-06,
"loss": 0.8135,
"step": 2668
},
{
"epoch": 1.09,
"grad_norm": 2.9020515093755397,
"learning_rate": 1.4218196421293017e-06,
"loss": 0.8358,
"step": 2669
},
{
"epoch": 1.09,
"grad_norm": 3.784533422990284,
"learning_rate": 1.4214310060620251e-06,
"loss": 0.8135,
"step": 2670
},
{
"epoch": 1.09,
"grad_norm": 2.832736143374342,
"learning_rate": 1.4210422925809686e-06,
"loss": 0.8334,
"step": 2671
},
{
"epoch": 1.09,
"grad_norm": 3.093323714451922,
"learning_rate": 1.420653501757536e-06,
"loss": 0.8841,
"step": 2672
},
{
"epoch": 1.09,
"grad_norm": 2.797939928237102,
"learning_rate": 1.420264633663145e-06,
"loss": 0.8714,
"step": 2673
},
{
"epoch": 1.09,
"grad_norm": 2.8839703440799043,
"learning_rate": 1.419875688369228e-06,
"loss": 0.7239,
"step": 2674
},
{
"epoch": 1.09,
"grad_norm": 2.5240554303074223,
"learning_rate": 1.4194866659472316e-06,
"loss": 0.779,
"step": 2675
},
{
"epoch": 1.09,
"grad_norm": 2.8168267223734893,
"learning_rate": 1.4190975664686158e-06,
"loss": 0.7785,
"step": 2676
},
{
"epoch": 1.09,
"grad_norm": 3.538832018593143,
"learning_rate": 1.4187083900048556e-06,
"loss": 0.8344,
"step": 2677
},
{
"epoch": 1.09,
"grad_norm": 2.5543505564108298,
"learning_rate": 1.4183191366274403e-06,
"loss": 0.8258,
"step": 2678
},
{
"epoch": 1.09,
"grad_norm": 3.2240103892808825,
"learning_rate": 1.417929806407872e-06,
"loss": 0.7865,
"step": 2679
},
{
"epoch": 1.09,
"grad_norm": 2.816842128845322,
"learning_rate": 1.4175403994176687e-06,
"loss": 0.8265,
"step": 2680
},
{
"epoch": 1.09,
"grad_norm": 3.22584981460906,
"learning_rate": 1.4171509157283606e-06,
"loss": 0.8361,
"step": 2681
},
{
"epoch": 1.09,
"grad_norm": 2.4405963162441346,
"learning_rate": 1.4167613554114941e-06,
"loss": 0.7726,
"step": 2682
},
{
"epoch": 1.09,
"grad_norm": 2.6882634953829188,
"learning_rate": 1.4163717185386278e-06,
"loss": 0.791,
"step": 2683
},
{
"epoch": 1.1,
"grad_norm": 2.870938281303194,
"learning_rate": 1.4159820051813357e-06,
"loss": 0.8537,
"step": 2684
},
{
"epoch": 1.1,
"grad_norm": 2.8666751420669905,
"learning_rate": 1.4155922154112045e-06,
"loss": 0.8139,
"step": 2685
},
{
"epoch": 1.1,
"grad_norm": 3.1757269082450232,
"learning_rate": 1.4152023492998365e-06,
"loss": 0.8315,
"step": 2686
},
{
"epoch": 1.1,
"grad_norm": 3.282968139948733,
"learning_rate": 1.4148124069188468e-06,
"loss": 0.8266,
"step": 2687
},
{
"epoch": 1.1,
"grad_norm": 2.856215459383275,
"learning_rate": 1.4144223883398648e-06,
"loss": 0.8733,
"step": 2688
},
{
"epoch": 1.1,
"grad_norm": 3.157870547861428,
"learning_rate": 1.4140322936345345e-06,
"loss": 0.8652,
"step": 2689
},
{
"epoch": 1.1,
"grad_norm": 3.199960564344617,
"learning_rate": 1.4136421228745135e-06,
"loss": 0.8583,
"step": 2690
},
{
"epoch": 1.1,
"grad_norm": 2.7803232129115028,
"learning_rate": 1.4132518761314729e-06,
"loss": 0.8667,
"step": 2691
},
{
"epoch": 1.1,
"grad_norm": 2.678485948306619,
"learning_rate": 1.4128615534770983e-06,
"loss": 0.8129,
"step": 2692
},
{
"epoch": 1.1,
"grad_norm": 3.3529125341436044,
"learning_rate": 1.412471154983089e-06,
"loss": 0.7859,
"step": 2693
},
{
"epoch": 1.1,
"grad_norm": 2.9273005595913375,
"learning_rate": 1.4120806807211586e-06,
"loss": 0.7722,
"step": 2694
},
{
"epoch": 1.1,
"grad_norm": 2.7134454231682925,
"learning_rate": 1.4116901307630341e-06,
"loss": 0.8612,
"step": 2695
},
{
"epoch": 1.1,
"grad_norm": 3.381642281062258,
"learning_rate": 1.4112995051804567e-06,
"loss": 0.8499,
"step": 2696
},
{
"epoch": 1.1,
"grad_norm": 2.8747955485725574,
"learning_rate": 1.4109088040451815e-06,
"loss": 0.802,
"step": 2697
},
{
"epoch": 1.1,
"grad_norm": 3.986529792574988,
"learning_rate": 1.4105180274289777e-06,
"loss": 0.7751,
"step": 2698
},
{
"epoch": 1.1,
"grad_norm": 2.964841249981792,
"learning_rate": 1.4101271754036278e-06,
"loss": 0.802,
"step": 2699
},
{
"epoch": 1.1,
"grad_norm": 2.799740584631272,
"learning_rate": 1.409736248040928e-06,
"loss": 0.803,
"step": 2700
},
{
"epoch": 1.1,
"eval_loss": 0.8861318826675415,
"eval_runtime": 466.2106,
"eval_samples_per_second": 74.756,
"eval_steps_per_second": 4.674,
"step": 2700
},
{
"epoch": 1.1,
"grad_norm": 2.629492921139844,
"learning_rate": 1.4093452454126898e-06,
"loss": 0.8328,
"step": 2701
},
{
"epoch": 1.1,
"grad_norm": 3.0754430692532897,
"learning_rate": 1.4089541675907365e-06,
"loss": 0.8072,
"step": 2702
},
{
"epoch": 1.1,
"grad_norm": 3.284974307082588,
"learning_rate": 1.4085630146469073e-06,
"loss": 0.8329,
"step": 2703
},
{
"epoch": 1.1,
"grad_norm": 2.935525542240611,
"learning_rate": 1.4081717866530531e-06,
"loss": 0.8446,
"step": 2704
},
{
"epoch": 1.1,
"grad_norm": 3.0632563822032584,
"learning_rate": 1.4077804836810398e-06,
"loss": 0.7989,
"step": 2705
},
{
"epoch": 1.1,
"grad_norm": 3.3559872303010683,
"learning_rate": 1.4073891058027476e-06,
"loss": 0.8698,
"step": 2706
},
{
"epoch": 1.1,
"grad_norm": 2.690456242785317,
"learning_rate": 1.4069976530900691e-06,
"loss": 0.9183,
"step": 2707
},
{
"epoch": 1.11,
"grad_norm": 4.040934410922592,
"learning_rate": 1.4066061256149115e-06,
"loss": 0.8573,
"step": 2708
},
{
"epoch": 1.11,
"grad_norm": 2.7899247158764755,
"learning_rate": 1.4062145234491954e-06,
"loss": 0.8603,
"step": 2709
},
{
"epoch": 1.11,
"grad_norm": 2.7305516155517227,
"learning_rate": 1.4058228466648552e-06,
"loss": 0.7705,
"step": 2710
},
{
"epoch": 1.11,
"grad_norm": 3.139312988279179,
"learning_rate": 1.4054310953338391e-06,
"loss": 0.8165,
"step": 2711
},
{
"epoch": 1.11,
"grad_norm": 3.514883206276104,
"learning_rate": 1.405039269528109e-06,
"loss": 0.8836,
"step": 2712
},
{
"epoch": 1.11,
"grad_norm": 3.1792089444369607,
"learning_rate": 1.4046473693196409e-06,
"loss": 0.7584,
"step": 2713
},
{
"epoch": 1.11,
"grad_norm": 2.662443436239651,
"learning_rate": 1.404255394780423e-06,
"loss": 0.829,
"step": 2714
},
{
"epoch": 1.11,
"grad_norm": 2.589020172482419,
"learning_rate": 1.4038633459824584e-06,
"loss": 0.8089,
"step": 2715
},
{
"epoch": 1.11,
"grad_norm": 2.831906977964017,
"learning_rate": 1.4034712229977644e-06,
"loss": 0.8335,
"step": 2716
},
{
"epoch": 1.11,
"grad_norm": 2.7127803699164685,
"learning_rate": 1.4030790258983703e-06,
"loss": 0.8279,
"step": 2717
},
{
"epoch": 1.11,
"grad_norm": 2.930086607570953,
"learning_rate": 1.4026867547563201e-06,
"loss": 0.8493,
"step": 2718
},
{
"epoch": 1.11,
"grad_norm": 3.0678297714966756,
"learning_rate": 1.4022944096436714e-06,
"loss": 0.8147,
"step": 2719
},
{
"epoch": 1.11,
"grad_norm": 2.215622696071573,
"learning_rate": 1.401901990632494e-06,
"loss": 0.8101,
"step": 2720
},
{
"epoch": 1.11,
"grad_norm": 3.5247714309322076,
"learning_rate": 1.401509497794874e-06,
"loss": 0.8116,
"step": 2721
},
{
"epoch": 1.11,
"grad_norm": 3.6363219453103004,
"learning_rate": 1.4011169312029084e-06,
"loss": 0.8653,
"step": 2722
},
{
"epoch": 1.11,
"grad_norm": 3.4349931906820657,
"learning_rate": 1.400724290928709e-06,
"loss": 0.8026,
"step": 2723
},
{
"epoch": 1.11,
"grad_norm": 4.003343504108915,
"learning_rate": 1.4003315770444009e-06,
"loss": 0.7728,
"step": 2724
},
{
"epoch": 1.11,
"grad_norm": 4.377001434029206,
"learning_rate": 1.399938789622123e-06,
"loss": 0.8065,
"step": 2725
},
{
"epoch": 1.11,
"grad_norm": 3.4280335456496958,
"learning_rate": 1.3995459287340272e-06,
"loss": 0.8108,
"step": 2726
},
{
"epoch": 1.11,
"grad_norm": 3.3408532249579403,
"learning_rate": 1.3991529944522792e-06,
"loss": 0.8385,
"step": 2727
},
{
"epoch": 1.11,
"grad_norm": 3.0373346379239345,
"learning_rate": 1.3987599868490584e-06,
"loss": 0.8253,
"step": 2728
},
{
"epoch": 1.11,
"grad_norm": 2.5114791169126893,
"learning_rate": 1.398366905996557e-06,
"loss": 0.8666,
"step": 2729
},
{
"epoch": 1.11,
"grad_norm": 2.946266557023997,
"learning_rate": 1.3979737519669815e-06,
"loss": 0.897,
"step": 2730
},
{
"epoch": 1.11,
"grad_norm": 2.412556006281693,
"learning_rate": 1.3975805248325506e-06,
"loss": 0.8119,
"step": 2731
},
{
"epoch": 1.11,
"grad_norm": 3.053128853163046,
"learning_rate": 1.3971872246654987e-06,
"loss": 0.7935,
"step": 2732
},
{
"epoch": 1.12,
"grad_norm": 3.2830726325656436,
"learning_rate": 1.3967938515380706e-06,
"loss": 0.8462,
"step": 2733
},
{
"epoch": 1.12,
"grad_norm": 2.9055430146649317,
"learning_rate": 1.3964004055225274e-06,
"loss": 0.8212,
"step": 2734
},
{
"epoch": 1.12,
"grad_norm": 3.8667704992692644,
"learning_rate": 1.396006886691141e-06,
"loss": 0.8973,
"step": 2735
},
{
"epoch": 1.12,
"grad_norm": 2.4938872457840184,
"learning_rate": 1.3956132951161988e-06,
"loss": 0.7557,
"step": 2736
},
{
"epoch": 1.12,
"grad_norm": 2.8030384266023662,
"learning_rate": 1.39521963087e-06,
"loss": 0.8707,
"step": 2737
},
{
"epoch": 1.12,
"grad_norm": 2.8783639256873097,
"learning_rate": 1.3948258940248586e-06,
"loss": 0.7949,
"step": 2738
},
{
"epoch": 1.12,
"grad_norm": 3.2420527853062038,
"learning_rate": 1.3944320846531006e-06,
"loss": 0.8006,
"step": 2739
},
{
"epoch": 1.12,
"grad_norm": 2.7916360488152234,
"learning_rate": 1.394038202827066e-06,
"loss": 0.8917,
"step": 2740
},
{
"epoch": 1.12,
"grad_norm": 3.1439258191052524,
"learning_rate": 1.393644248619108e-06,
"loss": 0.873,
"step": 2741
},
{
"epoch": 1.12,
"grad_norm": 2.4400138111246616,
"learning_rate": 1.3932502221015935e-06,
"loss": 0.8332,
"step": 2742
},
{
"epoch": 1.12,
"grad_norm": 2.7703449088602565,
"learning_rate": 1.3928561233469013e-06,
"loss": 0.8546,
"step": 2743
},
{
"epoch": 1.12,
"grad_norm": 2.596378708497507,
"learning_rate": 1.3924619524274254e-06,
"loss": 0.7669,
"step": 2744
},
{
"epoch": 1.12,
"grad_norm": 3.2311831111194427,
"learning_rate": 1.3920677094155722e-06,
"loss": 0.8728,
"step": 2745
},
{
"epoch": 1.12,
"grad_norm": 3.1787012057165747,
"learning_rate": 1.3916733943837601e-06,
"loss": 0.7763,
"step": 2746
},
{
"epoch": 1.12,
"grad_norm": 2.481365670061211,
"learning_rate": 1.3912790074044232e-06,
"loss": 0.7872,
"step": 2747
},
{
"epoch": 1.12,
"grad_norm": 3.1195585343188807,
"learning_rate": 1.3908845485500064e-06,
"loss": 0.8262,
"step": 2748
},
{
"epoch": 1.12,
"grad_norm": 3.2109171810301835,
"learning_rate": 1.3904900178929696e-06,
"loss": 0.7683,
"step": 2749
},
{
"epoch": 1.12,
"grad_norm": 3.1778887631466417,
"learning_rate": 1.3900954155057852e-06,
"loss": 0.8544,
"step": 2750
},
{
"epoch": 1.12,
"grad_norm": 3.1633792079177483,
"learning_rate": 1.389700741460938e-06,
"loss": 0.7447,
"step": 2751
},
{
"epoch": 1.12,
"grad_norm": 2.6225038675974433,
"learning_rate": 1.3893059958309277e-06,
"loss": 0.8137,
"step": 2752
},
{
"epoch": 1.12,
"grad_norm": 2.5203313399859426,
"learning_rate": 1.3889111786882658e-06,
"loss": 0.7715,
"step": 2753
},
{
"epoch": 1.12,
"grad_norm": 2.779193161521782,
"learning_rate": 1.3885162901054765e-06,
"loss": 0.8048,
"step": 2754
},
{
"epoch": 1.12,
"grad_norm": 2.73343472271838,
"learning_rate": 1.3881213301550993e-06,
"loss": 0.8246,
"step": 2755
},
{
"epoch": 1.12,
"grad_norm": 2.496964255997303,
"learning_rate": 1.3877262989096846e-06,
"loss": 0.8951,
"step": 2756
},
{
"epoch": 1.13,
"grad_norm": 2.6631031522822886,
"learning_rate": 1.387331196441797e-06,
"loss": 0.8027,
"step": 2757
},
{
"epoch": 1.13,
"grad_norm": 2.975439585078698,
"learning_rate": 1.3869360228240138e-06,
"loss": 0.8481,
"step": 2758
},
{
"epoch": 1.13,
"grad_norm": 3.1449328801569143,
"learning_rate": 1.3865407781289253e-06,
"loss": 0.8352,
"step": 2759
},
{
"epoch": 1.13,
"grad_norm": 3.003168473358607,
"learning_rate": 1.3861454624291358e-06,
"loss": 0.8582,
"step": 2760
},
{
"epoch": 1.13,
"grad_norm": 2.6542383791524897,
"learning_rate": 1.385750075797261e-06,
"loss": 0.8047,
"step": 2761
},
{
"epoch": 1.13,
"grad_norm": 2.78446777213948,
"learning_rate": 1.3853546183059312e-06,
"loss": 0.7698,
"step": 2762
},
{
"epoch": 1.13,
"grad_norm": 2.9428150613899637,
"learning_rate": 1.3849590900277885e-06,
"loss": 0.8038,
"step": 2763
},
{
"epoch": 1.13,
"grad_norm": 4.347338951989065,
"learning_rate": 1.384563491035489e-06,
"loss": 0.7174,
"step": 2764
},
{
"epoch": 1.13,
"grad_norm": 2.771035872754003,
"learning_rate": 1.384167821401701e-06,
"loss": 0.7574,
"step": 2765
},
{
"epoch": 1.13,
"grad_norm": 3.3441399650540844,
"learning_rate": 1.3837720811991064e-06,
"loss": 0.8192,
"step": 2766
},
{
"epoch": 1.13,
"grad_norm": 2.7295661937322824,
"learning_rate": 1.383376270500399e-06,
"loss": 0.7468,
"step": 2767
},
{
"epoch": 1.13,
"grad_norm": 3.7790277854561243,
"learning_rate": 1.3829803893782877e-06,
"loss": 0.8146,
"step": 2768
},
{
"epoch": 1.13,
"grad_norm": 3.063031321809145,
"learning_rate": 1.3825844379054914e-06,
"loss": 0.8703,
"step": 2769
},
{
"epoch": 1.13,
"grad_norm": 2.4870554393365145,
"learning_rate": 1.3821884161547449e-06,
"loss": 0.825,
"step": 2770
},
{
"epoch": 1.13,
"grad_norm": 2.973794535346506,
"learning_rate": 1.3817923241987935e-06,
"loss": 0.8367,
"step": 2771
},
{
"epoch": 1.13,
"grad_norm": 2.9823366410658614,
"learning_rate": 1.3813961621103966e-06,
"loss": 0.8064,
"step": 2772
},
{
"epoch": 1.13,
"grad_norm": 3.6525460238494163,
"learning_rate": 1.3809999299623263e-06,
"loss": 0.8097,
"step": 2773
},
{
"epoch": 1.13,
"grad_norm": 2.7196172183857956,
"learning_rate": 1.380603627827368e-06,
"loss": 0.7818,
"step": 2774
},
{
"epoch": 1.13,
"grad_norm": 2.478716270132258,
"learning_rate": 1.3802072557783186e-06,
"loss": 0.8645,
"step": 2775
},
{
"epoch": 1.13,
"grad_norm": 2.841459251880193,
"learning_rate": 1.3798108138879896e-06,
"loss": 0.7929,
"step": 2776
},
{
"epoch": 1.13,
"grad_norm": 4.013271759654359,
"learning_rate": 1.379414302229204e-06,
"loss": 0.7859,
"step": 2777
},
{
"epoch": 1.13,
"grad_norm": 2.9129664750453923,
"learning_rate": 1.3790177208747982e-06,
"loss": 0.8214,
"step": 2778
},
{
"epoch": 1.13,
"grad_norm": 2.9205132343754987,
"learning_rate": 1.378621069897621e-06,
"loss": 0.8416,
"step": 2779
},
{
"epoch": 1.13,
"grad_norm": 2.5940495775854155,
"learning_rate": 1.3782243493705347e-06,
"loss": 0.8489,
"step": 2780
},
{
"epoch": 1.13,
"grad_norm": 2.821794921653016,
"learning_rate": 1.377827559366414e-06,
"loss": 0.8389,
"step": 2781
},
{
"epoch": 1.14,
"grad_norm": 4.638259631617055,
"learning_rate": 1.3774306999581457e-06,
"loss": 0.8438,
"step": 2782
},
{
"epoch": 1.14,
"grad_norm": 3.8253584647601975,
"learning_rate": 1.3770337712186305e-06,
"loss": 0.8719,
"step": 2783
},
{
"epoch": 1.14,
"grad_norm": 2.46420137179421,
"learning_rate": 1.3766367732207812e-06,
"loss": 0.7933,
"step": 2784
},
{
"epoch": 1.14,
"grad_norm": 3.945166058215337,
"learning_rate": 1.3762397060375234e-06,
"loss": 0.8016,
"step": 2785
},
{
"epoch": 1.14,
"grad_norm": 3.9437900792591707,
"learning_rate": 1.3758425697417952e-06,
"loss": 0.8644,
"step": 2786
},
{
"epoch": 1.14,
"grad_norm": 2.8259968046042547,
"learning_rate": 1.375445364406548e-06,
"loss": 0.8096,
"step": 2787
},
{
"epoch": 1.14,
"grad_norm": 3.3048620578528713,
"learning_rate": 1.375048090104745e-06,
"loss": 0.8151,
"step": 2788
},
{
"epoch": 1.14,
"grad_norm": 4.898634898425581,
"learning_rate": 1.3746507469093631e-06,
"loss": 0.7776,
"step": 2789
},
{
"epoch": 1.14,
"grad_norm": 2.812843572690211,
"learning_rate": 1.3742533348933909e-06,
"loss": 0.8124,
"step": 2790
},
{
"epoch": 1.14,
"grad_norm": 2.5760591289589243,
"learning_rate": 1.3738558541298303e-06,
"loss": 0.8575,
"step": 2791
},
{
"epoch": 1.14,
"grad_norm": 3.031388109149295,
"learning_rate": 1.3734583046916956e-06,
"loss": 0.8408,
"step": 2792
},
{
"epoch": 1.14,
"grad_norm": 2.891196647330614,
"learning_rate": 1.3730606866520135e-06,
"loss": 0.846,
"step": 2793
},
{
"epoch": 1.14,
"grad_norm": 3.142702851999142,
"learning_rate": 1.3726630000838241e-06,
"loss": 0.7977,
"step": 2794
},
{
"epoch": 1.14,
"grad_norm": 2.9299255502160233,
"learning_rate": 1.3722652450601785e-06,
"loss": 0.8335,
"step": 2795
},
{
"epoch": 1.14,
"grad_norm": 2.6174405111203596,
"learning_rate": 1.3718674216541426e-06,
"loss": 0.7924,
"step": 2796
},
{
"epoch": 1.14,
"grad_norm": 2.4382238567201457,
"learning_rate": 1.3714695299387927e-06,
"loss": 0.8203,
"step": 2797
},
{
"epoch": 1.14,
"grad_norm": 2.816040578736067,
"learning_rate": 1.3710715699872188e-06,
"loss": 0.8044,
"step": 2798
},
{
"epoch": 1.14,
"grad_norm": 3.34506263920839,
"learning_rate": 1.3706735418725235e-06,
"loss": 0.7602,
"step": 2799
},
{
"epoch": 1.14,
"grad_norm": 3.3986759562482862,
"learning_rate": 1.3702754456678211e-06,
"loss": 0.7948,
"step": 2800
},
{
"epoch": 1.14,
"eval_loss": 0.8856059908866882,
"eval_runtime": 466.7688,
"eval_samples_per_second": 74.667,
"eval_steps_per_second": 4.668,
"step": 2800
},
{
"epoch": 1.14,
"grad_norm": 2.6962823075554203,
"learning_rate": 1.3698772814462397e-06,
"loss": 0.8364,
"step": 2801
},
{
"epoch": 1.14,
"grad_norm": 3.3266628921365498,
"learning_rate": 1.3694790492809184e-06,
"loss": 0.8222,
"step": 2802
},
{
"epoch": 1.14,
"grad_norm": 3.3966933963079136,
"learning_rate": 1.3690807492450099e-06,
"loss": 0.7922,
"step": 2803
},
{
"epoch": 1.14,
"grad_norm": 3.0423327890220233,
"learning_rate": 1.368682381411679e-06,
"loss": 0.8739,
"step": 2804
},
{
"epoch": 1.14,
"grad_norm": 2.6938934278714046,
"learning_rate": 1.368283945854103e-06,
"loss": 0.7969,
"step": 2805
},
{
"epoch": 1.15,
"grad_norm": 2.5103559612712996,
"learning_rate": 1.367885442645471e-06,
"loss": 0.8355,
"step": 2806
},
{
"epoch": 1.15,
"grad_norm": 3.666286250591398,
"learning_rate": 1.3674868718589864e-06,
"loss": 0.8065,
"step": 2807
},
{
"epoch": 1.15,
"grad_norm": 3.037156079897424,
"learning_rate": 1.367088233567862e-06,
"loss": 0.8748,
"step": 2808
},
{
"epoch": 1.15,
"grad_norm": 2.411197743673201,
"learning_rate": 1.3666895278453266e-06,
"loss": 0.8034,
"step": 2809
},
{
"epoch": 1.15,
"grad_norm": 3.049131117368969,
"learning_rate": 1.366290754764618e-06,
"loss": 0.8244,
"step": 2810
},
{
"epoch": 1.15,
"grad_norm": 3.465510595106703,
"learning_rate": 1.3658919143989885e-06,
"loss": 0.8572,
"step": 2811
},
{
"epoch": 1.15,
"grad_norm": 3.2439638427321533,
"learning_rate": 1.3654930068217021e-06,
"loss": 0.8002,
"step": 2812
},
{
"epoch": 1.15,
"grad_norm": 2.86372255453748,
"learning_rate": 1.3650940321060352e-06,
"loss": 0.8305,
"step": 2813
},
{
"epoch": 1.15,
"grad_norm": 3.276483516598674,
"learning_rate": 1.3646949903252764e-06,
"loss": 0.7893,
"step": 2814
},
{
"epoch": 1.15,
"grad_norm": 3.4390099002137826,
"learning_rate": 1.3642958815527271e-06,
"loss": 0.8607,
"step": 2815
},
{
"epoch": 1.15,
"grad_norm": 2.716307628940663,
"learning_rate": 1.3638967058617e-06,
"loss": 0.8285,
"step": 2816
},
{
"epoch": 1.15,
"grad_norm": 3.867865880587452,
"learning_rate": 1.3634974633255213e-06,
"loss": 0.8167,
"step": 2817
},
{
"epoch": 1.15,
"grad_norm": 2.9684691782588835,
"learning_rate": 1.3630981540175287e-06,
"loss": 0.8448,
"step": 2818
},
{
"epoch": 1.15,
"grad_norm": 2.90047063673236,
"learning_rate": 1.3626987780110723e-06,
"loss": 0.8478,
"step": 2819
},
{
"epoch": 1.15,
"grad_norm": 3.6003263092298456,
"learning_rate": 1.3622993353795153e-06,
"loss": 0.8232,
"step": 2820
},
{
"epoch": 1.15,
"grad_norm": 2.6482798105622494,
"learning_rate": 1.3618998261962311e-06,
"loss": 0.8379,
"step": 2821
},
{
"epoch": 1.15,
"grad_norm": 3.0107792305523873,
"learning_rate": 1.3615002505346077e-06,
"loss": 0.8374,
"step": 2822
},
{
"epoch": 1.15,
"grad_norm": 2.9001032137574123,
"learning_rate": 1.3611006084680436e-06,
"loss": 0.8165,
"step": 2823
},
{
"epoch": 1.15,
"grad_norm": 2.572531962286574,
"learning_rate": 1.3607009000699503e-06,
"loss": 0.8821,
"step": 2824
},
{
"epoch": 1.15,
"grad_norm": 2.876044749711604,
"learning_rate": 1.3603011254137511e-06,
"loss": 0.8128,
"step": 2825
},
{
"epoch": 1.15,
"grad_norm": 3.1199880045818618,
"learning_rate": 1.3599012845728822e-06,
"loss": 0.8513,
"step": 2826
},
{
"epoch": 1.15,
"grad_norm": 2.716779339609765,
"learning_rate": 1.359501377620791e-06,
"loss": 0.824,
"step": 2827
},
{
"epoch": 1.15,
"grad_norm": 2.4668659731419447,
"learning_rate": 1.359101404630937e-06,
"loss": 0.8324,
"step": 2828
},
{
"epoch": 1.15,
"grad_norm": 2.8351086703577226,
"learning_rate": 1.3587013656767933e-06,
"loss": 0.8097,
"step": 2829
},
{
"epoch": 1.15,
"grad_norm": 2.840904387589142,
"learning_rate": 1.358301260831844e-06,
"loss": 0.8203,
"step": 2830
},
{
"epoch": 1.16,
"grad_norm": 2.694000448102514,
"learning_rate": 1.3579010901695843e-06,
"loss": 0.8775,
"step": 2831
},
{
"epoch": 1.16,
"grad_norm": 3.196832398638553,
"learning_rate": 1.357500853763524e-06,
"loss": 0.8829,
"step": 2832
},
{
"epoch": 1.16,
"grad_norm": 3.055489108171588,
"learning_rate": 1.3571005516871829e-06,
"loss": 0.7733,
"step": 2833
},
{
"epoch": 1.16,
"grad_norm": 2.3547449762694086,
"learning_rate": 1.3567001840140932e-06,
"loss": 0.8332,
"step": 2834
},
{
"epoch": 1.16,
"grad_norm": 3.251937101678169,
"learning_rate": 1.3562997508178008e-06,
"loss": 0.8593,
"step": 2835
},
{
"epoch": 1.16,
"grad_norm": 2.862229495082667,
"learning_rate": 1.3558992521718607e-06,
"loss": 0.892,
"step": 2836
},
{
"epoch": 1.16,
"grad_norm": 2.7487407965738564,
"learning_rate": 1.3554986881498426e-06,
"loss": 0.8257,
"step": 2837
},
{
"epoch": 1.16,
"grad_norm": 2.946415945964073,
"learning_rate": 1.3550980588253268e-06,
"loss": 0.8167,
"step": 2838
},
{
"epoch": 1.16,
"grad_norm": 2.523893368821902,
"learning_rate": 1.3546973642719061e-06,
"loss": 0.8544,
"step": 2839
},
{
"epoch": 1.16,
"grad_norm": 2.8218349935870877,
"learning_rate": 1.3542966045631849e-06,
"loss": 0.8475,
"step": 2840
},
{
"epoch": 1.16,
"grad_norm": 3.635485086844598,
"learning_rate": 1.3538957797727805e-06,
"loss": 0.8395,
"step": 2841
},
{
"epoch": 1.16,
"grad_norm": 2.599740580888454,
"learning_rate": 1.3534948899743206e-06,
"loss": 0.8843,
"step": 2842
},
{
"epoch": 1.16,
"grad_norm": 3.0965913190352894,
"learning_rate": 1.3530939352414462e-06,
"loss": 0.7869,
"step": 2843
},
{
"epoch": 1.16,
"grad_norm": 3.095184993992893,
"learning_rate": 1.3526929156478098e-06,
"loss": 0.8344,
"step": 2844
},
{
"epoch": 1.16,
"grad_norm": 3.392977399717502,
"learning_rate": 1.3522918312670757e-06,
"loss": 0.7697,
"step": 2845
},
{
"epoch": 1.16,
"grad_norm": 3.196936442158172,
"learning_rate": 1.3518906821729197e-06,
"loss": 0.8475,
"step": 2846
},
{
"epoch": 1.16,
"grad_norm": 2.632109316875173,
"learning_rate": 1.3514894684390301e-06,
"loss": 0.8626,
"step": 2847
},
{
"epoch": 1.16,
"grad_norm": 3.200431376583602,
"learning_rate": 1.3510881901391078e-06,
"loss": 0.8573,
"step": 2848
},
{
"epoch": 1.16,
"grad_norm": 2.4998871909764917,
"learning_rate": 1.3506868473468635e-06,
"loss": 0.8321,
"step": 2849
},
{
"epoch": 1.16,
"grad_norm": 3.7081568862554457,
"learning_rate": 1.3502854401360216e-06,
"loss": 0.8207,
"step": 2850
},
{
"epoch": 1.16,
"grad_norm": 3.0304110035128473,
"learning_rate": 1.3498839685803172e-06,
"loss": 0.8383,
"step": 2851
},
{
"epoch": 1.16,
"grad_norm": 3.1082583587571313,
"learning_rate": 1.3494824327534982e-06,
"loss": 0.7988,
"step": 2852
},
{
"epoch": 1.16,
"grad_norm": 2.6704144780084533,
"learning_rate": 1.3490808327293236e-06,
"loss": 0.8255,
"step": 2853
},
{
"epoch": 1.16,
"grad_norm": 3.348289825172023,
"learning_rate": 1.3486791685815639e-06,
"loss": 0.7995,
"step": 2854
},
{
"epoch": 1.17,
"grad_norm": 3.047600948717146,
"learning_rate": 1.3482774403840026e-06,
"loss": 0.8395,
"step": 2855
},
{
"epoch": 1.17,
"grad_norm": 2.7552949237114404,
"learning_rate": 1.347875648210434e-06,
"loss": 0.8644,
"step": 2856
},
{
"epoch": 1.17,
"grad_norm": 2.4566524959224028,
"learning_rate": 1.3474737921346634e-06,
"loss": 0.8412,
"step": 2857
},
{
"epoch": 1.17,
"grad_norm": 4.203604945403258,
"learning_rate": 1.3470718722305105e-06,
"loss": 0.7496,
"step": 2858
},
{
"epoch": 1.17,
"grad_norm": 5.129537799360939,
"learning_rate": 1.346669888571804e-06,
"loss": 0.8104,
"step": 2859
},
{
"epoch": 1.17,
"grad_norm": 2.932918557366475,
"learning_rate": 1.3462678412323854e-06,
"loss": 0.8383,
"step": 2860
},
{
"epoch": 1.17,
"grad_norm": 3.141385951406596,
"learning_rate": 1.345865730286108e-06,
"loss": 0.8854,
"step": 2861
},
{
"epoch": 1.17,
"grad_norm": 4.282090646864782,
"learning_rate": 1.3454635558068365e-06,
"loss": 0.8578,
"step": 2862
},
{
"epoch": 1.17,
"grad_norm": 2.7861598013919937,
"learning_rate": 1.3450613178684476e-06,
"loss": 0.8932,
"step": 2863
},
{
"epoch": 1.17,
"grad_norm": 3.023611300697987,
"learning_rate": 1.3446590165448292e-06,
"loss": 0.7625,
"step": 2864
},
{
"epoch": 1.17,
"grad_norm": 2.768160981163771,
"learning_rate": 1.3442566519098815e-06,
"loss": 0.8583,
"step": 2865
},
{
"epoch": 1.17,
"grad_norm": 2.5586660287783043,
"learning_rate": 1.3438542240375154e-06,
"loss": 0.8044,
"step": 2866
},
{
"epoch": 1.17,
"grad_norm": 2.7620441106978935,
"learning_rate": 1.3434517330016543e-06,
"loss": 0.7645,
"step": 2867
},
{
"epoch": 1.17,
"grad_norm": 2.759933857192536,
"learning_rate": 1.3430491788762331e-06,
"loss": 0.7893,
"step": 2868
},
{
"epoch": 1.17,
"grad_norm": 3.3083039860658174,
"learning_rate": 1.3426465617351976e-06,
"loss": 0.9048,
"step": 2869
},
{
"epoch": 1.17,
"grad_norm": 2.8587452340953075,
"learning_rate": 1.3422438816525051e-06,
"loss": 0.8463,
"step": 2870
},
{
"epoch": 1.17,
"grad_norm": 3.231118249498831,
"learning_rate": 1.3418411387021265e-06,
"loss": 0.81,
"step": 2871
},
{
"epoch": 1.17,
"grad_norm": 2.6581444135274985,
"learning_rate": 1.3414383329580412e-06,
"loss": 0.8106,
"step": 2872
},
{
"epoch": 1.17,
"grad_norm": 2.613529931762444,
"learning_rate": 1.3410354644942423e-06,
"loss": 0.8877,
"step": 2873
},
{
"epoch": 1.17,
"grad_norm": 2.874073601851782,
"learning_rate": 1.3406325333847339e-06,
"loss": 0.9318,
"step": 2874
},
{
"epoch": 1.17,
"grad_norm": 3.4361791402151596,
"learning_rate": 1.3402295397035312e-06,
"loss": 0.9196,
"step": 2875
},
{
"epoch": 1.17,
"grad_norm": 2.83850961335454,
"learning_rate": 1.3398264835246616e-06,
"loss": 0.793,
"step": 2876
},
{
"epoch": 1.17,
"grad_norm": 2.9639141883229234,
"learning_rate": 1.339423364922163e-06,
"loss": 0.8414,
"step": 2877
},
{
"epoch": 1.17,
"grad_norm": 2.484444200223558,
"learning_rate": 1.3390201839700855e-06,
"loss": 0.8411,
"step": 2878
},
{
"epoch": 1.17,
"grad_norm": 2.709790211115155,
"learning_rate": 1.3386169407424905e-06,
"loss": 0.8572,
"step": 2879
},
{
"epoch": 1.18,
"grad_norm": 2.476022892493722,
"learning_rate": 1.3382136353134513e-06,
"loss": 0.8103,
"step": 2880
},
{
"epoch": 1.18,
"grad_norm": 2.723998714281562,
"learning_rate": 1.3378102677570514e-06,
"loss": 0.7682,
"step": 2881
},
{
"epoch": 1.18,
"grad_norm": 3.2995985859903385,
"learning_rate": 1.337406838147387e-06,
"loss": 0.7821,
"step": 2882
},
{
"epoch": 1.18,
"grad_norm": 3.287822424035108,
"learning_rate": 1.3370033465585646e-06,
"loss": 0.8327,
"step": 2883
},
{
"epoch": 1.18,
"grad_norm": 2.6920354181095734,
"learning_rate": 1.3365997930647034e-06,
"loss": 0.8093,
"step": 2884
},
{
"epoch": 1.18,
"grad_norm": 2.95217277881394,
"learning_rate": 1.3361961777399326e-06,
"loss": 0.8519,
"step": 2885
},
{
"epoch": 1.18,
"grad_norm": 2.976323085439001,
"learning_rate": 1.3357925006583937e-06,
"loss": 0.8257,
"step": 2886
},
{
"epoch": 1.18,
"grad_norm": 4.174169537789278,
"learning_rate": 1.3353887618942389e-06,
"loss": 0.7974,
"step": 2887
},
{
"epoch": 1.18,
"grad_norm": 4.614749536740232,
"learning_rate": 1.3349849615216324e-06,
"loss": 0.782,
"step": 2888
},
{
"epoch": 1.18,
"grad_norm": 2.7166871896695417,
"learning_rate": 1.334581099614749e-06,
"loss": 0.8423,
"step": 2889
},
{
"epoch": 1.18,
"grad_norm": 3.6194862003887462,
"learning_rate": 1.3341771762477754e-06,
"loss": 0.8609,
"step": 2890
},
{
"epoch": 1.18,
"grad_norm": 3.31127941952909,
"learning_rate": 1.3337731914949096e-06,
"loss": 0.8044,
"step": 2891
},
{
"epoch": 1.18,
"grad_norm": 3.2353604781580194,
"learning_rate": 1.3333691454303599e-06,
"loss": 0.8268,
"step": 2892
},
{
"epoch": 1.18,
"grad_norm": 4.400927675926693,
"learning_rate": 1.3329650381283473e-06,
"loss": 0.7424,
"step": 2893
},
{
"epoch": 1.18,
"grad_norm": 5.001781827953298,
"learning_rate": 1.332560869663103e-06,
"loss": 0.7775,
"step": 2894
},
{
"epoch": 1.18,
"grad_norm": 3.2014155834722695,
"learning_rate": 1.33215664010887e-06,
"loss": 0.8489,
"step": 2895
},
{
"epoch": 1.18,
"grad_norm": 3.9846329648021848,
"learning_rate": 1.3317523495399017e-06,
"loss": 0.8215,
"step": 2896
},
{
"epoch": 1.18,
"grad_norm": 4.026458020278095,
"learning_rate": 1.3313479980304645e-06,
"loss": 0.929,
"step": 2897
},
{
"epoch": 1.18,
"grad_norm": 3.55295550825035,
"learning_rate": 1.3309435856548334e-06,
"loss": 0.7941,
"step": 2898
},
{
"epoch": 1.18,
"grad_norm": 3.14653205091117,
"learning_rate": 1.330539112487297e-06,
"loss": 0.8189,
"step": 2899
},
{
"epoch": 1.18,
"grad_norm": 3.29294861346024,
"learning_rate": 1.3301345786021537e-06,
"loss": 0.8492,
"step": 2900
},
{
"epoch": 1.18,
"eval_loss": 0.8848859071731567,
"eval_runtime": 466.7289,
"eval_samples_per_second": 74.673,
"eval_steps_per_second": 4.669,
"step": 2900
},
{
"epoch": 1.18,
"grad_norm": 2.5876254403952497,
"learning_rate": 1.3297299840737133e-06,
"loss": 0.7939,
"step": 2901
},
{
"epoch": 1.18,
"grad_norm": 2.3138780687591516,
"learning_rate": 1.3293253289762968e-06,
"loss": 0.8725,
"step": 2902
},
{
"epoch": 1.18,
"grad_norm": 2.631172291430015,
"learning_rate": 1.3289206133842368e-06,
"loss": 0.7964,
"step": 2903
},
{
"epoch": 1.19,
"grad_norm": 2.7916361808621706,
"learning_rate": 1.328515837371876e-06,
"loss": 0.8903,
"step": 2904
},
{
"epoch": 1.19,
"grad_norm": 2.6237470294348544,
"learning_rate": 1.3281110010135695e-06,
"loss": 0.8399,
"step": 2905
},
{
"epoch": 1.19,
"grad_norm": 3.7520766309021276,
"learning_rate": 1.327706104383682e-06,
"loss": 0.8001,
"step": 2906
},
{
"epoch": 1.19,
"grad_norm": 2.94617386074089,
"learning_rate": 1.3273011475565902e-06,
"loss": 0.8396,
"step": 2907
},
{
"epoch": 1.19,
"grad_norm": 3.029724189046823,
"learning_rate": 1.3268961306066824e-06,
"loss": 0.8347,
"step": 2908
},
{
"epoch": 1.19,
"grad_norm": 3.4627874558441163,
"learning_rate": 1.3264910536083562e-06,
"loss": 0.8044,
"step": 2909
},
{
"epoch": 1.19,
"grad_norm": 3.0853762940174634,
"learning_rate": 1.3260859166360223e-06,
"loss": 0.7937,
"step": 2910
},
{
"epoch": 1.19,
"grad_norm": 3.2099894762394765,
"learning_rate": 1.3256807197641007e-06,
"loss": 0.8303,
"step": 2911
},
{
"epoch": 1.19,
"grad_norm": 3.319062998527478,
"learning_rate": 1.325275463067023e-06,
"loss": 0.8504,
"step": 2912
},
{
"epoch": 1.19,
"grad_norm": 3.0451783578396308,
"learning_rate": 1.3248701466192323e-06,
"loss": 0.7894,
"step": 2913
},
{
"epoch": 1.19,
"grad_norm": 2.513335723726552,
"learning_rate": 1.3244647704951821e-06,
"loss": 0.8931,
"step": 2914
},
{
"epoch": 1.19,
"grad_norm": 3.2779081190026713,
"learning_rate": 1.324059334769337e-06,
"loss": 0.8226,
"step": 2915
},
{
"epoch": 1.19,
"grad_norm": 3.0440753719531917,
"learning_rate": 1.3236538395161724e-06,
"loss": 0.9096,
"step": 2916
},
{
"epoch": 1.19,
"grad_norm": 3.450422002922723,
"learning_rate": 1.323248284810175e-06,
"loss": 0.7993,
"step": 2917
},
{
"epoch": 1.19,
"grad_norm": 4.650546050262956,
"learning_rate": 1.3228426707258427e-06,
"loss": 0.7516,
"step": 2918
},
{
"epoch": 1.19,
"grad_norm": 2.9876860766757307,
"learning_rate": 1.3224369973376825e-06,
"loss": 0.8853,
"step": 2919
},
{
"epoch": 1.19,
"grad_norm": 3.350641963312487,
"learning_rate": 1.3220312647202149e-06,
"loss": 0.8304,
"step": 2920
},
{
"epoch": 1.19,
"grad_norm": 3.29780110501639,
"learning_rate": 1.3216254729479697e-06,
"loss": 0.8945,
"step": 2921
},
{
"epoch": 1.19,
"grad_norm": 2.6844479733598927,
"learning_rate": 1.3212196220954873e-06,
"loss": 0.8622,
"step": 2922
},
{
"epoch": 1.19,
"grad_norm": 3.2962308946876893,
"learning_rate": 1.32081371223732e-06,
"loss": 0.9113,
"step": 2923
},
{
"epoch": 1.19,
"grad_norm": 3.2926538254422932,
"learning_rate": 1.3204077434480308e-06,
"loss": 0.8458,
"step": 2924
},
{
"epoch": 1.19,
"grad_norm": 2.564414035538575,
"learning_rate": 1.3200017158021923e-06,
"loss": 0.8321,
"step": 2925
},
{
"epoch": 1.19,
"grad_norm": 3.6322626015756674,
"learning_rate": 1.3195956293743896e-06,
"loss": 0.842,
"step": 2926
},
{
"epoch": 1.19,
"grad_norm": 3.1972250429295674,
"learning_rate": 1.3191894842392173e-06,
"loss": 0.8321,
"step": 2927
},
{
"epoch": 1.19,
"grad_norm": 2.860303840442916,
"learning_rate": 1.3187832804712812e-06,
"loss": 0.7998,
"step": 2928
},
{
"epoch": 1.2,
"grad_norm": 2.734612758167986,
"learning_rate": 1.3183770181451986e-06,
"loss": 0.8268,
"step": 2929
},
{
"epoch": 1.2,
"grad_norm": 2.540271476694354,
"learning_rate": 1.3179706973355962e-06,
"loss": 0.8682,
"step": 2930
},
{
"epoch": 1.2,
"grad_norm": 3.7697504348139477,
"learning_rate": 1.3175643181171125e-06,
"loss": 0.8547,
"step": 2931
},
{
"epoch": 1.2,
"grad_norm": 2.7743123238240246,
"learning_rate": 1.3171578805643964e-06,
"loss": 0.8255,
"step": 2932
},
{
"epoch": 1.2,
"grad_norm": 2.725938276919847,
"learning_rate": 1.3167513847521075e-06,
"loss": 0.8269,
"step": 2933
},
{
"epoch": 1.2,
"grad_norm": 2.712113491439464,
"learning_rate": 1.3163448307549158e-06,
"loss": 0.7915,
"step": 2934
},
{
"epoch": 1.2,
"grad_norm": 3.595995253624046,
"learning_rate": 1.3159382186475024e-06,
"loss": 0.8365,
"step": 2935
},
{
"epoch": 1.2,
"grad_norm": 3.2551513187809284,
"learning_rate": 1.3155315485045593e-06,
"loss": 0.7958,
"step": 2936
},
{
"epoch": 1.2,
"grad_norm": 2.7384178401718406,
"learning_rate": 1.3151248204007884e-06,
"loss": 0.827,
"step": 2937
},
{
"epoch": 1.2,
"grad_norm": 3.2045601936265236,
"learning_rate": 1.314718034410903e-06,
"loss": 0.8018,
"step": 2938
},
{
"epoch": 1.2,
"grad_norm": 2.609536885380519,
"learning_rate": 1.3143111906096263e-06,
"loss": 0.8547,
"step": 2939
},
{
"epoch": 1.2,
"grad_norm": 3.000529340151092,
"learning_rate": 1.3139042890716927e-06,
"loss": 0.7896,
"step": 2940
},
{
"epoch": 1.2,
"grad_norm": 2.886649996180552,
"learning_rate": 1.3134973298718476e-06,
"loss": 0.8341,
"step": 2941
},
{
"epoch": 1.2,
"grad_norm": 2.5391435643854305,
"learning_rate": 1.3130903130848453e-06,
"loss": 0.8608,
"step": 2942
},
{
"epoch": 1.2,
"grad_norm": 2.6317442089063223,
"learning_rate": 1.3126832387854525e-06,
"loss": 0.8727,
"step": 2943
},
{
"epoch": 1.2,
"grad_norm": 3.366546971712009,
"learning_rate": 1.3122761070484458e-06,
"loss": 0.8286,
"step": 2944
},
{
"epoch": 1.2,
"grad_norm": 2.7209224242573686,
"learning_rate": 1.3118689179486118e-06,
"loss": 0.8415,
"step": 2945
},
{
"epoch": 1.2,
"grad_norm": 3.395227468597177,
"learning_rate": 1.311461671560749e-06,
"loss": 0.8293,
"step": 2946
},
{
"epoch": 1.2,
"grad_norm": 2.685494840887225,
"learning_rate": 1.3110543679596648e-06,
"loss": 0.8527,
"step": 2947
},
{
"epoch": 1.2,
"grad_norm": 2.6911558346228843,
"learning_rate": 1.3106470072201781e-06,
"loss": 0.8014,
"step": 2948
},
{
"epoch": 1.2,
"grad_norm": 3.249822819277431,
"learning_rate": 1.310239589417118e-06,
"loss": 0.8556,
"step": 2949
},
{
"epoch": 1.2,
"grad_norm": 2.560844465031747,
"learning_rate": 1.3098321146253244e-06,
"loss": 0.7973,
"step": 2950
},
{
"epoch": 1.2,
"grad_norm": 2.4343101990728635,
"learning_rate": 1.3094245829196475e-06,
"loss": 0.8555,
"step": 2951
},
{
"epoch": 1.2,
"grad_norm": 3.196300406583056,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.7391,
"step": 2952
},
{
"epoch": 1.21,
"grad_norm": 2.974949263777133,
"learning_rate": 1.3086093490660954e-06,
"loss": 0.8951,
"step": 2953
},
{
"epoch": 1.21,
"grad_norm": 2.9243304569417083,
"learning_rate": 1.308201647067973e-06,
"loss": 0.8759,
"step": 2954
},
{
"epoch": 1.21,
"grad_norm": 3.202311352889672,
"learning_rate": 1.3077938884554721e-06,
"loss": 0.8423,
"step": 2955
},
{
"epoch": 1.21,
"grad_norm": 3.1294614969409316,
"learning_rate": 1.307386073303495e-06,
"loss": 0.8013,
"step": 2956
},
{
"epoch": 1.21,
"grad_norm": 3.3764830380125708,
"learning_rate": 1.3069782016869543e-06,
"loss": 0.838,
"step": 2957
},
{
"epoch": 1.21,
"grad_norm": 3.50702215982617,
"learning_rate": 1.3065702736807722e-06,
"loss": 0.8472,
"step": 2958
},
{
"epoch": 1.21,
"grad_norm": 3.884704985624516,
"learning_rate": 1.3061622893598835e-06,
"loss": 0.7892,
"step": 2959
},
{
"epoch": 1.21,
"grad_norm": 2.443159463861826,
"learning_rate": 1.3057542487992312e-06,
"loss": 0.8611,
"step": 2960
},
{
"epoch": 1.21,
"grad_norm": 2.84658550654606,
"learning_rate": 1.305346152073769e-06,
"loss": 0.8046,
"step": 2961
},
{
"epoch": 1.21,
"grad_norm": 2.9298534061260226,
"learning_rate": 1.304937999258462e-06,
"loss": 0.7647,
"step": 2962
},
{
"epoch": 1.21,
"grad_norm": 3.1696172941609975,
"learning_rate": 1.3045297904282847e-06,
"loss": 0.822,
"step": 2963
},
{
"epoch": 1.21,
"grad_norm": 3.0577077332277907,
"learning_rate": 1.3041215256582214e-06,
"loss": 0.8017,
"step": 2964
},
{
"epoch": 1.21,
"grad_norm": 3.840756014204138,
"learning_rate": 1.3037132050232683e-06,
"loss": 0.8249,
"step": 2965
},
{
"epoch": 1.21,
"grad_norm": 3.4590459342840467,
"learning_rate": 1.30330482859843e-06,
"loss": 0.7773,
"step": 2966
},
{
"epoch": 1.21,
"grad_norm": 3.405624994546982,
"learning_rate": 1.3028963964587229e-06,
"loss": 0.7874,
"step": 2967
},
{
"epoch": 1.21,
"grad_norm": 2.4891107194948203,
"learning_rate": 1.3024879086791728e-06,
"loss": 0.8952,
"step": 2968
},
{
"epoch": 1.21,
"grad_norm": 3.3339261956436603,
"learning_rate": 1.3020793653348158e-06,
"loss": 0.8035,
"step": 2969
},
{
"epoch": 1.21,
"grad_norm": 2.816935357733003,
"learning_rate": 1.3016707665006988e-06,
"loss": 0.8505,
"step": 2970
},
{
"epoch": 1.21,
"grad_norm": 2.8311556860553933,
"learning_rate": 1.3012621122518775e-06,
"loss": 0.8509,
"step": 2971
},
{
"epoch": 1.21,
"grad_norm": 3.9148609638075773,
"learning_rate": 1.3008534026634196e-06,
"loss": 0.827,
"step": 2972
},
{
"epoch": 1.21,
"grad_norm": 4.09893042755632,
"learning_rate": 1.3004446378104012e-06,
"loss": 0.8675,
"step": 2973
},
{
"epoch": 1.21,
"grad_norm": 3.161961784762332,
"learning_rate": 1.30003581776791e-06,
"loss": 0.8381,
"step": 2974
},
{
"epoch": 1.21,
"grad_norm": 3.4325104860458175,
"learning_rate": 1.2996269426110431e-06,
"loss": 0.7755,
"step": 2975
},
{
"epoch": 1.21,
"grad_norm": 2.3686506401666363,
"learning_rate": 1.299218012414908e-06,
"loss": 0.8253,
"step": 2976
},
{
"epoch": 1.21,
"grad_norm": 3.5569081921548102,
"learning_rate": 1.2988090272546217e-06,
"loss": 0.8353,
"step": 2977
},
{
"epoch": 1.22,
"grad_norm": 4.597423964009306,
"learning_rate": 1.298399987205312e-06,
"loss": 0.8,
"step": 2978
},
{
"epoch": 1.22,
"grad_norm": 3.6883313704868597,
"learning_rate": 1.297990892342117e-06,
"loss": 0.7722,
"step": 2979
},
{
"epoch": 1.22,
"grad_norm": 2.561840095076519,
"learning_rate": 1.2975817427401839e-06,
"loss": 0.8777,
"step": 2980
},
{
"epoch": 1.22,
"grad_norm": 2.9664518956965398,
"learning_rate": 1.2971725384746707e-06,
"loss": 0.8584,
"step": 2981
},
{
"epoch": 1.22,
"grad_norm": 3.57879375092246,
"learning_rate": 1.2967632796207448e-06,
"loss": 0.8096,
"step": 2982
},
{
"epoch": 1.22,
"grad_norm": 3.753161318165699,
"learning_rate": 1.296353966253585e-06,
"loss": 0.7675,
"step": 2983
},
{
"epoch": 1.22,
"grad_norm": 3.3780941193990532,
"learning_rate": 1.295944598448378e-06,
"loss": 0.8228,
"step": 2984
},
{
"epoch": 1.22,
"grad_norm": 3.0369207058956236,
"learning_rate": 1.2955351762803224e-06,
"loss": 0.797,
"step": 2985
},
{
"epoch": 1.22,
"grad_norm": 2.6012371297085464,
"learning_rate": 1.2951256998246256e-06,
"loss": 0.8823,
"step": 2986
},
{
"epoch": 1.22,
"grad_norm": 2.7428851397917815,
"learning_rate": 1.294716169156506e-06,
"loss": 0.8306,
"step": 2987
},
{
"epoch": 1.22,
"grad_norm": 3.2289090458798717,
"learning_rate": 1.294306584351191e-06,
"loss": 0.8617,
"step": 2988
},
{
"epoch": 1.22,
"grad_norm": 3.023182407211325,
"learning_rate": 1.2938969454839184e-06,
"loss": 0.8132,
"step": 2989
},
{
"epoch": 1.22,
"grad_norm": 2.3927432536939426,
"learning_rate": 1.2934872526299356e-06,
"loss": 0.8392,
"step": 2990
},
{
"epoch": 1.22,
"grad_norm": 3.287946288054918,
"learning_rate": 1.2930775058645004e-06,
"loss": 0.8371,
"step": 2991
},
{
"epoch": 1.22,
"grad_norm": 2.488021550179535,
"learning_rate": 1.2926677052628805e-06,
"loss": 0.8628,
"step": 2992
},
{
"epoch": 1.22,
"grad_norm": 2.9814252528817633,
"learning_rate": 1.292257850900353e-06,
"loss": 0.812,
"step": 2993
},
{
"epoch": 1.22,
"grad_norm": 3.158298659002879,
"learning_rate": 1.2918479428522048e-06,
"loss": 0.7786,
"step": 2994
},
{
"epoch": 1.22,
"grad_norm": 3.6567876571328255,
"learning_rate": 1.2914379811937334e-06,
"loss": 0.7992,
"step": 2995
},
{
"epoch": 1.22,
"grad_norm": 3.2831679974644494,
"learning_rate": 1.291027966000246e-06,
"loss": 0.7897,
"step": 2996
},
{
"epoch": 1.22,
"grad_norm": 3.6124445395813614,
"learning_rate": 1.290617897347059e-06,
"loss": 0.803,
"step": 2997
},
{
"epoch": 1.22,
"grad_norm": 2.9589360512907463,
"learning_rate": 1.2902077753094992e-06,
"loss": 0.8176,
"step": 2998
},
{
"epoch": 1.22,
"grad_norm": 4.594945026788219,
"learning_rate": 1.2897975999629022e-06,
"loss": 0.8462,
"step": 2999
},
{
"epoch": 1.22,
"grad_norm": 2.6225655560023595,
"learning_rate": 1.2893873713826155e-06,
"loss": 0.8132,
"step": 3000
},
{
"epoch": 1.22,
"eval_loss": 0.8857477903366089,
"eval_runtime": 466.7819,
"eval_samples_per_second": 74.664,
"eval_steps_per_second": 4.668,
"step": 3000
},
{
"epoch": 1.22,
"grad_norm": 3.9944356124687297,
"learning_rate": 1.2889770896439944e-06,
"loss": 0.7779,
"step": 3001
},
{
"epoch": 1.23,
"grad_norm": 2.87886513875547,
"learning_rate": 1.2885667548224043e-06,
"loss": 0.7572,
"step": 3002
},
{
"epoch": 1.23,
"grad_norm": 2.6809935728554715,
"learning_rate": 1.2881563669932216e-06,
"loss": 0.8769,
"step": 3003
},
{
"epoch": 1.23,
"grad_norm": 2.3691540065914363,
"learning_rate": 1.2877459262318307e-06,
"loss": 0.796,
"step": 3004
},
{
"epoch": 1.23,
"grad_norm": 4.65002213628321,
"learning_rate": 1.287335432613627e-06,
"loss": 0.7675,
"step": 3005
},
{
"epoch": 1.23,
"grad_norm": 3.042002015301754,
"learning_rate": 1.286924886214015e-06,
"loss": 0.8541,
"step": 3006
},
{
"epoch": 1.23,
"grad_norm": 3.3096988421442086,
"learning_rate": 1.286514287108409e-06,
"loss": 0.854,
"step": 3007
},
{
"epoch": 1.23,
"grad_norm": 2.3489435752634797,
"learning_rate": 1.286103635372233e-06,
"loss": 0.8566,
"step": 3008
},
{
"epoch": 1.23,
"grad_norm": 4.030564695508961,
"learning_rate": 1.2856929310809212e-06,
"loss": 0.8262,
"step": 3009
},
{
"epoch": 1.23,
"grad_norm": 3.439848521178289,
"learning_rate": 1.2852821743099162e-06,
"loss": 0.7946,
"step": 3010
},
{
"epoch": 1.23,
"grad_norm": 3.024939395761215,
"learning_rate": 1.2848713651346719e-06,
"loss": 0.7781,
"step": 3011
},
{
"epoch": 1.23,
"grad_norm": 3.200108358391015,
"learning_rate": 1.2844605036306498e-06,
"loss": 0.8057,
"step": 3012
},
{
"epoch": 1.23,
"grad_norm": 3.1016590038695653,
"learning_rate": 1.2840495898733233e-06,
"loss": 0.8049,
"step": 3013
},
{
"epoch": 1.23,
"grad_norm": 2.420053027374433,
"learning_rate": 1.2836386239381734e-06,
"loss": 0.845,
"step": 3014
},
{
"epoch": 1.23,
"grad_norm": 4.53986195325091,
"learning_rate": 1.283227605900692e-06,
"loss": 0.8734,
"step": 3015
},
{
"epoch": 1.23,
"grad_norm": 2.7036674140672274,
"learning_rate": 1.2828165358363798e-06,
"loss": 0.8822,
"step": 3016
},
{
"epoch": 1.23,
"grad_norm": 3.6161138245991093,
"learning_rate": 1.2824054138207474e-06,
"loss": 0.8296,
"step": 3017
},
{
"epoch": 1.23,
"grad_norm": 3.076714864914665,
"learning_rate": 1.2819942399293151e-06,
"loss": 0.8373,
"step": 3018
},
{
"epoch": 1.23,
"grad_norm": 4.53130328841018,
"learning_rate": 1.2815830142376128e-06,
"loss": 0.8423,
"step": 3019
},
{
"epoch": 1.23,
"grad_norm": 3.628889410958076,
"learning_rate": 1.2811717368211782e-06,
"loss": 0.807,
"step": 3020
},
{
"epoch": 1.23,
"grad_norm": 3.353180019890441,
"learning_rate": 1.2807604077555618e-06,
"loss": 0.84,
"step": 3021
},
{
"epoch": 1.23,
"grad_norm": 2.98449127578144,
"learning_rate": 1.2803490271163208e-06,
"loss": 0.8863,
"step": 3022
},
{
"epoch": 1.23,
"grad_norm": 2.581861315488735,
"learning_rate": 1.2799375949790223e-06,
"loss": 0.8396,
"step": 3023
},
{
"epoch": 1.23,
"grad_norm": 2.73821587451845,
"learning_rate": 1.2795261114192448e-06,
"loss": 0.808,
"step": 3024
},
{
"epoch": 1.23,
"grad_norm": 2.8260290128594776,
"learning_rate": 1.2791145765125732e-06,
"loss": 0.8072,
"step": 3025
},
{
"epoch": 1.23,
"grad_norm": 2.605441581610369,
"learning_rate": 1.2787029903346049e-06,
"loss": 0.8914,
"step": 3026
},
{
"epoch": 1.24,
"grad_norm": 4.027542884793127,
"learning_rate": 1.2782913529609443e-06,
"loss": 0.8026,
"step": 3027
},
{
"epoch": 1.24,
"grad_norm": 2.298116649200731,
"learning_rate": 1.2778796644672062e-06,
"loss": 0.8031,
"step": 3028
},
{
"epoch": 1.24,
"grad_norm": 2.9960646025434934,
"learning_rate": 1.2774679249290153e-06,
"loss": 0.8844,
"step": 3029
},
{
"epoch": 1.24,
"grad_norm": 2.587324817657491,
"learning_rate": 1.2770561344220044e-06,
"loss": 0.8716,
"step": 3030
},
{
"epoch": 1.24,
"grad_norm": 3.512336402260329,
"learning_rate": 1.276644293021817e-06,
"loss": 0.8331,
"step": 3031
},
{
"epoch": 1.24,
"grad_norm": 2.7036697297426713,
"learning_rate": 1.2762324008041055e-06,
"loss": 0.779,
"step": 3032
},
{
"epoch": 1.24,
"grad_norm": 2.9903762091538826,
"learning_rate": 1.2758204578445307e-06,
"loss": 0.8288,
"step": 3033
},
{
"epoch": 1.24,
"grad_norm": 2.965570181327629,
"learning_rate": 1.2754084642187641e-06,
"loss": 0.8628,
"step": 3034
},
{
"epoch": 1.24,
"grad_norm": 3.333401470712817,
"learning_rate": 1.2749964200024856e-06,
"loss": 0.8435,
"step": 3035
},
{
"epoch": 1.24,
"grad_norm": 2.362615899688823,
"learning_rate": 1.2745843252713847e-06,
"loss": 0.8816,
"step": 3036
},
{
"epoch": 1.24,
"grad_norm": 2.979396978997175,
"learning_rate": 1.2741721801011604e-06,
"loss": 0.8277,
"step": 3037
},
{
"epoch": 1.24,
"grad_norm": 2.976068439540155,
"learning_rate": 1.2737599845675204e-06,
"loss": 0.8595,
"step": 3038
},
{
"epoch": 1.24,
"grad_norm": 4.458450508108239,
"learning_rate": 1.2733477387461826e-06,
"loss": 0.8751,
"step": 3039
},
{
"epoch": 1.24,
"grad_norm": 3.8009394849937306,
"learning_rate": 1.2729354427128728e-06,
"loss": 0.7518,
"step": 3040
},
{
"epoch": 1.24,
"grad_norm": 3.9013796983612155,
"learning_rate": 1.2725230965433276e-06,
"loss": 0.8248,
"step": 3041
},
{
"epoch": 1.24,
"grad_norm": 3.370279335319776,
"learning_rate": 1.272110700313291e-06,
"loss": 0.8221,
"step": 3042
},
{
"epoch": 1.24,
"grad_norm": 2.84946147223308,
"learning_rate": 1.2716982540985178e-06,
"loss": 0.8779,
"step": 3043
},
{
"epoch": 1.24,
"grad_norm": 2.7117592829649886,
"learning_rate": 1.271285757974771e-06,
"loss": 0.8722,
"step": 3044
},
{
"epoch": 1.24,
"grad_norm": 3.217226642319152,
"learning_rate": 1.2708732120178235e-06,
"loss": 0.8128,
"step": 3045
},
{
"epoch": 1.24,
"grad_norm": 2.818884455910834,
"learning_rate": 1.2704606163034563e-06,
"loss": 0.8371,
"step": 3046
},
{
"epoch": 1.24,
"grad_norm": 3.0405954512808124,
"learning_rate": 1.270047970907461e-06,
"loss": 0.8449,
"step": 3047
},
{
"epoch": 1.24,
"grad_norm": 3.1940549535707774,
"learning_rate": 1.269635275905637e-06,
"loss": 0.9076,
"step": 3048
},
{
"epoch": 1.24,
"grad_norm": 2.703883610095769,
"learning_rate": 1.2692225313737937e-06,
"loss": 0.859,
"step": 3049
},
{
"epoch": 1.24,
"grad_norm": 2.69530078115598,
"learning_rate": 1.268809737387749e-06,
"loss": 0.8564,
"step": 3050
},
{
"epoch": 1.25,
"grad_norm": 3.6641371544020354,
"learning_rate": 1.2683968940233298e-06,
"loss": 0.8506,
"step": 3051
},
{
"epoch": 1.25,
"grad_norm": 2.8671104998178074,
"learning_rate": 1.2679840013563729e-06,
"loss": 0.8736,
"step": 3052
},
{
"epoch": 1.25,
"grad_norm": 2.8962270876123544,
"learning_rate": 1.2675710594627238e-06,
"loss": 0.8154,
"step": 3053
},
{
"epoch": 1.25,
"grad_norm": 2.9864139189334287,
"learning_rate": 1.2671580684182361e-06,
"loss": 0.8156,
"step": 3054
},
{
"epoch": 1.25,
"grad_norm": 3.211711608230754,
"learning_rate": 1.266745028298774e-06,
"loss": 0.8306,
"step": 3055
},
{
"epoch": 1.25,
"grad_norm": 2.989199870242515,
"learning_rate": 1.2663319391802096e-06,
"loss": 0.7978,
"step": 3056
},
{
"epoch": 1.25,
"grad_norm": 2.716347799445181,
"learning_rate": 1.2659188011384246e-06,
"loss": 0.843,
"step": 3057
},
{
"epoch": 1.25,
"grad_norm": 2.6021425983206585,
"learning_rate": 1.2655056142493091e-06,
"loss": 0.8652,
"step": 3058
},
{
"epoch": 1.25,
"grad_norm": 3.1875456314634163,
"learning_rate": 1.2650923785887623e-06,
"loss": 0.7631,
"step": 3059
},
{
"epoch": 1.25,
"grad_norm": 2.710581838738762,
"learning_rate": 1.2646790942326932e-06,
"loss": 0.7554,
"step": 3060
},
{
"epoch": 1.25,
"grad_norm": 4.05963825071148,
"learning_rate": 1.2642657612570187e-06,
"loss": 0.8804,
"step": 3061
},
{
"epoch": 1.25,
"grad_norm": 2.7917203079518282,
"learning_rate": 1.2638523797376651e-06,
"loss": 0.7697,
"step": 3062
},
{
"epoch": 1.25,
"grad_norm": 2.900950937649179,
"learning_rate": 1.2634389497505675e-06,
"loss": 0.9226,
"step": 3063
},
{
"epoch": 1.25,
"grad_norm": 2.8418325401621254,
"learning_rate": 1.26302547137167e-06,
"loss": 0.7637,
"step": 3064
},
{
"epoch": 1.25,
"grad_norm": 3.1288682048861105,
"learning_rate": 1.2626119446769253e-06,
"loss": 0.815,
"step": 3065
},
{
"epoch": 1.25,
"grad_norm": 2.385339373203998,
"learning_rate": 1.2621983697422957e-06,
"loss": 0.8349,
"step": 3066
},
{
"epoch": 1.25,
"grad_norm": 3.0225853993931215,
"learning_rate": 1.2617847466437517e-06,
"loss": 0.9148,
"step": 3067
},
{
"epoch": 1.25,
"grad_norm": 3.0899023933529786,
"learning_rate": 1.2613710754572725e-06,
"loss": 0.8665,
"step": 3068
},
{
"epoch": 1.25,
"grad_norm": 2.948398648968442,
"learning_rate": 1.2609573562588469e-06,
"loss": 0.815,
"step": 3069
},
{
"epoch": 1.25,
"grad_norm": 2.9768362743311805,
"learning_rate": 1.2605435891244718e-06,
"loss": 0.7914,
"step": 3070
},
{
"epoch": 1.25,
"grad_norm": 3.5854884676976693,
"learning_rate": 1.2601297741301536e-06,
"loss": 0.752,
"step": 3071
},
{
"epoch": 1.25,
"grad_norm": 2.9552477790714247,
"learning_rate": 1.2597159113519062e-06,
"loss": 0.8164,
"step": 3072
},
{
"epoch": 1.25,
"grad_norm": 3.233543388595383,
"learning_rate": 1.259302000865754e-06,
"loss": 0.8843,
"step": 3073
},
{
"epoch": 1.25,
"grad_norm": 3.4056360562894263,
"learning_rate": 1.2588880427477289e-06,
"loss": 0.7764,
"step": 3074
},
{
"epoch": 1.25,
"grad_norm": 4.357661149445776,
"learning_rate": 1.2584740370738723e-06,
"loss": 0.8733,
"step": 3075
},
{
"epoch": 1.26,
"grad_norm": 2.4432133984690356,
"learning_rate": 1.2580599839202334e-06,
"loss": 0.7772,
"step": 3076
},
{
"epoch": 1.26,
"grad_norm": 2.9241556159993243,
"learning_rate": 1.2576458833628714e-06,
"loss": 0.8177,
"step": 3077
},
{
"epoch": 1.26,
"grad_norm": 2.7266857456224916,
"learning_rate": 1.2572317354778532e-06,
"loss": 0.7365,
"step": 3078
},
{
"epoch": 1.26,
"grad_norm": 3.032916777750237,
"learning_rate": 1.2568175403412546e-06,
"loss": 0.8333,
"step": 3079
},
{
"epoch": 1.26,
"grad_norm": 2.452379163360149,
"learning_rate": 1.2564032980291606e-06,
"loss": 0.8221,
"step": 3080
},
{
"epoch": 1.26,
"grad_norm": 2.9133587864311346,
"learning_rate": 1.2559890086176643e-06,
"loss": 0.8321,
"step": 3081
},
{
"epoch": 1.26,
"grad_norm": 2.6798791136016322,
"learning_rate": 1.2555746721828671e-06,
"loss": 0.7959,
"step": 3082
},
{
"epoch": 1.26,
"grad_norm": 2.539685306836812,
"learning_rate": 1.25516028880088e-06,
"loss": 0.845,
"step": 3083
},
{
"epoch": 1.26,
"grad_norm": 2.857886023388425,
"learning_rate": 1.2547458585478227e-06,
"loss": 0.7841,
"step": 3084
},
{
"epoch": 1.26,
"grad_norm": 3.616468961090001,
"learning_rate": 1.2543313814998223e-06,
"loss": 0.798,
"step": 3085
},
{
"epoch": 1.26,
"grad_norm": 3.50745866227185,
"learning_rate": 1.2539168577330156e-06,
"loss": 0.8835,
"step": 3086
},
{
"epoch": 1.26,
"grad_norm": 2.742868954533521,
"learning_rate": 1.2535022873235468e-06,
"loss": 0.8299,
"step": 3087
},
{
"epoch": 1.26,
"grad_norm": 3.044223437579288,
"learning_rate": 1.2530876703475703e-06,
"loss": 0.7972,
"step": 3088
},
{
"epoch": 1.26,
"grad_norm": 3.7900237867252455,
"learning_rate": 1.2526730068812478e-06,
"loss": 0.8408,
"step": 3089
},
{
"epoch": 1.26,
"grad_norm": 2.843903754470036,
"learning_rate": 1.2522582970007501e-06,
"loss": 0.8187,
"step": 3090
},
{
"epoch": 1.26,
"grad_norm": 3.0256450736079095,
"learning_rate": 1.2518435407822563e-06,
"loss": 0.8646,
"step": 3091
},
{
"epoch": 1.26,
"grad_norm": 3.5170849691194905,
"learning_rate": 1.251428738301954e-06,
"loss": 0.8291,
"step": 3092
},
{
"epoch": 1.26,
"grad_norm": 3.260032332715373,
"learning_rate": 1.2510138896360392e-06,
"loss": 0.8489,
"step": 3093
},
{
"epoch": 1.26,
"grad_norm": 2.8259769577757794,
"learning_rate": 1.2505989948607173e-06,
"loss": 0.8917,
"step": 3094
},
{
"epoch": 1.26,
"grad_norm": 2.4475245421813328,
"learning_rate": 1.2501840540522002e-06,
"loss": 0.866,
"step": 3095
},
{
"epoch": 1.26,
"grad_norm": 3.0981921360993296,
"learning_rate": 1.2497690672867107e-06,
"loss": 0.8464,
"step": 3096
},
{
"epoch": 1.26,
"grad_norm": 3.1677118090038765,
"learning_rate": 1.2493540346404778e-06,
"loss": 0.8681,
"step": 3097
},
{
"epoch": 1.26,
"grad_norm": 4.095920061121832,
"learning_rate": 1.2489389561897408e-06,
"loss": 0.865,
"step": 3098
},
{
"epoch": 1.26,
"grad_norm": 2.857562999878785,
"learning_rate": 1.2485238320107463e-06,
"loss": 0.8411,
"step": 3099
},
{
"epoch": 1.27,
"grad_norm": 2.6901257147868027,
"learning_rate": 1.248108662179749e-06,
"loss": 0.8018,
"step": 3100
},
{
"epoch": 1.27,
"eval_loss": 0.8830435872077942,
"eval_runtime": 465.6834,
"eval_samples_per_second": 74.841,
"eval_steps_per_second": 4.679,
"step": 3100
},
{
"epoch": 1.27,
"grad_norm": 3.4368180045820766,
"learning_rate": 1.2476934467730134e-06,
"loss": 0.7844,
"step": 3101
},
{
"epoch": 1.27,
"grad_norm": 3.0438300944106778,
"learning_rate": 1.247278185866811e-06,
"loss": 0.8069,
"step": 3102
},
{
"epoch": 1.27,
"grad_norm": 2.791548271609807,
"learning_rate": 1.2468628795374223e-06,
"loss": 0.8448,
"step": 3103
},
{
"epoch": 1.27,
"grad_norm": 2.5271665843728686,
"learning_rate": 1.246447527861136e-06,
"loss": 0.8709,
"step": 3104
},
{
"epoch": 1.27,
"grad_norm": 3.7346007266413506,
"learning_rate": 1.2460321309142491e-06,
"loss": 0.8549,
"step": 3105
},
{
"epoch": 1.27,
"grad_norm": 2.8212467058758346,
"learning_rate": 1.2456166887730672e-06,
"loss": 0.8462,
"step": 3106
},
{
"epoch": 1.27,
"grad_norm": 3.4778847713207,
"learning_rate": 1.245201201513904e-06,
"loss": 0.8514,
"step": 3107
},
{
"epoch": 1.27,
"grad_norm": 2.8225514938310847,
"learning_rate": 1.244785669213081e-06,
"loss": 0.8151,
"step": 3108
},
{
"epoch": 1.27,
"grad_norm": 3.401409116790121,
"learning_rate": 1.2443700919469288e-06,
"loss": 0.7738,
"step": 3109
},
{
"epoch": 1.27,
"grad_norm": 3.2754795425123864,
"learning_rate": 1.2439544697917854e-06,
"loss": 0.8061,
"step": 3110
},
{
"epoch": 1.27,
"grad_norm": 3.4722550144703215,
"learning_rate": 1.2435388028239983e-06,
"loss": 0.8164,
"step": 3111
},
{
"epoch": 1.27,
"grad_norm": 3.3693891575144495,
"learning_rate": 1.2431230911199225e-06,
"loss": 0.7764,
"step": 3112
},
{
"epoch": 1.27,
"grad_norm": 2.7032158271619213,
"learning_rate": 1.24270733475592e-06,
"loss": 0.7891,
"step": 3113
},
{
"epoch": 1.27,
"grad_norm": 3.2907993977527235,
"learning_rate": 1.2422915338083634e-06,
"loss": 0.8362,
"step": 3114
},
{
"epoch": 1.27,
"grad_norm": 2.444869082595358,
"learning_rate": 1.2418756883536318e-06,
"loss": 0.806,
"step": 3115
},
{
"epoch": 1.27,
"grad_norm": 3.245105735474775,
"learning_rate": 1.241459798468113e-06,
"loss": 0.8745,
"step": 3116
},
{
"epoch": 1.27,
"grad_norm": 2.574590609824504,
"learning_rate": 1.241043864228203e-06,
"loss": 0.8518,
"step": 3117
},
{
"epoch": 1.27,
"grad_norm": 3.6340971029621167,
"learning_rate": 1.2406278857103056e-06,
"loss": 0.7517,
"step": 3118
},
{
"epoch": 1.27,
"grad_norm": 3.023482292979013,
"learning_rate": 1.2402118629908334e-06,
"loss": 0.9023,
"step": 3119
},
{
"epoch": 1.27,
"grad_norm": 2.66294478732651,
"learning_rate": 1.2397957961462067e-06,
"loss": 0.8389,
"step": 3120
},
{
"epoch": 1.27,
"grad_norm": 2.927579077610967,
"learning_rate": 1.2393796852528531e-06,
"loss": 0.8187,
"step": 3121
},
{
"epoch": 1.27,
"grad_norm": 2.9477873606248157,
"learning_rate": 1.2389635303872103e-06,
"loss": 0.7958,
"step": 3122
},
{
"epoch": 1.27,
"grad_norm": 2.791810762920315,
"learning_rate": 1.238547331625722e-06,
"loss": 0.8347,
"step": 3123
},
{
"epoch": 1.27,
"grad_norm": 3.21556040702046,
"learning_rate": 1.2381310890448416e-06,
"loss": 0.8694,
"step": 3124
},
{
"epoch": 1.28,
"grad_norm": 2.416280900353184,
"learning_rate": 1.237714802721029e-06,
"loss": 0.8785,
"step": 3125
},
{
"epoch": 1.28,
"grad_norm": 2.644477017309431,
"learning_rate": 1.237298472730753e-06,
"loss": 0.8926,
"step": 3126
},
{
"epoch": 1.28,
"grad_norm": 2.951068557992701,
"learning_rate": 1.2368820991504916e-06,
"loss": 0.8173,
"step": 3127
},
{
"epoch": 1.28,
"grad_norm": 2.432989026169331,
"learning_rate": 1.2364656820567283e-06,
"loss": 0.8411,
"step": 3128
},
{
"epoch": 1.28,
"grad_norm": 2.877366579134279,
"learning_rate": 1.236049221525956e-06,
"loss": 0.7396,
"step": 3129
},
{
"epoch": 1.28,
"grad_norm": 2.405859998626659,
"learning_rate": 1.235632717634676e-06,
"loss": 0.892,
"step": 3130
},
{
"epoch": 1.28,
"grad_norm": 3.9272889085273155,
"learning_rate": 1.2352161704593964e-06,
"loss": 0.8355,
"step": 3131
},
{
"epoch": 1.28,
"grad_norm": 4.8002549660421305,
"learning_rate": 1.2347995800766342e-06,
"loss": 0.8896,
"step": 3132
},
{
"epoch": 1.28,
"grad_norm": 3.430124128402978,
"learning_rate": 1.2343829465629143e-06,
"loss": 0.7846,
"step": 3133
},
{
"epoch": 1.28,
"grad_norm": 3.893835850347665,
"learning_rate": 1.233966269994768e-06,
"loss": 0.808,
"step": 3134
},
{
"epoch": 1.28,
"grad_norm": 3.3718719395786185,
"learning_rate": 1.2335495504487374e-06,
"loss": 0.8302,
"step": 3135
},
{
"epoch": 1.28,
"grad_norm": 3.7222656087667993,
"learning_rate": 1.2331327880013695e-06,
"loss": 0.7465,
"step": 3136
},
{
"epoch": 1.28,
"grad_norm": 2.8990020171107127,
"learning_rate": 1.2327159827292213e-06,
"loss": 0.8387,
"step": 3137
},
{
"epoch": 1.28,
"grad_norm": 3.981284003082686,
"learning_rate": 1.2322991347088563e-06,
"loss": 0.8591,
"step": 3138
},
{
"epoch": 1.28,
"grad_norm": 3.122691915487492,
"learning_rate": 1.2318822440168468e-06,
"loss": 0.8483,
"step": 3139
},
{
"epoch": 1.28,
"grad_norm": 3.0271287573584686,
"learning_rate": 1.2314653107297724e-06,
"loss": 0.8073,
"step": 3140
},
{
"epoch": 1.28,
"grad_norm": 2.507072617343269,
"learning_rate": 1.2310483349242207e-06,
"loss": 0.7952,
"step": 3141
},
{
"epoch": 1.28,
"grad_norm": 3.520749093456947,
"learning_rate": 1.230631316676787e-06,
"loss": 0.8211,
"step": 3142
},
{
"epoch": 1.28,
"grad_norm": 3.2235574005512557,
"learning_rate": 1.2302142560640748e-06,
"loss": 0.8411,
"step": 3143
},
{
"epoch": 1.28,
"grad_norm": 2.8776282590613382,
"learning_rate": 1.2297971531626947e-06,
"loss": 0.8191,
"step": 3144
},
{
"epoch": 1.28,
"grad_norm": 2.670972461248129,
"learning_rate": 1.2293800080492655e-06,
"loss": 0.8014,
"step": 3145
},
{
"epoch": 1.28,
"grad_norm": 3.385134205803857,
"learning_rate": 1.228962820800414e-06,
"loss": 0.7897,
"step": 3146
},
{
"epoch": 1.28,
"grad_norm": 2.8211787773971717,
"learning_rate": 1.228545591492774e-06,
"loss": 0.8716,
"step": 3147
},
{
"epoch": 1.28,
"grad_norm": 3.2306318058482835,
"learning_rate": 1.2281283202029882e-06,
"loss": 0.7718,
"step": 3148
},
{
"epoch": 1.29,
"grad_norm": 2.544585309059122,
"learning_rate": 1.2277110070077052e-06,
"loss": 0.8459,
"step": 3149
},
{
"epoch": 1.29,
"grad_norm": 2.7541725564349546,
"learning_rate": 1.2272936519835832e-06,
"loss": 0.9209,
"step": 3150
},
{
"epoch": 1.29,
"grad_norm": 3.0783018089256062,
"learning_rate": 1.226876255207287e-06,
"loss": 0.7762,
"step": 3151
},
{
"epoch": 1.29,
"grad_norm": 3.0333204384397257,
"learning_rate": 1.2264588167554892e-06,
"loss": 0.7855,
"step": 3152
},
{
"epoch": 1.29,
"grad_norm": 3.0421198187662375,
"learning_rate": 1.2260413367048703e-06,
"loss": 0.7999,
"step": 3153
},
{
"epoch": 1.29,
"grad_norm": 3.314964382747852,
"learning_rate": 1.2256238151321181e-06,
"loss": 0.8652,
"step": 3154
},
{
"epoch": 1.29,
"grad_norm": 3.431793140934256,
"learning_rate": 1.2252062521139286e-06,
"loss": 0.8716,
"step": 3155
},
{
"epoch": 1.29,
"grad_norm": 3.249311085073076,
"learning_rate": 1.2247886477270051e-06,
"loss": 0.8517,
"step": 3156
},
{
"epoch": 1.29,
"grad_norm": 3.7471884109337896,
"learning_rate": 1.224371002048058e-06,
"loss": 0.8173,
"step": 3157
},
{
"epoch": 1.29,
"grad_norm": 2.8142582959917304,
"learning_rate": 1.223953315153806e-06,
"loss": 0.7806,
"step": 3158
},
{
"epoch": 1.29,
"grad_norm": 3.4039032363160495,
"learning_rate": 1.2235355871209752e-06,
"loss": 0.8318,
"step": 3159
},
{
"epoch": 1.29,
"grad_norm": 2.8176868443068814,
"learning_rate": 1.2231178180262991e-06,
"loss": 0.8562,
"step": 3160
},
{
"epoch": 1.29,
"grad_norm": 3.0010603597189576,
"learning_rate": 1.2227000079465187e-06,
"loss": 0.8432,
"step": 3161
},
{
"epoch": 1.29,
"grad_norm": 3.227444625652311,
"learning_rate": 1.2222821569583828e-06,
"loss": 0.8191,
"step": 3162
},
{
"epoch": 1.29,
"grad_norm": 3.4013344901557505,
"learning_rate": 1.2218642651386476e-06,
"loss": 0.7772,
"step": 3163
},
{
"epoch": 1.29,
"grad_norm": 4.015012866736958,
"learning_rate": 1.2214463325640767e-06,
"loss": 0.825,
"step": 3164
},
{
"epoch": 1.29,
"grad_norm": 3.0423740603678397,
"learning_rate": 1.221028359311441e-06,
"loss": 0.8747,
"step": 3165
},
{
"epoch": 1.29,
"grad_norm": 2.992193520323122,
"learning_rate": 1.2206103454575193e-06,
"loss": 0.8088,
"step": 3166
},
{
"epoch": 1.29,
"grad_norm": 3.0743912764645382,
"learning_rate": 1.220192291079098e-06,
"loss": 0.7882,
"step": 3167
},
{
"epoch": 1.29,
"grad_norm": 3.058005511916807,
"learning_rate": 1.21977419625297e-06,
"loss": 0.822,
"step": 3168
},
{
"epoch": 1.29,
"grad_norm": 3.5616426634843035,
"learning_rate": 1.2193560610559369e-06,
"loss": 0.7923,
"step": 3169
},
{
"epoch": 1.29,
"grad_norm": 3.308347980064281,
"learning_rate": 1.218937885564806e-06,
"loss": 0.7623,
"step": 3170
},
{
"epoch": 1.29,
"grad_norm": 2.9779109412911433,
"learning_rate": 1.2185196698563945e-06,
"loss": 0.823,
"step": 3171
},
{
"epoch": 1.29,
"grad_norm": 3.2077614473756517,
"learning_rate": 1.2181014140075245e-06,
"loss": 0.8662,
"step": 3172
},
{
"epoch": 1.29,
"grad_norm": 3.353783508127858,
"learning_rate": 1.217683118095027e-06,
"loss": 0.8206,
"step": 3173
},
{
"epoch": 1.3,
"grad_norm": 3.0150611118925017,
"learning_rate": 1.21726478219574e-06,
"loss": 0.9224,
"step": 3174
},
{
"epoch": 1.3,
"grad_norm": 4.452835791868146,
"learning_rate": 1.2168464063865078e-06,
"loss": 0.8506,
"step": 3175
},
{
"epoch": 1.3,
"grad_norm": 2.7368308421644603,
"learning_rate": 1.2164279907441843e-06,
"loss": 0.8713,
"step": 3176
},
{
"epoch": 1.3,
"grad_norm": 3.1981904333452764,
"learning_rate": 1.2160095353456285e-06,
"loss": 0.7696,
"step": 3177
},
{
"epoch": 1.3,
"grad_norm": 2.8585978004448243,
"learning_rate": 1.215591040267708e-06,
"loss": 0.7902,
"step": 3178
},
{
"epoch": 1.3,
"grad_norm": 3.309768698894028,
"learning_rate": 1.2151725055872968e-06,
"loss": 0.8596,
"step": 3179
},
{
"epoch": 1.3,
"grad_norm": 3.612464102644876,
"learning_rate": 1.214753931381277e-06,
"loss": 0.769,
"step": 3180
},
{
"epoch": 1.3,
"grad_norm": 2.9920583746596803,
"learning_rate": 1.2143353177265375e-06,
"loss": 0.8528,
"step": 3181
},
{
"epoch": 1.3,
"grad_norm": 2.725733092338756,
"learning_rate": 1.2139166646999747e-06,
"loss": 0.7859,
"step": 3182
},
{
"epoch": 1.3,
"grad_norm": 2.8232593093954934,
"learning_rate": 1.2134979723784918e-06,
"loss": 0.8978,
"step": 3183
},
{
"epoch": 1.3,
"grad_norm": 3.1685638172442823,
"learning_rate": 1.213079240839e-06,
"loss": 0.7437,
"step": 3184
},
{
"epoch": 1.3,
"grad_norm": 3.042388024908869,
"learning_rate": 1.2126604701584164e-06,
"loss": 0.8683,
"step": 3185
},
{
"epoch": 1.3,
"grad_norm": 2.714918902023536,
"learning_rate": 1.212241660413667e-06,
"loss": 0.8519,
"step": 3186
},
{
"epoch": 1.3,
"grad_norm": 3.0346104109757683,
"learning_rate": 1.2118228116816837e-06,
"loss": 0.8341,
"step": 3187
},
{
"epoch": 1.3,
"grad_norm": 2.438371173414357,
"learning_rate": 1.2114039240394055e-06,
"loss": 0.8045,
"step": 3188
},
{
"epoch": 1.3,
"grad_norm": 2.8062966691901,
"learning_rate": 1.2109849975637799e-06,
"loss": 0.8038,
"step": 3189
},
{
"epoch": 1.3,
"grad_norm": 3.0682556929089553,
"learning_rate": 1.2105660323317596e-06,
"loss": 0.8174,
"step": 3190
},
{
"epoch": 1.3,
"grad_norm": 3.10076519559461,
"learning_rate": 1.2101470284203062e-06,
"loss": 0.7907,
"step": 3191
},
{
"epoch": 1.3,
"grad_norm": 2.805077388546574,
"learning_rate": 1.2097279859063873e-06,
"loss": 0.802,
"step": 3192
},
{
"epoch": 1.3,
"grad_norm": 3.5907491801652847,
"learning_rate": 1.209308904866978e-06,
"loss": 0.7995,
"step": 3193
},
{
"epoch": 1.3,
"grad_norm": 2.8750759622275277,
"learning_rate": 1.2088897853790605e-06,
"loss": 0.8288,
"step": 3194
},
{
"epoch": 1.3,
"grad_norm": 4.062076493159162,
"learning_rate": 1.2084706275196238e-06,
"loss": 0.8097,
"step": 3195
},
{
"epoch": 1.3,
"grad_norm": 2.509112432200954,
"learning_rate": 1.2080514313656641e-06,
"loss": 0.8483,
"step": 3196
},
{
"epoch": 1.3,
"grad_norm": 2.585118412241551,
"learning_rate": 1.2076321969941852e-06,
"loss": 0.8524,
"step": 3197
},
{
"epoch": 1.31,
"grad_norm": 3.2962888065795926,
"learning_rate": 1.2072129244821963e-06,
"loss": 0.8118,
"step": 3198
},
{
"epoch": 1.31,
"grad_norm": 3.248620542573956,
"learning_rate": 1.2067936139067158e-06,
"loss": 0.893,
"step": 3199
},
{
"epoch": 1.31,
"grad_norm": 2.7825456890581464,
"learning_rate": 1.2063742653447673e-06,
"loss": 0.8016,
"step": 3200
},
{
"epoch": 1.31,
"eval_loss": 0.8814263939857483,
"eval_runtime": 465.0862,
"eval_samples_per_second": 74.937,
"eval_steps_per_second": 4.685,
"step": 3200
},
{
"epoch": 1.31,
"grad_norm": 2.6531020996304524,
"learning_rate": 1.205954878873382e-06,
"loss": 0.8823,
"step": 3201
},
{
"epoch": 1.31,
"grad_norm": 3.5512399624665334,
"learning_rate": 1.2055354545695985e-06,
"loss": 0.8205,
"step": 3202
},
{
"epoch": 1.31,
"grad_norm": 3.1066310402356763,
"learning_rate": 1.205115992510462e-06,
"loss": 0.8497,
"step": 3203
},
{
"epoch": 1.31,
"grad_norm": 4.1645605843479006,
"learning_rate": 1.2046964927730244e-06,
"loss": 0.8215,
"step": 3204
},
{
"epoch": 1.31,
"grad_norm": 3.2388157317629203,
"learning_rate": 1.2042769554343445e-06,
"loss": 0.863,
"step": 3205
},
{
"epoch": 1.31,
"grad_norm": 2.6609488654932245,
"learning_rate": 1.2038573805714887e-06,
"loss": 0.7941,
"step": 3206
},
{
"epoch": 1.31,
"grad_norm": 3.228981680395927,
"learning_rate": 1.2034377682615292e-06,
"loss": 0.7974,
"step": 3207
},
{
"epoch": 1.31,
"grad_norm": 3.003204232028471,
"learning_rate": 1.2030181185815464e-06,
"loss": 0.8596,
"step": 3208
},
{
"epoch": 1.31,
"grad_norm": 2.7391062134523256,
"learning_rate": 1.2025984316086265e-06,
"loss": 0.843,
"step": 3209
},
{
"epoch": 1.31,
"grad_norm": 2.927121673298461,
"learning_rate": 1.202178707419863e-06,
"loss": 0.7878,
"step": 3210
},
{
"epoch": 1.31,
"grad_norm": 3.169943499635464,
"learning_rate": 1.2017589460923558e-06,
"loss": 0.7817,
"step": 3211
},
{
"epoch": 1.31,
"grad_norm": 2.5664380747381283,
"learning_rate": 1.201339147703213e-06,
"loss": 0.833,
"step": 3212
},
{
"epoch": 1.31,
"grad_norm": 4.570945360182799,
"learning_rate": 1.200919312329547e-06,
"loss": 0.7858,
"step": 3213
},
{
"epoch": 1.31,
"grad_norm": 2.6217866591169887,
"learning_rate": 1.2004994400484794e-06,
"loss": 0.8055,
"step": 3214
},
{
"epoch": 1.31,
"grad_norm": 4.427258183592165,
"learning_rate": 1.2000795309371376e-06,
"loss": 0.9028,
"step": 3215
},
{
"epoch": 1.31,
"grad_norm": 2.7783063411298197,
"learning_rate": 1.199659585072656e-06,
"loss": 0.8358,
"step": 3216
},
{
"epoch": 1.31,
"grad_norm": 3.3734179246376086,
"learning_rate": 1.1992396025321747e-06,
"loss": 0.8158,
"step": 3217
},
{
"epoch": 1.31,
"grad_norm": 3.120696716838637,
"learning_rate": 1.1988195833928423e-06,
"loss": 0.818,
"step": 3218
},
{
"epoch": 1.31,
"grad_norm": 3.3266560965839824,
"learning_rate": 1.1983995277318129e-06,
"loss": 0.8257,
"step": 3219
},
{
"epoch": 1.31,
"grad_norm": 3.7168748351128005,
"learning_rate": 1.1979794356262475e-06,
"loss": 0.7652,
"step": 3220
},
{
"epoch": 1.31,
"grad_norm": 2.648510280762522,
"learning_rate": 1.197559307153314e-06,
"loss": 0.8181,
"step": 3221
},
{
"epoch": 1.31,
"grad_norm": 3.292865561268693,
"learning_rate": 1.1971391423901873e-06,
"loss": 0.8451,
"step": 3222
},
{
"epoch": 1.32,
"grad_norm": 2.551787834129441,
"learning_rate": 1.1967189414140485e-06,
"loss": 0.84,
"step": 3223
},
{
"epoch": 1.32,
"grad_norm": 2.4361107990551853,
"learning_rate": 1.1962987043020846e-06,
"loss": 0.8196,
"step": 3224
},
{
"epoch": 1.32,
"grad_norm": 2.550894007554913,
"learning_rate": 1.1958784311314911e-06,
"loss": 0.9056,
"step": 3225
},
{
"epoch": 1.32,
"grad_norm": 2.517856624743152,
"learning_rate": 1.1954581219794685e-06,
"loss": 0.8484,
"step": 3226
},
{
"epoch": 1.32,
"grad_norm": 3.814584784577071,
"learning_rate": 1.1950377769232248e-06,
"loss": 0.7929,
"step": 3227
},
{
"epoch": 1.32,
"grad_norm": 3.691804882628607,
"learning_rate": 1.1946173960399742e-06,
"loss": 0.8618,
"step": 3228
},
{
"epoch": 1.32,
"grad_norm": 4.1158787340428225,
"learning_rate": 1.1941969794069376e-06,
"loss": 0.8221,
"step": 3229
},
{
"epoch": 1.32,
"grad_norm": 2.8423549307816502,
"learning_rate": 1.193776527101342e-06,
"loss": 0.8184,
"step": 3230
},
{
"epoch": 1.32,
"grad_norm": 2.791490830470931,
"learning_rate": 1.1933560392004221e-06,
"loss": 0.8578,
"step": 3231
},
{
"epoch": 1.32,
"grad_norm": 2.9652361056089815,
"learning_rate": 1.1929355157814182e-06,
"loss": 0.7535,
"step": 3232
},
{
"epoch": 1.32,
"grad_norm": 3.2225600102916223,
"learning_rate": 1.1925149569215774e-06,
"loss": 0.7802,
"step": 3233
},
{
"epoch": 1.32,
"grad_norm": 2.712120740021706,
"learning_rate": 1.1920943626981528e-06,
"loss": 0.8221,
"step": 3234
},
{
"epoch": 1.32,
"grad_norm": 2.7606499035518013,
"learning_rate": 1.1916737331884052e-06,
"loss": 0.8531,
"step": 3235
},
{
"epoch": 1.32,
"grad_norm": 2.543652207872624,
"learning_rate": 1.1912530684696007e-06,
"loss": 0.7708,
"step": 3236
},
{
"epoch": 1.32,
"grad_norm": 4.294921877743959,
"learning_rate": 1.1908323686190123e-06,
"loss": 0.9028,
"step": 3237
},
{
"epoch": 1.32,
"grad_norm": 2.9458880387540427,
"learning_rate": 1.19041163371392e-06,
"loss": 0.8947,
"step": 3238
},
{
"epoch": 1.32,
"grad_norm": 3.183754160244062,
"learning_rate": 1.189990863831609e-06,
"loss": 0.7217,
"step": 3239
},
{
"epoch": 1.32,
"grad_norm": 2.808455190289506,
"learning_rate": 1.1895700590493718e-06,
"loss": 0.811,
"step": 3240
},
{
"epoch": 1.32,
"grad_norm": 2.926082834975042,
"learning_rate": 1.1891492194445073e-06,
"loss": 0.8002,
"step": 3241
},
{
"epoch": 1.32,
"grad_norm": 2.7974459274721704,
"learning_rate": 1.188728345094321e-06,
"loss": 0.8015,
"step": 3242
},
{
"epoch": 1.32,
"grad_norm": 2.847498291112307,
"learning_rate": 1.188307436076124e-06,
"loss": 0.8409,
"step": 3243
},
{
"epoch": 1.32,
"grad_norm": 2.8096288670566727,
"learning_rate": 1.1878864924672338e-06,
"loss": 0.8038,
"step": 3244
},
{
"epoch": 1.32,
"grad_norm": 4.168757345061985,
"learning_rate": 1.1874655143449755e-06,
"loss": 0.8475,
"step": 3245
},
{
"epoch": 1.32,
"grad_norm": 2.7702567332803953,
"learning_rate": 1.1870445017866793e-06,
"loss": 0.8489,
"step": 3246
},
{
"epoch": 1.33,
"grad_norm": 2.684535540713073,
"learning_rate": 1.186623454869682e-06,
"loss": 0.794,
"step": 3247
},
{
"epoch": 1.33,
"grad_norm": 3.3824711469784785,
"learning_rate": 1.186202373671327e-06,
"loss": 0.8646,
"step": 3248
},
{
"epoch": 1.33,
"grad_norm": 3.316670522433086,
"learning_rate": 1.1857812582689636e-06,
"loss": 0.7898,
"step": 3249
},
{
"epoch": 1.33,
"grad_norm": 2.472495951872045,
"learning_rate": 1.185360108739948e-06,
"loss": 0.8369,
"step": 3250
},
{
"epoch": 1.33,
"grad_norm": 4.240874500673627,
"learning_rate": 1.1849389251616418e-06,
"loss": 0.7895,
"step": 3251
},
{
"epoch": 1.33,
"grad_norm": 3.8771190155625903,
"learning_rate": 1.1845177076114136e-06,
"loss": 0.8562,
"step": 3252
},
{
"epoch": 1.33,
"grad_norm": 4.297582294680691,
"learning_rate": 1.184096456166638e-06,
"loss": 0.8244,
"step": 3253
},
{
"epoch": 1.33,
"grad_norm": 2.661532620107432,
"learning_rate": 1.1836751709046957e-06,
"loss": 0.8707,
"step": 3254
},
{
"epoch": 1.33,
"grad_norm": 3.666834239415485,
"learning_rate": 1.1832538519029737e-06,
"loss": 0.795,
"step": 3255
},
{
"epoch": 1.33,
"grad_norm": 2.7088148954043336,
"learning_rate": 1.1828324992388654e-06,
"loss": 0.8288,
"step": 3256
},
{
"epoch": 1.33,
"grad_norm": 2.824022355054101,
"learning_rate": 1.18241111298977e-06,
"loss": 0.8176,
"step": 3257
},
{
"epoch": 1.33,
"grad_norm": 2.9997774348816266,
"learning_rate": 1.1819896932330932e-06,
"loss": 0.7992,
"step": 3258
},
{
"epoch": 1.33,
"grad_norm": 2.8355639017014824,
"learning_rate": 1.1815682400462461e-06,
"loss": 0.8442,
"step": 3259
},
{
"epoch": 1.33,
"grad_norm": 2.77696772482995,
"learning_rate": 1.1811467535066476e-06,
"loss": 0.8431,
"step": 3260
},
{
"epoch": 1.33,
"grad_norm": 3.108347175472377,
"learning_rate": 1.180725233691721e-06,
"loss": 0.7972,
"step": 3261
},
{
"epoch": 1.33,
"grad_norm": 3.5524534662391156,
"learning_rate": 1.1803036806788967e-06,
"loss": 0.8225,
"step": 3262
},
{
"epoch": 1.33,
"grad_norm": 2.581774653852759,
"learning_rate": 1.17988209454561e-06,
"loss": 0.8577,
"step": 3263
},
{
"epoch": 1.33,
"grad_norm": 4.030866955445505,
"learning_rate": 1.1794604753693046e-06,
"loss": 0.8038,
"step": 3264
},
{
"epoch": 1.33,
"grad_norm": 3.929768762869766,
"learning_rate": 1.179038823227428e-06,
"loss": 0.859,
"step": 3265
},
{
"epoch": 1.33,
"grad_norm": 3.7032194391389175,
"learning_rate": 1.1786171381974349e-06,
"loss": 0.8309,
"step": 3266
},
{
"epoch": 1.33,
"grad_norm": 2.7491854523057895,
"learning_rate": 1.1781954203567852e-06,
"loss": 0.821,
"step": 3267
},
{
"epoch": 1.33,
"grad_norm": 2.7833271101765895,
"learning_rate": 1.177773669782946e-06,
"loss": 0.8353,
"step": 3268
},
{
"epoch": 1.33,
"grad_norm": 2.661090325904124,
"learning_rate": 1.1773518865533896e-06,
"loss": 0.8636,
"step": 3269
},
{
"epoch": 1.33,
"grad_norm": 2.3430329596853032,
"learning_rate": 1.1769300707455943e-06,
"loss": 0.8751,
"step": 3270
},
{
"epoch": 1.33,
"grad_norm": 2.624206460740417,
"learning_rate": 1.176508222437045e-06,
"loss": 0.894,
"step": 3271
},
{
"epoch": 1.34,
"grad_norm": 2.8619988029247727,
"learning_rate": 1.1760863417052315e-06,
"loss": 0.8556,
"step": 3272
},
{
"epoch": 1.34,
"grad_norm": 3.228528506710222,
"learning_rate": 1.1756644286276503e-06,
"loss": 0.8347,
"step": 3273
},
{
"epoch": 1.34,
"grad_norm": 3.1928862626285324,
"learning_rate": 1.1752424832818043e-06,
"loss": 0.8099,
"step": 3274
},
{
"epoch": 1.34,
"grad_norm": 2.6782852201825342,
"learning_rate": 1.1748205057452015e-06,
"loss": 0.8061,
"step": 3275
},
{
"epoch": 1.34,
"grad_norm": 2.6292449267765887,
"learning_rate": 1.1743984960953558e-06,
"loss": 0.7818,
"step": 3276
},
{
"epoch": 1.34,
"grad_norm": 2.6703099755272164,
"learning_rate": 1.1739764544097874e-06,
"loss": 0.8631,
"step": 3277
},
{
"epoch": 1.34,
"grad_norm": 3.1890187220239183,
"learning_rate": 1.1735543807660225e-06,
"loss": 0.8223,
"step": 3278
},
{
"epoch": 1.34,
"grad_norm": 2.455438238073938,
"learning_rate": 1.1731322752415926e-06,
"loss": 0.8321,
"step": 3279
},
{
"epoch": 1.34,
"grad_norm": 3.7196289819775017,
"learning_rate": 1.1727101379140356e-06,
"loss": 0.8435,
"step": 3280
},
{
"epoch": 1.34,
"grad_norm": 3.1092965548283913,
"learning_rate": 1.1722879688608948e-06,
"loss": 0.8889,
"step": 3281
},
{
"epoch": 1.34,
"grad_norm": 3.2852634648072603,
"learning_rate": 1.17186576815972e-06,
"loss": 0.7861,
"step": 3282
},
{
"epoch": 1.34,
"grad_norm": 2.6780581937772174,
"learning_rate": 1.171443535888066e-06,
"loss": 0.8339,
"step": 3283
},
{
"epoch": 1.34,
"grad_norm": 2.4427135772084525,
"learning_rate": 1.1710212721234936e-06,
"loss": 0.8185,
"step": 3284
},
{
"epoch": 1.34,
"grad_norm": 3.600889070561899,
"learning_rate": 1.17059897694357e-06,
"loss": 0.8794,
"step": 3285
},
{
"epoch": 1.34,
"grad_norm": 2.964503492526636,
"learning_rate": 1.1701766504258675e-06,
"loss": 0.8457,
"step": 3286
},
{
"epoch": 1.34,
"grad_norm": 2.6535868339651145,
"learning_rate": 1.1697542926479643e-06,
"loss": 0.7773,
"step": 3287
},
{
"epoch": 1.34,
"grad_norm": 2.6909990458615427,
"learning_rate": 1.1693319036874447e-06,
"loss": 0.8636,
"step": 3288
},
{
"epoch": 1.34,
"grad_norm": 3.224908025229928,
"learning_rate": 1.1689094836218979e-06,
"loss": 0.7862,
"step": 3289
},
{
"epoch": 1.34,
"grad_norm": 3.309039823018537,
"learning_rate": 1.1684870325289203e-06,
"loss": 0.8463,
"step": 3290
},
{
"epoch": 1.34,
"grad_norm": 2.705112343239646,
"learning_rate": 1.168064550486112e-06,
"loss": 0.8709,
"step": 3291
},
{
"epoch": 1.34,
"grad_norm": 3.298798458120192,
"learning_rate": 1.1676420375710804e-06,
"loss": 0.8375,
"step": 3292
},
{
"epoch": 1.34,
"grad_norm": 2.5787233326938197,
"learning_rate": 1.1672194938614381e-06,
"loss": 0.9089,
"step": 3293
},
{
"epoch": 1.34,
"grad_norm": 3.460406484971266,
"learning_rate": 1.1667969194348031e-06,
"loss": 0.786,
"step": 3294
},
{
"epoch": 1.34,
"grad_norm": 2.8740886967540216,
"learning_rate": 1.1663743143687992e-06,
"loss": 0.8787,
"step": 3295
},
{
"epoch": 1.35,
"grad_norm": 3.061117941674982,
"learning_rate": 1.165951678741056e-06,
"loss": 0.7957,
"step": 3296
},
{
"epoch": 1.35,
"grad_norm": 2.798509445852098,
"learning_rate": 1.1655290126292084e-06,
"loss": 0.8915,
"step": 3297
},
{
"epoch": 1.35,
"grad_norm": 3.346825714817228,
"learning_rate": 1.165106316110897e-06,
"loss": 0.8108,
"step": 3298
},
{
"epoch": 1.35,
"grad_norm": 3.8561323256897815,
"learning_rate": 1.1646835892637683e-06,
"loss": 0.7813,
"step": 3299
},
{
"epoch": 1.35,
"grad_norm": 3.4907945574434014,
"learning_rate": 1.164260832165474e-06,
"loss": 0.8317,
"step": 3300
},
{
"epoch": 1.35,
"eval_loss": 0.881358802318573,
"eval_runtime": 466.2452,
"eval_samples_per_second": 74.75,
"eval_steps_per_second": 4.674,
"step": 3300
},
{
"epoch": 1.35,
"grad_norm": 2.6243205779917123,
"learning_rate": 1.1638380448936716e-06,
"loss": 0.8283,
"step": 3301
},
{
"epoch": 1.35,
"grad_norm": 2.88128231667405,
"learning_rate": 1.1634152275260236e-06,
"loss": 0.8074,
"step": 3302
},
{
"epoch": 1.35,
"grad_norm": 2.763116991948242,
"learning_rate": 1.1629923801401988e-06,
"loss": 0.8621,
"step": 3303
},
{
"epoch": 1.35,
"grad_norm": 3.138969388532484,
"learning_rate": 1.162569502813871e-06,
"loss": 0.8637,
"step": 3304
},
{
"epoch": 1.35,
"grad_norm": 3.1096712041661845,
"learning_rate": 1.1621465956247198e-06,
"loss": 0.7599,
"step": 3305
},
{
"epoch": 1.35,
"grad_norm": 2.6642670416480843,
"learning_rate": 1.16172365865043e-06,
"loss": 0.8331,
"step": 3306
},
{
"epoch": 1.35,
"grad_norm": 3.289442233464763,
"learning_rate": 1.1613006919686922e-06,
"loss": 0.7706,
"step": 3307
},
{
"epoch": 1.35,
"grad_norm": 3.147885866656986,
"learning_rate": 1.1608776956572021e-06,
"loss": 0.746,
"step": 3308
},
{
"epoch": 1.35,
"grad_norm": 2.93335711496386,
"learning_rate": 1.1604546697936611e-06,
"loss": 0.9006,
"step": 3309
},
{
"epoch": 1.35,
"grad_norm": 3.3562733324757943,
"learning_rate": 1.1600316144557757e-06,
"loss": 0.7849,
"step": 3310
},
{
"epoch": 1.35,
"grad_norm": 3.280480161923998,
"learning_rate": 1.1596085297212587e-06,
"loss": 0.8158,
"step": 3311
},
{
"epoch": 1.35,
"grad_norm": 2.4438792831590206,
"learning_rate": 1.1591854156678268e-06,
"loss": 0.8913,
"step": 3312
},
{
"epoch": 1.35,
"grad_norm": 3.858314318382528,
"learning_rate": 1.158762272373204e-06,
"loss": 0.7596,
"step": 3313
},
{
"epoch": 1.35,
"grad_norm": 2.9768597991379906,
"learning_rate": 1.1583390999151177e-06,
"loss": 0.8136,
"step": 3314
},
{
"epoch": 1.35,
"grad_norm": 2.4383794753236994,
"learning_rate": 1.1579158983713016e-06,
"loss": 0.8572,
"step": 3315
},
{
"epoch": 1.35,
"grad_norm": 3.089065671054522,
"learning_rate": 1.1574926678194952e-06,
"loss": 0.7867,
"step": 3316
},
{
"epoch": 1.35,
"grad_norm": 3.2914759335697927,
"learning_rate": 1.1570694083374426e-06,
"loss": 0.8773,
"step": 3317
},
{
"epoch": 1.35,
"grad_norm": 2.7540467709428844,
"learning_rate": 1.1566461200028933e-06,
"loss": 0.8817,
"step": 3318
},
{
"epoch": 1.35,
"grad_norm": 2.9699493143708904,
"learning_rate": 1.1562228028936028e-06,
"loss": 0.8108,
"step": 3319
},
{
"epoch": 1.35,
"grad_norm": 2.755890670774448,
"learning_rate": 1.1557994570873306e-06,
"loss": 0.8934,
"step": 3320
},
{
"epoch": 1.36,
"grad_norm": 3.43875144381465,
"learning_rate": 1.1553760826618428e-06,
"loss": 0.8217,
"step": 3321
},
{
"epoch": 1.36,
"grad_norm": 3.2870626459123224,
"learning_rate": 1.15495267969491e-06,
"loss": 0.8302,
"step": 3322
},
{
"epoch": 1.36,
"grad_norm": 3.615618744618986,
"learning_rate": 1.1545292482643083e-06,
"loss": 0.9165,
"step": 3323
},
{
"epoch": 1.36,
"grad_norm": 2.6179498585666,
"learning_rate": 1.1541057884478186e-06,
"loss": 0.805,
"step": 3324
},
{
"epoch": 1.36,
"grad_norm": 2.5172615766354314,
"learning_rate": 1.1536823003232276e-06,
"loss": 0.8407,
"step": 3325
},
{
"epoch": 1.36,
"grad_norm": 3.4823483838595366,
"learning_rate": 1.1532587839683271e-06,
"loss": 0.8364,
"step": 3326
},
{
"epoch": 1.36,
"grad_norm": 2.5315047770302557,
"learning_rate": 1.1528352394609139e-06,
"loss": 0.8295,
"step": 3327
},
{
"epoch": 1.36,
"grad_norm": 2.5181981163841742,
"learning_rate": 1.1524116668787896e-06,
"loss": 0.8189,
"step": 3328
},
{
"epoch": 1.36,
"grad_norm": 3.42159916098907,
"learning_rate": 1.1519880662997618e-06,
"loss": 0.7877,
"step": 3329
},
{
"epoch": 1.36,
"grad_norm": 2.426052807524248,
"learning_rate": 1.1515644378016428e-06,
"loss": 0.7762,
"step": 3330
},
{
"epoch": 1.36,
"grad_norm": 3.2409056302548747,
"learning_rate": 1.1511407814622495e-06,
"loss": 0.8598,
"step": 3331
},
{
"epoch": 1.36,
"grad_norm": 2.9470955310280043,
"learning_rate": 1.1507170973594053e-06,
"loss": 0.827,
"step": 3332
},
{
"epoch": 1.36,
"grad_norm": 3.2917775508416156,
"learning_rate": 1.1502933855709373e-06,
"loss": 0.7568,
"step": 3333
},
{
"epoch": 1.36,
"grad_norm": 3.1721369914458393,
"learning_rate": 1.1498696461746783e-06,
"loss": 0.7713,
"step": 3334
},
{
"epoch": 1.36,
"grad_norm": 3.71565366316924,
"learning_rate": 1.1494458792484666e-06,
"loss": 0.8536,
"step": 3335
},
{
"epoch": 1.36,
"grad_norm": 2.5584631939972056,
"learning_rate": 1.1490220848701444e-06,
"loss": 0.8503,
"step": 3336
},
{
"epoch": 1.36,
"grad_norm": 2.929244860671105,
"learning_rate": 1.14859826311756e-06,
"loss": 0.8266,
"step": 3337
},
{
"epoch": 1.36,
"grad_norm": 2.840325629889333,
"learning_rate": 1.1481744140685658e-06,
"loss": 0.8368,
"step": 3338
},
{
"epoch": 1.36,
"grad_norm": 2.7197333701597652,
"learning_rate": 1.1477505378010207e-06,
"loss": 0.8132,
"step": 3339
},
{
"epoch": 1.36,
"grad_norm": 3.1424518798787866,
"learning_rate": 1.1473266343927872e-06,
"loss": 0.8427,
"step": 3340
},
{
"epoch": 1.36,
"grad_norm": 3.1616712378139966,
"learning_rate": 1.146902703921733e-06,
"loss": 0.8461,
"step": 3341
},
{
"epoch": 1.36,
"grad_norm": 2.6523801914345757,
"learning_rate": 1.146478746465731e-06,
"loss": 0.7755,
"step": 3342
},
{
"epoch": 1.36,
"grad_norm": 3.1374251302604232,
"learning_rate": 1.1460547621026595e-06,
"loss": 0.8249,
"step": 3343
},
{
"epoch": 1.36,
"grad_norm": 3.5144110873159136,
"learning_rate": 1.145630750910401e-06,
"loss": 0.7702,
"step": 3344
},
{
"epoch": 1.37,
"grad_norm": 3.387653391344006,
"learning_rate": 1.1452067129668435e-06,
"loss": 0.7798,
"step": 3345
},
{
"epoch": 1.37,
"grad_norm": 2.736515900222986,
"learning_rate": 1.1447826483498792e-06,
"loss": 0.8534,
"step": 3346
},
{
"epoch": 1.37,
"grad_norm": 3.6378730308889775,
"learning_rate": 1.1443585571374062e-06,
"loss": 0.7999,
"step": 3347
},
{
"epoch": 1.37,
"grad_norm": 2.512665357544802,
"learning_rate": 1.1439344394073266e-06,
"loss": 0.8082,
"step": 3348
},
{
"epoch": 1.37,
"grad_norm": 2.58590835792854,
"learning_rate": 1.1435102952375475e-06,
"loss": 0.8688,
"step": 3349
},
{
"epoch": 1.37,
"grad_norm": 3.5485158254730913,
"learning_rate": 1.1430861247059818e-06,
"loss": 0.797,
"step": 3350
},
{
"epoch": 1.37,
"grad_norm": 2.540431267522002,
"learning_rate": 1.1426619278905456e-06,
"loss": 0.802,
"step": 3351
},
{
"epoch": 1.37,
"grad_norm": 3.4024829069058207,
"learning_rate": 1.142237704869162e-06,
"loss": 0.7905,
"step": 3352
},
{
"epoch": 1.37,
"grad_norm": 3.0454030494538196,
"learning_rate": 1.141813455719756e-06,
"loss": 0.8035,
"step": 3353
},
{
"epoch": 1.37,
"grad_norm": 3.483946191719869,
"learning_rate": 1.1413891805202603e-06,
"loss": 0.8785,
"step": 3354
},
{
"epoch": 1.37,
"grad_norm": 2.95445904778257,
"learning_rate": 1.1409648793486105e-06,
"loss": 0.8139,
"step": 3355
},
{
"epoch": 1.37,
"grad_norm": 3.353553197829011,
"learning_rate": 1.1405405522827483e-06,
"loss": 0.8151,
"step": 3356
},
{
"epoch": 1.37,
"grad_norm": 3.771088496773099,
"learning_rate": 1.1401161994006187e-06,
"loss": 0.8157,
"step": 3357
},
{
"epoch": 1.37,
"grad_norm": 3.8774763229866003,
"learning_rate": 1.1396918207801727e-06,
"loss": 0.8068,
"step": 3358
},
{
"epoch": 1.37,
"grad_norm": 2.885937802843294,
"learning_rate": 1.139267416499365e-06,
"loss": 0.8373,
"step": 3359
},
{
"epoch": 1.37,
"grad_norm": 3.810758262526119,
"learning_rate": 1.1388429866361566e-06,
"loss": 0.8143,
"step": 3360
},
{
"epoch": 1.37,
"grad_norm": 2.7677637848175745,
"learning_rate": 1.1384185312685109e-06,
"loss": 0.867,
"step": 3361
},
{
"epoch": 1.37,
"grad_norm": 3.6318127014624784,
"learning_rate": 1.137994050474398e-06,
"loss": 0.8357,
"step": 3362
},
{
"epoch": 1.37,
"grad_norm": 3.1121195544282108,
"learning_rate": 1.1375695443317919e-06,
"loss": 0.7616,
"step": 3363
},
{
"epoch": 1.37,
"grad_norm": 3.7956302457853233,
"learning_rate": 1.1371450129186704e-06,
"loss": 0.804,
"step": 3364
},
{
"epoch": 1.37,
"grad_norm": 3.285253066114744,
"learning_rate": 1.1367204563130182e-06,
"loss": 0.9045,
"step": 3365
},
{
"epoch": 1.37,
"grad_norm": 2.522895991798763,
"learning_rate": 1.136295874592822e-06,
"loss": 0.8126,
"step": 3366
},
{
"epoch": 1.37,
"grad_norm": 2.9227596535947433,
"learning_rate": 1.1358712678360752e-06,
"loss": 0.883,
"step": 3367
},
{
"epoch": 1.37,
"grad_norm": 3.15645287581586,
"learning_rate": 1.1354466361207741e-06,
"loss": 0.8892,
"step": 3368
},
{
"epoch": 1.37,
"grad_norm": 3.80114995163682,
"learning_rate": 1.1350219795249215e-06,
"loss": 0.7266,
"step": 3369
},
{
"epoch": 1.38,
"grad_norm": 2.614447507085457,
"learning_rate": 1.1345972981265226e-06,
"loss": 0.8095,
"step": 3370
},
{
"epoch": 1.38,
"grad_norm": 3.2538119688056923,
"learning_rate": 1.1341725920035889e-06,
"loss": 0.7674,
"step": 3371
},
{
"epoch": 1.38,
"grad_norm": 2.9666960304676424,
"learning_rate": 1.1337478612341357e-06,
"loss": 0.786,
"step": 3372
},
{
"epoch": 1.38,
"grad_norm": 2.7542198117871317,
"learning_rate": 1.1333231058961832e-06,
"loss": 0.8421,
"step": 3373
},
{
"epoch": 1.38,
"grad_norm": 2.9538095679255094,
"learning_rate": 1.1328983260677549e-06,
"loss": 0.802,
"step": 3374
},
{
"epoch": 1.38,
"grad_norm": 2.374806742470904,
"learning_rate": 1.132473521826881e-06,
"loss": 0.8904,
"step": 3375
},
{
"epoch": 1.38,
"grad_norm": 3.4626000868139855,
"learning_rate": 1.1320486932515936e-06,
"loss": 0.8254,
"step": 3376
},
{
"epoch": 1.38,
"grad_norm": 3.352538311431271,
"learning_rate": 1.1316238404199314e-06,
"loss": 0.8149,
"step": 3377
},
{
"epoch": 1.38,
"grad_norm": 3.77397616424856,
"learning_rate": 1.1311989634099369e-06,
"loss": 0.7888,
"step": 3378
},
{
"epoch": 1.38,
"grad_norm": 2.3220194257866633,
"learning_rate": 1.1307740622996563e-06,
"loss": 0.8421,
"step": 3379
},
{
"epoch": 1.38,
"grad_norm": 3.106429064747807,
"learning_rate": 1.1303491371671413e-06,
"loss": 0.8149,
"step": 3380
},
{
"epoch": 1.38,
"grad_norm": 2.572896242330744,
"learning_rate": 1.129924188090447e-06,
"loss": 0.8013,
"step": 3381
},
{
"epoch": 1.38,
"grad_norm": 3.130055229737645,
"learning_rate": 1.1294992151476342e-06,
"loss": 0.8852,
"step": 3382
},
{
"epoch": 1.38,
"grad_norm": 3.058794955436061,
"learning_rate": 1.1290742184167665e-06,
"loss": 0.9009,
"step": 3383
},
{
"epoch": 1.38,
"grad_norm": 3.4084830339097967,
"learning_rate": 1.128649197975913e-06,
"loss": 0.7536,
"step": 3384
},
{
"epoch": 1.38,
"grad_norm": 3.2391372716206708,
"learning_rate": 1.1282241539031469e-06,
"loss": 0.7554,
"step": 3385
},
{
"epoch": 1.38,
"grad_norm": 2.645144389065307,
"learning_rate": 1.127799086276546e-06,
"loss": 0.8115,
"step": 3386
},
{
"epoch": 1.38,
"grad_norm": 3.7017292148355914,
"learning_rate": 1.1273739951741913e-06,
"loss": 0.7985,
"step": 3387
},
{
"epoch": 1.38,
"grad_norm": 3.1578714468315314,
"learning_rate": 1.1269488806741696e-06,
"loss": 0.7953,
"step": 3388
},
{
"epoch": 1.38,
"grad_norm": 2.797331214282399,
"learning_rate": 1.1265237428545708e-06,
"loss": 0.8647,
"step": 3389
},
{
"epoch": 1.38,
"grad_norm": 2.890922934353078,
"learning_rate": 1.12609858179349e-06,
"loss": 0.8249,
"step": 3390
},
{
"epoch": 1.38,
"grad_norm": 2.822971620850644,
"learning_rate": 1.1256733975690262e-06,
"loss": 0.8181,
"step": 3391
},
{
"epoch": 1.38,
"grad_norm": 3.3239420590209763,
"learning_rate": 1.125248190259282e-06,
"loss": 0.8339,
"step": 3392
},
{
"epoch": 1.38,
"grad_norm": 3.2320408461123624,
"learning_rate": 1.1248229599423658e-06,
"loss": 0.8306,
"step": 3393
},
{
"epoch": 1.39,
"grad_norm": 3.0646006487636615,
"learning_rate": 1.1243977066963883e-06,
"loss": 0.8502,
"step": 3394
},
{
"epoch": 1.39,
"grad_norm": 2.7864469284588265,
"learning_rate": 1.1239724305994663e-06,
"loss": 0.8451,
"step": 3395
},
{
"epoch": 1.39,
"grad_norm": 2.502163189005791,
"learning_rate": 1.1235471317297192e-06,
"loss": 0.8368,
"step": 3396
},
{
"epoch": 1.39,
"grad_norm": 2.5160831599216262,
"learning_rate": 1.1231218101652716e-06,
"loss": 0.7928,
"step": 3397
},
{
"epoch": 1.39,
"grad_norm": 2.691000698868667,
"learning_rate": 1.122696465984252e-06,
"loss": 0.7956,
"step": 3398
},
{
"epoch": 1.39,
"grad_norm": 2.6081018619048564,
"learning_rate": 1.1222710992647928e-06,
"loss": 0.8588,
"step": 3399
},
{
"epoch": 1.39,
"grad_norm": 3.1926964402299056,
"learning_rate": 1.1218457100850307e-06,
"loss": 0.8184,
"step": 3400
},
{
"epoch": 1.39,
"eval_loss": 0.8800104260444641,
"eval_runtime": 467.0279,
"eval_samples_per_second": 74.625,
"eval_steps_per_second": 4.666,
"step": 3400
},
{
"epoch": 1.39,
"grad_norm": 3.1616887183154003,
"learning_rate": 1.1214202985231071e-06,
"loss": 0.7782,
"step": 3401
},
{
"epoch": 1.39,
"grad_norm": 5.01134820345729,
"learning_rate": 1.1209948646571662e-06,
"loss": 0.7743,
"step": 3402
},
{
"epoch": 1.39,
"grad_norm": 3.0295591686397785,
"learning_rate": 1.1205694085653575e-06,
"loss": 0.8495,
"step": 3403
},
{
"epoch": 1.39,
"grad_norm": 2.5609393802400713,
"learning_rate": 1.1201439303258345e-06,
"loss": 0.8793,
"step": 3404
},
{
"epoch": 1.39,
"grad_norm": 2.8095706968472283,
"learning_rate": 1.1197184300167537e-06,
"loss": 0.858,
"step": 3405
},
{
"epoch": 1.39,
"grad_norm": 3.144893296305728,
"learning_rate": 1.119292907716277e-06,
"loss": 0.782,
"step": 3406
},
{
"epoch": 1.39,
"grad_norm": 2.913250108526724,
"learning_rate": 1.118867363502569e-06,
"loss": 0.8879,
"step": 3407
},
{
"epoch": 1.39,
"grad_norm": 2.9663699156758825,
"learning_rate": 1.1184417974537999e-06,
"loss": 0.7971,
"step": 3408
},
{
"epoch": 1.39,
"grad_norm": 3.32992250447712,
"learning_rate": 1.1180162096481424e-06,
"loss": 0.7816,
"step": 3409
},
{
"epoch": 1.39,
"grad_norm": 2.5868988461996483,
"learning_rate": 1.1175906001637743e-06,
"loss": 0.8262,
"step": 3410
},
{
"epoch": 1.39,
"grad_norm": 2.5832943480164343,
"learning_rate": 1.1171649690788765e-06,
"loss": 0.8132,
"step": 3411
},
{
"epoch": 1.39,
"grad_norm": 3.70908750952905,
"learning_rate": 1.116739316471635e-06,
"loss": 0.8136,
"step": 3412
},
{
"epoch": 1.39,
"grad_norm": 3.1237867338786596,
"learning_rate": 1.1163136424202382e-06,
"loss": 0.8685,
"step": 3413
},
{
"epoch": 1.39,
"grad_norm": 2.9457725797451744,
"learning_rate": 1.11588794700288e-06,
"loss": 0.7828,
"step": 3414
},
{
"epoch": 1.39,
"grad_norm": 3.491566035041787,
"learning_rate": 1.115462230297757e-06,
"loss": 0.8078,
"step": 3415
},
{
"epoch": 1.39,
"grad_norm": 2.4717416701798394,
"learning_rate": 1.1150364923830704e-06,
"loss": 0.877,
"step": 3416
},
{
"epoch": 1.39,
"grad_norm": 3.0495837083636035,
"learning_rate": 1.114610733337025e-06,
"loss": 0.8239,
"step": 3417
},
{
"epoch": 1.39,
"grad_norm": 2.5636284558867235,
"learning_rate": 1.1141849532378303e-06,
"loss": 0.7994,
"step": 3418
},
{
"epoch": 1.4,
"grad_norm": 2.646530279722254,
"learning_rate": 1.113759152163698e-06,
"loss": 0.841,
"step": 3419
},
{
"epoch": 1.4,
"grad_norm": 3.0760379487680365,
"learning_rate": 1.1133333301928457e-06,
"loss": 0.8963,
"step": 3420
},
{
"epoch": 1.4,
"grad_norm": 2.7655869404387237,
"learning_rate": 1.1129074874034925e-06,
"loss": 0.7484,
"step": 3421
},
{
"epoch": 1.4,
"grad_norm": 2.4575123954973392,
"learning_rate": 1.1124816238738636e-06,
"loss": 0.8187,
"step": 3422
},
{
"epoch": 1.4,
"grad_norm": 2.899364758709025,
"learning_rate": 1.1120557396821865e-06,
"loss": 0.8145,
"step": 3423
},
{
"epoch": 1.4,
"grad_norm": 3.437995662558021,
"learning_rate": 1.1116298349066932e-06,
"loss": 0.8293,
"step": 3424
},
{
"epoch": 1.4,
"grad_norm": 2.9845020359977648,
"learning_rate": 1.1112039096256195e-06,
"loss": 0.8474,
"step": 3425
},
{
"epoch": 1.4,
"grad_norm": 2.974261977576689,
"learning_rate": 1.1107779639172041e-06,
"loss": 0.8449,
"step": 3426
},
{
"epoch": 1.4,
"grad_norm": 2.4012921481890697,
"learning_rate": 1.1103519978596908e-06,
"loss": 0.8236,
"step": 3427
},
{
"epoch": 1.4,
"grad_norm": 2.979491288172014,
"learning_rate": 1.1099260115313257e-06,
"loss": 0.8354,
"step": 3428
},
{
"epoch": 1.4,
"grad_norm": 3.0238540336440294,
"learning_rate": 1.1095000050103605e-06,
"loss": 0.8359,
"step": 3429
},
{
"epoch": 1.4,
"grad_norm": 3.057187645330933,
"learning_rate": 1.109073978375048e-06,
"loss": 0.8783,
"step": 3430
},
{
"epoch": 1.4,
"grad_norm": 3.7851569577829514,
"learning_rate": 1.1086479317036472e-06,
"loss": 0.7793,
"step": 3431
},
{
"epoch": 1.4,
"grad_norm": 2.7614872438427023,
"learning_rate": 1.1082218650744195e-06,
"loss": 0.8782,
"step": 3432
},
{
"epoch": 1.4,
"grad_norm": 2.8679869615246543,
"learning_rate": 1.10779577856563e-06,
"loss": 0.8155,
"step": 3433
},
{
"epoch": 1.4,
"grad_norm": 2.8990114625698125,
"learning_rate": 1.1073696722555478e-06,
"loss": 0.8215,
"step": 3434
},
{
"epoch": 1.4,
"grad_norm": 3.754359015475814,
"learning_rate": 1.1069435462224456e-06,
"loss": 0.8061,
"step": 3435
},
{
"epoch": 1.4,
"grad_norm": 2.923073709533283,
"learning_rate": 1.1065174005445995e-06,
"loss": 0.8284,
"step": 3436
},
{
"epoch": 1.4,
"grad_norm": 3.154210255898892,
"learning_rate": 1.1060912353002897e-06,
"loss": 0.865,
"step": 3437
},
{
"epoch": 1.4,
"grad_norm": 3.5618786981283272,
"learning_rate": 1.1056650505677991e-06,
"loss": 0.8509,
"step": 3438
},
{
"epoch": 1.4,
"grad_norm": 2.69051166048299,
"learning_rate": 1.1052388464254148e-06,
"loss": 0.8518,
"step": 3439
},
{
"epoch": 1.4,
"grad_norm": 2.8124972572134195,
"learning_rate": 1.1048126229514277e-06,
"loss": 0.7839,
"step": 3440
},
{
"epoch": 1.4,
"grad_norm": 2.7136269826815087,
"learning_rate": 1.1043863802241312e-06,
"loss": 0.813,
"step": 3441
},
{
"epoch": 1.4,
"grad_norm": 3.566337627873776,
"learning_rate": 1.103960118321824e-06,
"loss": 0.7586,
"step": 3442
},
{
"epoch": 1.41,
"grad_norm": 3.092225246696403,
"learning_rate": 1.1035338373228065e-06,
"loss": 0.7889,
"step": 3443
},
{
"epoch": 1.41,
"grad_norm": 3.2549326756467747,
"learning_rate": 1.1031075373053835e-06,
"loss": 0.8812,
"step": 3444
},
{
"epoch": 1.41,
"grad_norm": 2.4632250433147096,
"learning_rate": 1.1026812183478634e-06,
"loss": 0.7481,
"step": 3445
},
{
"epoch": 1.41,
"grad_norm": 2.6793562732279907,
"learning_rate": 1.1022548805285577e-06,
"loss": 0.8266,
"step": 3446
},
{
"epoch": 1.41,
"grad_norm": 3.3939279219722347,
"learning_rate": 1.1018285239257816e-06,
"loss": 0.7735,
"step": 3447
},
{
"epoch": 1.41,
"grad_norm": 2.8876332443752086,
"learning_rate": 1.101402148617854e-06,
"loss": 0.8068,
"step": 3448
},
{
"epoch": 1.41,
"grad_norm": 3.425334061729834,
"learning_rate": 1.100975754683096e-06,
"loss": 0.7967,
"step": 3449
},
{
"epoch": 1.41,
"grad_norm": 3.150225505448449,
"learning_rate": 1.100549342199834e-06,
"loss": 0.8863,
"step": 3450
},
{
"epoch": 1.41,
"grad_norm": 4.441028698074484,
"learning_rate": 1.1001229112463966e-06,
"loss": 0.7921,
"step": 3451
},
{
"epoch": 1.41,
"grad_norm": 3.875206425218492,
"learning_rate": 1.0996964619011154e-06,
"loss": 0.8165,
"step": 3452
},
{
"epoch": 1.41,
"grad_norm": 2.640659267535285,
"learning_rate": 1.0992699942423268e-06,
"loss": 0.8517,
"step": 3453
},
{
"epoch": 1.41,
"grad_norm": 4.7234380697689184,
"learning_rate": 1.0988435083483692e-06,
"loss": 0.8266,
"step": 3454
},
{
"epoch": 1.41,
"grad_norm": 3.8616111644389113,
"learning_rate": 1.0984170042975855e-06,
"loss": 0.8031,
"step": 3455
},
{
"epoch": 1.41,
"grad_norm": 2.886875417978041,
"learning_rate": 1.097990482168321e-06,
"loss": 0.8307,
"step": 3456
},
{
"epoch": 1.41,
"grad_norm": 3.063917014527807,
"learning_rate": 1.0975639420389247e-06,
"loss": 0.7831,
"step": 3457
},
{
"epoch": 1.41,
"grad_norm": 2.9189417107933893,
"learning_rate": 1.0971373839877489e-06,
"loss": 0.8503,
"step": 3458
},
{
"epoch": 1.41,
"grad_norm": 2.9603780114076748,
"learning_rate": 1.0967108080931493e-06,
"loss": 0.8536,
"step": 3459
},
{
"epoch": 1.41,
"grad_norm": 3.7591139262360462,
"learning_rate": 1.0962842144334845e-06,
"loss": 0.8007,
"step": 3460
},
{
"epoch": 1.41,
"grad_norm": 2.3469249617915615,
"learning_rate": 1.0958576030871171e-06,
"loss": 0.85,
"step": 3461
},
{
"epoch": 1.41,
"grad_norm": 3.362470189562642,
"learning_rate": 1.0954309741324116e-06,
"loss": 0.8158,
"step": 3462
},
{
"epoch": 1.41,
"grad_norm": 2.968548153883882,
"learning_rate": 1.0950043276477378e-06,
"loss": 0.7687,
"step": 3463
},
{
"epoch": 1.41,
"grad_norm": 2.7197367112142725,
"learning_rate": 1.0945776637114669e-06,
"loss": 0.8048,
"step": 3464
},
{
"epoch": 1.41,
"grad_norm": 2.7979220735330266,
"learning_rate": 1.0941509824019731e-06,
"loss": 0.8039,
"step": 3465
},
{
"epoch": 1.41,
"grad_norm": 2.789776618870798,
"learning_rate": 1.0937242837976364e-06,
"loss": 0.8696,
"step": 3466
},
{
"epoch": 1.41,
"grad_norm": 2.7719844306742227,
"learning_rate": 1.0932975679768364e-06,
"loss": 0.84,
"step": 3467
},
{
"epoch": 1.42,
"grad_norm": 3.0721397734503855,
"learning_rate": 1.0928708350179591e-06,
"loss": 0.7653,
"step": 3468
},
{
"epoch": 1.42,
"grad_norm": 5.009160435693752,
"learning_rate": 1.0924440849993915e-06,
"loss": 0.7931,
"step": 3469
},
{
"epoch": 1.42,
"grad_norm": 2.656249782101977,
"learning_rate": 1.0920173179995244e-06,
"loss": 0.8163,
"step": 3470
},
{
"epoch": 1.42,
"grad_norm": 2.9376843350505992,
"learning_rate": 1.0915905340967522e-06,
"loss": 0.8839,
"step": 3471
},
{
"epoch": 1.42,
"grad_norm": 2.6867680270258165,
"learning_rate": 1.0911637333694714e-06,
"loss": 0.9023,
"step": 3472
},
{
"epoch": 1.42,
"grad_norm": 4.165176202725972,
"learning_rate": 1.0907369158960827e-06,
"loss": 0.7995,
"step": 3473
},
{
"epoch": 1.42,
"grad_norm": 3.0776549357479333,
"learning_rate": 1.090310081754989e-06,
"loss": 0.845,
"step": 3474
},
{
"epoch": 1.42,
"grad_norm": 4.076720109900857,
"learning_rate": 1.0898832310245965e-06,
"loss": 0.8513,
"step": 3475
},
{
"epoch": 1.42,
"grad_norm": 3.20132595031946,
"learning_rate": 1.089456363783315e-06,
"loss": 0.8265,
"step": 3476
},
{
"epoch": 1.42,
"grad_norm": 2.4005506676201542,
"learning_rate": 1.0890294801095562e-06,
"loss": 0.7914,
"step": 3477
},
{
"epoch": 1.42,
"grad_norm": 4.074197238424801,
"learning_rate": 1.0886025800817362e-06,
"loss": 0.844,
"step": 3478
},
{
"epoch": 1.42,
"grad_norm": 2.7661465770818743,
"learning_rate": 1.088175663778273e-06,
"loss": 0.8007,
"step": 3479
},
{
"epoch": 1.42,
"grad_norm": 2.90909895958672,
"learning_rate": 1.087748731277588e-06,
"loss": 0.7568,
"step": 3480
},
{
"epoch": 1.42,
"grad_norm": 2.76111372719799,
"learning_rate": 1.087321782658106e-06,
"loss": 0.8353,
"step": 3481
},
{
"epoch": 1.42,
"grad_norm": 2.6966820993876213,
"learning_rate": 1.0868948179982536e-06,
"loss": 0.8234,
"step": 3482
},
{
"epoch": 1.42,
"grad_norm": 3.0963246110034564,
"learning_rate": 1.0864678373764615e-06,
"loss": 0.8277,
"step": 3483
},
{
"epoch": 1.42,
"grad_norm": 2.613986408178252,
"learning_rate": 1.086040840871163e-06,
"loss": 0.8578,
"step": 3484
},
{
"epoch": 1.42,
"grad_norm": 2.7462006593566466,
"learning_rate": 1.085613828560794e-06,
"loss": 0.819,
"step": 3485
},
{
"epoch": 1.42,
"grad_norm": 2.6067399506003865,
"learning_rate": 1.0851868005237937e-06,
"loss": 0.7846,
"step": 3486
},
{
"epoch": 1.42,
"grad_norm": 3.0213983400463507,
"learning_rate": 1.0847597568386038e-06,
"loss": 0.8265,
"step": 3487
},
{
"epoch": 1.42,
"grad_norm": 2.9616644118620092,
"learning_rate": 1.084332697583669e-06,
"loss": 0.8788,
"step": 3488
},
{
"epoch": 1.42,
"grad_norm": 3.1918302608961193,
"learning_rate": 1.0839056228374375e-06,
"loss": 0.8082,
"step": 3489
},
{
"epoch": 1.42,
"grad_norm": 2.534468172269994,
"learning_rate": 1.0834785326783592e-06,
"loss": 0.8714,
"step": 3490
},
{
"epoch": 1.42,
"grad_norm": 2.9509724837501348,
"learning_rate": 1.0830514271848877e-06,
"loss": 0.7774,
"step": 3491
},
{
"epoch": 1.43,
"grad_norm": 4.5393718519409925,
"learning_rate": 1.0826243064354792e-06,
"loss": 0.8296,
"step": 3492
},
{
"epoch": 1.43,
"grad_norm": 3.0105929244410237,
"learning_rate": 1.0821971705085924e-06,
"loss": 0.8346,
"step": 3493
},
{
"epoch": 1.43,
"grad_norm": 2.9207464592944516,
"learning_rate": 1.0817700194826893e-06,
"loss": 0.8513,
"step": 3494
},
{
"epoch": 1.43,
"grad_norm": 3.2624704399884275,
"learning_rate": 1.081342853436234e-06,
"loss": 0.8722,
"step": 3495
},
{
"epoch": 1.43,
"grad_norm": 3.028976052390905,
"learning_rate": 1.0809156724476944e-06,
"loss": 0.8754,
"step": 3496
},
{
"epoch": 1.43,
"grad_norm": 2.853176655918474,
"learning_rate": 1.08048847659554e-06,
"loss": 0.85,
"step": 3497
},
{
"epoch": 1.43,
"grad_norm": 2.675888289855771,
"learning_rate": 1.0800612659582437e-06,
"loss": 0.8398,
"step": 3498
},
{
"epoch": 1.43,
"grad_norm": 2.936829320908903,
"learning_rate": 1.079634040614281e-06,
"loss": 0.8447,
"step": 3499
},
{
"epoch": 1.43,
"grad_norm": 3.1810937714994156,
"learning_rate": 1.0792068006421303e-06,
"loss": 0.8789,
"step": 3500
},
{
"epoch": 1.43,
"eval_loss": 0.8793097138404846,
"eval_runtime": 466.3321,
"eval_samples_per_second": 74.736,
"eval_steps_per_second": 4.673,
"step": 3500
},
{
"epoch": 1.43,
"grad_norm": 2.972110550523249,
"learning_rate": 1.0787795461202716e-06,
"loss": 0.8093,
"step": 3501
},
{
"epoch": 1.43,
"grad_norm": 3.517794613866309,
"learning_rate": 1.0783522771271893e-06,
"loss": 0.7762,
"step": 3502
},
{
"epoch": 1.43,
"grad_norm": 3.358817759521912,
"learning_rate": 1.077924993741369e-06,
"loss": 0.834,
"step": 3503
},
{
"epoch": 1.43,
"grad_norm": 3.082539482620655,
"learning_rate": 1.0774976960413002e-06,
"loss": 0.7842,
"step": 3504
},
{
"epoch": 1.43,
"grad_norm": 2.822203036256108,
"learning_rate": 1.0770703841054736e-06,
"loss": 0.7959,
"step": 3505
},
{
"epoch": 1.43,
"grad_norm": 2.744148348506507,
"learning_rate": 1.0766430580123837e-06,
"loss": 0.816,
"step": 3506
},
{
"epoch": 1.43,
"grad_norm": 3.351945310638318,
"learning_rate": 1.0762157178405268e-06,
"loss": 0.8524,
"step": 3507
},
{
"epoch": 1.43,
"grad_norm": 3.1246527334375216,
"learning_rate": 1.0757883636684026e-06,
"loss": 0.8197,
"step": 3508
},
{
"epoch": 1.43,
"grad_norm": 2.3226634950267346,
"learning_rate": 1.0753609955745128e-06,
"loss": 0.8918,
"step": 3509
},
{
"epoch": 1.43,
"grad_norm": 3.8644513665466853,
"learning_rate": 1.0749336136373617e-06,
"loss": 0.7564,
"step": 3510
},
{
"epoch": 1.43,
"grad_norm": 2.430450100726175,
"learning_rate": 1.074506217935456e-06,
"loss": 0.8125,
"step": 3511
},
{
"epoch": 1.43,
"grad_norm": 2.9577496369539342,
"learning_rate": 1.0740788085473057e-06,
"loss": 0.7605,
"step": 3512
},
{
"epoch": 1.43,
"grad_norm": 3.8308390814265856,
"learning_rate": 1.0736513855514222e-06,
"loss": 0.8984,
"step": 3513
},
{
"epoch": 1.43,
"grad_norm": 2.2071088363884943,
"learning_rate": 1.07322394902632e-06,
"loss": 0.8371,
"step": 3514
},
{
"epoch": 1.43,
"grad_norm": 2.4972348455401585,
"learning_rate": 1.0727964990505167e-06,
"loss": 0.8157,
"step": 3515
},
{
"epoch": 1.43,
"grad_norm": 3.473968808773429,
"learning_rate": 1.0723690357025307e-06,
"loss": 0.7793,
"step": 3516
},
{
"epoch": 1.44,
"grad_norm": 2.4716961092964524,
"learning_rate": 1.071941559060885e-06,
"loss": 0.833,
"step": 3517
},
{
"epoch": 1.44,
"grad_norm": 2.425626505936839,
"learning_rate": 1.0715140692041033e-06,
"loss": 0.8254,
"step": 3518
},
{
"epoch": 1.44,
"grad_norm": 3.283554062650639,
"learning_rate": 1.0710865662107123e-06,
"loss": 0.7625,
"step": 3519
},
{
"epoch": 1.44,
"grad_norm": 3.298677179580582,
"learning_rate": 1.0706590501592412e-06,
"loss": 0.8628,
"step": 3520
},
{
"epoch": 1.44,
"grad_norm": 2.4361258102078205,
"learning_rate": 1.0702315211282216e-06,
"loss": 0.8612,
"step": 3521
},
{
"epoch": 1.44,
"grad_norm": 2.7024719299172126,
"learning_rate": 1.0698039791961877e-06,
"loss": 0.7955,
"step": 3522
},
{
"epoch": 1.44,
"grad_norm": 3.2018618660987928,
"learning_rate": 1.0693764244416757e-06,
"loss": 0.8158,
"step": 3523
},
{
"epoch": 1.44,
"grad_norm": 2.985352164063182,
"learning_rate": 1.0689488569432235e-06,
"loss": 0.8574,
"step": 3524
},
{
"epoch": 1.44,
"grad_norm": 3.6849933700387925,
"learning_rate": 1.0685212767793731e-06,
"loss": 0.8569,
"step": 3525
},
{
"epoch": 1.44,
"grad_norm": 2.671248879756324,
"learning_rate": 1.0680936840286676e-06,
"loss": 0.8296,
"step": 3526
},
{
"epoch": 1.44,
"grad_norm": 2.66564971693463,
"learning_rate": 1.0676660787696526e-06,
"loss": 0.7902,
"step": 3527
},
{
"epoch": 1.44,
"grad_norm": 2.9701124298190487,
"learning_rate": 1.067238461080876e-06,
"loss": 0.8319,
"step": 3528
},
{
"epoch": 1.44,
"grad_norm": 3.0031012375078197,
"learning_rate": 1.0668108310408877e-06,
"loss": 0.8247,
"step": 3529
},
{
"epoch": 1.44,
"grad_norm": 2.7716551088588126,
"learning_rate": 1.0663831887282409e-06,
"loss": 0.8325,
"step": 3530
},
{
"epoch": 1.44,
"grad_norm": 2.8612627987400456,
"learning_rate": 1.0659555342214897e-06,
"loss": 0.8269,
"step": 3531
},
{
"epoch": 1.44,
"grad_norm": 3.002791454890724,
"learning_rate": 1.0655278675991915e-06,
"loss": 0.8109,
"step": 3532
},
{
"epoch": 1.44,
"grad_norm": 5.050035743653485,
"learning_rate": 1.0651001889399053e-06,
"loss": 0.8722,
"step": 3533
},
{
"epoch": 1.44,
"grad_norm": 2.8724224131308667,
"learning_rate": 1.0646724983221926e-06,
"loss": 0.7697,
"step": 3534
},
{
"epoch": 1.44,
"grad_norm": 2.9753312757479766,
"learning_rate": 1.064244795824617e-06,
"loss": 0.7822,
"step": 3535
},
{
"epoch": 1.44,
"grad_norm": 3.1532631047041977,
"learning_rate": 1.0638170815257446e-06,
"loss": 0.7698,
"step": 3536
},
{
"epoch": 1.44,
"grad_norm": 2.687346808622626,
"learning_rate": 1.0633893555041427e-06,
"loss": 0.8442,
"step": 3537
},
{
"epoch": 1.44,
"grad_norm": 2.57274697685896,
"learning_rate": 1.062961617838382e-06,
"loss": 0.8614,
"step": 3538
},
{
"epoch": 1.44,
"grad_norm": 2.53873078799517,
"learning_rate": 1.0625338686070347e-06,
"loss": 0.7833,
"step": 3539
},
{
"epoch": 1.44,
"grad_norm": 2.3777286089014997,
"learning_rate": 1.0621061078886748e-06,
"loss": 0.8507,
"step": 3540
},
{
"epoch": 1.45,
"grad_norm": 2.570827710570036,
"learning_rate": 1.0616783357618797e-06,
"loss": 0.8524,
"step": 3541
},
{
"epoch": 1.45,
"grad_norm": 3.465447946486636,
"learning_rate": 1.0612505523052267e-06,
"loss": 0.8331,
"step": 3542
},
{
"epoch": 1.45,
"grad_norm": 4.838366292526872,
"learning_rate": 1.0608227575972977e-06,
"loss": 0.8094,
"step": 3543
},
{
"epoch": 1.45,
"grad_norm": 2.9338967416437676,
"learning_rate": 1.0603949517166748e-06,
"loss": 0.8254,
"step": 3544
},
{
"epoch": 1.45,
"grad_norm": 3.2558544701128325,
"learning_rate": 1.0599671347419429e-06,
"loss": 0.8818,
"step": 3545
},
{
"epoch": 1.45,
"grad_norm": 2.6034678298343614,
"learning_rate": 1.0595393067516886e-06,
"loss": 0.8149,
"step": 3546
},
{
"epoch": 1.45,
"grad_norm": 3.197284955917636,
"learning_rate": 1.0591114678245012e-06,
"loss": 0.7855,
"step": 3547
},
{
"epoch": 1.45,
"grad_norm": 3.18914916141454,
"learning_rate": 1.0586836180389713e-06,
"loss": 0.8296,
"step": 3548
},
{
"epoch": 1.45,
"grad_norm": 2.6806736278501817,
"learning_rate": 1.0582557574736922e-06,
"loss": 0.8067,
"step": 3549
},
{
"epoch": 1.45,
"grad_norm": 2.7098686126788274,
"learning_rate": 1.0578278862072577e-06,
"loss": 0.812,
"step": 3550
},
{
"epoch": 1.45,
"grad_norm": 3.185216301586237,
"learning_rate": 1.0574000043182661e-06,
"loss": 0.7694,
"step": 3551
},
{
"epoch": 1.45,
"grad_norm": 2.8520916763540702,
"learning_rate": 1.0569721118853146e-06,
"loss": 0.9236,
"step": 3552
},
{
"epoch": 1.45,
"grad_norm": 2.9855356540920712,
"learning_rate": 1.0565442089870052e-06,
"loss": 0.7691,
"step": 3553
},
{
"epoch": 1.45,
"grad_norm": 3.1004572621021347,
"learning_rate": 1.0561162957019399e-06,
"loss": 0.8156,
"step": 3554
},
{
"epoch": 1.45,
"grad_norm": 2.999733125950064,
"learning_rate": 1.055688372108723e-06,
"loss": 0.7952,
"step": 3555
},
{
"epoch": 1.45,
"grad_norm": 2.9559580175667404,
"learning_rate": 1.0552604382859614e-06,
"loss": 0.7917,
"step": 3556
},
{
"epoch": 1.45,
"grad_norm": 2.772205140136618,
"learning_rate": 1.0548324943122633e-06,
"loss": 0.9027,
"step": 3557
},
{
"epoch": 1.45,
"grad_norm": 3.094101410991152,
"learning_rate": 1.0544045402662385e-06,
"loss": 0.8317,
"step": 3558
},
{
"epoch": 1.45,
"grad_norm": 3.4417800046470823,
"learning_rate": 1.0539765762264995e-06,
"loss": 0.8104,
"step": 3559
},
{
"epoch": 1.45,
"grad_norm": 2.546672976394522,
"learning_rate": 1.05354860227166e-06,
"loss": 0.7994,
"step": 3560
},
{
"epoch": 1.45,
"grad_norm": 3.735271251375197,
"learning_rate": 1.0531206184803354e-06,
"loss": 0.7873,
"step": 3561
},
{
"epoch": 1.45,
"grad_norm": 3.808648054823178,
"learning_rate": 1.0526926249311438e-06,
"loss": 0.839,
"step": 3562
},
{
"epoch": 1.45,
"grad_norm": 2.587811966570033,
"learning_rate": 1.0522646217027036e-06,
"loss": 0.8061,
"step": 3563
},
{
"epoch": 1.45,
"grad_norm": 3.583652762211378,
"learning_rate": 1.0518366088736368e-06,
"loss": 0.8242,
"step": 3564
},
{
"epoch": 1.45,
"grad_norm": 3.3330143448338108,
"learning_rate": 1.0514085865225654e-06,
"loss": 0.8775,
"step": 3565
},
{
"epoch": 1.46,
"grad_norm": 2.396137300270619,
"learning_rate": 1.0509805547281143e-06,
"loss": 0.8152,
"step": 3566
},
{
"epoch": 1.46,
"grad_norm": 2.5458186043340176,
"learning_rate": 1.0505525135689096e-06,
"loss": 0.7973,
"step": 3567
},
{
"epoch": 1.46,
"grad_norm": 2.5803323814614765,
"learning_rate": 1.0501244631235798e-06,
"loss": 0.8289,
"step": 3568
},
{
"epoch": 1.46,
"grad_norm": 2.8473219460510037,
"learning_rate": 1.0496964034707545e-06,
"loss": 0.7909,
"step": 3569
},
{
"epoch": 1.46,
"grad_norm": 3.4311202080468832,
"learning_rate": 1.0492683346890644e-06,
"loss": 0.8223,
"step": 3570
},
{
"epoch": 1.46,
"grad_norm": 3.113534324238185,
"learning_rate": 1.0488402568571437e-06,
"loss": 0.8414,
"step": 3571
},
{
"epoch": 1.46,
"grad_norm": 2.71617535721356,
"learning_rate": 1.0484121700536262e-06,
"loss": 0.8441,
"step": 3572
},
{
"epoch": 1.46,
"grad_norm": 3.45732804058731,
"learning_rate": 1.047984074357149e-06,
"loss": 0.8113,
"step": 3573
},
{
"epoch": 1.46,
"grad_norm": 3.166138221460659,
"learning_rate": 1.0475559698463494e-06,
"loss": 0.8061,
"step": 3574
},
{
"epoch": 1.46,
"grad_norm": 4.023148784516555,
"learning_rate": 1.047127856599868e-06,
"loss": 0.7595,
"step": 3575
},
{
"epoch": 1.46,
"grad_norm": 3.1820813260289738,
"learning_rate": 1.046699734696345e-06,
"loss": 0.8875,
"step": 3576
},
{
"epoch": 1.46,
"grad_norm": 3.3383922491744116,
"learning_rate": 1.0462716042144242e-06,
"loss": 0.7719,
"step": 3577
},
{
"epoch": 1.46,
"grad_norm": 3.026731144696328,
"learning_rate": 1.0458434652327492e-06,
"loss": 0.7922,
"step": 3578
},
{
"epoch": 1.46,
"grad_norm": 2.9441064999033077,
"learning_rate": 1.0454153178299667e-06,
"loss": 0.8072,
"step": 3579
},
{
"epoch": 1.46,
"grad_norm": 3.2407568937122955,
"learning_rate": 1.044987162084724e-06,
"loss": 0.8479,
"step": 3580
},
{
"epoch": 1.46,
"grad_norm": 2.809346599277932,
"learning_rate": 1.04455899807567e-06,
"loss": 0.809,
"step": 3581
},
{
"epoch": 1.46,
"grad_norm": 3.0844798826597746,
"learning_rate": 1.044130825881455e-06,
"loss": 0.7874,
"step": 3582
},
{
"epoch": 1.46,
"grad_norm": 2.777629549895933,
"learning_rate": 1.0437026455807317e-06,
"loss": 0.8687,
"step": 3583
},
{
"epoch": 1.46,
"grad_norm": 2.6605699191520813,
"learning_rate": 1.0432744572521537e-06,
"loss": 0.8485,
"step": 3584
},
{
"epoch": 1.46,
"grad_norm": 2.888974872318917,
"learning_rate": 1.0428462609743755e-06,
"loss": 0.8251,
"step": 3585
},
{
"epoch": 1.46,
"grad_norm": 2.7113487418005624,
"learning_rate": 1.042418056826054e-06,
"loss": 0.8371,
"step": 3586
},
{
"epoch": 1.46,
"grad_norm": 2.9397287947036,
"learning_rate": 1.0419898448858471e-06,
"loss": 0.8336,
"step": 3587
},
{
"epoch": 1.46,
"grad_norm": 2.9729239430397687,
"learning_rate": 1.041561625232414e-06,
"loss": 0.8347,
"step": 3588
},
{
"epoch": 1.46,
"grad_norm": 2.59671292339549,
"learning_rate": 1.0411333979444157e-06,
"loss": 0.8536,
"step": 3589
},
{
"epoch": 1.47,
"grad_norm": 4.570335230513413,
"learning_rate": 1.0407051631005147e-06,
"loss": 0.8469,
"step": 3590
},
{
"epoch": 1.47,
"grad_norm": 2.8559103343891983,
"learning_rate": 1.0402769207793736e-06,
"loss": 0.8777,
"step": 3591
},
{
"epoch": 1.47,
"grad_norm": 2.602986429658809,
"learning_rate": 1.0398486710596585e-06,
"loss": 0.7794,
"step": 3592
},
{
"epoch": 1.47,
"grad_norm": 2.710117676122032,
"learning_rate": 1.0394204140200352e-06,
"loss": 0.8564,
"step": 3593
},
{
"epoch": 1.47,
"grad_norm": 2.648400355555663,
"learning_rate": 1.038992149739171e-06,
"loss": 0.8665,
"step": 3594
},
{
"epoch": 1.47,
"grad_norm": 2.763435721242656,
"learning_rate": 1.0385638782957358e-06,
"loss": 0.8441,
"step": 3595
},
{
"epoch": 1.47,
"grad_norm": 3.2946018411706532,
"learning_rate": 1.0381355997683994e-06,
"loss": 0.7519,
"step": 3596
},
{
"epoch": 1.47,
"grad_norm": 3.11921885602358,
"learning_rate": 1.0377073142358333e-06,
"loss": 0.7893,
"step": 3597
},
{
"epoch": 1.47,
"grad_norm": 3.5843575504233676,
"learning_rate": 1.0372790217767104e-06,
"loss": 0.822,
"step": 3598
},
{
"epoch": 1.47,
"grad_norm": 3.6428210923696738,
"learning_rate": 1.0368507224697053e-06,
"loss": 0.857,
"step": 3599
},
{
"epoch": 1.47,
"grad_norm": 3.025343816011617,
"learning_rate": 1.0364224163934931e-06,
"loss": 0.7726,
"step": 3600
},
{
"epoch": 1.47,
"eval_loss": 0.8781746625900269,
"eval_runtime": 466.8205,
"eval_samples_per_second": 74.658,
"eval_steps_per_second": 4.668,
"step": 3600
},
{
"epoch": 1.47,
"grad_norm": 3.970338428262772,
"learning_rate": 1.0359941036267506e-06,
"loss": 0.7674,
"step": 3601
},
{
"epoch": 1.47,
"grad_norm": 2.4040646821838365,
"learning_rate": 1.0355657842481555e-06,
"loss": 0.8904,
"step": 3602
},
{
"epoch": 1.47,
"grad_norm": 3.5665754825946707,
"learning_rate": 1.0351374583363875e-06,
"loss": 0.8148,
"step": 3603
},
{
"epoch": 1.47,
"grad_norm": 5.930237271169677,
"learning_rate": 1.034709125970126e-06,
"loss": 0.8362,
"step": 3604
},
{
"epoch": 1.47,
"grad_norm": 2.9653809902260204,
"learning_rate": 1.0342807872280535e-06,
"loss": 0.8034,
"step": 3605
},
{
"epoch": 1.47,
"grad_norm": 2.5266968192109704,
"learning_rate": 1.033852442188852e-06,
"loss": 0.8418,
"step": 3606
},
{
"epoch": 1.47,
"grad_norm": 4.9241089416359465,
"learning_rate": 1.0334240909312054e-06,
"loss": 0.7988,
"step": 3607
},
{
"epoch": 1.47,
"grad_norm": 2.6142619538550274,
"learning_rate": 1.032995733533799e-06,
"loss": 0.8316,
"step": 3608
},
{
"epoch": 1.47,
"grad_norm": 3.4668650696547103,
"learning_rate": 1.0325673700753187e-06,
"loss": 0.8585,
"step": 3609
},
{
"epoch": 1.47,
"grad_norm": 2.72549995709747,
"learning_rate": 1.0321390006344517e-06,
"loss": 0.7992,
"step": 3610
},
{
"epoch": 1.47,
"grad_norm": 2.5937094925512945,
"learning_rate": 1.0317106252898866e-06,
"loss": 0.8407,
"step": 3611
},
{
"epoch": 1.47,
"grad_norm": 2.945679330632761,
"learning_rate": 1.031282244120312e-06,
"loss": 0.8064,
"step": 3612
},
{
"epoch": 1.47,
"grad_norm": 2.976386710011163,
"learning_rate": 1.0308538572044195e-06,
"loss": 0.8531,
"step": 3613
},
{
"epoch": 1.47,
"grad_norm": 2.921393429640964,
"learning_rate": 1.0304254646208999e-06,
"loss": 0.7988,
"step": 3614
},
{
"epoch": 1.48,
"grad_norm": 4.002142796794127,
"learning_rate": 1.029997066448446e-06,
"loss": 0.8167,
"step": 3615
},
{
"epoch": 1.48,
"grad_norm": 3.483264848451701,
"learning_rate": 1.0295686627657513e-06,
"loss": 0.8204,
"step": 3616
},
{
"epoch": 1.48,
"grad_norm": 3.144646906181894,
"learning_rate": 1.02914025365151e-06,
"loss": 0.7773,
"step": 3617
},
{
"epoch": 1.48,
"grad_norm": 3.301615648905499,
"learning_rate": 1.028711839184419e-06,
"loss": 0.9073,
"step": 3618
},
{
"epoch": 1.48,
"grad_norm": 3.250163930587703,
"learning_rate": 1.0282834194431736e-06,
"loss": 0.8768,
"step": 3619
},
{
"epoch": 1.48,
"grad_norm": 2.5790097125174682,
"learning_rate": 1.0278549945064719e-06,
"loss": 0.7849,
"step": 3620
},
{
"epoch": 1.48,
"grad_norm": 3.1429813232332977,
"learning_rate": 1.0274265644530122e-06,
"loss": 0.83,
"step": 3621
},
{
"epoch": 1.48,
"grad_norm": 3.2540068367209085,
"learning_rate": 1.0269981293614941e-06,
"loss": 0.7722,
"step": 3622
},
{
"epoch": 1.48,
"grad_norm": 2.7460765311067505,
"learning_rate": 1.026569689310618e-06,
"loss": 0.8403,
"step": 3623
},
{
"epoch": 1.48,
"grad_norm": 3.3715338678168445,
"learning_rate": 1.0261412443790855e-06,
"loss": 0.8873,
"step": 3624
},
{
"epoch": 1.48,
"grad_norm": 2.5537404487704958,
"learning_rate": 1.025712794645598e-06,
"loss": 0.8166,
"step": 3625
},
{
"epoch": 1.48,
"grad_norm": 2.57756775132154,
"learning_rate": 1.025284340188859e-06,
"loss": 0.8503,
"step": 3626
},
{
"epoch": 1.48,
"grad_norm": 2.503732050874767,
"learning_rate": 1.0248558810875725e-06,
"loss": 0.7902,
"step": 3627
},
{
"epoch": 1.48,
"grad_norm": 3.078288441638959,
"learning_rate": 1.0244274174204434e-06,
"loss": 0.7484,
"step": 3628
},
{
"epoch": 1.48,
"grad_norm": 3.9059377295927127,
"learning_rate": 1.023998949266177e-06,
"loss": 0.8265,
"step": 3629
},
{
"epoch": 1.48,
"grad_norm": 3.2315942625399003,
"learning_rate": 1.0235704767034798e-06,
"loss": 0.8204,
"step": 3630
},
{
"epoch": 1.48,
"grad_norm": 2.786604308339088,
"learning_rate": 1.0231419998110594e-06,
"loss": 0.8552,
"step": 3631
},
{
"epoch": 1.48,
"grad_norm": 3.109351792592465,
"learning_rate": 1.0227135186676236e-06,
"loss": 0.7986,
"step": 3632
},
{
"epoch": 1.48,
"grad_norm": 3.0480104415860363,
"learning_rate": 1.0222850333518814e-06,
"loss": 0.8007,
"step": 3633
},
{
"epoch": 1.48,
"grad_norm": 3.157963166190811,
"learning_rate": 1.021856543942542e-06,
"loss": 0.7437,
"step": 3634
},
{
"epoch": 1.48,
"grad_norm": 2.771211861011189,
"learning_rate": 1.021428050518316e-06,
"loss": 0.8552,
"step": 3635
},
{
"epoch": 1.48,
"grad_norm": 2.8055383354685874,
"learning_rate": 1.0209995531579147e-06,
"loss": 0.8322,
"step": 3636
},
{
"epoch": 1.48,
"grad_norm": 2.9114308177101274,
"learning_rate": 1.0205710519400496e-06,
"loss": 0.8032,
"step": 3637
},
{
"epoch": 1.48,
"grad_norm": 2.6716835449334613,
"learning_rate": 1.0201425469434335e-06,
"loss": 0.8188,
"step": 3638
},
{
"epoch": 1.49,
"grad_norm": 2.7522645522962015,
"learning_rate": 1.0197140382467797e-06,
"loss": 0.8832,
"step": 3639
},
{
"epoch": 1.49,
"grad_norm": 2.7750772623924433,
"learning_rate": 1.0192855259288013e-06,
"loss": 0.8391,
"step": 3640
},
{
"epoch": 1.49,
"grad_norm": 3.9776085708971562,
"learning_rate": 1.018857010068214e-06,
"loss": 0.825,
"step": 3641
},
{
"epoch": 1.49,
"grad_norm": 2.9692124540855445,
"learning_rate": 1.0184284907437325e-06,
"loss": 0.7993,
"step": 3642
},
{
"epoch": 1.49,
"grad_norm": 2.745225971172974,
"learning_rate": 1.0179999680340718e-06,
"loss": 0.8905,
"step": 3643
},
{
"epoch": 1.49,
"grad_norm": 2.486355501022104,
"learning_rate": 1.0175714420179503e-06,
"loss": 0.7632,
"step": 3644
},
{
"epoch": 1.49,
"grad_norm": 2.8600633626742544,
"learning_rate": 1.0171429127740833e-06,
"loss": 0.8412,
"step": 3645
},
{
"epoch": 1.49,
"grad_norm": 2.735141257982849,
"learning_rate": 1.0167143803811892e-06,
"loss": 0.8091,
"step": 3646
},
{
"epoch": 1.49,
"grad_norm": 3.2368858676879984,
"learning_rate": 1.0162858449179864e-06,
"loss": 0.8183,
"step": 3647
},
{
"epoch": 1.49,
"grad_norm": 3.1382375731701226,
"learning_rate": 1.0158573064631934e-06,
"loss": 0.8005,
"step": 3648
},
{
"epoch": 1.49,
"grad_norm": 2.7526478386285005,
"learning_rate": 1.0154287650955298e-06,
"loss": 0.8423,
"step": 3649
},
{
"epoch": 1.49,
"grad_norm": 2.9961385186482143,
"learning_rate": 1.0150002208937152e-06,
"loss": 0.9353,
"step": 3650
},
{
"epoch": 1.49,
"grad_norm": 3.8044294906379776,
"learning_rate": 1.0145716739364704e-06,
"loss": 0.7716,
"step": 3651
},
{
"epoch": 1.49,
"grad_norm": 3.9042353256051148,
"learning_rate": 1.0141431243025162e-06,
"loss": 0.7949,
"step": 3652
},
{
"epoch": 1.49,
"grad_norm": 3.3887465245021837,
"learning_rate": 1.0137145720705736e-06,
"loss": 0.8243,
"step": 3653
},
{
"epoch": 1.49,
"grad_norm": 2.4866744163089676,
"learning_rate": 1.0132860173193652e-06,
"loss": 0.8568,
"step": 3654
},
{
"epoch": 1.49,
"grad_norm": 3.549954280058447,
"learning_rate": 1.012857460127613e-06,
"loss": 0.8224,
"step": 3655
},
{
"epoch": 1.49,
"grad_norm": 2.5077921312115192,
"learning_rate": 1.0124289005740394e-06,
"loss": 0.8485,
"step": 3656
},
{
"epoch": 1.49,
"grad_norm": 2.818784257841039,
"learning_rate": 1.0120003387373686e-06,
"loss": 0.8409,
"step": 3657
},
{
"epoch": 1.49,
"grad_norm": 2.9835454383073254,
"learning_rate": 1.0115717746963234e-06,
"loss": 0.8231,
"step": 3658
},
{
"epoch": 1.49,
"grad_norm": 3.034358258743486,
"learning_rate": 1.0111432085296283e-06,
"loss": 0.7945,
"step": 3659
},
{
"epoch": 1.49,
"grad_norm": 2.41365135514477,
"learning_rate": 1.0107146403160075e-06,
"loss": 0.8161,
"step": 3660
},
{
"epoch": 1.49,
"grad_norm": 3.0267373060194305,
"learning_rate": 1.0102860701341863e-06,
"loss": 0.8314,
"step": 3661
},
{
"epoch": 1.49,
"grad_norm": 3.504800257244514,
"learning_rate": 1.0098574980628893e-06,
"loss": 0.8511,
"step": 3662
},
{
"epoch": 1.49,
"grad_norm": 2.417496950214851,
"learning_rate": 1.0094289241808423e-06,
"loss": 0.8329,
"step": 3663
},
{
"epoch": 1.5,
"grad_norm": 4.01500337492168,
"learning_rate": 1.0090003485667713e-06,
"loss": 0.8143,
"step": 3664
},
{
"epoch": 1.5,
"grad_norm": 3.0970576532168406,
"learning_rate": 1.0085717712994026e-06,
"loss": 0.8699,
"step": 3665
},
{
"epoch": 1.5,
"grad_norm": 2.528371045263945,
"learning_rate": 1.008143192457462e-06,
"loss": 0.8343,
"step": 3666
},
{
"epoch": 1.5,
"grad_norm": 3.3167399990037247,
"learning_rate": 1.0077146121196772e-06,
"loss": 0.7972,
"step": 3667
},
{
"epoch": 1.5,
"grad_norm": 3.033552363866982,
"learning_rate": 1.0072860303647748e-06,
"loss": 0.8237,
"step": 3668
},
{
"epoch": 1.5,
"grad_norm": 2.9955268046028842,
"learning_rate": 1.0068574472714818e-06,
"loss": 0.8115,
"step": 3669
},
{
"epoch": 1.5,
"grad_norm": 4.013158183988942,
"learning_rate": 1.0064288629185264e-06,
"loss": 0.86,
"step": 3670
},
{
"epoch": 1.5,
"grad_norm": 2.8372867720160024,
"learning_rate": 1.0060002773846358e-06,
"loss": 0.7865,
"step": 3671
},
{
"epoch": 1.5,
"grad_norm": 2.613167825937184,
"learning_rate": 1.0055716907485389e-06,
"loss": 0.9232,
"step": 3672
},
{
"epoch": 1.5,
"grad_norm": 2.806381282024053,
"learning_rate": 1.0051431030889627e-06,
"loss": 0.853,
"step": 3673
},
{
"epoch": 1.5,
"grad_norm": 2.748202467294689,
"learning_rate": 1.0047145144846365e-06,
"loss": 0.8711,
"step": 3674
},
{
"epoch": 1.5,
"grad_norm": 3.2057608879627257,
"learning_rate": 1.0042859250142885e-06,
"loss": 0.8597,
"step": 3675
},
{
"epoch": 1.5,
"grad_norm": 2.4910479769637477,
"learning_rate": 1.0038573347566478e-06,
"loss": 0.8773,
"step": 3676
},
{
"epoch": 1.5,
"grad_norm": 3.6929735686396716,
"learning_rate": 1.0034287437904429e-06,
"loss": 0.7593,
"step": 3677
},
{
"epoch": 1.5,
"grad_norm": 3.272228598051252,
"learning_rate": 1.003000152194403e-06,
"loss": 0.8444,
"step": 3678
},
{
"epoch": 1.5,
"grad_norm": 4.149075910200719,
"learning_rate": 1.0025715600472571e-06,
"loss": 0.8303,
"step": 3679
},
{
"epoch": 1.5,
"grad_norm": 2.3092407653093043,
"learning_rate": 1.002142967427735e-06,
"loss": 0.8403,
"step": 3680
},
{
"epoch": 1.5,
"grad_norm": 3.838630018538625,
"learning_rate": 1.001714374414565e-06,
"loss": 0.7898,
"step": 3681
},
{
"epoch": 1.5,
"grad_norm": 2.5558224511351924,
"learning_rate": 1.001285781086477e-06,
"loss": 0.8224,
"step": 3682
},
{
"epoch": 1.5,
"grad_norm": 2.903039438137959,
"learning_rate": 1.0008571875222005e-06,
"loss": 0.8577,
"step": 3683
},
{
"epoch": 1.5,
"grad_norm": 2.4275950638697075,
"learning_rate": 1.000428593800465e-06,
"loss": 0.8437,
"step": 3684
},
{
"epoch": 1.5,
"grad_norm": 3.0094366089842377,
"learning_rate": 1e-06,
"loss": 0.8381,
"step": 3685
},
{
"epoch": 1.5,
"grad_norm": 3.6011661199090814,
"learning_rate": 9.995714061995349e-07,
"loss": 0.8141,
"step": 3686
},
{
"epoch": 1.5,
"grad_norm": 3.0048225236032855,
"learning_rate": 9.991428124777994e-07,
"loss": 0.8098,
"step": 3687
},
{
"epoch": 1.5,
"grad_norm": 2.400057055301834,
"learning_rate": 9.987142189135233e-07,
"loss": 0.8416,
"step": 3688
},
{
"epoch": 1.51,
"grad_norm": 2.923275937091851,
"learning_rate": 9.982856255854352e-07,
"loss": 0.838,
"step": 3689
},
{
"epoch": 1.51,
"grad_norm": 2.503757878716997,
"learning_rate": 9.978570325722652e-07,
"loss": 0.7973,
"step": 3690
},
{
"epoch": 1.51,
"grad_norm": 3.1040794838001755,
"learning_rate": 9.974284399527426e-07,
"loss": 0.7513,
"step": 3691
},
{
"epoch": 1.51,
"grad_norm": 2.6538880580895015,
"learning_rate": 9.96999847805597e-07,
"loss": 0.7942,
"step": 3692
},
{
"epoch": 1.51,
"grad_norm": 2.267850525254165,
"learning_rate": 9.96571256209557e-07,
"loss": 0.8422,
"step": 3693
},
{
"epoch": 1.51,
"grad_norm": 3.4455419665855356,
"learning_rate": 9.961426652433523e-07,
"loss": 0.8933,
"step": 3694
},
{
"epoch": 1.51,
"grad_norm": 3.9713575552421925,
"learning_rate": 9.957140749857112e-07,
"loss": 0.7872,
"step": 3695
},
{
"epoch": 1.51,
"grad_norm": 2.5013780182191185,
"learning_rate": 9.952854855153636e-07,
"loss": 0.8376,
"step": 3696
},
{
"epoch": 1.51,
"grad_norm": 2.7777453967222194,
"learning_rate": 9.948568969110372e-07,
"loss": 0.8131,
"step": 3697
},
{
"epoch": 1.51,
"grad_norm": 2.7871125006249,
"learning_rate": 9.944283092514615e-07,
"loss": 0.8507,
"step": 3698
},
{
"epoch": 1.51,
"grad_norm": 4.032829209943345,
"learning_rate": 9.939997226153641e-07,
"loss": 0.8014,
"step": 3699
},
{
"epoch": 1.51,
"grad_norm": 3.9713360423247903,
"learning_rate": 9.935711370814735e-07,
"loss": 0.8234,
"step": 3700
},
{
"epoch": 1.51,
"eval_loss": 0.8774491548538208,
"eval_runtime": 466.3792,
"eval_samples_per_second": 74.729,
"eval_steps_per_second": 4.672,
"step": 3700
},
{
"epoch": 1.51,
"grad_norm": 3.0325405912685137,
"learning_rate": 9.931425527285183e-07,
"loss": 0.8328,
"step": 3701
},
{
"epoch": 1.51,
"grad_norm": 3.150958114602766,
"learning_rate": 9.927139696352254e-07,
"loss": 0.7749,
"step": 3702
},
{
"epoch": 1.51,
"grad_norm": 3.1346600662539177,
"learning_rate": 9.922853878803229e-07,
"loss": 0.8463,
"step": 3703
},
{
"epoch": 1.51,
"grad_norm": 2.658717132942148,
"learning_rate": 9.918568075425378e-07,
"loss": 0.8126,
"step": 3704
},
{
"epoch": 1.51,
"grad_norm": 2.546903464281303,
"learning_rate": 9.914282287005975e-07,
"loss": 0.8002,
"step": 3705
},
{
"epoch": 1.51,
"grad_norm": 3.808789496500799,
"learning_rate": 9.909996514332288e-07,
"loss": 0.7781,
"step": 3706
},
{
"epoch": 1.51,
"grad_norm": 3.3493718260434897,
"learning_rate": 9.905710758191576e-07,
"loss": 0.8079,
"step": 3707
},
{
"epoch": 1.51,
"grad_norm": 2.2753910523864045,
"learning_rate": 9.901425019371106e-07,
"loss": 0.8521,
"step": 3708
},
{
"epoch": 1.51,
"grad_norm": 2.711380294990765,
"learning_rate": 9.89713929865814e-07,
"loss": 0.8422,
"step": 3709
},
{
"epoch": 1.51,
"grad_norm": 3.041238297651093,
"learning_rate": 9.892853596839924e-07,
"loss": 0.885,
"step": 3710
},
{
"epoch": 1.51,
"grad_norm": 4.150765411098226,
"learning_rate": 9.888567914703718e-07,
"loss": 0.8244,
"step": 3711
},
{
"epoch": 1.51,
"grad_norm": 2.5246748759496365,
"learning_rate": 9.884282253036767e-07,
"loss": 0.819,
"step": 3712
},
{
"epoch": 1.52,
"grad_norm": 2.7328350182998813,
"learning_rate": 9.879996612626317e-07,
"loss": 0.8931,
"step": 3713
},
{
"epoch": 1.52,
"grad_norm": 3.025465623689717,
"learning_rate": 9.875710994259608e-07,
"loss": 0.8138,
"step": 3714
},
{
"epoch": 1.52,
"grad_norm": 4.361884365604717,
"learning_rate": 9.87142539872387e-07,
"loss": 0.8157,
"step": 3715
},
{
"epoch": 1.52,
"grad_norm": 2.6521681138788846,
"learning_rate": 9.86713982680635e-07,
"loss": 0.8293,
"step": 3716
},
{
"epoch": 1.52,
"grad_norm": 3.125075250974279,
"learning_rate": 9.862854279294263e-07,
"loss": 0.8257,
"step": 3717
},
{
"epoch": 1.52,
"grad_norm": 3.1753264514292128,
"learning_rate": 9.85856875697484e-07,
"loss": 0.7259,
"step": 3718
},
{
"epoch": 1.52,
"grad_norm": 2.8478238637183515,
"learning_rate": 9.854283260635298e-07,
"loss": 0.8458,
"step": 3719
},
{
"epoch": 1.52,
"grad_norm": 2.513731457349589,
"learning_rate": 9.849997791062849e-07,
"loss": 0.7952,
"step": 3720
},
{
"epoch": 1.52,
"grad_norm": 2.6702548669248807,
"learning_rate": 9.845712349044702e-07,
"loss": 0.7909,
"step": 3721
},
{
"epoch": 1.52,
"grad_norm": 3.329730791763044,
"learning_rate": 9.84142693536807e-07,
"loss": 0.8712,
"step": 3722
},
{
"epoch": 1.52,
"grad_norm": 2.5422324979951787,
"learning_rate": 9.837141550820137e-07,
"loss": 0.7874,
"step": 3723
},
{
"epoch": 1.52,
"grad_norm": 3.0194946801818094,
"learning_rate": 9.832856196188107e-07,
"loss": 0.8402,
"step": 3724
},
{
"epoch": 1.52,
"grad_norm": 4.789318385837217,
"learning_rate": 9.828570872259168e-07,
"loss": 0.8113,
"step": 3725
},
{
"epoch": 1.52,
"grad_norm": 4.465909522113727,
"learning_rate": 9.824285579820503e-07,
"loss": 0.8403,
"step": 3726
},
{
"epoch": 1.52,
"grad_norm": 3.8981483972519984,
"learning_rate": 9.82000031965928e-07,
"loss": 0.8287,
"step": 3727
},
{
"epoch": 1.52,
"grad_norm": 3.267507474148383,
"learning_rate": 9.815715092562678e-07,
"loss": 0.7827,
"step": 3728
},
{
"epoch": 1.52,
"grad_norm": 3.905138523837206,
"learning_rate": 9.81142989931786e-07,
"loss": 0.8024,
"step": 3729
},
{
"epoch": 1.52,
"grad_norm": 3.4226874765219266,
"learning_rate": 9.807144740711984e-07,
"loss": 0.8279,
"step": 3730
},
{
"epoch": 1.52,
"grad_norm": 3.352330998455309,
"learning_rate": 9.802859617532206e-07,
"loss": 0.8036,
"step": 3731
},
{
"epoch": 1.52,
"grad_norm": 2.542227495008085,
"learning_rate": 9.798574530565666e-07,
"loss": 0.8942,
"step": 3732
},
{
"epoch": 1.52,
"grad_norm": 4.331947242008415,
"learning_rate": 9.794289480599503e-07,
"loss": 0.796,
"step": 3733
},
{
"epoch": 1.52,
"grad_norm": 2.836994490564262,
"learning_rate": 9.790004468420852e-07,
"loss": 0.8657,
"step": 3734
},
{
"epoch": 1.52,
"grad_norm": 2.6581432052585834,
"learning_rate": 9.785719494816842e-07,
"loss": 0.842,
"step": 3735
},
{
"epoch": 1.52,
"grad_norm": 3.218826475473835,
"learning_rate": 9.781434560574581e-07,
"loss": 0.8602,
"step": 3736
},
{
"epoch": 1.52,
"grad_norm": 2.9231574995218983,
"learning_rate": 9.777149666481187e-07,
"loss": 0.9114,
"step": 3737
},
{
"epoch": 1.53,
"grad_norm": 2.7158140633281187,
"learning_rate": 9.772864813323763e-07,
"loss": 0.8695,
"step": 3738
},
{
"epoch": 1.53,
"grad_norm": 3.017508511078352,
"learning_rate": 9.768580001889407e-07,
"loss": 0.8088,
"step": 3739
},
{
"epoch": 1.53,
"grad_norm": 2.8948196750523403,
"learning_rate": 9.764295232965203e-07,
"loss": 0.8482,
"step": 3740
},
{
"epoch": 1.53,
"grad_norm": 3.0319900959290824,
"learning_rate": 9.76001050733823e-07,
"loss": 0.7793,
"step": 3741
},
{
"epoch": 1.53,
"grad_norm": 3.08297031109412,
"learning_rate": 9.755725825795566e-07,
"loss": 0.857,
"step": 3742
},
{
"epoch": 1.53,
"grad_norm": 2.827882433471178,
"learning_rate": 9.751441189124276e-07,
"loss": 0.8714,
"step": 3743
},
{
"epoch": 1.53,
"grad_norm": 3.0617933180195314,
"learning_rate": 9.74715659811141e-07,
"loss": 0.7917,
"step": 3744
},
{
"epoch": 1.53,
"grad_norm": 2.6592642603093406,
"learning_rate": 9.742872053544022e-07,
"loss": 0.831,
"step": 3745
},
{
"epoch": 1.53,
"grad_norm": 3.379908729173079,
"learning_rate": 9.738587556209146e-07,
"loss": 0.8099,
"step": 3746
},
{
"epoch": 1.53,
"grad_norm": 2.3591345928155425,
"learning_rate": 9.734303106893818e-07,
"loss": 0.84,
"step": 3747
},
{
"epoch": 1.53,
"grad_norm": 2.6426656100118584,
"learning_rate": 9.730018706385058e-07,
"loss": 0.819,
"step": 3748
},
{
"epoch": 1.53,
"grad_norm": 2.3399700909039525,
"learning_rate": 9.725734355469877e-07,
"loss": 0.8368,
"step": 3749
},
{
"epoch": 1.53,
"grad_norm": 2.481497218609477,
"learning_rate": 9.721450054935283e-07,
"loss": 0.8474,
"step": 3750
},
{
"epoch": 1.53,
"grad_norm": 3.228694071830215,
"learning_rate": 9.717165805568263e-07,
"loss": 0.8153,
"step": 3751
},
{
"epoch": 1.53,
"grad_norm": 3.096468522400253,
"learning_rate": 9.712881608155813e-07,
"loss": 0.7796,
"step": 3752
},
{
"epoch": 1.53,
"grad_norm": 3.094291229614008,
"learning_rate": 9.708597463484899e-07,
"loss": 0.8057,
"step": 3753
},
{
"epoch": 1.53,
"grad_norm": 3.835769787898692,
"learning_rate": 9.704313372342488e-07,
"loss": 0.7935,
"step": 3754
},
{
"epoch": 1.53,
"grad_norm": 2.73703914026374,
"learning_rate": 9.70002933551554e-07,
"loss": 0.8965,
"step": 3755
},
{
"epoch": 1.53,
"grad_norm": 2.921219597990998,
"learning_rate": 9.695745353791002e-07,
"loss": 0.8854,
"step": 3756
},
{
"epoch": 1.53,
"grad_norm": 2.7072041111959124,
"learning_rate": 9.691461427955806e-07,
"loss": 0.7945,
"step": 3757
},
{
"epoch": 1.53,
"grad_norm": 2.4419450637815463,
"learning_rate": 9.68717755879688e-07,
"loss": 0.756,
"step": 3758
},
{
"epoch": 1.53,
"grad_norm": 3.04674697428981,
"learning_rate": 9.682893747101134e-07,
"loss": 0.8503,
"step": 3759
},
{
"epoch": 1.53,
"grad_norm": 2.7422219954803575,
"learning_rate": 9.67860999365548e-07,
"loss": 0.856,
"step": 3760
},
{
"epoch": 1.53,
"grad_norm": 2.9952296620614964,
"learning_rate": 9.674326299246812e-07,
"loss": 0.7461,
"step": 3761
},
{
"epoch": 1.54,
"grad_norm": 2.89926633383467,
"learning_rate": 9.67004266466201e-07,
"loss": 0.8044,
"step": 3762
},
{
"epoch": 1.54,
"grad_norm": 3.1858903555457094,
"learning_rate": 9.665759090687945e-07,
"loss": 0.8784,
"step": 3763
},
{
"epoch": 1.54,
"grad_norm": 3.424792120164961,
"learning_rate": 9.66147557811148e-07,
"loss": 0.7741,
"step": 3764
},
{
"epoch": 1.54,
"grad_norm": 2.870482793711958,
"learning_rate": 9.657192127719466e-07,
"loss": 0.8127,
"step": 3765
},
{
"epoch": 1.54,
"grad_norm": 2.7764768698621323,
"learning_rate": 9.65290874029874e-07,
"loss": 0.8506,
"step": 3766
},
{
"epoch": 1.54,
"grad_norm": 2.470491174563407,
"learning_rate": 9.648625416636124e-07,
"loss": 0.8016,
"step": 3767
},
{
"epoch": 1.54,
"grad_norm": 3.066154299517681,
"learning_rate": 9.644342157518442e-07,
"loss": 0.7581,
"step": 3768
},
{
"epoch": 1.54,
"grad_norm": 3.0388333250968165,
"learning_rate": 9.640058963732495e-07,
"loss": 0.808,
"step": 3769
},
{
"epoch": 1.54,
"grad_norm": 3.9500209048280484,
"learning_rate": 9.635775836065068e-07,
"loss": 0.8243,
"step": 3770
},
{
"epoch": 1.54,
"grad_norm": 3.285331259416731,
"learning_rate": 9.631492775302948e-07,
"loss": 0.8656,
"step": 3771
},
{
"epoch": 1.54,
"grad_norm": 3.0537405676618503,
"learning_rate": 9.627209782232893e-07,
"loss": 0.7954,
"step": 3772
},
{
"epoch": 1.54,
"grad_norm": 2.9837191226366966,
"learning_rate": 9.62292685764167e-07,
"loss": 0.775,
"step": 3773
},
{
"epoch": 1.54,
"grad_norm": 2.7997706243350513,
"learning_rate": 9.618644002316007e-07,
"loss": 0.8792,
"step": 3774
},
{
"epoch": 1.54,
"grad_norm": 2.888970709891548,
"learning_rate": 9.614361217042641e-07,
"loss": 0.8237,
"step": 3775
},
{
"epoch": 1.54,
"grad_norm": 2.6284503759419713,
"learning_rate": 9.610078502608289e-07,
"loss": 0.8678,
"step": 3776
},
{
"epoch": 1.54,
"grad_norm": 2.7994509657142896,
"learning_rate": 9.60579585979965e-07,
"loss": 0.7995,
"step": 3777
},
{
"epoch": 1.54,
"grad_norm": 3.248483480914519,
"learning_rate": 9.601513289403417e-07,
"loss": 0.8106,
"step": 3778
},
{
"epoch": 1.54,
"grad_norm": 2.6736632331515606,
"learning_rate": 9.597230792206265e-07,
"loss": 0.7607,
"step": 3779
},
{
"epoch": 1.54,
"grad_norm": 2.4306688909417016,
"learning_rate": 9.592948368994854e-07,
"loss": 0.8381,
"step": 3780
},
{
"epoch": 1.54,
"grad_norm": 2.606533833771822,
"learning_rate": 9.58866602055584e-07,
"loss": 0.8207,
"step": 3781
},
{
"epoch": 1.54,
"grad_norm": 3.325080588933625,
"learning_rate": 9.58438374767586e-07,
"loss": 0.8052,
"step": 3782
},
{
"epoch": 1.54,
"grad_norm": 2.630535000611345,
"learning_rate": 9.58010155114153e-07,
"loss": 0.8758,
"step": 3783
},
{
"epoch": 1.54,
"grad_norm": 2.7780040380851045,
"learning_rate": 9.57581943173946e-07,
"loss": 0.8178,
"step": 3784
},
{
"epoch": 1.54,
"grad_norm": 3.2815204743526882,
"learning_rate": 9.571537390256242e-07,
"loss": 0.8472,
"step": 3785
},
{
"epoch": 1.54,
"grad_norm": 4.013345258625929,
"learning_rate": 9.567255427478467e-07,
"loss": 0.8668,
"step": 3786
},
{
"epoch": 1.55,
"grad_norm": 2.5624801690605143,
"learning_rate": 9.562973544192682e-07,
"loss": 0.8165,
"step": 3787
},
{
"epoch": 1.55,
"grad_norm": 2.6581750071434387,
"learning_rate": 9.55869174118545e-07,
"loss": 0.7737,
"step": 3788
},
{
"epoch": 1.55,
"grad_norm": 2.8234651728853644,
"learning_rate": 9.554410019243302e-07,
"loss": 0.8921,
"step": 3789
},
{
"epoch": 1.55,
"grad_norm": 2.550179996191361,
"learning_rate": 9.550128379152761e-07,
"loss": 0.8123,
"step": 3790
},
{
"epoch": 1.55,
"grad_norm": 3.1776559088563565,
"learning_rate": 9.545846821700335e-07,
"loss": 0.8062,
"step": 3791
},
{
"epoch": 1.55,
"grad_norm": 2.8460979704621647,
"learning_rate": 9.54156534767251e-07,
"loss": 0.8786,
"step": 3792
},
{
"epoch": 1.55,
"grad_norm": 3.3474541790642562,
"learning_rate": 9.53728395785576e-07,
"loss": 0.8556,
"step": 3793
},
{
"epoch": 1.55,
"grad_norm": 3.2873227910774916,
"learning_rate": 9.533002653036548e-07,
"loss": 0.8258,
"step": 3794
},
{
"epoch": 1.55,
"grad_norm": 2.491294971430405,
"learning_rate": 9.528721434001323e-07,
"loss": 0.949,
"step": 3795
},
{
"epoch": 1.55,
"grad_norm": 2.6380580676377448,
"learning_rate": 9.524440301536506e-07,
"loss": 0.7943,
"step": 3796
},
{
"epoch": 1.55,
"grad_norm": 3.186791677010676,
"learning_rate": 9.520159256428511e-07,
"loss": 0.8316,
"step": 3797
},
{
"epoch": 1.55,
"grad_norm": 2.749969139947035,
"learning_rate": 9.515878299463735e-07,
"loss": 0.7905,
"step": 3798
},
{
"epoch": 1.55,
"grad_norm": 2.681774534599489,
"learning_rate": 9.511597431428566e-07,
"loss": 0.8216,
"step": 3799
},
{
"epoch": 1.55,
"grad_norm": 3.0372922179256117,
"learning_rate": 9.507316653109354e-07,
"loss": 0.8123,
"step": 3800
},
{
"epoch": 1.55,
"eval_loss": 0.8767625689506531,
"eval_runtime": 466.1173,
"eval_samples_per_second": 74.771,
"eval_steps_per_second": 4.675,
"step": 3800
},
{
"epoch": 1.55,
"grad_norm": 2.5807637702499067,
"learning_rate": 9.503035965292456e-07,
"loss": 0.8527,
"step": 3801
},
{
"epoch": 1.55,
"grad_norm": 2.6717493285625133,
"learning_rate": 9.498755368764201e-07,
"loss": 0.8697,
"step": 3802
},
{
"epoch": 1.55,
"grad_norm": 2.440651766148024,
"learning_rate": 9.494474864310902e-07,
"loss": 0.8632,
"step": 3803
},
{
"epoch": 1.55,
"grad_norm": 3.904370043582062,
"learning_rate": 9.490194452718859e-07,
"loss": 0.7611,
"step": 3804
},
{
"epoch": 1.55,
"grad_norm": 3.0234381764481326,
"learning_rate": 9.485914134774349e-07,
"loss": 0.8065,
"step": 3805
},
{
"epoch": 1.55,
"grad_norm": 3.27646308122458,
"learning_rate": 9.481633911263633e-07,
"loss": 0.797,
"step": 3806
},
{
"epoch": 1.55,
"grad_norm": 3.4119410688887664,
"learning_rate": 9.477353782972962e-07,
"loss": 0.7608,
"step": 3807
},
{
"epoch": 1.55,
"grad_norm": 3.8812121125220354,
"learning_rate": 9.473073750688564e-07,
"loss": 0.8041,
"step": 3808
},
{
"epoch": 1.55,
"grad_norm": 3.040851208562584,
"learning_rate": 9.468793815196645e-07,
"loss": 0.7589,
"step": 3809
},
{
"epoch": 1.55,
"grad_norm": 2.416076335702764,
"learning_rate": 9.464513977283401e-07,
"loss": 0.8798,
"step": 3810
},
{
"epoch": 1.56,
"grad_norm": 4.187496602437439,
"learning_rate": 9.460234237735003e-07,
"loss": 0.7955,
"step": 3811
},
{
"epoch": 1.56,
"grad_norm": 2.6583544212792454,
"learning_rate": 9.455954597337616e-07,
"loss": 0.7675,
"step": 3812
},
{
"epoch": 1.56,
"grad_norm": 2.443526562381321,
"learning_rate": 9.451675056877368e-07,
"loss": 0.8437,
"step": 3813
},
{
"epoch": 1.56,
"grad_norm": 2.896956191631736,
"learning_rate": 9.447395617140386e-07,
"loss": 0.8589,
"step": 3814
},
{
"epoch": 1.56,
"grad_norm": 3.37298508987514,
"learning_rate": 9.443116278912769e-07,
"loss": 0.8254,
"step": 3815
},
{
"epoch": 1.56,
"grad_norm": 2.8311175656908985,
"learning_rate": 9.438837042980605e-07,
"loss": 0.8194,
"step": 3816
},
{
"epoch": 1.56,
"grad_norm": 3.376416028764514,
"learning_rate": 9.43455791012995e-07,
"loss": 0.7961,
"step": 3817
},
{
"epoch": 1.56,
"grad_norm": 3.5495383536741847,
"learning_rate": 9.430278881146854e-07,
"loss": 0.7969,
"step": 3818
},
{
"epoch": 1.56,
"grad_norm": 2.936150807178658,
"learning_rate": 9.42599995681734e-07,
"loss": 0.7457,
"step": 3819
},
{
"epoch": 1.56,
"grad_norm": 3.7186930084510474,
"learning_rate": 9.42172113792742e-07,
"loss": 0.8561,
"step": 3820
},
{
"epoch": 1.56,
"grad_norm": 2.7506590627338396,
"learning_rate": 9.417442425263082e-07,
"loss": 0.8171,
"step": 3821
},
{
"epoch": 1.56,
"grad_norm": 2.8322935340539375,
"learning_rate": 9.413163819610287e-07,
"loss": 0.7756,
"step": 3822
},
{
"epoch": 1.56,
"grad_norm": 3.3259120378970746,
"learning_rate": 9.40888532175499e-07,
"loss": 0.759,
"step": 3823
},
{
"epoch": 1.56,
"grad_norm": 2.7320631679754945,
"learning_rate": 9.404606932483113e-07,
"loss": 0.8159,
"step": 3824
},
{
"epoch": 1.56,
"grad_norm": 3.333574379329062,
"learning_rate": 9.400328652580575e-07,
"loss": 0.7955,
"step": 3825
},
{
"epoch": 1.56,
"grad_norm": 3.4868649200961057,
"learning_rate": 9.396050482833253e-07,
"loss": 0.8082,
"step": 3826
},
{
"epoch": 1.56,
"grad_norm": 3.6445060196454167,
"learning_rate": 9.391772424027023e-07,
"loss": 0.8748,
"step": 3827
},
{
"epoch": 1.56,
"grad_norm": 3.799641197924141,
"learning_rate": 9.387494476947731e-07,
"loss": 0.7682,
"step": 3828
},
{
"epoch": 1.56,
"grad_norm": 3.0391868960561212,
"learning_rate": 9.383216642381207e-07,
"loss": 0.7809,
"step": 3829
},
{
"epoch": 1.56,
"grad_norm": 2.4456202416083817,
"learning_rate": 9.37893892111325e-07,
"loss": 0.8488,
"step": 3830
},
{
"epoch": 1.56,
"grad_norm": 2.752141593031499,
"learning_rate": 9.374661313929655e-07,
"loss": 0.842,
"step": 3831
},
{
"epoch": 1.56,
"grad_norm": 2.288654835478112,
"learning_rate": 9.370383821616179e-07,
"loss": 0.8423,
"step": 3832
},
{
"epoch": 1.56,
"grad_norm": 3.1861838966628175,
"learning_rate": 9.366106444958571e-07,
"loss": 0.8052,
"step": 3833
},
{
"epoch": 1.56,
"grad_norm": 2.5936648377659193,
"learning_rate": 9.361829184742557e-07,
"loss": 0.7394,
"step": 3834
},
{
"epoch": 1.56,
"grad_norm": 2.677420053219721,
"learning_rate": 9.35755204175383e-07,
"loss": 0.8206,
"step": 3835
},
{
"epoch": 1.57,
"grad_norm": 2.8955651482995908,
"learning_rate": 9.353275016778075e-07,
"loss": 0.7983,
"step": 3836
},
{
"epoch": 1.57,
"grad_norm": 3.5359697216112735,
"learning_rate": 9.348998110600945e-07,
"loss": 0.8982,
"step": 3837
},
{
"epoch": 1.57,
"grad_norm": 3.0894146217154184,
"learning_rate": 9.344721324008088e-07,
"loss": 0.8088,
"step": 3838
},
{
"epoch": 1.57,
"grad_norm": 2.8637299917297714,
"learning_rate": 9.340444657785102e-07,
"loss": 0.8383,
"step": 3839
},
{
"epoch": 1.57,
"grad_norm": 2.7717506887453753,
"learning_rate": 9.336168112717591e-07,
"loss": 0.8903,
"step": 3840
},
{
"epoch": 1.57,
"grad_norm": 3.2295297511291405,
"learning_rate": 9.331891689591121e-07,
"loss": 0.7823,
"step": 3841
},
{
"epoch": 1.57,
"grad_norm": 4.1532264951507365,
"learning_rate": 9.327615389191243e-07,
"loss": 0.8288,
"step": 3842
},
{
"epoch": 1.57,
"grad_norm": 3.274397245804369,
"learning_rate": 9.323339212303476e-07,
"loss": 0.8154,
"step": 3843
},
{
"epoch": 1.57,
"grad_norm": 2.301998246626229,
"learning_rate": 9.319063159713324e-07,
"loss": 0.8551,
"step": 3844
},
{
"epoch": 1.57,
"grad_norm": 3.070564999005072,
"learning_rate": 9.314787232206266e-07,
"loss": 0.7295,
"step": 3845
},
{
"epoch": 1.57,
"grad_norm": 4.812330934875104,
"learning_rate": 9.310511430567767e-07,
"loss": 0.8158,
"step": 3846
},
{
"epoch": 1.57,
"grad_norm": 3.6469131861827724,
"learning_rate": 9.306235755583247e-07,
"loss": 0.7868,
"step": 3847
},
{
"epoch": 1.57,
"grad_norm": 3.4265043448616885,
"learning_rate": 9.301960208038124e-07,
"loss": 0.8425,
"step": 3848
},
{
"epoch": 1.57,
"grad_norm": 2.7778250736290135,
"learning_rate": 9.297684788717784e-07,
"loss": 0.8249,
"step": 3849
},
{
"epoch": 1.57,
"grad_norm": 2.4719178166416587,
"learning_rate": 9.293409498407586e-07,
"loss": 0.9081,
"step": 3850
},
{
"epoch": 1.57,
"grad_norm": 2.3885556692120455,
"learning_rate": 9.28913433789288e-07,
"loss": 0.7982,
"step": 3851
},
{
"epoch": 1.57,
"grad_norm": 2.887656227123775,
"learning_rate": 9.284859307958968e-07,
"loss": 0.8401,
"step": 3852
},
{
"epoch": 1.57,
"grad_norm": 2.485439504814186,
"learning_rate": 9.280584409391149e-07,
"loss": 0.8055,
"step": 3853
},
{
"epoch": 1.57,
"grad_norm": 2.8899172287826054,
"learning_rate": 9.276309642974689e-07,
"loss": 0.8347,
"step": 3854
},
{
"epoch": 1.57,
"grad_norm": 3.326792548057293,
"learning_rate": 9.272035009494835e-07,
"loss": 0.8023,
"step": 3855
},
{
"epoch": 1.57,
"grad_norm": 3.2517691636195405,
"learning_rate": 9.267760509736799e-07,
"loss": 0.8532,
"step": 3856
},
{
"epoch": 1.57,
"grad_norm": 3.060808768820477,
"learning_rate": 9.263486144485778e-07,
"loss": 0.7635,
"step": 3857
},
{
"epoch": 1.57,
"grad_norm": 3.091707278128988,
"learning_rate": 9.259211914526943e-07,
"loss": 0.8672,
"step": 3858
},
{
"epoch": 1.57,
"grad_norm": 3.226177255391831,
"learning_rate": 9.25493782064544e-07,
"loss": 0.8793,
"step": 3859
},
{
"epoch": 1.58,
"grad_norm": 2.539110431958746,
"learning_rate": 9.250663863626385e-07,
"loss": 0.835,
"step": 3860
},
{
"epoch": 1.58,
"grad_norm": 3.2473393670349995,
"learning_rate": 9.246390044254871e-07,
"loss": 0.8477,
"step": 3861
},
{
"epoch": 1.58,
"grad_norm": 3.6231649470263414,
"learning_rate": 9.242116363315972e-07,
"loss": 0.7724,
"step": 3862
},
{
"epoch": 1.58,
"grad_norm": 2.9680538511414327,
"learning_rate": 9.237842821594728e-07,
"loss": 0.8129,
"step": 3863
},
{
"epoch": 1.58,
"grad_norm": 3.1375138614372657,
"learning_rate": 9.233569419876166e-07,
"loss": 0.8717,
"step": 3864
},
{
"epoch": 1.58,
"grad_norm": 2.6520925588243074,
"learning_rate": 9.229296158945263e-07,
"loss": 0.8827,
"step": 3865
},
{
"epoch": 1.58,
"grad_norm": 2.9175290556164626,
"learning_rate": 9.225023039586999e-07,
"loss": 0.9133,
"step": 3866
},
{
"epoch": 1.58,
"grad_norm": 3.473102031744373,
"learning_rate": 9.220750062586308e-07,
"loss": 0.8174,
"step": 3867
},
{
"epoch": 1.58,
"grad_norm": 3.2851034558722843,
"learning_rate": 9.216477228728109e-07,
"loss": 0.773,
"step": 3868
},
{
"epoch": 1.58,
"grad_norm": 4.599986490020172,
"learning_rate": 9.212204538797285e-07,
"loss": 0.8083,
"step": 3869
},
{
"epoch": 1.58,
"grad_norm": 3.079225051378142,
"learning_rate": 9.207931993578698e-07,
"loss": 0.8545,
"step": 3870
},
{
"epoch": 1.58,
"grad_norm": 4.320096808653716,
"learning_rate": 9.203659593857188e-07,
"loss": 0.8421,
"step": 3871
},
{
"epoch": 1.58,
"grad_norm": 3.20483154113751,
"learning_rate": 9.199387340417563e-07,
"loss": 0.7983,
"step": 3872
},
{
"epoch": 1.58,
"grad_norm": 3.391870605922844,
"learning_rate": 9.195115234044601e-07,
"loss": 0.8493,
"step": 3873
},
{
"epoch": 1.58,
"grad_norm": 2.7972074360789243,
"learning_rate": 9.190843275523056e-07,
"loss": 0.7686,
"step": 3874
},
{
"epoch": 1.58,
"grad_norm": 2.85734681619047,
"learning_rate": 9.186571465637659e-07,
"loss": 0.8325,
"step": 3875
},
{
"epoch": 1.58,
"grad_norm": 3.185886125058062,
"learning_rate": 9.18229980517311e-07,
"loss": 0.833,
"step": 3876
},
{
"epoch": 1.58,
"grad_norm": 2.7904681283095387,
"learning_rate": 9.178028294914079e-07,
"loss": 0.8713,
"step": 3877
},
{
"epoch": 1.58,
"grad_norm": 3.4369966346657863,
"learning_rate": 9.173756935645209e-07,
"loss": 0.7548,
"step": 3878
},
{
"epoch": 1.58,
"grad_norm": 2.5461702161308764,
"learning_rate": 9.169485728151123e-07,
"loss": 0.833,
"step": 3879
},
{
"epoch": 1.58,
"grad_norm": 3.2950357833341086,
"learning_rate": 9.165214673216408e-07,
"loss": 0.8735,
"step": 3880
},
{
"epoch": 1.58,
"grad_norm": 2.6659415881523416,
"learning_rate": 9.160943771625628e-07,
"loss": 0.7938,
"step": 3881
},
{
"epoch": 1.58,
"grad_norm": 2.536888594098963,
"learning_rate": 9.15667302416331e-07,
"loss": 0.814,
"step": 3882
},
{
"epoch": 1.58,
"grad_norm": 2.7413868664165517,
"learning_rate": 9.152402431613961e-07,
"loss": 0.8448,
"step": 3883
},
{
"epoch": 1.58,
"grad_norm": 2.7691077282236525,
"learning_rate": 9.148131994762063e-07,
"loss": 0.8351,
"step": 3884
},
{
"epoch": 1.59,
"grad_norm": 2.671711471074151,
"learning_rate": 9.143861714392061e-07,
"loss": 0.8374,
"step": 3885
},
{
"epoch": 1.59,
"grad_norm": 2.8466000636079234,
"learning_rate": 9.13959159128837e-07,
"loss": 0.8094,
"step": 3886
},
{
"epoch": 1.59,
"grad_norm": 3.5231499116364473,
"learning_rate": 9.135321626235383e-07,
"loss": 0.8109,
"step": 3887
},
{
"epoch": 1.59,
"grad_norm": 2.4763620425164916,
"learning_rate": 9.131051820017463e-07,
"loss": 0.8744,
"step": 3888
},
{
"epoch": 1.59,
"grad_norm": 4.472902892104651,
"learning_rate": 9.126782173418943e-07,
"loss": 0.8183,
"step": 3889
},
{
"epoch": 1.59,
"grad_norm": 3.4876651046534968,
"learning_rate": 9.122512687224121e-07,
"loss": 0.785,
"step": 3890
},
{
"epoch": 1.59,
"grad_norm": 3.39805840459713,
"learning_rate": 9.118243362217269e-07,
"loss": 0.8295,
"step": 3891
},
{
"epoch": 1.59,
"grad_norm": 2.650499878112733,
"learning_rate": 9.113974199182637e-07,
"loss": 0.8464,
"step": 3892
},
{
"epoch": 1.59,
"grad_norm": 3.1432075157546584,
"learning_rate": 9.109705198904436e-07,
"loss": 0.7822,
"step": 3893
},
{
"epoch": 1.59,
"grad_norm": 3.827340865656425,
"learning_rate": 9.105436362166853e-07,
"loss": 0.848,
"step": 3894
},
{
"epoch": 1.59,
"grad_norm": 3.412951931582269,
"learning_rate": 9.101167689754037e-07,
"loss": 0.8081,
"step": 3895
},
{
"epoch": 1.59,
"grad_norm": 2.5074203126880734,
"learning_rate": 9.096899182450111e-07,
"loss": 0.865,
"step": 3896
},
{
"epoch": 1.59,
"grad_norm": 2.5358824376645863,
"learning_rate": 9.092630841039172e-07,
"loss": 0.8219,
"step": 3897
},
{
"epoch": 1.59,
"grad_norm": 2.955048933130243,
"learning_rate": 9.088362666305286e-07,
"loss": 0.8323,
"step": 3898
},
{
"epoch": 1.59,
"grad_norm": 2.792125128876032,
"learning_rate": 9.084094659032479e-07,
"loss": 0.884,
"step": 3899
},
{
"epoch": 1.59,
"grad_norm": 3.644174629318515,
"learning_rate": 9.079826820004755e-07,
"loss": 0.8312,
"step": 3900
},
{
"epoch": 1.59,
"eval_loss": 0.8757497668266296,
"eval_runtime": 465.7922,
"eval_samples_per_second": 74.823,
"eval_steps_per_second": 4.678,
"step": 3900
},
{
"epoch": 1.59,
"grad_norm": 2.8709344327172714,
"learning_rate": 9.075559150006084e-07,
"loss": 0.8495,
"step": 3901
},
{
"epoch": 1.59,
"grad_norm": 3.2179001434179,
"learning_rate": 9.071291649820411e-07,
"loss": 0.8175,
"step": 3902
},
{
"epoch": 1.59,
"grad_norm": 4.058207757946577,
"learning_rate": 9.067024320231636e-07,
"loss": 0.786,
"step": 3903
},
{
"epoch": 1.59,
"grad_norm": 4.730580498596109,
"learning_rate": 9.062757162023638e-07,
"loss": 0.8821,
"step": 3904
},
{
"epoch": 1.59,
"grad_norm": 3.0226654298213402,
"learning_rate": 9.058490175980266e-07,
"loss": 0.8277,
"step": 3905
},
{
"epoch": 1.59,
"grad_norm": 2.907380456692561,
"learning_rate": 9.054223362885334e-07,
"loss": 0.8487,
"step": 3906
},
{
"epoch": 1.59,
"grad_norm": 3.1452155295369963,
"learning_rate": 9.049956723522624e-07,
"loss": 0.8183,
"step": 3907
},
{
"epoch": 1.59,
"grad_norm": 2.9610173150921915,
"learning_rate": 9.045690258675882e-07,
"loss": 0.8067,
"step": 3908
},
{
"epoch": 1.6,
"grad_norm": 3.7464440163036934,
"learning_rate": 9.041423969128829e-07,
"loss": 0.7814,
"step": 3909
},
{
"epoch": 1.6,
"grad_norm": 3.7987431034186425,
"learning_rate": 9.037157855665153e-07,
"loss": 0.7627,
"step": 3910
},
{
"epoch": 1.6,
"grad_norm": 3.8111553428948253,
"learning_rate": 9.032891919068508e-07,
"loss": 0.8733,
"step": 3911
},
{
"epoch": 1.6,
"grad_norm": 3.7211375404483626,
"learning_rate": 9.028626160122512e-07,
"loss": 0.775,
"step": 3912
},
{
"epoch": 1.6,
"grad_norm": 3.126539710509044,
"learning_rate": 9.024360579610753e-07,
"loss": 0.7961,
"step": 3913
},
{
"epoch": 1.6,
"grad_norm": 3.3477473277024896,
"learning_rate": 9.02009517831679e-07,
"loss": 0.7673,
"step": 3914
},
{
"epoch": 1.6,
"grad_norm": 3.2436318625846225,
"learning_rate": 9.015829957024147e-07,
"loss": 0.8073,
"step": 3915
},
{
"epoch": 1.6,
"grad_norm": 3.712115065163701,
"learning_rate": 9.011564916516309e-07,
"loss": 0.8023,
"step": 3916
},
{
"epoch": 1.6,
"grad_norm": 2.574099817058779,
"learning_rate": 9.007300057576732e-07,
"loss": 0.8162,
"step": 3917
},
{
"epoch": 1.6,
"grad_norm": 3.0456548727436847,
"learning_rate": 9.003035380988846e-07,
"loss": 0.8293,
"step": 3918
},
{
"epoch": 1.6,
"grad_norm": 2.976305440127457,
"learning_rate": 8.998770887536038e-07,
"loss": 0.6976,
"step": 3919
},
{
"epoch": 1.6,
"grad_norm": 2.6942450684284998,
"learning_rate": 8.994506578001662e-07,
"loss": 0.8403,
"step": 3920
},
{
"epoch": 1.6,
"grad_norm": 2.8694513947598184,
"learning_rate": 8.99024245316904e-07,
"loss": 0.8769,
"step": 3921
},
{
"epoch": 1.6,
"grad_norm": 2.621276019264129,
"learning_rate": 8.98597851382146e-07,
"loss": 0.7853,
"step": 3922
},
{
"epoch": 1.6,
"grad_norm": 3.139274864384303,
"learning_rate": 8.981714760742182e-07,
"loss": 0.839,
"step": 3923
},
{
"epoch": 1.6,
"grad_norm": 2.9451544839824337,
"learning_rate": 8.977451194714423e-07,
"loss": 0.8424,
"step": 3924
},
{
"epoch": 1.6,
"grad_norm": 3.017030616325397,
"learning_rate": 8.973187816521367e-07,
"loss": 0.8637,
"step": 3925
},
{
"epoch": 1.6,
"grad_norm": 4.203120423982303,
"learning_rate": 8.968924626946166e-07,
"loss": 0.8457,
"step": 3926
},
{
"epoch": 1.6,
"grad_norm": 2.76997905948547,
"learning_rate": 8.964661626771936e-07,
"loss": 0.8683,
"step": 3927
},
{
"epoch": 1.6,
"grad_norm": 3.7909652083109187,
"learning_rate": 8.960398816781763e-07,
"loss": 0.8686,
"step": 3928
},
{
"epoch": 1.6,
"grad_norm": 2.587586923860106,
"learning_rate": 8.95613619775869e-07,
"loss": 0.8503,
"step": 3929
},
{
"epoch": 1.6,
"grad_norm": 3.613797856337829,
"learning_rate": 8.951873770485724e-07,
"loss": 0.8212,
"step": 3930
},
{
"epoch": 1.6,
"grad_norm": 3.0773533723679156,
"learning_rate": 8.947611535745851e-07,
"loss": 0.7638,
"step": 3931
},
{
"epoch": 1.6,
"grad_norm": 3.7386607000154393,
"learning_rate": 8.943349494322011e-07,
"loss": 0.7537,
"step": 3932
},
{
"epoch": 1.6,
"grad_norm": 2.5530885494448587,
"learning_rate": 8.939087646997104e-07,
"loss": 0.8808,
"step": 3933
},
{
"epoch": 1.61,
"grad_norm": 2.7898009830086297,
"learning_rate": 8.934825994554003e-07,
"loss": 0.8412,
"step": 3934
},
{
"epoch": 1.61,
"grad_norm": 3.3734587820091697,
"learning_rate": 8.93056453777554e-07,
"loss": 0.8368,
"step": 3935
},
{
"epoch": 1.61,
"grad_norm": 2.5398514676139206,
"learning_rate": 8.926303277444518e-07,
"loss": 0.8362,
"step": 3936
},
{
"epoch": 1.61,
"grad_norm": 3.4118117901554443,
"learning_rate": 8.922042214343699e-07,
"loss": 0.8525,
"step": 3937
},
{
"epoch": 1.61,
"grad_norm": 3.0084491654579764,
"learning_rate": 8.917781349255805e-07,
"loss": 0.8503,
"step": 3938
},
{
"epoch": 1.61,
"grad_norm": 2.9973949588944806,
"learning_rate": 8.913520682963528e-07,
"loss": 0.8295,
"step": 3939
},
{
"epoch": 1.61,
"grad_norm": 3.808595841195203,
"learning_rate": 8.909260216249519e-07,
"loss": 0.7888,
"step": 3940
},
{
"epoch": 1.61,
"grad_norm": 3.942590791908032,
"learning_rate": 8.904999949896399e-07,
"loss": 0.8376,
"step": 3941
},
{
"epoch": 1.61,
"grad_norm": 2.9304043845038135,
"learning_rate": 8.900739884686742e-07,
"loss": 0.8456,
"step": 3942
},
{
"epoch": 1.61,
"grad_norm": 3.3000111839814483,
"learning_rate": 8.896480021403091e-07,
"loss": 0.834,
"step": 3943
},
{
"epoch": 1.61,
"grad_norm": 2.97495253692414,
"learning_rate": 8.892220360827957e-07,
"loss": 0.8266,
"step": 3944
},
{
"epoch": 1.61,
"grad_norm": 3.10078035966075,
"learning_rate": 8.887960903743806e-07,
"loss": 0.7734,
"step": 3945
},
{
"epoch": 1.61,
"grad_norm": 2.5466221838492302,
"learning_rate": 8.883701650933067e-07,
"loss": 0.843,
"step": 3946
},
{
"epoch": 1.61,
"grad_norm": 3.0010992963641674,
"learning_rate": 8.879442603178135e-07,
"loss": 0.785,
"step": 3947
},
{
"epoch": 1.61,
"grad_norm": 2.6538525195677267,
"learning_rate": 8.875183761261363e-07,
"loss": 0.889,
"step": 3948
},
{
"epoch": 1.61,
"grad_norm": 3.7201055718162537,
"learning_rate": 8.870925125965076e-07,
"loss": 0.7165,
"step": 3949
},
{
"epoch": 1.61,
"grad_norm": 3.0572197754132855,
"learning_rate": 8.866666698071546e-07,
"loss": 0.7684,
"step": 3950
},
{
"epoch": 1.61,
"grad_norm": 3.280284920485557,
"learning_rate": 8.862408478363018e-07,
"loss": 0.7501,
"step": 3951
},
{
"epoch": 1.61,
"grad_norm": 2.339966482013776,
"learning_rate": 8.858150467621698e-07,
"loss": 0.7845,
"step": 3952
},
{
"epoch": 1.61,
"grad_norm": 2.876414372661603,
"learning_rate": 8.853892666629748e-07,
"loss": 0.7575,
"step": 3953
},
{
"epoch": 1.61,
"grad_norm": 2.7432130598721964,
"learning_rate": 8.849635076169299e-07,
"loss": 0.8275,
"step": 3954
},
{
"epoch": 1.61,
"grad_norm": 3.1001476242716817,
"learning_rate": 8.845377697022431e-07,
"loss": 0.7762,
"step": 3955
},
{
"epoch": 1.61,
"grad_norm": 3.4490285309006623,
"learning_rate": 8.841120529971201e-07,
"loss": 0.8037,
"step": 3956
},
{
"epoch": 1.61,
"grad_norm": 2.376956069422458,
"learning_rate": 8.836863575797617e-07,
"loss": 0.7815,
"step": 3957
},
{
"epoch": 1.62,
"grad_norm": 2.957409668939954,
"learning_rate": 8.832606835283653e-07,
"loss": 0.891,
"step": 3958
},
{
"epoch": 1.62,
"grad_norm": 2.9800317545116988,
"learning_rate": 8.828350309211234e-07,
"loss": 0.8029,
"step": 3959
},
{
"epoch": 1.62,
"grad_norm": 2.659143389792635,
"learning_rate": 8.824093998362259e-07,
"loss": 0.8089,
"step": 3960
},
{
"epoch": 1.62,
"grad_norm": 3.03071446159711,
"learning_rate": 8.819837903518574e-07,
"loss": 0.911,
"step": 3961
},
{
"epoch": 1.62,
"grad_norm": 3.2920139800210038,
"learning_rate": 8.815582025462004e-07,
"loss": 0.8161,
"step": 3962
},
{
"epoch": 1.62,
"grad_norm": 2.4162222450033575,
"learning_rate": 8.811326364974309e-07,
"loss": 0.8257,
"step": 3963
},
{
"epoch": 1.62,
"grad_norm": 4.330973135930056,
"learning_rate": 8.807070922837232e-07,
"loss": 0.7614,
"step": 3964
},
{
"epoch": 1.62,
"grad_norm": 4.27949052153162,
"learning_rate": 8.802815699832463e-07,
"loss": 0.8501,
"step": 3965
},
{
"epoch": 1.62,
"grad_norm": 4.350177440659666,
"learning_rate": 8.798560696741654e-07,
"loss": 0.7482,
"step": 3966
},
{
"epoch": 1.62,
"grad_norm": 2.8688874384237755,
"learning_rate": 8.794305914346425e-07,
"loss": 0.8483,
"step": 3967
},
{
"epoch": 1.62,
"grad_norm": 2.9405660061513106,
"learning_rate": 8.790051353428337e-07,
"loss": 0.8025,
"step": 3968
},
{
"epoch": 1.62,
"grad_norm": 4.379675118232489,
"learning_rate": 8.78579701476893e-07,
"loss": 0.813,
"step": 3969
},
{
"epoch": 1.62,
"grad_norm": 3.4507460028123917,
"learning_rate": 8.781542899149691e-07,
"loss": 0.8148,
"step": 3970
},
{
"epoch": 1.62,
"grad_norm": 3.6271320790694945,
"learning_rate": 8.777289007352073e-07,
"loss": 0.7832,
"step": 3971
},
{
"epoch": 1.62,
"grad_norm": 5.337952173341659,
"learning_rate": 8.773035340157482e-07,
"loss": 0.8097,
"step": 3972
},
{
"epoch": 1.62,
"grad_norm": 3.5169740532030835,
"learning_rate": 8.768781898347284e-07,
"loss": 0.7884,
"step": 3973
},
{
"epoch": 1.62,
"grad_norm": 3.71565259342169,
"learning_rate": 8.764528682702806e-07,
"loss": 0.875,
"step": 3974
},
{
"epoch": 1.62,
"grad_norm": 2.846252735009237,
"learning_rate": 8.76027569400534e-07,
"loss": 0.8249,
"step": 3975
},
{
"epoch": 1.62,
"grad_norm": 2.953411496706816,
"learning_rate": 8.756022933036115e-07,
"loss": 0.7988,
"step": 3976
},
{
"epoch": 1.62,
"grad_norm": 4.542418887200488,
"learning_rate": 8.751770400576343e-07,
"loss": 0.7882,
"step": 3977
},
{
"epoch": 1.62,
"grad_norm": 3.1063121405906386,
"learning_rate": 8.747518097407178e-07,
"loss": 0.8708,
"step": 3978
},
{
"epoch": 1.62,
"grad_norm": 3.092496521916784,
"learning_rate": 8.743266024309742e-07,
"loss": 0.8404,
"step": 3979
},
{
"epoch": 1.62,
"grad_norm": 2.557228890588799,
"learning_rate": 8.739014182065101e-07,
"loss": 0.8325,
"step": 3980
},
{
"epoch": 1.62,
"grad_norm": 2.929658746004797,
"learning_rate": 8.73476257145429e-07,
"loss": 0.7998,
"step": 3981
},
{
"epoch": 1.62,
"grad_norm": 3.42362161203424,
"learning_rate": 8.730511193258304e-07,
"loss": 0.8114,
"step": 3982
},
{
"epoch": 1.63,
"grad_norm": 3.307680265617682,
"learning_rate": 8.726260048258085e-07,
"loss": 0.7824,
"step": 3983
},
{
"epoch": 1.63,
"grad_norm": 2.8056407409554307,
"learning_rate": 8.722009137234542e-07,
"loss": 0.8351,
"step": 3984
},
{
"epoch": 1.63,
"grad_norm": 3.0686412939778687,
"learning_rate": 8.717758460968529e-07,
"loss": 0.819,
"step": 3985
},
{
"epoch": 1.63,
"grad_norm": 3.065458143504144,
"learning_rate": 8.713508020240869e-07,
"loss": 0.8442,
"step": 3986
},
{
"epoch": 1.63,
"grad_norm": 3.0311001728977987,
"learning_rate": 8.709257815832333e-07,
"loss": 0.7505,
"step": 3987
},
{
"epoch": 1.63,
"grad_norm": 2.309683310602133,
"learning_rate": 8.705007848523662e-07,
"loss": 0.8144,
"step": 3988
},
{
"epoch": 1.63,
"grad_norm": 2.7547804775362317,
"learning_rate": 8.700758119095529e-07,
"loss": 0.8419,
"step": 3989
},
{
"epoch": 1.63,
"grad_norm": 3.3963684063448385,
"learning_rate": 8.696508628328589e-07,
"loss": 0.805,
"step": 3990
},
{
"epoch": 1.63,
"grad_norm": 2.705540205810719,
"learning_rate": 8.692259377003437e-07,
"loss": 0.8128,
"step": 3991
},
{
"epoch": 1.63,
"grad_norm": 2.5520004697980556,
"learning_rate": 8.688010365900634e-07,
"loss": 0.8317,
"step": 3992
},
{
"epoch": 1.63,
"grad_norm": 3.322418308123796,
"learning_rate": 8.683761595800687e-07,
"loss": 0.783,
"step": 3993
},
{
"epoch": 1.63,
"grad_norm": 2.5834628265206687,
"learning_rate": 8.679513067484063e-07,
"loss": 0.7848,
"step": 3994
},
{
"epoch": 1.63,
"grad_norm": 2.643711500227287,
"learning_rate": 8.675264781731193e-07,
"loss": 0.7888,
"step": 3995
},
{
"epoch": 1.63,
"grad_norm": 3.041870238915418,
"learning_rate": 8.67101673932245e-07,
"loss": 0.7886,
"step": 3996
},
{
"epoch": 1.63,
"grad_norm": 2.584292914265296,
"learning_rate": 8.666768941038172e-07,
"loss": 0.8228,
"step": 3997
},
{
"epoch": 1.63,
"grad_norm": 2.6250539003686995,
"learning_rate": 8.662521387658643e-07,
"loss": 0.7734,
"step": 3998
},
{
"epoch": 1.63,
"grad_norm": 3.056407875752555,
"learning_rate": 8.658274079964111e-07,
"loss": 0.8553,
"step": 3999
},
{
"epoch": 1.63,
"grad_norm": 3.294212340176613,
"learning_rate": 8.654027018734772e-07,
"loss": 0.7962,
"step": 4000
},
{
"epoch": 1.63,
"eval_loss": 0.8744407892227173,
"eval_runtime": 465.9112,
"eval_samples_per_second": 74.804,
"eval_steps_per_second": 4.677,
"step": 4000
},
{
"epoch": 1.63,
"grad_norm": 2.6961613543363447,
"learning_rate": 8.649780204750789e-07,
"loss": 0.8308,
"step": 4001
},
{
"epoch": 1.63,
"grad_norm": 3.2197806226224754,
"learning_rate": 8.645533638792258e-07,
"loss": 0.8481,
"step": 4002
},
{
"epoch": 1.63,
"grad_norm": 3.4278181853484133,
"learning_rate": 8.641287321639249e-07,
"loss": 0.8231,
"step": 4003
},
{
"epoch": 1.63,
"grad_norm": 2.7770859435626973,
"learning_rate": 8.63704125407178e-07,
"loss": 0.8213,
"step": 4004
},
{
"epoch": 1.63,
"grad_norm": 3.3044429439682292,
"learning_rate": 8.632795436869821e-07,
"loss": 0.8691,
"step": 4005
},
{
"epoch": 1.63,
"grad_norm": 3.794677615066353,
"learning_rate": 8.628549870813296e-07,
"loss": 0.7576,
"step": 4006
},
{
"epoch": 1.64,
"grad_norm": 3.40443504825831,
"learning_rate": 8.624304556682082e-07,
"loss": 0.8568,
"step": 4007
},
{
"epoch": 1.64,
"grad_norm": 2.6463269985233864,
"learning_rate": 8.620059495256019e-07,
"loss": 0.8453,
"step": 4008
},
{
"epoch": 1.64,
"grad_norm": 2.842335481864828,
"learning_rate": 8.615814687314889e-07,
"loss": 0.7875,
"step": 4009
},
{
"epoch": 1.64,
"grad_norm": 2.807965998545619,
"learning_rate": 8.611570133638436e-07,
"loss": 0.7903,
"step": 4010
},
{
"epoch": 1.64,
"grad_norm": 3.0006044774146168,
"learning_rate": 8.607325835006348e-07,
"loss": 0.8453,
"step": 4011
},
{
"epoch": 1.64,
"grad_norm": 3.5357544011519795,
"learning_rate": 8.603081792198274e-07,
"loss": 0.7793,
"step": 4012
},
{
"epoch": 1.64,
"grad_norm": 3.246731680077103,
"learning_rate": 8.598838005993811e-07,
"loss": 0.7844,
"step": 4013
},
{
"epoch": 1.64,
"grad_norm": 2.562807273556959,
"learning_rate": 8.59459447717252e-07,
"loss": 0.8305,
"step": 4014
},
{
"epoch": 1.64,
"grad_norm": 3.142705014499922,
"learning_rate": 8.590351206513893e-07,
"loss": 0.8333,
"step": 4015
},
{
"epoch": 1.64,
"grad_norm": 2.8411565850261407,
"learning_rate": 8.586108194797397e-07,
"loss": 0.7672,
"step": 4016
},
{
"epoch": 1.64,
"grad_norm": 3.3289289436671092,
"learning_rate": 8.581865442802439e-07,
"loss": 0.8527,
"step": 4017
},
{
"epoch": 1.64,
"grad_norm": 3.4794709096092395,
"learning_rate": 8.577622951308385e-07,
"loss": 0.7671,
"step": 4018
},
{
"epoch": 1.64,
"grad_norm": 3.2527621807415836,
"learning_rate": 8.573380721094543e-07,
"loss": 0.8033,
"step": 4019
},
{
"epoch": 1.64,
"grad_norm": 3.276858888514617,
"learning_rate": 8.569138752940182e-07,
"loss": 0.8347,
"step": 4020
},
{
"epoch": 1.64,
"grad_norm": 3.007142040873471,
"learning_rate": 8.564897047624523e-07,
"loss": 0.8863,
"step": 4021
},
{
"epoch": 1.64,
"grad_norm": 2.7603734281823322,
"learning_rate": 8.560655605926737e-07,
"loss": 0.7577,
"step": 4022
},
{
"epoch": 1.64,
"grad_norm": 2.4832706514172047,
"learning_rate": 8.55641442862594e-07,
"loss": 0.7775,
"step": 4023
},
{
"epoch": 1.64,
"grad_norm": 3.1428682453634096,
"learning_rate": 8.552173516501209e-07,
"loss": 0.76,
"step": 4024
},
{
"epoch": 1.64,
"grad_norm": 3.748219631087575,
"learning_rate": 8.547932870331566e-07,
"loss": 0.8291,
"step": 4025
},
{
"epoch": 1.64,
"grad_norm": 3.4389858264151925,
"learning_rate": 8.543692490895988e-07,
"loss": 0.8286,
"step": 4026
},
{
"epoch": 1.64,
"grad_norm": 3.720219451796561,
"learning_rate": 8.539452378973407e-07,
"loss": 0.7642,
"step": 4027
},
{
"epoch": 1.64,
"grad_norm": 3.2504979487381296,
"learning_rate": 8.53521253534269e-07,
"loss": 0.8171,
"step": 4028
},
{
"epoch": 1.64,
"grad_norm": 2.7367439439545023,
"learning_rate": 8.530972960782671e-07,
"loss": 0.7482,
"step": 4029
},
{
"epoch": 1.64,
"grad_norm": 3.0714803843632437,
"learning_rate": 8.526733656072129e-07,
"loss": 0.7893,
"step": 4030
},
{
"epoch": 1.64,
"grad_norm": 4.363594627025173,
"learning_rate": 8.522494621989793e-07,
"loss": 0.8496,
"step": 4031
},
{
"epoch": 1.65,
"grad_norm": 4.138474509447946,
"learning_rate": 8.518255859314341e-07,
"loss": 0.797,
"step": 4032
},
{
"epoch": 1.65,
"grad_norm": 3.553872856167557,
"learning_rate": 8.514017368824399e-07,
"loss": 0.8408,
"step": 4033
},
{
"epoch": 1.65,
"grad_norm": 3.58989348661617,
"learning_rate": 8.509779151298555e-07,
"loss": 0.8051,
"step": 4034
},
{
"epoch": 1.65,
"grad_norm": 2.8358219947888186,
"learning_rate": 8.505541207515335e-07,
"loss": 0.7525,
"step": 4035
},
{
"epoch": 1.65,
"grad_norm": 3.363526024387236,
"learning_rate": 8.501303538253215e-07,
"loss": 0.8123,
"step": 4036
},
{
"epoch": 1.65,
"grad_norm": 2.5807559832860365,
"learning_rate": 8.497066144290627e-07,
"loss": 0.8207,
"step": 4037
},
{
"epoch": 1.65,
"grad_norm": 3.0651996468199116,
"learning_rate": 8.492829026405947e-07,
"loss": 0.749,
"step": 4038
},
{
"epoch": 1.65,
"grad_norm": 3.240944866306308,
"learning_rate": 8.488592185377501e-07,
"loss": 0.7618,
"step": 4039
},
{
"epoch": 1.65,
"grad_norm": 2.764349324933311,
"learning_rate": 8.484355621983577e-07,
"loss": 0.7948,
"step": 4040
},
{
"epoch": 1.65,
"grad_norm": 3.274294028673199,
"learning_rate": 8.480119337002383e-07,
"loss": 0.7795,
"step": 4041
},
{
"epoch": 1.65,
"grad_norm": 2.5488028388291046,
"learning_rate": 8.475883331212104e-07,
"loss": 0.816,
"step": 4042
},
{
"epoch": 1.65,
"grad_norm": 2.624263787086616,
"learning_rate": 8.471647605390862e-07,
"loss": 0.83,
"step": 4043
},
{
"epoch": 1.65,
"grad_norm": 2.6706268623140677,
"learning_rate": 8.46741216031673e-07,
"loss": 0.8219,
"step": 4044
},
{
"epoch": 1.65,
"grad_norm": 2.711082973516553,
"learning_rate": 8.463176996767724e-07,
"loss": 0.7499,
"step": 4045
},
{
"epoch": 1.65,
"grad_norm": 2.73357689777792,
"learning_rate": 8.458942115521812e-07,
"loss": 0.8257,
"step": 4046
},
{
"epoch": 1.65,
"grad_norm": 2.511003438042444,
"learning_rate": 8.454707517356916e-07,
"loss": 0.8086,
"step": 4047
},
{
"epoch": 1.65,
"grad_norm": 5.368126932749729,
"learning_rate": 8.450473203050899e-07,
"loss": 0.8877,
"step": 4048
},
{
"epoch": 1.65,
"grad_norm": 2.9500132856851815,
"learning_rate": 8.446239173381571e-07,
"loss": 0.8483,
"step": 4049
},
{
"epoch": 1.65,
"grad_norm": 3.594229684504372,
"learning_rate": 8.442005429126693e-07,
"loss": 0.8426,
"step": 4050
},
{
"epoch": 1.65,
"grad_norm": 3.6689686753267883,
"learning_rate": 8.437771971063972e-07,
"loss": 0.7956,
"step": 4051
},
{
"epoch": 1.65,
"grad_norm": 2.9714011484984204,
"learning_rate": 8.433538799971068e-07,
"loss": 0.8115,
"step": 4052
},
{
"epoch": 1.65,
"grad_norm": 3.0531946387839826,
"learning_rate": 8.429305916625578e-07,
"loss": 0.8976,
"step": 4053
},
{
"epoch": 1.65,
"grad_norm": 3.187564802920537,
"learning_rate": 8.425073321805048e-07,
"loss": 0.7919,
"step": 4054
},
{
"epoch": 1.65,
"grad_norm": 3.079626214643183,
"learning_rate": 8.420841016286983e-07,
"loss": 0.8782,
"step": 4055
},
{
"epoch": 1.66,
"grad_norm": 3.038582268196141,
"learning_rate": 8.416609000848824e-07,
"loss": 0.8579,
"step": 4056
},
{
"epoch": 1.66,
"grad_norm": 2.7257448359473813,
"learning_rate": 8.412377276267964e-07,
"loss": 0.8642,
"step": 4057
},
{
"epoch": 1.66,
"grad_norm": 2.891875405915258,
"learning_rate": 8.408145843321731e-07,
"loss": 0.7603,
"step": 4058
},
{
"epoch": 1.66,
"grad_norm": 2.7595938494398444,
"learning_rate": 8.403914702787411e-07,
"loss": 0.8508,
"step": 4059
},
{
"epoch": 1.66,
"grad_norm": 3.418989935873254,
"learning_rate": 8.39968385544224e-07,
"loss": 0.7524,
"step": 4060
},
{
"epoch": 1.66,
"grad_norm": 3.16000061223267,
"learning_rate": 8.39545330206339e-07,
"loss": 0.8144,
"step": 4061
},
{
"epoch": 1.66,
"grad_norm": 2.6087286241915955,
"learning_rate": 8.39122304342798e-07,
"loss": 0.7664,
"step": 4062
},
{
"epoch": 1.66,
"grad_norm": 3.35733616258596,
"learning_rate": 8.386993080313079e-07,
"loss": 0.7893,
"step": 4063
},
{
"epoch": 1.66,
"grad_norm": 2.8493478835396004,
"learning_rate": 8.382763413495698e-07,
"loss": 0.8256,
"step": 4064
},
{
"epoch": 1.66,
"grad_norm": 2.9561861666063685,
"learning_rate": 8.378534043752805e-07,
"loss": 0.8333,
"step": 4065
},
{
"epoch": 1.66,
"grad_norm": 3.089116870623866,
"learning_rate": 8.37430497186129e-07,
"loss": 0.7705,
"step": 4066
},
{
"epoch": 1.66,
"grad_norm": 2.816494695186894,
"learning_rate": 8.370076198598013e-07,
"loss": 0.8262,
"step": 4067
},
{
"epoch": 1.66,
"grad_norm": 2.845798007375517,
"learning_rate": 8.365847724739765e-07,
"loss": 0.7855,
"step": 4068
},
{
"epoch": 1.66,
"grad_norm": 3.187716135163052,
"learning_rate": 8.361619551063286e-07,
"loss": 0.8662,
"step": 4069
},
{
"epoch": 1.66,
"grad_norm": 2.7869355667951763,
"learning_rate": 8.357391678345262e-07,
"loss": 0.8628,
"step": 4070
},
{
"epoch": 1.66,
"grad_norm": 2.9306712730985183,
"learning_rate": 8.353164107362318e-07,
"loss": 0.8475,
"step": 4071
},
{
"epoch": 1.66,
"grad_norm": 3.171641872093218,
"learning_rate": 8.348936838891028e-07,
"loss": 0.795,
"step": 4072
},
{
"epoch": 1.66,
"grad_norm": 2.558815734843899,
"learning_rate": 8.344709873707915e-07,
"loss": 0.8787,
"step": 4073
},
{
"epoch": 1.66,
"grad_norm": 3.2283610793970325,
"learning_rate": 8.34048321258944e-07,
"loss": 0.8216,
"step": 4074
},
{
"epoch": 1.66,
"grad_norm": 2.552969246048517,
"learning_rate": 8.336256856312009e-07,
"loss": 0.8645,
"step": 4075
},
{
"epoch": 1.66,
"grad_norm": 2.8474279678938235,
"learning_rate": 8.33203080565197e-07,
"loss": 0.8642,
"step": 4076
},
{
"epoch": 1.66,
"grad_norm": 3.7370610368902244,
"learning_rate": 8.327805061385618e-07,
"loss": 0.767,
"step": 4077
},
{
"epoch": 1.66,
"grad_norm": 3.1786686162928413,
"learning_rate": 8.323579624289198e-07,
"loss": 0.8417,
"step": 4078
},
{
"epoch": 1.66,
"grad_norm": 3.344453646372888,
"learning_rate": 8.319354495138881e-07,
"loss": 0.8917,
"step": 4079
},
{
"epoch": 1.66,
"grad_norm": 3.3044534458057315,
"learning_rate": 8.315129674710799e-07,
"loss": 0.8395,
"step": 4080
},
{
"epoch": 1.67,
"grad_norm": 2.8403423569510458,
"learning_rate": 8.310905163781019e-07,
"loss": 0.8634,
"step": 4081
},
{
"epoch": 1.67,
"grad_norm": 3.605965587378421,
"learning_rate": 8.306680963125556e-07,
"loss": 0.8101,
"step": 4082
},
{
"epoch": 1.67,
"grad_norm": 2.9080533691675647,
"learning_rate": 8.302457073520359e-07,
"loss": 0.8338,
"step": 4083
},
{
"epoch": 1.67,
"grad_norm": 2.59572212032861,
"learning_rate": 8.298233495741327e-07,
"loss": 0.8859,
"step": 4084
},
{
"epoch": 1.67,
"grad_norm": 3.0737864031498585,
"learning_rate": 8.294010230564299e-07,
"loss": 0.8021,
"step": 4085
},
{
"epoch": 1.67,
"grad_norm": 2.6075955327070948,
"learning_rate": 8.289787278765062e-07,
"loss": 0.8344,
"step": 4086
},
{
"epoch": 1.67,
"grad_norm": 2.8422109610394672,
"learning_rate": 8.285564641119343e-07,
"loss": 0.8779,
"step": 4087
},
{
"epoch": 1.67,
"grad_norm": 3.6772721711715013,
"learning_rate": 8.281342318402801e-07,
"loss": 0.7429,
"step": 4088
},
{
"epoch": 1.67,
"grad_norm": 2.8176062310724284,
"learning_rate": 8.277120311391051e-07,
"loss": 0.8195,
"step": 4089
},
{
"epoch": 1.67,
"grad_norm": 2.981066284447691,
"learning_rate": 8.272898620859643e-07,
"loss": 0.8153,
"step": 4090
},
{
"epoch": 1.67,
"grad_norm": 2.7388016805613105,
"learning_rate": 8.268677247584076e-07,
"loss": 0.7921,
"step": 4091
},
{
"epoch": 1.67,
"grad_norm": 3.0662076321209204,
"learning_rate": 8.264456192339775e-07,
"loss": 0.8168,
"step": 4092
},
{
"epoch": 1.67,
"grad_norm": 2.909521483094279,
"learning_rate": 8.260235455902124e-07,
"loss": 0.8021,
"step": 4093
},
{
"epoch": 1.67,
"grad_norm": 2.8237045803805105,
"learning_rate": 8.256015039046442e-07,
"loss": 0.7804,
"step": 4094
},
{
"epoch": 1.67,
"grad_norm": 3.472397125385917,
"learning_rate": 8.251794942547987e-07,
"loss": 0.7772,
"step": 4095
},
{
"epoch": 1.67,
"grad_norm": 2.6574840071658103,
"learning_rate": 8.247575167181957e-07,
"loss": 0.8349,
"step": 4096
},
{
"epoch": 1.67,
"grad_norm": 3.0825799597349355,
"learning_rate": 8.243355713723497e-07,
"loss": 0.7711,
"step": 4097
},
{
"epoch": 1.67,
"grad_norm": 3.222741295267264,
"learning_rate": 8.239136582947686e-07,
"loss": 0.8042,
"step": 4098
},
{
"epoch": 1.67,
"grad_norm": 3.5546436406427104,
"learning_rate": 8.234917775629551e-07,
"loss": 0.7779,
"step": 4099
},
{
"epoch": 1.67,
"grad_norm": 2.390579168650682,
"learning_rate": 8.230699292544058e-07,
"loss": 0.8292,
"step": 4100
},
{
"epoch": 1.67,
"eval_loss": 0.874110758304596,
"eval_runtime": 466.0724,
"eval_samples_per_second": 74.778,
"eval_steps_per_second": 4.675,
"step": 4100
},
{
"epoch": 1.67,
"grad_norm": 2.860291281041945,
"learning_rate": 8.226481134466104e-07,
"loss": 0.7748,
"step": 4101
},
{
"epoch": 1.67,
"grad_norm": 3.243929960451979,
"learning_rate": 8.22226330217054e-07,
"loss": 0.8485,
"step": 4102
},
{
"epoch": 1.67,
"grad_norm": 3.0020468199029966,
"learning_rate": 8.218045796432145e-07,
"loss": 0.7723,
"step": 4103
},
{
"epoch": 1.67,
"grad_norm": 3.2814038305761177,
"learning_rate": 8.213828618025655e-07,
"loss": 0.8566,
"step": 4104
},
{
"epoch": 1.68,
"grad_norm": 3.416187959357751,
"learning_rate": 8.209611767725719e-07,
"loss": 0.7795,
"step": 4105
},
{
"epoch": 1.68,
"grad_norm": 3.0939152328015216,
"learning_rate": 8.205395246306953e-07,
"loss": 0.7844,
"step": 4106
},
{
"epoch": 1.68,
"grad_norm": 2.9191504572417166,
"learning_rate": 8.201179054543896e-07,
"loss": 0.8042,
"step": 4107
},
{
"epoch": 1.68,
"grad_norm": 3.4131070932177883,
"learning_rate": 8.196963193211037e-07,
"loss": 0.8439,
"step": 4108
},
{
"epoch": 1.68,
"grad_norm": 3.8066899414522886,
"learning_rate": 8.192747663082792e-07,
"loss": 0.788,
"step": 4109
},
{
"epoch": 1.68,
"grad_norm": 4.668966902264546,
"learning_rate": 8.188532464933526e-07,
"loss": 0.7951,
"step": 4110
},
{
"epoch": 1.68,
"grad_norm": 2.899034296623488,
"learning_rate": 8.184317599537536e-07,
"loss": 0.8491,
"step": 4111
},
{
"epoch": 1.68,
"grad_norm": 3.9020074531561115,
"learning_rate": 8.180103067669069e-07,
"loss": 0.7745,
"step": 4112
},
{
"epoch": 1.68,
"grad_norm": 3.175866579529044,
"learning_rate": 8.175888870102301e-07,
"loss": 0.7142,
"step": 4113
},
{
"epoch": 1.68,
"grad_norm": 2.945002057737186,
"learning_rate": 8.171675007611346e-07,
"loss": 0.7734,
"step": 4114
},
{
"epoch": 1.68,
"grad_norm": 2.8882623792495625,
"learning_rate": 8.167461480970262e-07,
"loss": 0.7489,
"step": 4115
},
{
"epoch": 1.68,
"grad_norm": 2.9553504479974646,
"learning_rate": 8.16324829095304e-07,
"loss": 0.7542,
"step": 4116
},
{
"epoch": 1.68,
"grad_norm": 2.9163603449189015,
"learning_rate": 8.159035438333621e-07,
"loss": 0.8263,
"step": 4117
},
{
"epoch": 1.68,
"grad_norm": 2.2971646632543576,
"learning_rate": 8.154822923885863e-07,
"loss": 0.8281,
"step": 4118
},
{
"epoch": 1.68,
"grad_norm": 3.1653986504689815,
"learning_rate": 8.150610748383581e-07,
"loss": 0.8092,
"step": 4119
},
{
"epoch": 1.68,
"grad_norm": 3.0118816291215396,
"learning_rate": 8.146398912600521e-07,
"loss": 0.8383,
"step": 4120
},
{
"epoch": 1.68,
"grad_norm": 3.6506336662973093,
"learning_rate": 8.142187417310365e-07,
"loss": 0.8132,
"step": 4121
},
{
"epoch": 1.68,
"grad_norm": 2.4725550979484017,
"learning_rate": 8.137976263286733e-07,
"loss": 0.9015,
"step": 4122
},
{
"epoch": 1.68,
"grad_norm": 3.6018817351095818,
"learning_rate": 8.133765451303182e-07,
"loss": 0.8124,
"step": 4123
},
{
"epoch": 1.68,
"grad_norm": 2.6619843962295286,
"learning_rate": 8.129554982133206e-07,
"loss": 0.8651,
"step": 4124
},
{
"epoch": 1.68,
"grad_norm": 4.1345424169645435,
"learning_rate": 8.125344856550247e-07,
"loss": 0.8437,
"step": 4125
},
{
"epoch": 1.68,
"grad_norm": 2.7661588176843632,
"learning_rate": 8.12113507532766e-07,
"loss": 0.8599,
"step": 4126
},
{
"epoch": 1.68,
"grad_norm": 3.0100005863975556,
"learning_rate": 8.116925639238762e-07,
"loss": 0.8072,
"step": 4127
},
{
"epoch": 1.68,
"grad_norm": 3.2560834297293604,
"learning_rate": 8.11271654905679e-07,
"loss": 0.8427,
"step": 4128
},
{
"epoch": 1.68,
"grad_norm": 2.97941952232176,
"learning_rate": 8.108507805554923e-07,
"loss": 0.8175,
"step": 4129
},
{
"epoch": 1.69,
"grad_norm": 3.5806052769412253,
"learning_rate": 8.104299409506283e-07,
"loss": 0.7768,
"step": 4130
},
{
"epoch": 1.69,
"grad_norm": 2.558731822743576,
"learning_rate": 8.100091361683911e-07,
"loss": 0.8972,
"step": 4131
},
{
"epoch": 1.69,
"grad_norm": 2.77093868006394,
"learning_rate": 8.095883662860801e-07,
"loss": 0.8317,
"step": 4132
},
{
"epoch": 1.69,
"grad_norm": 3.7232934980299293,
"learning_rate": 8.091676313809874e-07,
"loss": 0.8326,
"step": 4133
},
{
"epoch": 1.69,
"grad_norm": 3.4358246184866035,
"learning_rate": 8.087469315303994e-07,
"loss": 0.7785,
"step": 4134
},
{
"epoch": 1.69,
"grad_norm": 3.342003306941795,
"learning_rate": 8.083262668115948e-07,
"loss": 0.804,
"step": 4135
},
{
"epoch": 1.69,
"grad_norm": 3.404430497934739,
"learning_rate": 8.079056373018471e-07,
"loss": 0.8439,
"step": 4136
},
{
"epoch": 1.69,
"grad_norm": 3.372563123063929,
"learning_rate": 8.074850430784226e-07,
"loss": 0.8121,
"step": 4137
},
{
"epoch": 1.69,
"grad_norm": 3.023195256295235,
"learning_rate": 8.070644842185821e-07,
"loss": 0.8343,
"step": 4138
},
{
"epoch": 1.69,
"grad_norm": 3.106497996781755,
"learning_rate": 8.066439607995779e-07,
"loss": 0.8262,
"step": 4139
},
{
"epoch": 1.69,
"grad_norm": 2.4259706063634443,
"learning_rate": 8.062234728986579e-07,
"loss": 0.8316,
"step": 4140
},
{
"epoch": 1.69,
"grad_norm": 3.5823516477357664,
"learning_rate": 8.058030205930626e-07,
"loss": 0.8018,
"step": 4141
},
{
"epoch": 1.69,
"grad_norm": 3.374953743111472,
"learning_rate": 8.053826039600257e-07,
"loss": 0.8496,
"step": 4142
},
{
"epoch": 1.69,
"grad_norm": 3.4006269822031294,
"learning_rate": 8.049622230767754e-07,
"loss": 0.8019,
"step": 4143
},
{
"epoch": 1.69,
"grad_norm": 2.618616649585613,
"learning_rate": 8.045418780205314e-07,
"loss": 0.8019,
"step": 4144
},
{
"epoch": 1.69,
"grad_norm": 2.7784390548492928,
"learning_rate": 8.041215688685088e-07,
"loss": 0.8614,
"step": 4145
},
{
"epoch": 1.69,
"grad_norm": 3.1356220527887153,
"learning_rate": 8.037012956979152e-07,
"loss": 0.7735,
"step": 4146
},
{
"epoch": 1.69,
"grad_norm": 3.268578096708495,
"learning_rate": 8.032810585859518e-07,
"loss": 0.7305,
"step": 4147
},
{
"epoch": 1.69,
"grad_norm": 2.973074753553033,
"learning_rate": 8.028608576098127e-07,
"loss": 0.7861,
"step": 4148
},
{
"epoch": 1.69,
"grad_norm": 4.178913702069519,
"learning_rate": 8.024406928466858e-07,
"loss": 0.8358,
"step": 4149
},
{
"epoch": 1.69,
"grad_norm": 2.7737828957612005,
"learning_rate": 8.020205643737523e-07,
"loss": 0.8027,
"step": 4150
},
{
"epoch": 1.69,
"grad_norm": 4.166634599123504,
"learning_rate": 8.016004722681875e-07,
"loss": 0.8193,
"step": 4151
},
{
"epoch": 1.69,
"grad_norm": 2.97005416975797,
"learning_rate": 8.011804166071577e-07,
"loss": 0.8275,
"step": 4152
},
{
"epoch": 1.69,
"grad_norm": 2.885307724362976,
"learning_rate": 8.007603974678253e-07,
"loss": 0.8353,
"step": 4153
},
{
"epoch": 1.7,
"grad_norm": 2.8285510300364005,
"learning_rate": 8.003404149273443e-07,
"loss": 0.8056,
"step": 4154
},
{
"epoch": 1.7,
"grad_norm": 3.5746845462613623,
"learning_rate": 7.999204690628624e-07,
"loss": 0.8391,
"step": 4155
},
{
"epoch": 1.7,
"grad_norm": 3.12207949360473,
"learning_rate": 7.995005599515207e-07,
"loss": 0.7998,
"step": 4156
},
{
"epoch": 1.7,
"grad_norm": 2.6681576060186187,
"learning_rate": 7.99080687670453e-07,
"loss": 0.7859,
"step": 4157
},
{
"epoch": 1.7,
"grad_norm": 2.8934789463514097,
"learning_rate": 7.986608522967873e-07,
"loss": 0.771,
"step": 4158
},
{
"epoch": 1.7,
"grad_norm": 3.2122806439105602,
"learning_rate": 7.982410539076439e-07,
"loss": 0.8898,
"step": 4159
},
{
"epoch": 1.7,
"grad_norm": 3.162367987358733,
"learning_rate": 7.978212925801371e-07,
"loss": 0.8159,
"step": 4160
},
{
"epoch": 1.7,
"grad_norm": 2.796425723052705,
"learning_rate": 7.974015683913735e-07,
"loss": 0.8314,
"step": 4161
},
{
"epoch": 1.7,
"grad_norm": 2.977280614287867,
"learning_rate": 7.969818814184535e-07,
"loss": 0.887,
"step": 4162
},
{
"epoch": 1.7,
"grad_norm": 2.922819220433539,
"learning_rate": 7.965622317384705e-07,
"loss": 0.8182,
"step": 4163
},
{
"epoch": 1.7,
"grad_norm": 2.513124859266637,
"learning_rate": 7.961426194285116e-07,
"loss": 0.8449,
"step": 4164
},
{
"epoch": 1.7,
"grad_norm": 3.3885408367662464,
"learning_rate": 7.957230445656555e-07,
"loss": 0.8161,
"step": 4165
},
{
"epoch": 1.7,
"grad_norm": 2.8386833440769137,
"learning_rate": 7.953035072269757e-07,
"loss": 0.8612,
"step": 4166
},
{
"epoch": 1.7,
"grad_norm": 3.1170976753981083,
"learning_rate": 7.948840074895379e-07,
"loss": 0.7847,
"step": 4167
},
{
"epoch": 1.7,
"grad_norm": 2.9933812560442137,
"learning_rate": 7.944645454304016e-07,
"loss": 0.8218,
"step": 4168
},
{
"epoch": 1.7,
"grad_norm": 3.162531563192291,
"learning_rate": 7.940451211266181e-07,
"loss": 0.8268,
"step": 4169
},
{
"epoch": 1.7,
"grad_norm": 2.3227858786011213,
"learning_rate": 7.936257346552329e-07,
"loss": 0.848,
"step": 4170
},
{
"epoch": 1.7,
"grad_norm": 3.1315772316690555,
"learning_rate": 7.932063860932842e-07,
"loss": 0.8013,
"step": 4171
},
{
"epoch": 1.7,
"grad_norm": 2.760521196345073,
"learning_rate": 7.927870755178036e-07,
"loss": 0.7671,
"step": 4172
},
{
"epoch": 1.7,
"grad_norm": 3.47367398451893,
"learning_rate": 7.923678030058152e-07,
"loss": 0.7772,
"step": 4173
},
{
"epoch": 1.7,
"grad_norm": 3.313272470362454,
"learning_rate": 7.919485686343359e-07,
"loss": 0.7849,
"step": 4174
},
{
"epoch": 1.7,
"grad_norm": 2.9457310508992123,
"learning_rate": 7.915293724803761e-07,
"loss": 0.7799,
"step": 4175
},
{
"epoch": 1.7,
"grad_norm": 2.2618318856093076,
"learning_rate": 7.911102146209394e-07,
"loss": 0.8117,
"step": 4176
},
{
"epoch": 1.7,
"grad_norm": 2.391409202379487,
"learning_rate": 7.90691095133022e-07,
"loss": 0.8767,
"step": 4177
},
{
"epoch": 1.7,
"grad_norm": 3.2025566706467554,
"learning_rate": 7.902720140936128e-07,
"loss": 0.9103,
"step": 4178
},
{
"epoch": 1.71,
"grad_norm": 3.1176192612912392,
"learning_rate": 7.898529715796938e-07,
"loss": 0.8346,
"step": 4179
},
{
"epoch": 1.71,
"grad_norm": 3.1850167395855378,
"learning_rate": 7.894339676682404e-07,
"loss": 0.8082,
"step": 4180
},
{
"epoch": 1.71,
"grad_norm": 3.6220700298705655,
"learning_rate": 7.890150024362205e-07,
"loss": 0.8263,
"step": 4181
},
{
"epoch": 1.71,
"grad_norm": 2.9405400072922974,
"learning_rate": 7.885960759605946e-07,
"loss": 0.8346,
"step": 4182
},
{
"epoch": 1.71,
"grad_norm": 3.0051084315992855,
"learning_rate": 7.881771883183163e-07,
"loss": 0.8077,
"step": 4183
},
{
"epoch": 1.71,
"grad_norm": 2.5051463979869952,
"learning_rate": 7.877583395863329e-07,
"loss": 0.8822,
"step": 4184
},
{
"epoch": 1.71,
"grad_norm": 2.7743204640479213,
"learning_rate": 7.873395298415836e-07,
"loss": 0.8369,
"step": 4185
},
{
"epoch": 1.71,
"grad_norm": 2.4714573945516958,
"learning_rate": 7.869207591610003e-07,
"loss": 0.8218,
"step": 4186
},
{
"epoch": 1.71,
"grad_norm": 3.108169467469027,
"learning_rate": 7.865020276215082e-07,
"loss": 0.8747,
"step": 4187
},
{
"epoch": 1.71,
"grad_norm": 3.719992530803171,
"learning_rate": 7.860833353000252e-07,
"loss": 0.7759,
"step": 4188
},
{
"epoch": 1.71,
"grad_norm": 3.48948063788902,
"learning_rate": 7.856646822734624e-07,
"loss": 0.836,
"step": 4189
},
{
"epoch": 1.71,
"grad_norm": 2.9656199058478845,
"learning_rate": 7.852460686187232e-07,
"loss": 0.7719,
"step": 4190
},
{
"epoch": 1.71,
"grad_norm": 4.034818705502605,
"learning_rate": 7.848274944127033e-07,
"loss": 0.7812,
"step": 4191
},
{
"epoch": 1.71,
"grad_norm": 3.076425546274116,
"learning_rate": 7.844089597322922e-07,
"loss": 0.7691,
"step": 4192
},
{
"epoch": 1.71,
"grad_norm": 2.4621870499593657,
"learning_rate": 7.839904646543716e-07,
"loss": 0.8741,
"step": 4193
},
{
"epoch": 1.71,
"grad_norm": 3.2357335213717446,
"learning_rate": 7.835720092558159e-07,
"loss": 0.7893,
"step": 4194
},
{
"epoch": 1.71,
"grad_norm": 2.935116590419187,
"learning_rate": 7.831535936134922e-07,
"loss": 0.8264,
"step": 4195
},
{
"epoch": 1.71,
"grad_norm": 2.5956372458991344,
"learning_rate": 7.827352178042601e-07,
"loss": 0.8244,
"step": 4196
},
{
"epoch": 1.71,
"grad_norm": 2.6925887391265726,
"learning_rate": 7.823168819049729e-07,
"loss": 0.7869,
"step": 4197
},
{
"epoch": 1.71,
"grad_norm": 2.4003444257356152,
"learning_rate": 7.818985859924756e-07,
"loss": 0.7507,
"step": 4198
},
{
"epoch": 1.71,
"grad_norm": 2.7966700083453584,
"learning_rate": 7.814803301436057e-07,
"loss": 0.798,
"step": 4199
},
{
"epoch": 1.71,
"grad_norm": 3.120218957523779,
"learning_rate": 7.810621144351939e-07,
"loss": 0.8692,
"step": 4200
},
{
"epoch": 1.71,
"eval_loss": 0.8726236820220947,
"eval_runtime": 465.493,
"eval_samples_per_second": 74.871,
"eval_steps_per_second": 4.681,
"step": 4200
},
{
"epoch": 1.71,
"grad_norm": 3.1815260031568284,
"learning_rate": 7.806439389440632e-07,
"loss": 0.8564,
"step": 4201
},
{
"epoch": 1.71,
"grad_norm": 2.707453998402156,
"learning_rate": 7.802258037470299e-07,
"loss": 0.7961,
"step": 4202
},
{
"epoch": 1.72,
"grad_norm": 3.094792254380922,
"learning_rate": 7.798077089209022e-07,
"loss": 0.8093,
"step": 4203
},
{
"epoch": 1.72,
"grad_norm": 2.677786563276835,
"learning_rate": 7.793896545424806e-07,
"loss": 0.7233,
"step": 4204
},
{
"epoch": 1.72,
"grad_norm": 3.318073864900623,
"learning_rate": 7.78971640688559e-07,
"loss": 0.7642,
"step": 4205
},
{
"epoch": 1.72,
"grad_norm": 3.963169882645067,
"learning_rate": 7.785536674359234e-07,
"loss": 0.8415,
"step": 4206
},
{
"epoch": 1.72,
"grad_norm": 3.6326007389403134,
"learning_rate": 7.781357348613525e-07,
"loss": 0.7931,
"step": 4207
},
{
"epoch": 1.72,
"grad_norm": 2.8385215900161738,
"learning_rate": 7.777178430416173e-07,
"loss": 0.86,
"step": 4208
},
{
"epoch": 1.72,
"grad_norm": 2.9392891848993354,
"learning_rate": 7.772999920534812e-07,
"loss": 0.7945,
"step": 4209
},
{
"epoch": 1.72,
"grad_norm": 2.715958721333859,
"learning_rate": 7.768821819737008e-07,
"loss": 0.7818,
"step": 4210
},
{
"epoch": 1.72,
"grad_norm": 2.6093555346891506,
"learning_rate": 7.764644128790248e-07,
"loss": 0.8451,
"step": 4211
},
{
"epoch": 1.72,
"grad_norm": 5.241475253717735,
"learning_rate": 7.760466848461941e-07,
"loss": 0.876,
"step": 4212
},
{
"epoch": 1.72,
"grad_norm": 2.946522728672347,
"learning_rate": 7.756289979519422e-07,
"loss": 0.8466,
"step": 4213
},
{
"epoch": 1.72,
"grad_norm": 3.285874517662331,
"learning_rate": 7.752113522729948e-07,
"loss": 0.8791,
"step": 4214
},
{
"epoch": 1.72,
"grad_norm": 2.834286055477442,
"learning_rate": 7.747937478860711e-07,
"loss": 0.8023,
"step": 4215
},
{
"epoch": 1.72,
"grad_norm": 3.3424350670644727,
"learning_rate": 7.743761848678818e-07,
"loss": 0.7917,
"step": 4216
},
{
"epoch": 1.72,
"grad_norm": 4.094456175930852,
"learning_rate": 7.739586632951298e-07,
"loss": 0.8803,
"step": 4217
},
{
"epoch": 1.72,
"grad_norm": 4.596759874524738,
"learning_rate": 7.735411832445109e-07,
"loss": 0.7937,
"step": 4218
},
{
"epoch": 1.72,
"grad_norm": 3.1622978079683413,
"learning_rate": 7.73123744792713e-07,
"loss": 0.838,
"step": 4219
},
{
"epoch": 1.72,
"grad_norm": 2.5162492489419668,
"learning_rate": 7.72706348016417e-07,
"loss": 0.882,
"step": 4220
},
{
"epoch": 1.72,
"grad_norm": 2.7641346131734825,
"learning_rate": 7.722889929922948e-07,
"loss": 0.7794,
"step": 4221
},
{
"epoch": 1.72,
"grad_norm": 3.355717210019132,
"learning_rate": 7.718716797970119e-07,
"loss": 0.7615,
"step": 4222
},
{
"epoch": 1.72,
"grad_norm": 5.135749469540946,
"learning_rate": 7.714544085072256e-07,
"loss": 0.8391,
"step": 4223
},
{
"epoch": 1.72,
"grad_norm": 3.0828328823816955,
"learning_rate": 7.71037179199586e-07,
"loss": 0.8307,
"step": 4224
},
{
"epoch": 1.72,
"grad_norm": 3.3134360078293086,
"learning_rate": 7.706199919507344e-07,
"loss": 0.7364,
"step": 4225
},
{
"epoch": 1.72,
"grad_norm": 2.8188681665673316,
"learning_rate": 7.702028468373054e-07,
"loss": 0.9023,
"step": 4226
},
{
"epoch": 1.72,
"grad_norm": 3.177435700912355,
"learning_rate": 7.697857439359251e-07,
"loss": 0.8702,
"step": 4227
},
{
"epoch": 1.73,
"grad_norm": 2.510687277021847,
"learning_rate": 7.693686833232132e-07,
"loss": 0.7996,
"step": 4228
},
{
"epoch": 1.73,
"grad_norm": 4.542160020777928,
"learning_rate": 7.689516650757793e-07,
"loss": 0.7796,
"step": 4229
},
{
"epoch": 1.73,
"grad_norm": 2.554995004079241,
"learning_rate": 7.685346892702276e-07,
"loss": 0.8104,
"step": 4230
},
{
"epoch": 1.73,
"grad_norm": 3.031220763970587,
"learning_rate": 7.681177559831532e-07,
"loss": 0.8389,
"step": 4231
},
{
"epoch": 1.73,
"grad_norm": 3.691418612244424,
"learning_rate": 7.677008652911437e-07,
"loss": 0.8363,
"step": 4232
},
{
"epoch": 1.73,
"grad_norm": 3.125926453967599,
"learning_rate": 7.67284017270779e-07,
"loss": 0.85,
"step": 4233
},
{
"epoch": 1.73,
"grad_norm": 3.4552080540405563,
"learning_rate": 7.668672119986305e-07,
"loss": 0.8407,
"step": 4234
},
{
"epoch": 1.73,
"grad_norm": 2.9051038418139377,
"learning_rate": 7.664504495512626e-07,
"loss": 0.7697,
"step": 4235
},
{
"epoch": 1.73,
"grad_norm": 3.064671419958182,
"learning_rate": 7.660337300052316e-07,
"loss": 0.8474,
"step": 4236
},
{
"epoch": 1.73,
"grad_norm": 3.618631457242714,
"learning_rate": 7.65617053437086e-07,
"loss": 0.7855,
"step": 4237
},
{
"epoch": 1.73,
"grad_norm": 2.7887190736428016,
"learning_rate": 7.652004199233657e-07,
"loss": 0.8324,
"step": 4238
},
{
"epoch": 1.73,
"grad_norm": 2.4359184009151496,
"learning_rate": 7.647838295406036e-07,
"loss": 0.8233,
"step": 4239
},
{
"epoch": 1.73,
"grad_norm": 2.7468670971202362,
"learning_rate": 7.643672823653239e-07,
"loss": 0.8889,
"step": 4240
},
{
"epoch": 1.73,
"grad_norm": 3.791594438515525,
"learning_rate": 7.639507784740441e-07,
"loss": 0.7621,
"step": 4241
},
{
"epoch": 1.73,
"grad_norm": 2.8436862856501905,
"learning_rate": 7.635343179432717e-07,
"loss": 0.829,
"step": 4242
},
{
"epoch": 1.73,
"grad_norm": 3.412679703118202,
"learning_rate": 7.631179008495084e-07,
"loss": 0.7771,
"step": 4243
},
{
"epoch": 1.73,
"grad_norm": 3.8587511517622777,
"learning_rate": 7.627015272692465e-07,
"loss": 0.8079,
"step": 4244
},
{
"epoch": 1.73,
"grad_norm": 2.6692358712815065,
"learning_rate": 7.62285197278971e-07,
"loss": 0.8781,
"step": 4245
},
{
"epoch": 1.73,
"grad_norm": 3.011608682494722,
"learning_rate": 7.618689109551587e-07,
"loss": 0.8773,
"step": 4246
},
{
"epoch": 1.73,
"grad_norm": 3.5380423279472017,
"learning_rate": 7.61452668374278e-07,
"loss": 0.8332,
"step": 4247
},
{
"epoch": 1.73,
"grad_norm": 2.5997886207978587,
"learning_rate": 7.610364696127896e-07,
"loss": 0.8047,
"step": 4248
},
{
"epoch": 1.73,
"grad_norm": 2.5027321426870177,
"learning_rate": 7.606203147471467e-07,
"loss": 0.8264,
"step": 4249
},
{
"epoch": 1.73,
"grad_norm": 3.647126634132705,
"learning_rate": 7.602042038537937e-07,
"loss": 0.8861,
"step": 4250
},
{
"epoch": 1.73,
"grad_norm": 4.566521969899312,
"learning_rate": 7.597881370091667e-07,
"loss": 0.8227,
"step": 4251
},
{
"epoch": 1.74,
"grad_norm": 3.063344044728664,
"learning_rate": 7.593721142896943e-07,
"loss": 0.8021,
"step": 4252
},
{
"epoch": 1.74,
"grad_norm": 3.797337644425932,
"learning_rate": 7.589561357717968e-07,
"loss": 0.8119,
"step": 4253
},
{
"epoch": 1.74,
"grad_norm": 3.6206130754309847,
"learning_rate": 7.585402015318871e-07,
"loss": 0.7666,
"step": 4254
},
{
"epoch": 1.74,
"grad_norm": 3.0775064841234725,
"learning_rate": 7.581243116463682e-07,
"loss": 0.8107,
"step": 4255
},
{
"epoch": 1.74,
"grad_norm": 2.880550747444561,
"learning_rate": 7.577084661916365e-07,
"loss": 0.829,
"step": 4256
},
{
"epoch": 1.74,
"grad_norm": 3.2659626240587984,
"learning_rate": 7.572926652440799e-07,
"loss": 0.7494,
"step": 4257
},
{
"epoch": 1.74,
"grad_norm": 2.624088034977849,
"learning_rate": 7.568769088800779e-07,
"loss": 0.8611,
"step": 4258
},
{
"epoch": 1.74,
"grad_norm": 3.1638390581305504,
"learning_rate": 7.564611971760016e-07,
"loss": 0.7822,
"step": 4259
},
{
"epoch": 1.74,
"grad_norm": 3.0106121211439403,
"learning_rate": 7.560455302082145e-07,
"loss": 0.8583,
"step": 4260
},
{
"epoch": 1.74,
"grad_norm": 2.911526690525421,
"learning_rate": 7.556299080530713e-07,
"loss": 0.8672,
"step": 4261
},
{
"epoch": 1.74,
"grad_norm": 3.2780165467548326,
"learning_rate": 7.55214330786919e-07,
"loss": 0.8412,
"step": 4262
},
{
"epoch": 1.74,
"grad_norm": 2.571544736588269,
"learning_rate": 7.547987984860962e-07,
"loss": 0.8084,
"step": 4263
},
{
"epoch": 1.74,
"grad_norm": 2.7372342526200404,
"learning_rate": 7.543833112269328e-07,
"loss": 0.8261,
"step": 4264
},
{
"epoch": 1.74,
"grad_norm": 3.15936850287932,
"learning_rate": 7.539678690857508e-07,
"loss": 0.7985,
"step": 4265
},
{
"epoch": 1.74,
"grad_norm": 2.5081481640336434,
"learning_rate": 7.535524721388638e-07,
"loss": 0.8699,
"step": 4266
},
{
"epoch": 1.74,
"grad_norm": 3.462737964906543,
"learning_rate": 7.531371204625779e-07,
"loss": 0.8478,
"step": 4267
},
{
"epoch": 1.74,
"grad_norm": 2.968582907475268,
"learning_rate": 7.52721814133189e-07,
"loss": 0.7744,
"step": 4268
},
{
"epoch": 1.74,
"grad_norm": 2.7872089181840916,
"learning_rate": 7.523065532269865e-07,
"loss": 0.8266,
"step": 4269
},
{
"epoch": 1.74,
"grad_norm": 2.5345678376986833,
"learning_rate": 7.518913378202508e-07,
"loss": 0.8251,
"step": 4270
},
{
"epoch": 1.74,
"grad_norm": 2.5402903312644733,
"learning_rate": 7.514761679892541e-07,
"loss": 0.8127,
"step": 4271
},
{
"epoch": 1.74,
"grad_norm": 3.2633290879649106,
"learning_rate": 7.510610438102594e-07,
"loss": 0.8054,
"step": 4272
},
{
"epoch": 1.74,
"grad_norm": 4.656540229098241,
"learning_rate": 7.506459653595222e-07,
"loss": 0.7797,
"step": 4273
},
{
"epoch": 1.74,
"grad_norm": 3.3332036339144184,
"learning_rate": 7.502309327132894e-07,
"loss": 0.8114,
"step": 4274
},
{
"epoch": 1.74,
"grad_norm": 2.9778633728885446,
"learning_rate": 7.498159459477996e-07,
"loss": 0.8242,
"step": 4275
},
{
"epoch": 1.74,
"grad_norm": 2.7104763069566604,
"learning_rate": 7.49401005139283e-07,
"loss": 0.8576,
"step": 4276
},
{
"epoch": 1.75,
"grad_norm": 2.994616822832894,
"learning_rate": 7.489861103639608e-07,
"loss": 0.8097,
"step": 4277
},
{
"epoch": 1.75,
"grad_norm": 2.571692518284457,
"learning_rate": 7.485712616980462e-07,
"loss": 0.8255,
"step": 4278
},
{
"epoch": 1.75,
"grad_norm": 2.720479174788241,
"learning_rate": 7.481564592177436e-07,
"loss": 0.8456,
"step": 4279
},
{
"epoch": 1.75,
"grad_norm": 3.197970106740343,
"learning_rate": 7.477417029992501e-07,
"loss": 0.7644,
"step": 4280
},
{
"epoch": 1.75,
"grad_norm": 2.988842487607233,
"learning_rate": 7.473269931187522e-07,
"loss": 0.8569,
"step": 4281
},
{
"epoch": 1.75,
"grad_norm": 2.669181394649249,
"learning_rate": 7.469123296524298e-07,
"loss": 0.8213,
"step": 4282
},
{
"epoch": 1.75,
"grad_norm": 4.3176355249673115,
"learning_rate": 7.464977126764531e-07,
"loss": 0.7629,
"step": 4283
},
{
"epoch": 1.75,
"grad_norm": 2.997891568294716,
"learning_rate": 7.460831422669849e-07,
"loss": 0.778,
"step": 4284
},
{
"epoch": 1.75,
"grad_norm": 2.7990955268746998,
"learning_rate": 7.456686185001779e-07,
"loss": 0.8633,
"step": 4285
},
{
"epoch": 1.75,
"grad_norm": 3.8070003806216186,
"learning_rate": 7.452541414521772e-07,
"loss": 0.8431,
"step": 4286
},
{
"epoch": 1.75,
"grad_norm": 2.558101573797906,
"learning_rate": 7.448397111991196e-07,
"loss": 0.8315,
"step": 4287
},
{
"epoch": 1.75,
"grad_norm": 3.0577472156670504,
"learning_rate": 7.44425327817133e-07,
"loss": 0.8254,
"step": 4288
},
{
"epoch": 1.75,
"grad_norm": 3.67549483143448,
"learning_rate": 7.44010991382336e-07,
"loss": 0.8026,
"step": 4289
},
{
"epoch": 1.75,
"grad_norm": 3.0072164337345586,
"learning_rate": 7.435967019708396e-07,
"loss": 0.8684,
"step": 4290
},
{
"epoch": 1.75,
"grad_norm": 3.268432977696694,
"learning_rate": 7.431824596587454e-07,
"loss": 0.8139,
"step": 4291
},
{
"epoch": 1.75,
"grad_norm": 2.444564763500703,
"learning_rate": 7.427682645221467e-07,
"loss": 0.8057,
"step": 4292
},
{
"epoch": 1.75,
"grad_norm": 2.840186099621506,
"learning_rate": 7.423541166371287e-07,
"loss": 0.8165,
"step": 4293
},
{
"epoch": 1.75,
"grad_norm": 3.569390176886716,
"learning_rate": 7.419400160797664e-07,
"loss": 0.8384,
"step": 4294
},
{
"epoch": 1.75,
"grad_norm": 3.230725034082421,
"learning_rate": 7.415259629261278e-07,
"loss": 0.8522,
"step": 4295
},
{
"epoch": 1.75,
"grad_norm": 2.370785726093981,
"learning_rate": 7.41111957252271e-07,
"loss": 0.8262,
"step": 4296
},
{
"epoch": 1.75,
"grad_norm": 3.9885251897467917,
"learning_rate": 7.406979991342461e-07,
"loss": 0.7488,
"step": 4297
},
{
"epoch": 1.75,
"grad_norm": 3.4057642355725424,
"learning_rate": 7.402840886480939e-07,
"loss": 0.82,
"step": 4298
},
{
"epoch": 1.75,
"grad_norm": 2.4259449483390005,
"learning_rate": 7.398702258698466e-07,
"loss": 0.8515,
"step": 4299
},
{
"epoch": 1.75,
"grad_norm": 3.881631540716603,
"learning_rate": 7.39456410875528e-07,
"loss": 0.7715,
"step": 4300
},
{
"epoch": 1.75,
"eval_loss": 0.8724686503410339,
"eval_runtime": 466.7541,
"eval_samples_per_second": 74.669,
"eval_steps_per_second": 4.668,
"step": 4300
},
{
"epoch": 1.76,
"grad_norm": 2.853957529883649,
"learning_rate": 7.390426437411532e-07,
"loss": 0.7642,
"step": 4301
},
{
"epoch": 1.76,
"grad_norm": 3.286362963311685,
"learning_rate": 7.386289245427275e-07,
"loss": 0.8124,
"step": 4302
},
{
"epoch": 1.76,
"grad_norm": 2.909941205446139,
"learning_rate": 7.382152533562484e-07,
"loss": 0.8122,
"step": 4303
},
{
"epoch": 1.76,
"grad_norm": 3.172637688209738,
"learning_rate": 7.378016302577043e-07,
"loss": 0.8416,
"step": 4304
},
{
"epoch": 1.76,
"grad_norm": 3.558508651044969,
"learning_rate": 7.373880553230745e-07,
"loss": 0.7969,
"step": 4305
},
{
"epoch": 1.76,
"grad_norm": 2.9175735728852246,
"learning_rate": 7.369745286283303e-07,
"loss": 0.8755,
"step": 4306
},
{
"epoch": 1.76,
"grad_norm": 4.834645505856762,
"learning_rate": 7.365610502494326e-07,
"loss": 0.8629,
"step": 4307
},
{
"epoch": 1.76,
"grad_norm": 2.960982134337366,
"learning_rate": 7.361476202623349e-07,
"loss": 0.8266,
"step": 4308
},
{
"epoch": 1.76,
"grad_norm": 3.4106551112827956,
"learning_rate": 7.357342387429812e-07,
"loss": 0.8695,
"step": 4309
},
{
"epoch": 1.76,
"grad_norm": 2.759539890722289,
"learning_rate": 7.353209057673069e-07,
"loss": 0.8148,
"step": 4310
},
{
"epoch": 1.76,
"grad_norm": 4.987805901878724,
"learning_rate": 7.349076214112377e-07,
"loss": 0.8034,
"step": 4311
},
{
"epoch": 1.76,
"grad_norm": 3.3691881276798656,
"learning_rate": 7.344943857506909e-07,
"loss": 0.7559,
"step": 4312
},
{
"epoch": 1.76,
"grad_norm": 3.120599761555665,
"learning_rate": 7.340811988615753e-07,
"loss": 0.8493,
"step": 4313
},
{
"epoch": 1.76,
"grad_norm": 2.6071818613836255,
"learning_rate": 7.336680608197903e-07,
"loss": 0.8024,
"step": 4314
},
{
"epoch": 1.76,
"grad_norm": 3.253916426866886,
"learning_rate": 7.332549717012259e-07,
"loss": 0.8116,
"step": 4315
},
{
"epoch": 1.76,
"grad_norm": 3.506722280768608,
"learning_rate": 7.328419315817638e-07,
"loss": 0.8615,
"step": 4316
},
{
"epoch": 1.76,
"grad_norm": 3.0458066659440517,
"learning_rate": 7.324289405372763e-07,
"loss": 0.8226,
"step": 4317
},
{
"epoch": 1.76,
"grad_norm": 2.631824907878585,
"learning_rate": 7.320159986436267e-07,
"loss": 0.8183,
"step": 4318
},
{
"epoch": 1.76,
"grad_norm": 2.5763770460658164,
"learning_rate": 7.316031059766704e-07,
"loss": 0.85,
"step": 4319
},
{
"epoch": 1.76,
"grad_norm": 4.125126972394935,
"learning_rate": 7.311902626122512e-07,
"loss": 0.7866,
"step": 4320
},
{
"epoch": 1.76,
"grad_norm": 2.991145134776905,
"learning_rate": 7.307774686262063e-07,
"loss": 0.8092,
"step": 4321
},
{
"epoch": 1.76,
"grad_norm": 2.6552233257764657,
"learning_rate": 7.303647240943629e-07,
"loss": 0.7999,
"step": 4322
},
{
"epoch": 1.76,
"grad_norm": 5.860798707808013,
"learning_rate": 7.299520290925391e-07,
"loss": 0.7997,
"step": 4323
},
{
"epoch": 1.76,
"grad_norm": 2.870746388442075,
"learning_rate": 7.295393836965438e-07,
"loss": 0.8068,
"step": 4324
},
{
"epoch": 1.76,
"grad_norm": 2.997810976588839,
"learning_rate": 7.291267879821765e-07,
"loss": 0.9186,
"step": 4325
},
{
"epoch": 1.77,
"grad_norm": 2.5960350934276155,
"learning_rate": 7.287142420252289e-07,
"loss": 0.7872,
"step": 4326
},
{
"epoch": 1.77,
"grad_norm": 3.5654576212286515,
"learning_rate": 7.283017459014824e-07,
"loss": 0.8093,
"step": 4327
},
{
"epoch": 1.77,
"grad_norm": 4.352578967558027,
"learning_rate": 7.278892996867091e-07,
"loss": 0.8315,
"step": 4328
},
{
"epoch": 1.77,
"grad_norm": 2.8804994468015317,
"learning_rate": 7.274769034566726e-07,
"loss": 0.8334,
"step": 4329
},
{
"epoch": 1.77,
"grad_norm": 2.731229655824611,
"learning_rate": 7.27064557287127e-07,
"loss": 0.8309,
"step": 4330
},
{
"epoch": 1.77,
"grad_norm": 2.775589501029162,
"learning_rate": 7.266522612538177e-07,
"loss": 0.766,
"step": 4331
},
{
"epoch": 1.77,
"grad_norm": 3.3645035167633393,
"learning_rate": 7.262400154324796e-07,
"loss": 0.7959,
"step": 4332
},
{
"epoch": 1.77,
"grad_norm": 3.5611671320956173,
"learning_rate": 7.258278198988397e-07,
"loss": 0.8616,
"step": 4333
},
{
"epoch": 1.77,
"grad_norm": 2.7994630445759845,
"learning_rate": 7.254156747286153e-07,
"loss": 0.7638,
"step": 4334
},
{
"epoch": 1.77,
"grad_norm": 3.2316673700496135,
"learning_rate": 7.250035799975145e-07,
"loss": 0.7815,
"step": 4335
},
{
"epoch": 1.77,
"grad_norm": 3.723513724118256,
"learning_rate": 7.245915357812362e-07,
"loss": 0.8509,
"step": 4336
},
{
"epoch": 1.77,
"grad_norm": 3.279276111737335,
"learning_rate": 7.241795421554695e-07,
"loss": 0.7447,
"step": 4337
},
{
"epoch": 1.77,
"grad_norm": 3.5858089072026553,
"learning_rate": 7.237675991958944e-07,
"loss": 0.7544,
"step": 4338
},
{
"epoch": 1.77,
"grad_norm": 3.287248458825873,
"learning_rate": 7.233557069781826e-07,
"loss": 0.7866,
"step": 4339
},
{
"epoch": 1.77,
"grad_norm": 2.6611824295352053,
"learning_rate": 7.229438655779956e-07,
"loss": 0.8038,
"step": 4340
},
{
"epoch": 1.77,
"grad_norm": 3.121919221018261,
"learning_rate": 7.225320750709849e-07,
"loss": 0.7714,
"step": 4341
},
{
"epoch": 1.77,
"grad_norm": 3.350393605852748,
"learning_rate": 7.221203355327938e-07,
"loss": 0.8382,
"step": 4342
},
{
"epoch": 1.77,
"grad_norm": 2.5349053234965164,
"learning_rate": 7.217086470390559e-07,
"loss": 0.8165,
"step": 4343
},
{
"epoch": 1.77,
"grad_norm": 2.859867571492041,
"learning_rate": 7.212970096653954e-07,
"loss": 0.8459,
"step": 4344
},
{
"epoch": 1.77,
"grad_norm": 2.5539996110686434,
"learning_rate": 7.208854234874266e-07,
"loss": 0.8597,
"step": 4345
},
{
"epoch": 1.77,
"grad_norm": 3.5967457120370265,
"learning_rate": 7.204738885807553e-07,
"loss": 0.7944,
"step": 4346
},
{
"epoch": 1.77,
"grad_norm": 3.2519582547326142,
"learning_rate": 7.200624050209774e-07,
"loss": 0.8078,
"step": 4347
},
{
"epoch": 1.77,
"grad_norm": 3.279537874837454,
"learning_rate": 7.196509728836793e-07,
"loss": 0.7742,
"step": 4348
},
{
"epoch": 1.77,
"grad_norm": 2.989972592083028,
"learning_rate": 7.192395922444383e-07,
"loss": 0.8273,
"step": 4349
},
{
"epoch": 1.78,
"grad_norm": 2.438920609989755,
"learning_rate": 7.188282631788216e-07,
"loss": 0.7869,
"step": 4350
},
{
"epoch": 1.78,
"grad_norm": 3.1479924259465446,
"learning_rate": 7.184169857623874e-07,
"loss": 0.7783,
"step": 4351
},
{
"epoch": 1.78,
"grad_norm": 3.450759508538549,
"learning_rate": 7.180057600706847e-07,
"loss": 0.8115,
"step": 4352
},
{
"epoch": 1.78,
"grad_norm": 3.6786825232951794,
"learning_rate": 7.175945861792525e-07,
"loss": 0.8653,
"step": 4353
},
{
"epoch": 1.78,
"grad_norm": 2.6901878514800517,
"learning_rate": 7.171834641636203e-07,
"loss": 0.8566,
"step": 4354
},
{
"epoch": 1.78,
"grad_norm": 3.025398384681881,
"learning_rate": 7.16772394099308e-07,
"loss": 0.865,
"step": 4355
},
{
"epoch": 1.78,
"grad_norm": 2.621505565640674,
"learning_rate": 7.163613760618265e-07,
"loss": 0.8177,
"step": 4356
},
{
"epoch": 1.78,
"grad_norm": 3.2623768492318743,
"learning_rate": 7.159504101266769e-07,
"loss": 0.8768,
"step": 4357
},
{
"epoch": 1.78,
"grad_norm": 4.285277127532714,
"learning_rate": 7.155394963693502e-07,
"loss": 0.8643,
"step": 4358
},
{
"epoch": 1.78,
"grad_norm": 3.0275609915738086,
"learning_rate": 7.151286348653283e-07,
"loss": 0.831,
"step": 4359
},
{
"epoch": 1.78,
"grad_norm": 2.683317675962261,
"learning_rate": 7.147178256900835e-07,
"loss": 0.7705,
"step": 4360
},
{
"epoch": 1.78,
"grad_norm": 3.5520558109744376,
"learning_rate": 7.14307068919079e-07,
"loss": 0.768,
"step": 4361
},
{
"epoch": 1.78,
"grad_norm": 2.334346768054463,
"learning_rate": 7.138963646277669e-07,
"loss": 0.8122,
"step": 4362
},
{
"epoch": 1.78,
"grad_norm": 2.798869260347941,
"learning_rate": 7.134857128915911e-07,
"loss": 0.8741,
"step": 4363
},
{
"epoch": 1.78,
"grad_norm": 3.3807825085765777,
"learning_rate": 7.13075113785985e-07,
"loss": 0.8134,
"step": 4364
},
{
"epoch": 1.78,
"grad_norm": 2.4670915530796136,
"learning_rate": 7.126645673863729e-07,
"loss": 0.7656,
"step": 4365
},
{
"epoch": 1.78,
"grad_norm": 2.442495744968065,
"learning_rate": 7.122540737681693e-07,
"loss": 0.8077,
"step": 4366
},
{
"epoch": 1.78,
"grad_norm": 3.293676815621374,
"learning_rate": 7.118436330067785e-07,
"loss": 0.8138,
"step": 4367
},
{
"epoch": 1.78,
"grad_norm": 2.788638094039444,
"learning_rate": 7.114332451775955e-07,
"loss": 0.8277,
"step": 4368
},
{
"epoch": 1.78,
"grad_norm": 2.4367989975599516,
"learning_rate": 7.110229103560057e-07,
"loss": 0.7677,
"step": 4369
},
{
"epoch": 1.78,
"grad_norm": 2.393742212462862,
"learning_rate": 7.106126286173846e-07,
"loss": 0.9183,
"step": 4370
},
{
"epoch": 1.78,
"grad_norm": 2.6896979387275057,
"learning_rate": 7.102024000370977e-07,
"loss": 0.7884,
"step": 4371
},
{
"epoch": 1.78,
"grad_norm": 2.934565065519542,
"learning_rate": 7.097922246905011e-07,
"loss": 0.7548,
"step": 4372
},
{
"epoch": 1.78,
"grad_norm": 2.8979686118991634,
"learning_rate": 7.09382102652941e-07,
"loss": 0.8276,
"step": 4373
},
{
"epoch": 1.78,
"grad_norm": 3.161369823962194,
"learning_rate": 7.08972033999754e-07,
"loss": 0.7853,
"step": 4374
},
{
"epoch": 1.79,
"grad_norm": 2.583808311425139,
"learning_rate": 7.085620188062665e-07,
"loss": 0.8011,
"step": 4375
},
{
"epoch": 1.79,
"grad_norm": 2.589759096191495,
"learning_rate": 7.081520571477953e-07,
"loss": 0.8105,
"step": 4376
},
{
"epoch": 1.79,
"grad_norm": 3.2742999275332094,
"learning_rate": 7.07742149099647e-07,
"loss": 0.7773,
"step": 4377
},
{
"epoch": 1.79,
"grad_norm": 3.1393123681091093,
"learning_rate": 7.073322947371194e-07,
"loss": 0.8493,
"step": 4378
},
{
"epoch": 1.79,
"grad_norm": 3.352232740155522,
"learning_rate": 7.069224941354996e-07,
"loss": 0.835,
"step": 4379
},
{
"epoch": 1.79,
"grad_norm": 3.420191614530228,
"learning_rate": 7.065127473700644e-07,
"loss": 0.7798,
"step": 4380
},
{
"epoch": 1.79,
"grad_norm": 2.5719746330832685,
"learning_rate": 7.061030545160818e-07,
"loss": 0.7663,
"step": 4381
},
{
"epoch": 1.79,
"grad_norm": 3.192326394553939,
"learning_rate": 7.05693415648809e-07,
"loss": 0.791,
"step": 4382
},
{
"epoch": 1.79,
"grad_norm": 2.660773623512925,
"learning_rate": 7.052838308434941e-07,
"loss": 0.8245,
"step": 4383
},
{
"epoch": 1.79,
"grad_norm": 3.032766711429511,
"learning_rate": 7.048743001753744e-07,
"loss": 0.7825,
"step": 4384
},
{
"epoch": 1.79,
"grad_norm": 2.9385937460523097,
"learning_rate": 7.044648237196776e-07,
"loss": 0.8122,
"step": 4385
},
{
"epoch": 1.79,
"grad_norm": 3.8032904844512183,
"learning_rate": 7.04055401551622e-07,
"loss": 0.7992,
"step": 4386
},
{
"epoch": 1.79,
"grad_norm": 3.140899991500796,
"learning_rate": 7.036460337464154e-07,
"loss": 0.7817,
"step": 4387
},
{
"epoch": 1.79,
"grad_norm": 3.117316620188791,
"learning_rate": 7.032367203792552e-07,
"loss": 0.8291,
"step": 4388
},
{
"epoch": 1.79,
"grad_norm": 3.1064332805832633,
"learning_rate": 7.028274615253296e-07,
"loss": 0.8298,
"step": 4389
},
{
"epoch": 1.79,
"grad_norm": 3.0278257092747722,
"learning_rate": 7.024182572598161e-07,
"loss": 0.7315,
"step": 4390
},
{
"epoch": 1.79,
"grad_norm": 5.157395561234281,
"learning_rate": 7.020091076578833e-07,
"loss": 0.8563,
"step": 4391
},
{
"epoch": 1.79,
"grad_norm": 2.5913169907556304,
"learning_rate": 7.016000127946879e-07,
"loss": 0.8136,
"step": 4392
},
{
"epoch": 1.79,
"grad_norm": 2.717817296582211,
"learning_rate": 7.011909727453784e-07,
"loss": 0.8203,
"step": 4393
},
{
"epoch": 1.79,
"grad_norm": 2.908353582918542,
"learning_rate": 7.007819875850923e-07,
"loss": 0.8272,
"step": 4394
},
{
"epoch": 1.79,
"grad_norm": 3.5503418407500953,
"learning_rate": 7.003730573889567e-07,
"loss": 0.7654,
"step": 4395
},
{
"epoch": 1.79,
"grad_norm": 3.5983670439677287,
"learning_rate": 6.999641822320902e-07,
"loss": 0.8518,
"step": 4396
},
{
"epoch": 1.79,
"grad_norm": 2.8785021260005204,
"learning_rate": 6.995553621895988e-07,
"loss": 0.7614,
"step": 4397
},
{
"epoch": 1.79,
"grad_norm": 2.8150946444411864,
"learning_rate": 6.991465973365806e-07,
"loss": 0.7976,
"step": 4398
},
{
"epoch": 1.8,
"grad_norm": 2.541183862072172,
"learning_rate": 6.987378877481225e-07,
"loss": 0.7802,
"step": 4399
},
{
"epoch": 1.8,
"grad_norm": 3.104049838560228,
"learning_rate": 6.983292334993014e-07,
"loss": 0.8804,
"step": 4400
},
{
"epoch": 1.8,
"eval_loss": 0.8719713091850281,
"eval_runtime": 466.7589,
"eval_samples_per_second": 74.668,
"eval_steps_per_second": 4.668,
"step": 4400
},
{
"epoch": 1.8,
"grad_norm": 2.694355252998166,
"learning_rate": 6.979206346651841e-07,
"loss": 0.8324,
"step": 4401
},
{
"epoch": 1.8,
"grad_norm": 5.16187212738353,
"learning_rate": 6.975120913208272e-07,
"loss": 0.8417,
"step": 4402
},
{
"epoch": 1.8,
"grad_norm": 3.7670657601724447,
"learning_rate": 6.971036035412769e-07,
"loss": 0.803,
"step": 4403
},
{
"epoch": 1.8,
"grad_norm": 3.3397946552828075,
"learning_rate": 6.966951714015703e-07,
"loss": 0.87,
"step": 4404
},
{
"epoch": 1.8,
"grad_norm": 4.357254087591364,
"learning_rate": 6.96286794976732e-07,
"loss": 0.7749,
"step": 4405
},
{
"epoch": 1.8,
"grad_norm": 3.379098054957886,
"learning_rate": 6.958784743417787e-07,
"loss": 0.8191,
"step": 4406
},
{
"epoch": 1.8,
"grad_norm": 3.5658860435862945,
"learning_rate": 6.954702095717156e-07,
"loss": 0.7749,
"step": 4407
},
{
"epoch": 1.8,
"grad_norm": 4.93353488243568,
"learning_rate": 6.950620007415378e-07,
"loss": 0.7816,
"step": 4408
},
{
"epoch": 1.8,
"grad_norm": 2.559848805374218,
"learning_rate": 6.946538479262311e-07,
"loss": 0.7916,
"step": 4409
},
{
"epoch": 1.8,
"grad_norm": 3.1367081697455896,
"learning_rate": 6.942457512007689e-07,
"loss": 0.8528,
"step": 4410
},
{
"epoch": 1.8,
"grad_norm": 4.215025896552536,
"learning_rate": 6.938377106401164e-07,
"loss": 0.7934,
"step": 4411
},
{
"epoch": 1.8,
"grad_norm": 4.860984192627439,
"learning_rate": 6.934297263192275e-07,
"loss": 0.769,
"step": 4412
},
{
"epoch": 1.8,
"grad_norm": 2.855803308147978,
"learning_rate": 6.930217983130462e-07,
"loss": 0.8424,
"step": 4413
},
{
"epoch": 1.8,
"grad_norm": 2.6742042155766246,
"learning_rate": 6.926139266965052e-07,
"loss": 0.8408,
"step": 4414
},
{
"epoch": 1.8,
"grad_norm": 3.503855863832134,
"learning_rate": 6.922061115445279e-07,
"loss": 0.8041,
"step": 4415
},
{
"epoch": 1.8,
"grad_norm": 2.512949455575786,
"learning_rate": 6.917983529320267e-07,
"loss": 0.7983,
"step": 4416
},
{
"epoch": 1.8,
"grad_norm": 3.370882597940238,
"learning_rate": 6.913906509339047e-07,
"loss": 0.8171,
"step": 4417
},
{
"epoch": 1.8,
"grad_norm": 3.4526307073023563,
"learning_rate": 6.909830056250526e-07,
"loss": 0.8288,
"step": 4418
},
{
"epoch": 1.8,
"grad_norm": 2.967609577685023,
"learning_rate": 6.905754170803526e-07,
"loss": 0.8031,
"step": 4419
},
{
"epoch": 1.8,
"grad_norm": 2.8347946290909136,
"learning_rate": 6.901678853746755e-07,
"loss": 0.8325,
"step": 4420
},
{
"epoch": 1.8,
"grad_norm": 2.7483740392655513,
"learning_rate": 6.897604105828817e-07,
"loss": 0.8615,
"step": 4421
},
{
"epoch": 1.8,
"grad_norm": 2.558447974244027,
"learning_rate": 6.89352992779822e-07,
"loss": 0.8268,
"step": 4422
},
{
"epoch": 1.8,
"grad_norm": 2.7587943876845404,
"learning_rate": 6.889456320403353e-07,
"loss": 0.8114,
"step": 4423
},
{
"epoch": 1.81,
"grad_norm": 4.550597887509363,
"learning_rate": 6.885383284392509e-07,
"loss": 0.7833,
"step": 4424
},
{
"epoch": 1.81,
"grad_norm": 2.854885781955078,
"learning_rate": 6.881310820513879e-07,
"loss": 0.7623,
"step": 4425
},
{
"epoch": 1.81,
"grad_norm": 3.5657248377348334,
"learning_rate": 6.877238929515543e-07,
"loss": 0.8586,
"step": 4426
},
{
"epoch": 1.81,
"grad_norm": 2.77874373823525,
"learning_rate": 6.873167612145476e-07,
"loss": 0.8518,
"step": 4427
},
{
"epoch": 1.81,
"grad_norm": 3.0540848036879114,
"learning_rate": 6.869096869151549e-07,
"loss": 0.8703,
"step": 4428
},
{
"epoch": 1.81,
"grad_norm": 3.533907396281667,
"learning_rate": 6.865026701281524e-07,
"loss": 0.8789,
"step": 4429
},
{
"epoch": 1.81,
"grad_norm": 2.6237826303222733,
"learning_rate": 6.860957109283074e-07,
"loss": 0.7791,
"step": 4430
},
{
"epoch": 1.81,
"grad_norm": 2.8233188633083603,
"learning_rate": 6.856888093903737e-07,
"loss": 0.797,
"step": 4431
},
{
"epoch": 1.81,
"grad_norm": 2.791829587341658,
"learning_rate": 6.852819655890972e-07,
"loss": 0.9354,
"step": 4432
},
{
"epoch": 1.81,
"grad_norm": 3.5184287914990526,
"learning_rate": 6.848751795992116e-07,
"loss": 0.8082,
"step": 4433
},
{
"epoch": 1.81,
"grad_norm": 3.5635851411440194,
"learning_rate": 6.844684514954409e-07,
"loss": 0.8404,
"step": 4434
},
{
"epoch": 1.81,
"grad_norm": 3.83001215048837,
"learning_rate": 6.840617813524977e-07,
"loss": 0.8061,
"step": 4435
},
{
"epoch": 1.81,
"grad_norm": 2.5102799129223117,
"learning_rate": 6.836551692450842e-07,
"loss": 0.85,
"step": 4436
},
{
"epoch": 1.81,
"grad_norm": 2.82813655007396,
"learning_rate": 6.832486152478926e-07,
"loss": 0.8128,
"step": 4437
},
{
"epoch": 1.81,
"grad_norm": 3.0762462683862206,
"learning_rate": 6.828421194356036e-07,
"loss": 0.7814,
"step": 4438
},
{
"epoch": 1.81,
"grad_norm": 4.352952976304413,
"learning_rate": 6.824356818828876e-07,
"loss": 0.7829,
"step": 4439
},
{
"epoch": 1.81,
"grad_norm": 3.5146631344541346,
"learning_rate": 6.820293026644039e-07,
"loss": 0.873,
"step": 4440
},
{
"epoch": 1.81,
"grad_norm": 2.6593790946819693,
"learning_rate": 6.816229818548016e-07,
"loss": 0.7547,
"step": 4441
},
{
"epoch": 1.81,
"grad_norm": 4.305234636752902,
"learning_rate": 6.812167195287186e-07,
"loss": 0.7889,
"step": 4442
},
{
"epoch": 1.81,
"grad_norm": 3.2162794135055965,
"learning_rate": 6.808105157607831e-07,
"loss": 0.8402,
"step": 4443
},
{
"epoch": 1.81,
"grad_norm": 5.026978676449776,
"learning_rate": 6.804043706256105e-07,
"loss": 0.7854,
"step": 4444
},
{
"epoch": 1.81,
"grad_norm": 2.7668298167762977,
"learning_rate": 6.799982841978076e-07,
"loss": 0.8586,
"step": 4445
},
{
"epoch": 1.81,
"grad_norm": 3.3184858623578504,
"learning_rate": 6.795922565519693e-07,
"loss": 0.7413,
"step": 4446
},
{
"epoch": 1.81,
"grad_norm": 3.08442184973697,
"learning_rate": 6.791862877626799e-07,
"loss": 0.7571,
"step": 4447
},
{
"epoch": 1.82,
"grad_norm": 2.775997529401611,
"learning_rate": 6.787803779045127e-07,
"loss": 0.838,
"step": 4448
},
{
"epoch": 1.82,
"grad_norm": 2.690638300823183,
"learning_rate": 6.783745270520303e-07,
"loss": 0.8139,
"step": 4449
},
{
"epoch": 1.82,
"grad_norm": 2.7646985576992744,
"learning_rate": 6.779687352797849e-07,
"loss": 0.8453,
"step": 4450
},
{
"epoch": 1.82,
"grad_norm": 3.6655326529587047,
"learning_rate": 6.775630026623172e-07,
"loss": 0.8268,
"step": 4451
},
{
"epoch": 1.82,
"grad_norm": 3.4023601154522187,
"learning_rate": 6.771573292741577e-07,
"loss": 0.8803,
"step": 4452
},
{
"epoch": 1.82,
"grad_norm": 2.8570374491878514,
"learning_rate": 6.767517151898248e-07,
"loss": 0.8041,
"step": 4453
},
{
"epoch": 1.82,
"grad_norm": 3.2208714954945443,
"learning_rate": 6.763461604838277e-07,
"loss": 0.8348,
"step": 4454
},
{
"epoch": 1.82,
"grad_norm": 3.6293621451038183,
"learning_rate": 6.759406652306629e-07,
"loss": 0.8128,
"step": 4455
},
{
"epoch": 1.82,
"grad_norm": 3.6716655487575633,
"learning_rate": 6.755352295048182e-07,
"loss": 0.7973,
"step": 4456
},
{
"epoch": 1.82,
"grad_norm": 3.182660304874301,
"learning_rate": 6.751298533807677e-07,
"loss": 0.7949,
"step": 4457
},
{
"epoch": 1.82,
"grad_norm": 3.191190409599908,
"learning_rate": 6.747245369329771e-07,
"loss": 0.8215,
"step": 4458
},
{
"epoch": 1.82,
"grad_norm": 3.823868190642151,
"learning_rate": 6.743192802358994e-07,
"loss": 0.7782,
"step": 4459
},
{
"epoch": 1.82,
"grad_norm": 2.7359797377157613,
"learning_rate": 6.739140833639779e-07,
"loss": 0.7728,
"step": 4460
},
{
"epoch": 1.82,
"grad_norm": 3.127020507391882,
"learning_rate": 6.735089463916437e-07,
"loss": 0.8476,
"step": 4461
},
{
"epoch": 1.82,
"grad_norm": 2.970111741687616,
"learning_rate": 6.731038693933175e-07,
"loss": 0.7822,
"step": 4462
},
{
"epoch": 1.82,
"grad_norm": 2.44119102462462,
"learning_rate": 6.726988524434094e-07,
"loss": 0.8337,
"step": 4463
},
{
"epoch": 1.82,
"grad_norm": 2.6497776617375166,
"learning_rate": 6.722938956163181e-07,
"loss": 0.8782,
"step": 4464
},
{
"epoch": 1.82,
"grad_norm": 2.825459581218652,
"learning_rate": 6.718889989864307e-07,
"loss": 0.8129,
"step": 4465
},
{
"epoch": 1.82,
"grad_norm": 3.1628613809051673,
"learning_rate": 6.714841626281239e-07,
"loss": 0.8187,
"step": 4466
},
{
"epoch": 1.82,
"grad_norm": 2.2772848372154715,
"learning_rate": 6.710793866157632e-07,
"loss": 0.8781,
"step": 4467
},
{
"epoch": 1.82,
"grad_norm": 3.1608563185746896,
"learning_rate": 6.706746710237029e-07,
"loss": 0.844,
"step": 4468
},
{
"epoch": 1.82,
"grad_norm": 2.9388778018790305,
"learning_rate": 6.70270015926287e-07,
"loss": 0.7587,
"step": 4469
},
{
"epoch": 1.82,
"grad_norm": 2.8748659257986033,
"learning_rate": 6.698654213978463e-07,
"loss": 0.789,
"step": 4470
},
{
"epoch": 1.82,
"grad_norm": 2.667597703880651,
"learning_rate": 6.69460887512703e-07,
"loss": 0.8183,
"step": 4471
},
{
"epoch": 1.82,
"grad_norm": 3.4652567251219812,
"learning_rate": 6.690564143451665e-07,
"loss": 0.8171,
"step": 4472
},
{
"epoch": 1.83,
"grad_norm": 2.2938668826728246,
"learning_rate": 6.686520019695358e-07,
"loss": 0.8779,
"step": 4473
},
{
"epoch": 1.83,
"grad_norm": 2.758599968204325,
"learning_rate": 6.682476504600982e-07,
"loss": 0.8732,
"step": 4474
},
{
"epoch": 1.83,
"grad_norm": 3.1715790989040764,
"learning_rate": 6.678433598911299e-07,
"loss": 0.7829,
"step": 4475
},
{
"epoch": 1.83,
"grad_norm": 3.1056703418783234,
"learning_rate": 6.674391303368969e-07,
"loss": 0.8217,
"step": 4476
},
{
"epoch": 1.83,
"grad_norm": 2.6210870581176113,
"learning_rate": 6.670349618716529e-07,
"loss": 0.8684,
"step": 4477
},
{
"epoch": 1.83,
"grad_norm": 2.8315126596674545,
"learning_rate": 6.666308545696401e-07,
"loss": 0.7698,
"step": 4478
},
{
"epoch": 1.83,
"grad_norm": 2.983753586317627,
"learning_rate": 6.662268085050906e-07,
"loss": 0.869,
"step": 4479
},
{
"epoch": 1.83,
"grad_norm": 3.110108373816128,
"learning_rate": 6.658228237522246e-07,
"loss": 0.8371,
"step": 4480
},
{
"epoch": 1.83,
"grad_norm": 2.8933672946538698,
"learning_rate": 6.654189003852509e-07,
"loss": 0.8077,
"step": 4481
},
{
"epoch": 1.83,
"grad_norm": 4.367293945518055,
"learning_rate": 6.650150384783679e-07,
"loss": 0.7538,
"step": 4482
},
{
"epoch": 1.83,
"grad_norm": 3.292383943754982,
"learning_rate": 6.64611238105761e-07,
"loss": 0.775,
"step": 4483
},
{
"epoch": 1.83,
"grad_norm": 3.0858633028731464,
"learning_rate": 6.642074993416063e-07,
"loss": 0.8262,
"step": 4484
},
{
"epoch": 1.83,
"grad_norm": 3.344351611643381,
"learning_rate": 6.638038222600673e-07,
"loss": 0.7966,
"step": 4485
},
{
"epoch": 1.83,
"grad_norm": 3.191573373994451,
"learning_rate": 6.634002069352966e-07,
"loss": 0.7774,
"step": 4486
},
{
"epoch": 1.83,
"grad_norm": 2.624991036285244,
"learning_rate": 6.629966534414353e-07,
"loss": 0.7546,
"step": 4487
},
{
"epoch": 1.83,
"grad_norm": 2.981488059469163,
"learning_rate": 6.62593161852613e-07,
"loss": 0.8155,
"step": 4488
},
{
"epoch": 1.83,
"grad_norm": 2.452662604204239,
"learning_rate": 6.621897322429484e-07,
"loss": 0.8432,
"step": 4489
},
{
"epoch": 1.83,
"grad_norm": 3.099641351230698,
"learning_rate": 6.617863646865488e-07,
"loss": 0.7881,
"step": 4490
},
{
"epoch": 1.83,
"grad_norm": 2.73064786720382,
"learning_rate": 6.613830592575093e-07,
"loss": 0.822,
"step": 4491
},
{
"epoch": 1.83,
"grad_norm": 3.0219501074519597,
"learning_rate": 6.609798160299145e-07,
"loss": 0.8013,
"step": 4492
},
{
"epoch": 1.83,
"grad_norm": 2.67374732711347,
"learning_rate": 6.605766350778373e-07,
"loss": 0.8439,
"step": 4493
},
{
"epoch": 1.83,
"grad_norm": 3.326882234385994,
"learning_rate": 6.601735164753388e-07,
"loss": 0.7871,
"step": 4494
},
{
"epoch": 1.83,
"grad_norm": 2.700414823399321,
"learning_rate": 6.597704602964686e-07,
"loss": 0.8726,
"step": 4495
},
{
"epoch": 1.83,
"grad_norm": 2.6131700822899724,
"learning_rate": 6.59367466615266e-07,
"loss": 0.9,
"step": 4496
},
{
"epoch": 1.84,
"grad_norm": 3.3901676010330504,
"learning_rate": 6.589645355057576e-07,
"loss": 0.8378,
"step": 4497
},
{
"epoch": 1.84,
"grad_norm": 2.789258931877943,
"learning_rate": 6.585616670419587e-07,
"loss": 0.8203,
"step": 4498
},
{
"epoch": 1.84,
"grad_norm": 2.9121490870140128,
"learning_rate": 6.581588612978738e-07,
"loss": 0.8515,
"step": 4499
},
{
"epoch": 1.84,
"grad_norm": 2.884089997258484,
"learning_rate": 6.577561183474947e-07,
"loss": 0.8443,
"step": 4500
},
{
"epoch": 1.84,
"eval_loss": 0.8702970743179321,
"eval_runtime": 466.4475,
"eval_samples_per_second": 74.718,
"eval_steps_per_second": 4.671,
"step": 4500
},
{
"epoch": 1.84,
"grad_norm": 3.0027615094364393,
"learning_rate": 6.573534382648025e-07,
"loss": 0.805,
"step": 4501
},
{
"epoch": 1.84,
"grad_norm": 3.796612089922597,
"learning_rate": 6.569508211237668e-07,
"loss": 0.8208,
"step": 4502
},
{
"epoch": 1.84,
"grad_norm": 2.7899711986554236,
"learning_rate": 6.565482669983456e-07,
"loss": 0.8091,
"step": 4503
},
{
"epoch": 1.84,
"grad_norm": 5.078504575882827,
"learning_rate": 6.561457759624845e-07,
"loss": 0.8362,
"step": 4504
},
{
"epoch": 1.84,
"grad_norm": 3.860521995825905,
"learning_rate": 6.557433480901184e-07,
"loss": 0.7766,
"step": 4505
},
{
"epoch": 1.84,
"grad_norm": 2.582300353813726,
"learning_rate": 6.553409834551704e-07,
"loss": 0.7643,
"step": 4506
},
{
"epoch": 1.84,
"grad_norm": 3.1243889543912298,
"learning_rate": 6.549386821315526e-07,
"loss": 0.8101,
"step": 4507
},
{
"epoch": 1.84,
"grad_norm": 3.295899715969267,
"learning_rate": 6.545364441931635e-07,
"loss": 0.8207,
"step": 4508
},
{
"epoch": 1.84,
"grad_norm": 3.3204325341442162,
"learning_rate": 6.541342697138921e-07,
"loss": 0.8791,
"step": 4509
},
{
"epoch": 1.84,
"grad_norm": 3.563962839963807,
"learning_rate": 6.537321587676147e-07,
"loss": 0.7798,
"step": 4510
},
{
"epoch": 1.84,
"grad_norm": 2.5922320322893597,
"learning_rate": 6.53330111428196e-07,
"loss": 0.7925,
"step": 4511
},
{
"epoch": 1.84,
"grad_norm": 2.5962019827747262,
"learning_rate": 6.529281277694897e-07,
"loss": 0.8253,
"step": 4512
},
{
"epoch": 1.84,
"grad_norm": 3.0239401389268723,
"learning_rate": 6.525262078653364e-07,
"loss": 0.8447,
"step": 4513
},
{
"epoch": 1.84,
"grad_norm": 2.8370349842699274,
"learning_rate": 6.521243517895663e-07,
"loss": 0.8343,
"step": 4514
},
{
"epoch": 1.84,
"grad_norm": 2.673846665081707,
"learning_rate": 6.517225596159974e-07,
"loss": 0.8321,
"step": 4515
},
{
"epoch": 1.84,
"grad_norm": 7.528486223836865,
"learning_rate": 6.513208314184362e-07,
"loss": 0.8236,
"step": 4516
},
{
"epoch": 1.84,
"grad_norm": 2.7114467344112003,
"learning_rate": 6.509191672706766e-07,
"loss": 0.9174,
"step": 4517
},
{
"epoch": 1.84,
"grad_norm": 2.9159593162445123,
"learning_rate": 6.505175672465019e-07,
"loss": 0.7948,
"step": 4518
},
{
"epoch": 1.84,
"grad_norm": 2.402322172626653,
"learning_rate": 6.501160314196825e-07,
"loss": 0.8268,
"step": 4519
},
{
"epoch": 1.84,
"grad_norm": 3.304669145570292,
"learning_rate": 6.497145598639789e-07,
"loss": 0.7915,
"step": 4520
},
{
"epoch": 1.84,
"grad_norm": 2.3774731199332906,
"learning_rate": 6.493131526531366e-07,
"loss": 0.8006,
"step": 4521
},
{
"epoch": 1.85,
"grad_norm": 4.16939979723971,
"learning_rate": 6.489118098608923e-07,
"loss": 0.7802,
"step": 4522
},
{
"epoch": 1.85,
"grad_norm": 3.0357332145351696,
"learning_rate": 6.485105315609696e-07,
"loss": 0.8808,
"step": 4523
},
{
"epoch": 1.85,
"grad_norm": 2.992948698062865,
"learning_rate": 6.481093178270804e-07,
"loss": 0.8247,
"step": 4524
},
{
"epoch": 1.85,
"grad_norm": 3.1355244366214516,
"learning_rate": 6.477081687329246e-07,
"loss": 0.9095,
"step": 4525
},
{
"epoch": 1.85,
"grad_norm": 3.4448160317301135,
"learning_rate": 6.473070843521904e-07,
"loss": 0.8258,
"step": 4526
},
{
"epoch": 1.85,
"grad_norm": 3.4634544162622167,
"learning_rate": 6.469060647585536e-07,
"loss": 0.8123,
"step": 4527
},
{
"epoch": 1.85,
"grad_norm": 3.0251535179464297,
"learning_rate": 6.465051100256792e-07,
"loss": 0.8131,
"step": 4528
},
{
"epoch": 1.85,
"grad_norm": 3.025655073346237,
"learning_rate": 6.461042202272197e-07,
"loss": 0.829,
"step": 4529
},
{
"epoch": 1.85,
"grad_norm": 2.596595580006668,
"learning_rate": 6.457033954368149e-07,
"loss": 0.8032,
"step": 4530
},
{
"epoch": 1.85,
"grad_norm": 3.573297527847472,
"learning_rate": 6.45302635728094e-07,
"loss": 0.7439,
"step": 4531
},
{
"epoch": 1.85,
"grad_norm": 3.2082559797370296,
"learning_rate": 6.44901941174673e-07,
"loss": 0.7904,
"step": 4532
},
{
"epoch": 1.85,
"grad_norm": 3.169364252275195,
"learning_rate": 6.445013118501577e-07,
"loss": 0.7628,
"step": 4533
},
{
"epoch": 1.85,
"grad_norm": 3.5811313660508883,
"learning_rate": 6.441007478281393e-07,
"loss": 0.8317,
"step": 4534
},
{
"epoch": 1.85,
"grad_norm": 2.8033560721426394,
"learning_rate": 6.437002491821995e-07,
"loss": 0.8762,
"step": 4535
},
{
"epoch": 1.85,
"grad_norm": 2.9563348393867837,
"learning_rate": 6.432998159859064e-07,
"loss": 0.7884,
"step": 4536
},
{
"epoch": 1.85,
"grad_norm": 3.069822768915537,
"learning_rate": 6.428994483128173e-07,
"loss": 0.808,
"step": 4537
},
{
"epoch": 1.85,
"grad_norm": 2.8604192474336854,
"learning_rate": 6.424991462364762e-07,
"loss": 0.8334,
"step": 4538
},
{
"epoch": 1.85,
"grad_norm": 3.666669792337015,
"learning_rate": 6.420989098304156e-07,
"loss": 0.8049,
"step": 4539
},
{
"epoch": 1.85,
"grad_norm": 3.0485788839082835,
"learning_rate": 6.416987391681561e-07,
"loss": 0.732,
"step": 4540
},
{
"epoch": 1.85,
"grad_norm": 3.9965715764531295,
"learning_rate": 6.412986343232064e-07,
"loss": 0.8548,
"step": 4541
},
{
"epoch": 1.85,
"grad_norm": 3.6906793672501648,
"learning_rate": 6.408985953690629e-07,
"loss": 0.8194,
"step": 4542
},
{
"epoch": 1.85,
"grad_norm": 3.0581929565602644,
"learning_rate": 6.404986223792093e-07,
"loss": 0.9135,
"step": 4543
},
{
"epoch": 1.85,
"grad_norm": 3.3558988348982517,
"learning_rate": 6.40098715427118e-07,
"loss": 0.8142,
"step": 4544
},
{
"epoch": 1.85,
"grad_norm": 4.096878764395294,
"learning_rate": 6.396988745862486e-07,
"loss": 0.7871,
"step": 4545
},
{
"epoch": 1.86,
"grad_norm": 2.805381246632203,
"learning_rate": 6.392990999300501e-07,
"loss": 0.7485,
"step": 4546
},
{
"epoch": 1.86,
"grad_norm": 3.3443356435966742,
"learning_rate": 6.388993915319565e-07,
"loss": 0.8664,
"step": 4547
},
{
"epoch": 1.86,
"grad_norm": 2.685061707111674,
"learning_rate": 6.384997494653923e-07,
"loss": 0.803,
"step": 4548
},
{
"epoch": 1.86,
"grad_norm": 2.5681329124163463,
"learning_rate": 6.381001738037687e-07,
"loss": 0.7949,
"step": 4549
},
{
"epoch": 1.86,
"grad_norm": 2.738164469666351,
"learning_rate": 6.37700664620485e-07,
"loss": 0.842,
"step": 4550
},
{
"epoch": 1.86,
"grad_norm": 2.9617659848469073,
"learning_rate": 6.373012219889276e-07,
"loss": 0.8167,
"step": 4551
},
{
"epoch": 1.86,
"grad_norm": 2.6212701952192665,
"learning_rate": 6.369018459824713e-07,
"loss": 0.8637,
"step": 4552
},
{
"epoch": 1.86,
"grad_norm": 2.9302674139391742,
"learning_rate": 6.365025366744786e-07,
"loss": 0.7529,
"step": 4553
},
{
"epoch": 1.86,
"grad_norm": 2.9461178156085466,
"learning_rate": 6.361032941382999e-07,
"loss": 0.8865,
"step": 4554
},
{
"epoch": 1.86,
"grad_norm": 3.3607579779447425,
"learning_rate": 6.357041184472731e-07,
"loss": 0.7861,
"step": 4555
},
{
"epoch": 1.86,
"grad_norm": 3.1363607030455314,
"learning_rate": 6.353050096747236e-07,
"loss": 0.7556,
"step": 4556
},
{
"epoch": 1.86,
"grad_norm": 3.204309953576282,
"learning_rate": 6.349059678939649e-07,
"loss": 0.7902,
"step": 4557
},
{
"epoch": 1.86,
"grad_norm": 2.669807774876877,
"learning_rate": 6.345069931782978e-07,
"loss": 0.8619,
"step": 4558
},
{
"epoch": 1.86,
"grad_norm": 3.486897061143538,
"learning_rate": 6.341080856010117e-07,
"loss": 0.813,
"step": 4559
},
{
"epoch": 1.86,
"grad_norm": 4.826431084496083,
"learning_rate": 6.33709245235382e-07,
"loss": 0.7675,
"step": 4560
},
{
"epoch": 1.86,
"grad_norm": 3.606576295437842,
"learning_rate": 6.333104721546735e-07,
"loss": 0.7941,
"step": 4561
},
{
"epoch": 1.86,
"grad_norm": 4.137255837180635,
"learning_rate": 6.329117664321375e-07,
"loss": 0.7577,
"step": 4562
},
{
"epoch": 1.86,
"grad_norm": 3.0554676730621746,
"learning_rate": 6.325131281410139e-07,
"loss": 0.8155,
"step": 4563
},
{
"epoch": 1.86,
"grad_norm": 3.677467929590977,
"learning_rate": 6.321145573545287e-07,
"loss": 0.8437,
"step": 4564
},
{
"epoch": 1.86,
"grad_norm": 2.8180650828134324,
"learning_rate": 6.317160541458971e-07,
"loss": 0.7914,
"step": 4565
},
{
"epoch": 1.86,
"grad_norm": 3.0043329800568817,
"learning_rate": 6.313176185883209e-07,
"loss": 0.8542,
"step": 4566
},
{
"epoch": 1.86,
"grad_norm": 3.0141316767880575,
"learning_rate": 6.309192507549903e-07,
"loss": 0.8374,
"step": 4567
},
{
"epoch": 1.86,
"grad_norm": 3.534070110400838,
"learning_rate": 6.305209507190816e-07,
"loss": 0.7963,
"step": 4568
},
{
"epoch": 1.86,
"grad_norm": 2.3456324195427722,
"learning_rate": 6.301227185537605e-07,
"loss": 0.8288,
"step": 4569
},
{
"epoch": 1.86,
"grad_norm": 2.989171824988268,
"learning_rate": 6.297245543321789e-07,
"loss": 0.9234,
"step": 4570
},
{
"epoch": 1.87,
"grad_norm": 2.553390711583213,
"learning_rate": 6.293264581274764e-07,
"loss": 0.8431,
"step": 4571
},
{
"epoch": 1.87,
"grad_norm": 3.995041537277995,
"learning_rate": 6.289284300127813e-07,
"loss": 0.8902,
"step": 4572
},
{
"epoch": 1.87,
"grad_norm": 2.9812695744583477,
"learning_rate": 6.285304700612073e-07,
"loss": 0.7507,
"step": 4573
},
{
"epoch": 1.87,
"grad_norm": 2.7768703412094426,
"learning_rate": 6.281325783458573e-07,
"loss": 0.8078,
"step": 4574
},
{
"epoch": 1.87,
"grad_norm": 3.061342198664115,
"learning_rate": 6.277347549398211e-07,
"loss": 0.8404,
"step": 4575
},
{
"epoch": 1.87,
"grad_norm": 2.315940064068564,
"learning_rate": 6.27336999916176e-07,
"loss": 0.8279,
"step": 4576
},
{
"epoch": 1.87,
"grad_norm": 3.4616823565749657,
"learning_rate": 6.269393133479863e-07,
"loss": 0.7876,
"step": 4577
},
{
"epoch": 1.87,
"grad_norm": 3.043212580010287,
"learning_rate": 6.265416953083044e-07,
"loss": 0.8018,
"step": 4578
},
{
"epoch": 1.87,
"grad_norm": 2.7360557827925867,
"learning_rate": 6.261441458701695e-07,
"loss": 0.7303,
"step": 4579
},
{
"epoch": 1.87,
"grad_norm": 2.9631446952211142,
"learning_rate": 6.257466651066093e-07,
"loss": 0.8502,
"step": 4580
},
{
"epoch": 1.87,
"grad_norm": 2.6509729033179408,
"learning_rate": 6.25349253090637e-07,
"loss": 0.8042,
"step": 4581
},
{
"epoch": 1.87,
"grad_norm": 3.213121941038986,
"learning_rate": 6.24951909895255e-07,
"loss": 0.7634,
"step": 4582
},
{
"epoch": 1.87,
"grad_norm": 2.854420640357115,
"learning_rate": 6.245546355934521e-07,
"loss": 0.7937,
"step": 4583
},
{
"epoch": 1.87,
"grad_norm": 2.6560080201201868,
"learning_rate": 6.241574302582046e-07,
"loss": 0.7851,
"step": 4584
},
{
"epoch": 1.87,
"grad_norm": 2.975646862385941,
"learning_rate": 6.237602939624769e-07,
"loss": 0.8256,
"step": 4585
},
{
"epoch": 1.87,
"grad_norm": 3.643057019259417,
"learning_rate": 6.233632267792186e-07,
"loss": 0.8072,
"step": 4586
},
{
"epoch": 1.87,
"grad_norm": 2.692323916772985,
"learning_rate": 6.229662287813693e-07,
"loss": 0.8077,
"step": 4587
},
{
"epoch": 1.87,
"grad_norm": 2.906305317185813,
"learning_rate": 6.225693000418541e-07,
"loss": 0.8292,
"step": 4588
},
{
"epoch": 1.87,
"grad_norm": 2.6647562239298,
"learning_rate": 6.221724406335862e-07,
"loss": 0.8418,
"step": 4589
},
{
"epoch": 1.87,
"grad_norm": 3.4285782265198037,
"learning_rate": 6.217756506294652e-07,
"loss": 0.778,
"step": 4590
},
{
"epoch": 1.87,
"grad_norm": 3.438757027651794,
"learning_rate": 6.21378930102379e-07,
"loss": 0.7709,
"step": 4591
},
{
"epoch": 1.87,
"grad_norm": 3.2072615863239515,
"learning_rate": 6.209822791252017e-07,
"loss": 0.8397,
"step": 4592
},
{
"epoch": 1.87,
"grad_norm": 2.892672730835523,
"learning_rate": 6.205856977707963e-07,
"loss": 0.8658,
"step": 4593
},
{
"epoch": 1.87,
"grad_norm": 2.5805304512475007,
"learning_rate": 6.201891861120104e-07,
"loss": 0.8546,
"step": 4594
},
{
"epoch": 1.88,
"grad_norm": 2.531840529530571,
"learning_rate": 6.197927442216812e-07,
"loss": 0.8811,
"step": 4595
},
{
"epoch": 1.88,
"grad_norm": 2.7215335695523937,
"learning_rate": 6.193963721726322e-07,
"loss": 0.7968,
"step": 4596
},
{
"epoch": 1.88,
"grad_norm": 3.3781013130692283,
"learning_rate": 6.190000700376739e-07,
"loss": 0.8308,
"step": 4597
},
{
"epoch": 1.88,
"grad_norm": 2.9000362004865905,
"learning_rate": 6.186038378896037e-07,
"loss": 0.9048,
"step": 4598
},
{
"epoch": 1.88,
"grad_norm": 2.5908564072705706,
"learning_rate": 6.182076758012067e-07,
"loss": 0.8884,
"step": 4599
},
{
"epoch": 1.88,
"grad_norm": 2.7773593724186894,
"learning_rate": 6.178115838452554e-07,
"loss": 0.8446,
"step": 4600
},
{
"epoch": 1.88,
"eval_loss": 0.8694307208061218,
"eval_runtime": 466.9254,
"eval_samples_per_second": 74.641,
"eval_steps_per_second": 4.667,
"step": 4600
},
{
"epoch": 1.88,
"grad_norm": 2.8901275859972158,
"learning_rate": 6.174155620945084e-07,
"loss": 0.7602,
"step": 4601
},
{
"epoch": 1.88,
"grad_norm": 3.568834382231878,
"learning_rate": 6.170196106217127e-07,
"loss": 0.8761,
"step": 4602
},
{
"epoch": 1.88,
"grad_norm": 2.9229594735807245,
"learning_rate": 6.166237294996009e-07,
"loss": 0.7789,
"step": 4603
},
{
"epoch": 1.88,
"grad_norm": 2.6223413128778543,
"learning_rate": 6.162279188008937e-07,
"loss": 0.8575,
"step": 4604
},
{
"epoch": 1.88,
"grad_norm": 2.7927273752580493,
"learning_rate": 6.158321785982989e-07,
"loss": 0.8517,
"step": 4605
},
{
"epoch": 1.88,
"grad_norm": 5.546238961875335,
"learning_rate": 6.154365089645111e-07,
"loss": 0.7486,
"step": 4606
},
{
"epoch": 1.88,
"grad_norm": 3.229814768383325,
"learning_rate": 6.150409099722114e-07,
"loss": 0.8572,
"step": 4607
},
{
"epoch": 1.88,
"grad_norm": 3.3810980642371553,
"learning_rate": 6.146453816940688e-07,
"loss": 0.7962,
"step": 4608
},
{
"epoch": 1.88,
"grad_norm": 2.8298209429668986,
"learning_rate": 6.142499242027388e-07,
"loss": 0.7787,
"step": 4609
},
{
"epoch": 1.88,
"grad_norm": 2.678690481309827,
"learning_rate": 6.138545375708643e-07,
"loss": 0.7723,
"step": 4610
},
{
"epoch": 1.88,
"grad_norm": 3.5477573196283023,
"learning_rate": 6.134592218710746e-07,
"loss": 0.8348,
"step": 4611
},
{
"epoch": 1.88,
"grad_norm": 3.384048114786985,
"learning_rate": 6.130639771759862e-07,
"loss": 0.8527,
"step": 4612
},
{
"epoch": 1.88,
"grad_norm": 2.636836209188327,
"learning_rate": 6.126688035582031e-07,
"loss": 0.7378,
"step": 4613
},
{
"epoch": 1.88,
"grad_norm": 2.8558920052708787,
"learning_rate": 6.122737010903154e-07,
"loss": 0.9006,
"step": 4614
},
{
"epoch": 1.88,
"grad_norm": 2.8223800347262182,
"learning_rate": 6.118786698449008e-07,
"loss": 0.7917,
"step": 4615
},
{
"epoch": 1.88,
"grad_norm": 6.064569324193783,
"learning_rate": 6.114837098945233e-07,
"loss": 0.7363,
"step": 4616
},
{
"epoch": 1.88,
"grad_norm": 3.1610867010714587,
"learning_rate": 6.110888213117344e-07,
"loss": 0.757,
"step": 4617
},
{
"epoch": 1.88,
"grad_norm": 4.556191927890162,
"learning_rate": 6.106940041690722e-07,
"loss": 0.8611,
"step": 4618
},
{
"epoch": 1.88,
"grad_norm": 2.762156838645142,
"learning_rate": 6.102992585390618e-07,
"loss": 0.8062,
"step": 4619
},
{
"epoch": 1.89,
"grad_norm": 2.7748937756832612,
"learning_rate": 6.09904584494215e-07,
"loss": 0.8357,
"step": 4620
},
{
"epoch": 1.89,
"grad_norm": 3.233240875157716,
"learning_rate": 6.095099821070303e-07,
"loss": 0.8399,
"step": 4621
},
{
"epoch": 1.89,
"grad_norm": 3.5788173348528516,
"learning_rate": 6.091154514499934e-07,
"loss": 0.8201,
"step": 4622
},
{
"epoch": 1.89,
"grad_norm": 2.7979854498098393,
"learning_rate": 6.08720992595577e-07,
"loss": 0.8492,
"step": 4623
},
{
"epoch": 1.89,
"grad_norm": 2.6630149035393798,
"learning_rate": 6.0832660561624e-07,
"loss": 0.8572,
"step": 4624
},
{
"epoch": 1.89,
"grad_norm": 2.6878581547355025,
"learning_rate": 6.079322905844279e-07,
"loss": 0.7478,
"step": 4625
},
{
"epoch": 1.89,
"grad_norm": 2.5350637306572703,
"learning_rate": 6.075380475725743e-07,
"loss": 0.8308,
"step": 4626
},
{
"epoch": 1.89,
"grad_norm": 2.898890553762731,
"learning_rate": 6.071438766530986e-07,
"loss": 0.8648,
"step": 4627
},
{
"epoch": 1.89,
"grad_norm": 3.3009603608395675,
"learning_rate": 6.067497778984068e-07,
"loss": 0.7573,
"step": 4628
},
{
"epoch": 1.89,
"grad_norm": 3.3469099775548568,
"learning_rate": 6.063557513808921e-07,
"loss": 0.8121,
"step": 4629
},
{
"epoch": 1.89,
"grad_norm": 3.0432226957885766,
"learning_rate": 6.05961797172934e-07,
"loss": 0.7574,
"step": 4630
},
{
"epoch": 1.89,
"grad_norm": 2.8227539762778537,
"learning_rate": 6.055679153468993e-07,
"loss": 0.8144,
"step": 4631
},
{
"epoch": 1.89,
"grad_norm": 3.2043966751174615,
"learning_rate": 6.051741059751416e-07,
"loss": 0.7482,
"step": 4632
},
{
"epoch": 1.89,
"grad_norm": 2.2184539909865664,
"learning_rate": 6.047803691299999e-07,
"loss": 0.7706,
"step": 4633
},
{
"epoch": 1.89,
"grad_norm": 3.414607380732174,
"learning_rate": 6.043867048838014e-07,
"loss": 0.7829,
"step": 4634
},
{
"epoch": 1.89,
"grad_norm": 3.2276770234257275,
"learning_rate": 6.03993113308859e-07,
"loss": 0.8992,
"step": 4635
},
{
"epoch": 1.89,
"grad_norm": 2.803885903752123,
"learning_rate": 6.03599594477473e-07,
"loss": 0.8129,
"step": 4636
},
{
"epoch": 1.89,
"grad_norm": 2.9696355655826325,
"learning_rate": 6.032061484619293e-07,
"loss": 0.8904,
"step": 4637
},
{
"epoch": 1.89,
"grad_norm": 4.40554672909878,
"learning_rate": 6.028127753345013e-07,
"loss": 0.818,
"step": 4638
},
{
"epoch": 1.89,
"grad_norm": 3.043455769493823,
"learning_rate": 6.02419475167449e-07,
"loss": 0.8807,
"step": 4639
},
{
"epoch": 1.89,
"grad_norm": 2.5092463137369245,
"learning_rate": 6.020262480330187e-07,
"loss": 0.827,
"step": 4640
},
{
"epoch": 1.89,
"grad_norm": 3.58197054202922,
"learning_rate": 6.01633094003443e-07,
"loss": 0.7848,
"step": 4641
},
{
"epoch": 1.89,
"grad_norm": 4.050427556501583,
"learning_rate": 6.012400131509419e-07,
"loss": 0.8322,
"step": 4642
},
{
"epoch": 1.89,
"grad_norm": 2.4074370139052133,
"learning_rate": 6.008470055477206e-07,
"loss": 0.8174,
"step": 4643
},
{
"epoch": 1.9,
"grad_norm": 3.002121262499782,
"learning_rate": 6.004540712659727e-07,
"loss": 0.8046,
"step": 4644
},
{
"epoch": 1.9,
"grad_norm": 4.477288774041213,
"learning_rate": 6.000612103778772e-07,
"loss": 0.7559,
"step": 4645
},
{
"epoch": 1.9,
"grad_norm": 3.010227095675896,
"learning_rate": 5.996684229555991e-07,
"loss": 0.8351,
"step": 4646
},
{
"epoch": 1.9,
"grad_norm": 2.4103257247221683,
"learning_rate": 5.992757090712911e-07,
"loss": 0.8097,
"step": 4647
},
{
"epoch": 1.9,
"grad_norm": 2.6640301654717056,
"learning_rate": 5.988830687970918e-07,
"loss": 0.7623,
"step": 4648
},
{
"epoch": 1.9,
"grad_norm": 2.643520344554264,
"learning_rate": 5.984905022051262e-07,
"loss": 0.8033,
"step": 4649
},
{
"epoch": 1.9,
"grad_norm": 2.776627746172839,
"learning_rate": 5.980980093675059e-07,
"loss": 0.8714,
"step": 4650
},
{
"epoch": 1.9,
"grad_norm": 3.072883793634797,
"learning_rate": 5.977055903563288e-07,
"loss": 0.8047,
"step": 4651
},
{
"epoch": 1.9,
"grad_norm": 2.613828104456171,
"learning_rate": 5.973132452436798e-07,
"loss": 0.8544,
"step": 4652
},
{
"epoch": 1.9,
"grad_norm": 2.8044231044697034,
"learning_rate": 5.969209741016297e-07,
"loss": 0.765,
"step": 4653
},
{
"epoch": 1.9,
"grad_norm": 2.7607462347843947,
"learning_rate": 5.965287770022356e-07,
"loss": 0.8848,
"step": 4654
},
{
"epoch": 1.9,
"grad_norm": 3.238517323745459,
"learning_rate": 5.961366540175414e-07,
"loss": 0.8417,
"step": 4655
},
{
"epoch": 1.9,
"grad_norm": 3.0713548336053664,
"learning_rate": 5.95744605219577e-07,
"loss": 0.8087,
"step": 4656
},
{
"epoch": 1.9,
"grad_norm": 2.894281905042098,
"learning_rate": 5.953526306803592e-07,
"loss": 0.8492,
"step": 4657
},
{
"epoch": 1.9,
"grad_norm": 2.9230137825601115,
"learning_rate": 5.949607304718909e-07,
"loss": 0.8403,
"step": 4658
},
{
"epoch": 1.9,
"grad_norm": 2.8741025814727474,
"learning_rate": 5.945689046661609e-07,
"loss": 0.8246,
"step": 4659
},
{
"epoch": 1.9,
"grad_norm": 2.816391212212391,
"learning_rate": 5.941771533351449e-07,
"loss": 0.9105,
"step": 4660
},
{
"epoch": 1.9,
"grad_norm": 2.6081078536943685,
"learning_rate": 5.937854765508046e-07,
"loss": 0.8438,
"step": 4661
},
{
"epoch": 1.9,
"grad_norm": 3.1943401568422574,
"learning_rate": 5.933938743850888e-07,
"loss": 0.7391,
"step": 4662
},
{
"epoch": 1.9,
"grad_norm": 2.918447338095638,
"learning_rate": 5.93002346909931e-07,
"loss": 0.7714,
"step": 4663
},
{
"epoch": 1.9,
"grad_norm": 2.8281634455429385,
"learning_rate": 5.926108941972523e-07,
"loss": 0.8438,
"step": 4664
},
{
"epoch": 1.9,
"grad_norm": 3.0556145298668485,
"learning_rate": 5.922195163189599e-07,
"loss": 0.8092,
"step": 4665
},
{
"epoch": 1.9,
"grad_norm": 3.2557729800104944,
"learning_rate": 5.91828213346947e-07,
"loss": 0.7779,
"step": 4666
},
{
"epoch": 1.9,
"grad_norm": 3.1272970034019734,
"learning_rate": 5.91436985353093e-07,
"loss": 0.7938,
"step": 4667
},
{
"epoch": 1.9,
"grad_norm": 4.019929266081077,
"learning_rate": 5.910458324092633e-07,
"loss": 0.822,
"step": 4668
},
{
"epoch": 1.91,
"grad_norm": 3.1011063741551688,
"learning_rate": 5.906547545873102e-07,
"loss": 0.815,
"step": 4669
},
{
"epoch": 1.91,
"grad_norm": 4.292193908912591,
"learning_rate": 5.902637519590721e-07,
"loss": 0.771,
"step": 4670
},
{
"epoch": 1.91,
"grad_norm": 2.5777955640861885,
"learning_rate": 5.898728245963724e-07,
"loss": 0.8227,
"step": 4671
},
{
"epoch": 1.91,
"grad_norm": 2.594864028415473,
"learning_rate": 5.894819725710224e-07,
"loss": 0.8147,
"step": 4672
},
{
"epoch": 1.91,
"grad_norm": 2.5843902843334936,
"learning_rate": 5.890911959548183e-07,
"loss": 0.7712,
"step": 4673
},
{
"epoch": 1.91,
"grad_norm": 2.7788290189584473,
"learning_rate": 5.887004948195433e-07,
"loss": 0.8361,
"step": 4674
},
{
"epoch": 1.91,
"grad_norm": 2.600373322049377,
"learning_rate": 5.883098692369661e-07,
"loss": 0.8038,
"step": 4675
},
{
"epoch": 1.91,
"grad_norm": 3.162844872913323,
"learning_rate": 5.879193192788417e-07,
"loss": 0.7611,
"step": 4676
},
{
"epoch": 1.91,
"grad_norm": 2.8551257872376263,
"learning_rate": 5.875288450169109e-07,
"loss": 0.8164,
"step": 4677
},
{
"epoch": 1.91,
"grad_norm": 2.912984886448314,
"learning_rate": 5.871384465229017e-07,
"loss": 0.8218,
"step": 4678
},
{
"epoch": 1.91,
"grad_norm": 2.5522717011810774,
"learning_rate": 5.867481238685273e-07,
"loss": 0.8081,
"step": 4679
},
{
"epoch": 1.91,
"grad_norm": 3.3954622572009354,
"learning_rate": 5.863578771254865e-07,
"loss": 0.8315,
"step": 4680
},
{
"epoch": 1.91,
"grad_norm": 3.0118195804943775,
"learning_rate": 5.859677063654653e-07,
"loss": 0.8247,
"step": 4681
},
{
"epoch": 1.91,
"grad_norm": 4.021005665643418,
"learning_rate": 5.85577611660135e-07,
"loss": 0.8249,
"step": 4682
},
{
"epoch": 1.91,
"grad_norm": 2.783478486001385,
"learning_rate": 5.851875930811536e-07,
"loss": 0.8037,
"step": 4683
},
{
"epoch": 1.91,
"grad_norm": 3.501807398648961,
"learning_rate": 5.847976507001636e-07,
"loss": 0.759,
"step": 4684
},
{
"epoch": 1.91,
"grad_norm": 3.299892302662092,
"learning_rate": 5.844077845887955e-07,
"loss": 0.87,
"step": 4685
},
{
"epoch": 1.91,
"grad_norm": 3.0055367051086646,
"learning_rate": 5.840179948186645e-07,
"loss": 0.8505,
"step": 4686
},
{
"epoch": 1.91,
"grad_norm": 2.6126068538582623,
"learning_rate": 5.836282814613719e-07,
"loss": 0.8325,
"step": 4687
},
{
"epoch": 1.91,
"grad_norm": 2.861682510765365,
"learning_rate": 5.83238644588506e-07,
"loss": 0.8026,
"step": 4688
},
{
"epoch": 1.91,
"grad_norm": 2.9796422565789653,
"learning_rate": 5.828490842716394e-07,
"loss": 0.8223,
"step": 4689
},
{
"epoch": 1.91,
"grad_norm": 2.8751211542828212,
"learning_rate": 5.824596005823317e-07,
"loss": 0.8171,
"step": 4690
},
{
"epoch": 1.91,
"grad_norm": 3.0989368272372606,
"learning_rate": 5.820701935921277e-07,
"loss": 0.7952,
"step": 4691
},
{
"epoch": 1.91,
"grad_norm": 3.138920549767831,
"learning_rate": 5.816808633725599e-07,
"loss": 0.8487,
"step": 4692
},
{
"epoch": 1.92,
"grad_norm": 2.9699833067818844,
"learning_rate": 5.812916099951444e-07,
"loss": 0.824,
"step": 4693
},
{
"epoch": 1.92,
"grad_norm": 4.07251092261976,
"learning_rate": 5.809024335313844e-07,
"loss": 0.8298,
"step": 4694
},
{
"epoch": 1.92,
"grad_norm": 3.194608564756602,
"learning_rate": 5.805133340527685e-07,
"loss": 0.7554,
"step": 4695
},
{
"epoch": 1.92,
"grad_norm": 3.148090666630515,
"learning_rate": 5.801243116307719e-07,
"loss": 0.8078,
"step": 4696
},
{
"epoch": 1.92,
"grad_norm": 4.206155966010429,
"learning_rate": 5.797353663368549e-07,
"loss": 0.7524,
"step": 4697
},
{
"epoch": 1.92,
"grad_norm": 2.972459345212913,
"learning_rate": 5.793464982424639e-07,
"loss": 0.8199,
"step": 4698
},
{
"epoch": 1.92,
"grad_norm": 3.3989018788539767,
"learning_rate": 5.789577074190313e-07,
"loss": 0.8457,
"step": 4699
},
{
"epoch": 1.92,
"grad_norm": 3.1163339529065825,
"learning_rate": 5.785689939379752e-07,
"loss": 0.8273,
"step": 4700
},
{
"epoch": 1.92,
"eval_loss": 0.8679046630859375,
"eval_runtime": 465.4198,
"eval_samples_per_second": 74.883,
"eval_steps_per_second": 4.682,
"step": 4700
},
{
"epoch": 1.92,
"grad_norm": 3.9173705639723537,
"learning_rate": 5.781803578706985e-07,
"loss": 0.8647,
"step": 4701
},
{
"epoch": 1.92,
"grad_norm": 3.0856737938707064,
"learning_rate": 5.777917992885918e-07,
"loss": 0.8127,
"step": 4702
},
{
"epoch": 1.92,
"grad_norm": 2.903539025226562,
"learning_rate": 5.774033182630301e-07,
"loss": 0.8053,
"step": 4703
},
{
"epoch": 1.92,
"grad_norm": 3.0916215387344224,
"learning_rate": 5.770149148653746e-07,
"loss": 0.7692,
"step": 4704
},
{
"epoch": 1.92,
"grad_norm": 2.4008578056690886,
"learning_rate": 5.766265891669724e-07,
"loss": 0.742,
"step": 4705
},
{
"epoch": 1.92,
"grad_norm": 3.9159166742850826,
"learning_rate": 5.762383412391551e-07,
"loss": 0.8389,
"step": 4706
},
{
"epoch": 1.92,
"grad_norm": 2.616315782462237,
"learning_rate": 5.75850171153242e-07,
"loss": 0.8894,
"step": 4707
},
{
"epoch": 1.92,
"grad_norm": 3.220012455417011,
"learning_rate": 5.754620789805368e-07,
"loss": 0.7976,
"step": 4708
},
{
"epoch": 1.92,
"grad_norm": 3.260307049663994,
"learning_rate": 5.750740647923295e-07,
"loss": 0.7432,
"step": 4709
},
{
"epoch": 1.92,
"grad_norm": 2.636207147332711,
"learning_rate": 5.746861286598946e-07,
"loss": 0.83,
"step": 4710
},
{
"epoch": 1.92,
"grad_norm": 2.7053544818216735,
"learning_rate": 5.742982706544937e-07,
"loss": 0.7814,
"step": 4711
},
{
"epoch": 1.92,
"grad_norm": 2.904889835257073,
"learning_rate": 5.73910490847373e-07,
"loss": 0.8832,
"step": 4712
},
{
"epoch": 1.92,
"grad_norm": 3.0037945239492982,
"learning_rate": 5.735227893097658e-07,
"loss": 0.8209,
"step": 4713
},
{
"epoch": 1.92,
"grad_norm": 2.8684334847910264,
"learning_rate": 5.731351661128892e-07,
"loss": 0.8455,
"step": 4714
},
{
"epoch": 1.92,
"grad_norm": 2.7253650588505036,
"learning_rate": 5.72747621327947e-07,
"loss": 0.8761,
"step": 4715
},
{
"epoch": 1.92,
"grad_norm": 3.305530772423873,
"learning_rate": 5.723601550261281e-07,
"loss": 0.761,
"step": 4716
},
{
"epoch": 1.92,
"grad_norm": 2.686866102064351,
"learning_rate": 5.71972767278607e-07,
"loss": 0.7898,
"step": 4717
},
{
"epoch": 1.93,
"grad_norm": 3.2011091779910705,
"learning_rate": 5.715854581565452e-07,
"loss": 0.7736,
"step": 4718
},
{
"epoch": 1.93,
"grad_norm": 2.6679847803430032,
"learning_rate": 5.711982277310872e-07,
"loss": 0.813,
"step": 4719
},
{
"epoch": 1.93,
"grad_norm": 2.4653047948965328,
"learning_rate": 5.70811076073365e-07,
"loss": 0.8792,
"step": 4720
},
{
"epoch": 1.93,
"grad_norm": 2.9712446657619322,
"learning_rate": 5.704240032544954e-07,
"loss": 0.8119,
"step": 4721
},
{
"epoch": 1.93,
"grad_norm": 2.559215256017626,
"learning_rate": 5.700370093455808e-07,
"loss": 0.7838,
"step": 4722
},
{
"epoch": 1.93,
"grad_norm": 3.578809250403192,
"learning_rate": 5.696500944177092e-07,
"loss": 0.7466,
"step": 4723
},
{
"epoch": 1.93,
"grad_norm": 2.5608707895929204,
"learning_rate": 5.692632585419539e-07,
"loss": 0.7865,
"step": 4724
},
{
"epoch": 1.93,
"grad_norm": 3.079299403792182,
"learning_rate": 5.688765017893741e-07,
"loss": 0.8279,
"step": 4725
},
{
"epoch": 1.93,
"grad_norm": 3.3415213947362052,
"learning_rate": 5.684898242310143e-07,
"loss": 0.8288,
"step": 4726
},
{
"epoch": 1.93,
"grad_norm": 4.330220007993759,
"learning_rate": 5.681032259379031e-07,
"loss": 0.7267,
"step": 4727
},
{
"epoch": 1.93,
"grad_norm": 3.2327967519102367,
"learning_rate": 5.677167069810573e-07,
"loss": 0.7147,
"step": 4728
},
{
"epoch": 1.93,
"grad_norm": 2.76820614936463,
"learning_rate": 5.673302674314768e-07,
"loss": 0.776,
"step": 4729
},
{
"epoch": 1.93,
"grad_norm": 3.1068085188386507,
"learning_rate": 5.669439073601483e-07,
"loss": 0.7839,
"step": 4730
},
{
"epoch": 1.93,
"grad_norm": 3.239935047218851,
"learning_rate": 5.665576268380426e-07,
"loss": 0.857,
"step": 4731
},
{
"epoch": 1.93,
"grad_norm": 2.6996633444703937,
"learning_rate": 5.661714259361164e-07,
"loss": 0.7871,
"step": 4732
},
{
"epoch": 1.93,
"grad_norm": 2.643556941979755,
"learning_rate": 5.657853047253129e-07,
"loss": 0.7658,
"step": 4733
},
{
"epoch": 1.93,
"grad_norm": 2.644814921628657,
"learning_rate": 5.653992632765592e-07,
"loss": 0.8418,
"step": 4734
},
{
"epoch": 1.93,
"grad_norm": 2.879716006892931,
"learning_rate": 5.650133016607689e-07,
"loss": 0.7549,
"step": 4735
},
{
"epoch": 1.93,
"grad_norm": 3.088096097514159,
"learning_rate": 5.646274199488392e-07,
"loss": 0.844,
"step": 4736
},
{
"epoch": 1.93,
"grad_norm": 2.725081411586929,
"learning_rate": 5.642416182116545e-07,
"loss": 0.8273,
"step": 4737
},
{
"epoch": 1.93,
"grad_norm": 2.5536118592020496,
"learning_rate": 5.63855896520083e-07,
"loss": 0.8589,
"step": 4738
},
{
"epoch": 1.93,
"grad_norm": 2.5030150988753346,
"learning_rate": 5.634702549449803e-07,
"loss": 0.8552,
"step": 4739
},
{
"epoch": 1.93,
"grad_norm": 2.7363785033311108,
"learning_rate": 5.63084693557185e-07,
"loss": 0.7948,
"step": 4740
},
{
"epoch": 1.93,
"grad_norm": 2.8500434971918707,
"learning_rate": 5.626992124275216e-07,
"loss": 0.8533,
"step": 4741
},
{
"epoch": 1.94,
"grad_norm": 3.2002815833136515,
"learning_rate": 5.623138116268007e-07,
"loss": 0.7738,
"step": 4742
},
{
"epoch": 1.94,
"grad_norm": 3.886720385551389,
"learning_rate": 5.619284912258175e-07,
"loss": 0.7781,
"step": 4743
},
{
"epoch": 1.94,
"grad_norm": 3.3160095016999436,
"learning_rate": 5.615432512953526e-07,
"loss": 0.7481,
"step": 4744
},
{
"epoch": 1.94,
"grad_norm": 3.310356735556062,
"learning_rate": 5.611580919061716e-07,
"loss": 0.8137,
"step": 4745
},
{
"epoch": 1.94,
"grad_norm": 2.9446266487933213,
"learning_rate": 5.607730131290254e-07,
"loss": 0.8134,
"step": 4746
},
{
"epoch": 1.94,
"grad_norm": 2.4899986625581985,
"learning_rate": 5.603880150346503e-07,
"loss": 0.7952,
"step": 4747
},
{
"epoch": 1.94,
"grad_norm": 3.000717271891924,
"learning_rate": 5.600030976937674e-07,
"loss": 0.7907,
"step": 4748
},
{
"epoch": 1.94,
"grad_norm": 3.2975444259751634,
"learning_rate": 5.596182611770834e-07,
"loss": 0.848,
"step": 4749
},
{
"epoch": 1.94,
"grad_norm": 2.5912828012226496,
"learning_rate": 5.592335055552899e-07,
"loss": 0.8071,
"step": 4750
},
{
"epoch": 1.94,
"grad_norm": 3.18031275399337,
"learning_rate": 5.588488308990638e-07,
"loss": 0.7953,
"step": 4751
},
{
"epoch": 1.94,
"grad_norm": 2.81165590427726,
"learning_rate": 5.584642372790671e-07,
"loss": 0.8378,
"step": 4752
},
{
"epoch": 1.94,
"grad_norm": 2.768996868596082,
"learning_rate": 5.580797247659459e-07,
"loss": 0.8191,
"step": 4753
},
{
"epoch": 1.94,
"grad_norm": 3.1105234017129693,
"learning_rate": 5.576952934303334e-07,
"loss": 0.8072,
"step": 4754
},
{
"epoch": 1.94,
"grad_norm": 3.3334159206969036,
"learning_rate": 5.573109433428464e-07,
"loss": 0.8449,
"step": 4755
},
{
"epoch": 1.94,
"grad_norm": 2.646139800144381,
"learning_rate": 5.569266745740876e-07,
"loss": 0.861,
"step": 4756
},
{
"epoch": 1.94,
"grad_norm": 2.5607907026180046,
"learning_rate": 5.565424871946436e-07,
"loss": 0.7662,
"step": 4757
},
{
"epoch": 1.94,
"grad_norm": 3.9727208294961236,
"learning_rate": 5.561583812750867e-07,
"loss": 0.854,
"step": 4758
},
{
"epoch": 1.94,
"grad_norm": 3.6941685465186325,
"learning_rate": 5.557743568859752e-07,
"loss": 0.7755,
"step": 4759
},
{
"epoch": 1.94,
"grad_norm": 4.36891109487727,
"learning_rate": 5.553904140978512e-07,
"loss": 0.7049,
"step": 4760
},
{
"epoch": 1.94,
"grad_norm": 4.499437998819088,
"learning_rate": 5.550065529812427e-07,
"loss": 0.7433,
"step": 4761
},
{
"epoch": 1.94,
"grad_norm": 2.5491884993052256,
"learning_rate": 5.546227736066609e-07,
"loss": 0.8143,
"step": 4762
},
{
"epoch": 1.94,
"grad_norm": 2.771222449726093,
"learning_rate": 5.542390760446041e-07,
"loss": 0.8587,
"step": 4763
},
{
"epoch": 1.94,
"grad_norm": 2.8438712110293447,
"learning_rate": 5.538554603655542e-07,
"loss": 0.7511,
"step": 4764
},
{
"epoch": 1.94,
"grad_norm": 3.279400351746387,
"learning_rate": 5.534719266399798e-07,
"loss": 0.7869,
"step": 4765
},
{
"epoch": 1.94,
"grad_norm": 4.644281405937971,
"learning_rate": 5.530884749383317e-07,
"loss": 0.7796,
"step": 4766
},
{
"epoch": 1.95,
"grad_norm": 3.241899614975205,
"learning_rate": 5.527051053310482e-07,
"loss": 0.7473,
"step": 4767
},
{
"epoch": 1.95,
"grad_norm": 2.934859372287149,
"learning_rate": 5.523218178885509e-07,
"loss": 0.8318,
"step": 4768
},
{
"epoch": 1.95,
"grad_norm": 3.165051915115065,
"learning_rate": 5.519386126812473e-07,
"loss": 0.7853,
"step": 4769
},
{
"epoch": 1.95,
"grad_norm": 2.8440478087400782,
"learning_rate": 5.515554897795291e-07,
"loss": 0.8072,
"step": 4770
},
{
"epoch": 1.95,
"grad_norm": 2.4184299881982194,
"learning_rate": 5.511724492537733e-07,
"loss": 0.8586,
"step": 4771
},
{
"epoch": 1.95,
"grad_norm": 3.1346610537344026,
"learning_rate": 5.507894911743415e-07,
"loss": 0.8368,
"step": 4772
},
{
"epoch": 1.95,
"grad_norm": 2.870379076572671,
"learning_rate": 5.504066156115805e-07,
"loss": 0.8038,
"step": 4773
},
{
"epoch": 1.95,
"grad_norm": 2.670414879609988,
"learning_rate": 5.500238226358214e-07,
"loss": 0.9296,
"step": 4774
},
{
"epoch": 1.95,
"grad_norm": 3.814079910391784,
"learning_rate": 5.496411123173808e-07,
"loss": 0.78,
"step": 4775
},
{
"epoch": 1.95,
"grad_norm": 2.920175761005988,
"learning_rate": 5.492584847265596e-07,
"loss": 0.8346,
"step": 4776
},
{
"epoch": 1.95,
"grad_norm": 3.3206670141153323,
"learning_rate": 5.488759399336438e-07,
"loss": 0.7682,
"step": 4777
},
{
"epoch": 1.95,
"grad_norm": 2.9062063192459777,
"learning_rate": 5.484934780089039e-07,
"loss": 0.7823,
"step": 4778
},
{
"epoch": 1.95,
"grad_norm": 3.375631761744824,
"learning_rate": 5.481110990225955e-07,
"loss": 0.8067,
"step": 4779
},
{
"epoch": 1.95,
"grad_norm": 4.1579581029427635,
"learning_rate": 5.477288030449587e-07,
"loss": 0.8377,
"step": 4780
},
{
"epoch": 1.95,
"grad_norm": 2.775618830582615,
"learning_rate": 5.473465901462184e-07,
"loss": 0.8153,
"step": 4781
},
{
"epoch": 1.95,
"grad_norm": 2.62624021805156,
"learning_rate": 5.469644603965848e-07,
"loss": 0.8209,
"step": 4782
},
{
"epoch": 1.95,
"grad_norm": 3.13261722217919,
"learning_rate": 5.465824138662516e-07,
"loss": 0.7789,
"step": 4783
},
{
"epoch": 1.95,
"grad_norm": 3.014162301528159,
"learning_rate": 5.462004506253979e-07,
"loss": 0.8277,
"step": 4784
},
{
"epoch": 1.95,
"grad_norm": 2.776700629849616,
"learning_rate": 5.458185707441884e-07,
"loss": 0.8324,
"step": 4785
},
{
"epoch": 1.95,
"grad_norm": 2.8304256901468197,
"learning_rate": 5.454367742927714e-07,
"loss": 0.8989,
"step": 4786
},
{
"epoch": 1.95,
"grad_norm": 3.6718938444303504,
"learning_rate": 5.450550613412794e-07,
"loss": 0.8476,
"step": 4787
},
{
"epoch": 1.95,
"grad_norm": 2.6884994346679285,
"learning_rate": 5.44673431959831e-07,
"loss": 0.8135,
"step": 4788
},
{
"epoch": 1.95,
"grad_norm": 3.062182385355467,
"learning_rate": 5.442918862185277e-07,
"loss": 0.7632,
"step": 4789
},
{
"epoch": 1.95,
"grad_norm": 3.450091802354631,
"learning_rate": 5.439104241874579e-07,
"loss": 0.8233,
"step": 4790
},
{
"epoch": 1.96,
"grad_norm": 2.4831999481229468,
"learning_rate": 5.435290459366933e-07,
"loss": 0.8795,
"step": 4791
},
{
"epoch": 1.96,
"grad_norm": 2.4966778108114744,
"learning_rate": 5.431477515362894e-07,
"loss": 0.8576,
"step": 4792
},
{
"epoch": 1.96,
"grad_norm": 3.120818479951481,
"learning_rate": 5.427665410562877e-07,
"loss": 0.8459,
"step": 4793
},
{
"epoch": 1.96,
"grad_norm": 3.6094602545913386,
"learning_rate": 5.423854145667134e-07,
"loss": 0.7309,
"step": 4794
},
{
"epoch": 1.96,
"grad_norm": 2.9433361892995276,
"learning_rate": 5.420043721375771e-07,
"loss": 0.7644,
"step": 4795
},
{
"epoch": 1.96,
"grad_norm": 3.495615969762915,
"learning_rate": 5.416234138388733e-07,
"loss": 0.7336,
"step": 4796
},
{
"epoch": 1.96,
"grad_norm": 2.73595709458692,
"learning_rate": 5.412425397405813e-07,
"loss": 0.7556,
"step": 4797
},
{
"epoch": 1.96,
"grad_norm": 3.1703283829411992,
"learning_rate": 5.408617499126649e-07,
"loss": 0.8073,
"step": 4798
},
{
"epoch": 1.96,
"grad_norm": 3.363978483631288,
"learning_rate": 5.404810444250721e-07,
"loss": 0.8131,
"step": 4799
},
{
"epoch": 1.96,
"grad_norm": 3.30257562206828,
"learning_rate": 5.40100423347736e-07,
"loss": 0.7567,
"step": 4800
},
{
"epoch": 1.96,
"eval_loss": 0.8677650094032288,
"eval_runtime": 466.1396,
"eval_samples_per_second": 74.767,
"eval_steps_per_second": 4.675,
"step": 4800
},
{
"epoch": 1.96,
"grad_norm": 3.0467597043358547,
"learning_rate": 5.397198867505739e-07,
"loss": 0.7364,
"step": 4801
},
{
"epoch": 1.96,
"grad_norm": 3.9086181629032004,
"learning_rate": 5.393394347034874e-07,
"loss": 0.8319,
"step": 4802
},
{
"epoch": 1.96,
"grad_norm": 2.649891965311411,
"learning_rate": 5.389590672763632e-07,
"loss": 0.8307,
"step": 4803
},
{
"epoch": 1.96,
"grad_norm": 2.384019239255758,
"learning_rate": 5.385787845390709e-07,
"loss": 0.8016,
"step": 4804
},
{
"epoch": 1.96,
"grad_norm": 2.3663821646177663,
"learning_rate": 5.381985865614668e-07,
"loss": 0.8133,
"step": 4805
},
{
"epoch": 1.96,
"grad_norm": 2.9265834929828514,
"learning_rate": 5.378184734133901e-07,
"loss": 0.7894,
"step": 4806
},
{
"epoch": 1.96,
"grad_norm": 2.5363213038037946,
"learning_rate": 5.374384451646647e-07,
"loss": 0.8959,
"step": 4807
},
{
"epoch": 1.96,
"grad_norm": 2.2595127044540795,
"learning_rate": 5.370585018850995e-07,
"loss": 0.7908,
"step": 4808
},
{
"epoch": 1.96,
"grad_norm": 2.7921623301813843,
"learning_rate": 5.366786436444863e-07,
"loss": 0.7758,
"step": 4809
},
{
"epoch": 1.96,
"grad_norm": 2.674977075533869,
"learning_rate": 5.362988705126026e-07,
"loss": 0.8495,
"step": 4810
},
{
"epoch": 1.96,
"grad_norm": 2.8035062069019134,
"learning_rate": 5.359191825592105e-07,
"loss": 0.8087,
"step": 4811
},
{
"epoch": 1.96,
"grad_norm": 3.3227128142797415,
"learning_rate": 5.355395798540558e-07,
"loss": 0.7729,
"step": 4812
},
{
"epoch": 1.96,
"grad_norm": 2.634695021463479,
"learning_rate": 5.351600624668682e-07,
"loss": 0.839,
"step": 4813
},
{
"epoch": 1.96,
"grad_norm": 2.5354916747726755,
"learning_rate": 5.347806304673626e-07,
"loss": 0.8957,
"step": 4814
},
{
"epoch": 1.96,
"grad_norm": 3.036447663404858,
"learning_rate": 5.344012839252372e-07,
"loss": 0.7777,
"step": 4815
},
{
"epoch": 1.97,
"grad_norm": 2.7396158496220417,
"learning_rate": 5.340220229101766e-07,
"loss": 0.872,
"step": 4816
},
{
"epoch": 1.97,
"grad_norm": 3.4495150080469186,
"learning_rate": 5.336428474918471e-07,
"loss": 0.8206,
"step": 4817
},
{
"epoch": 1.97,
"grad_norm": 3.117819577487548,
"learning_rate": 5.332637577399006e-07,
"loss": 0.802,
"step": 4818
},
{
"epoch": 1.97,
"grad_norm": 3.845568431986155,
"learning_rate": 5.328847537239734e-07,
"loss": 0.8419,
"step": 4819
},
{
"epoch": 1.97,
"grad_norm": 2.7856837739488447,
"learning_rate": 5.325058355136858e-07,
"loss": 0.776,
"step": 4820
},
{
"epoch": 1.97,
"grad_norm": 2.961741066103992,
"learning_rate": 5.321270031786418e-07,
"loss": 0.8936,
"step": 4821
},
{
"epoch": 1.97,
"grad_norm": 2.752690342408492,
"learning_rate": 5.317482567884306e-07,
"loss": 0.8256,
"step": 4822
},
{
"epoch": 1.97,
"grad_norm": 2.524239902406657,
"learning_rate": 5.313695964126249e-07,
"loss": 0.8568,
"step": 4823
},
{
"epoch": 1.97,
"grad_norm": 2.784614727661939,
"learning_rate": 5.309910221207818e-07,
"loss": 0.8383,
"step": 4824
},
{
"epoch": 1.97,
"grad_norm": 3.002769206937061,
"learning_rate": 5.30612533982443e-07,
"loss": 0.8363,
"step": 4825
},
{
"epoch": 1.97,
"grad_norm": 3.907127492990308,
"learning_rate": 5.302341320671335e-07,
"loss": 0.7766,
"step": 4826
},
{
"epoch": 1.97,
"grad_norm": 2.531428412063365,
"learning_rate": 5.29855816444363e-07,
"loss": 0.81,
"step": 4827
},
{
"epoch": 1.97,
"grad_norm": 4.004043615128322,
"learning_rate": 5.294775871836257e-07,
"loss": 0.7856,
"step": 4828
},
{
"epoch": 1.97,
"grad_norm": 3.008257822263623,
"learning_rate": 5.290994443543996e-07,
"loss": 0.8308,
"step": 4829
},
{
"epoch": 1.97,
"grad_norm": 2.415506753445954,
"learning_rate": 5.287213880261457e-07,
"loss": 0.8008,
"step": 4830
},
{
"epoch": 1.97,
"grad_norm": 2.7331605319121604,
"learning_rate": 5.283434182683112e-07,
"loss": 0.8363,
"step": 4831
},
{
"epoch": 1.97,
"grad_norm": 4.149086863492819,
"learning_rate": 5.279655351503261e-07,
"loss": 0.8467,
"step": 4832
},
{
"epoch": 1.97,
"grad_norm": 2.382307409429726,
"learning_rate": 5.275877387416052e-07,
"loss": 0.7954,
"step": 4833
},
{
"epoch": 1.97,
"grad_norm": 2.918071188597369,
"learning_rate": 5.27210029111546e-07,
"loss": 0.7926,
"step": 4834
},
{
"epoch": 1.97,
"grad_norm": 3.098087413245674,
"learning_rate": 5.268324063295315e-07,
"loss": 0.8306,
"step": 4835
},
{
"epoch": 1.97,
"grad_norm": 5.6611977669442,
"learning_rate": 5.264548704649277e-07,
"loss": 0.8239,
"step": 4836
},
{
"epoch": 1.97,
"grad_norm": 3.8789703825800084,
"learning_rate": 5.260774215870862e-07,
"loss": 0.7921,
"step": 4837
},
{
"epoch": 1.97,
"grad_norm": 2.613514565765164,
"learning_rate": 5.257000597653414e-07,
"loss": 0.8586,
"step": 4838
},
{
"epoch": 1.97,
"grad_norm": 2.8019826095848988,
"learning_rate": 5.25322785069011e-07,
"loss": 0.8737,
"step": 4839
},
{
"epoch": 1.98,
"grad_norm": 3.2675591784189075,
"learning_rate": 5.249455975673982e-07,
"loss": 0.7634,
"step": 4840
},
{
"epoch": 1.98,
"grad_norm": 2.389040374646795,
"learning_rate": 5.24568497329789e-07,
"loss": 0.7968,
"step": 4841
},
{
"epoch": 1.98,
"grad_norm": 4.009339406796333,
"learning_rate": 5.241914844254554e-07,
"loss": 0.8227,
"step": 4842
},
{
"epoch": 1.98,
"grad_norm": 2.8710498502078967,
"learning_rate": 5.238145589236503e-07,
"loss": 0.8848,
"step": 4843
},
{
"epoch": 1.98,
"grad_norm": 2.8372472827817314,
"learning_rate": 5.23437720893613e-07,
"loss": 0.8467,
"step": 4844
},
{
"epoch": 1.98,
"grad_norm": 3.2175609625977706,
"learning_rate": 5.230609704045655e-07,
"loss": 0.835,
"step": 4845
},
{
"epoch": 1.98,
"grad_norm": 2.4864820866357134,
"learning_rate": 5.226843075257143e-07,
"loss": 0.8182,
"step": 4846
},
{
"epoch": 1.98,
"grad_norm": 3.268229119091705,
"learning_rate": 5.223077323262496e-07,
"loss": 0.8345,
"step": 4847
},
{
"epoch": 1.98,
"grad_norm": 2.648789727592603,
"learning_rate": 5.219312448753454e-07,
"loss": 0.797,
"step": 4848
},
{
"epoch": 1.98,
"grad_norm": 3.1538317798239976,
"learning_rate": 5.215548452421598e-07,
"loss": 0.7541,
"step": 4849
},
{
"epoch": 1.98,
"grad_norm": 2.5509913569705533,
"learning_rate": 5.211785334958345e-07,
"loss": 0.8087,
"step": 4850
},
{
"epoch": 1.98,
"grad_norm": 2.7633741159568657,
"learning_rate": 5.208023097054954e-07,
"loss": 0.8273,
"step": 4851
},
{
"epoch": 1.98,
"grad_norm": 3.140312270670465,
"learning_rate": 5.204261739402518e-07,
"loss": 0.8373,
"step": 4852
},
{
"epoch": 1.98,
"grad_norm": 3.2888973898562166,
"learning_rate": 5.200501262691973e-07,
"loss": 0.7542,
"step": 4853
},
{
"epoch": 1.98,
"grad_norm": 2.8486425522309906,
"learning_rate": 5.196741667614091e-07,
"loss": 0.7739,
"step": 4854
},
{
"epoch": 1.98,
"grad_norm": 2.6666761246021817,
"learning_rate": 5.192982954859483e-07,
"loss": 0.8078,
"step": 4855
},
{
"epoch": 1.98,
"grad_norm": 2.615894834179495,
"learning_rate": 5.189225125118588e-07,
"loss": 0.7707,
"step": 4856
},
{
"epoch": 1.98,
"grad_norm": 2.8538204403625853,
"learning_rate": 5.185468179081702e-07,
"loss": 0.8301,
"step": 4857
},
{
"epoch": 1.98,
"grad_norm": 2.899897649904134,
"learning_rate": 5.181712117438945e-07,
"loss": 0.8585,
"step": 4858
},
{
"epoch": 1.98,
"grad_norm": 2.748601576162454,
"learning_rate": 5.177956940880284e-07,
"loss": 0.8602,
"step": 4859
},
{
"epoch": 1.98,
"grad_norm": 3.330078646570928,
"learning_rate": 5.174202650095504e-07,
"loss": 0.7693,
"step": 4860
},
{
"epoch": 1.98,
"grad_norm": 3.1150234652826154,
"learning_rate": 5.170449245774251e-07,
"loss": 0.8171,
"step": 4861
},
{
"epoch": 1.98,
"grad_norm": 4.702843855029057,
"learning_rate": 5.166696728605991e-07,
"loss": 0.7774,
"step": 4862
},
{
"epoch": 1.98,
"grad_norm": 2.9438818634881185,
"learning_rate": 5.162945099280038e-07,
"loss": 0.7775,
"step": 4863
},
{
"epoch": 1.98,
"grad_norm": 3.080922398880032,
"learning_rate": 5.159194358485545e-07,
"loss": 0.8223,
"step": 4864
},
{
"epoch": 1.99,
"grad_norm": 2.610240784864806,
"learning_rate": 5.155444506911486e-07,
"loss": 0.8386,
"step": 4865
},
{
"epoch": 1.99,
"grad_norm": 4.5497895006186235,
"learning_rate": 5.151695545246681e-07,
"loss": 0.8276,
"step": 4866
},
{
"epoch": 1.99,
"grad_norm": 3.1417720929413697,
"learning_rate": 5.147947474179788e-07,
"loss": 0.8048,
"step": 4867
},
{
"epoch": 1.99,
"grad_norm": 4.015310737992175,
"learning_rate": 5.14420029439931e-07,
"loss": 0.7542,
"step": 4868
},
{
"epoch": 1.99,
"grad_norm": 2.9160291986958446,
"learning_rate": 5.140454006593564e-07,
"loss": 0.8164,
"step": 4869
},
{
"epoch": 1.99,
"grad_norm": 2.8042418494136148,
"learning_rate": 5.13670861145072e-07,
"loss": 0.8369,
"step": 4870
},
{
"epoch": 1.99,
"grad_norm": 3.507526060247667,
"learning_rate": 5.132964109658779e-07,
"loss": 0.7635,
"step": 4871
},
{
"epoch": 1.99,
"grad_norm": 3.33911140583579,
"learning_rate": 5.129220501905579e-07,
"loss": 0.8702,
"step": 4872
},
{
"epoch": 1.99,
"grad_norm": 3.1543223790040793,
"learning_rate": 5.125477788878794e-07,
"loss": 0.8536,
"step": 4873
},
{
"epoch": 1.99,
"grad_norm": 3.5322926902262357,
"learning_rate": 5.12173597126593e-07,
"loss": 0.7611,
"step": 4874
},
{
"epoch": 1.99,
"grad_norm": 2.8963098676315497,
"learning_rate": 5.117995049754335e-07,
"loss": 0.8088,
"step": 4875
},
{
"epoch": 1.99,
"grad_norm": 3.972816663407247,
"learning_rate": 5.114255025031188e-07,
"loss": 0.8207,
"step": 4876
},
{
"epoch": 1.99,
"grad_norm": 3.5672775124214717,
"learning_rate": 5.110515897783502e-07,
"loss": 0.8323,
"step": 4877
},
{
"epoch": 1.99,
"grad_norm": 2.5029668221278425,
"learning_rate": 5.10677766869813e-07,
"loss": 0.865,
"step": 4878
},
{
"epoch": 1.99,
"grad_norm": 4.406727734564054,
"learning_rate": 5.103040338461755e-07,
"loss": 0.8462,
"step": 4879
},
{
"epoch": 1.99,
"grad_norm": 2.690892635349046,
"learning_rate": 5.099303907760897e-07,
"loss": 0.8536,
"step": 4880
},
{
"epoch": 1.99,
"grad_norm": 2.9832644339266237,
"learning_rate": 5.095568377281917e-07,
"loss": 0.8704,
"step": 4881
},
{
"epoch": 1.99,
"grad_norm": 2.8656093958996744,
"learning_rate": 5.091833747710991e-07,
"loss": 0.7902,
"step": 4882
},
{
"epoch": 1.99,
"grad_norm": 2.6402985512161252,
"learning_rate": 5.088100019734155e-07,
"loss": 0.8215,
"step": 4883
},
{
"epoch": 1.99,
"grad_norm": 2.775751361545222,
"learning_rate": 5.084367194037265e-07,
"loss": 0.8318,
"step": 4884
},
{
"epoch": 1.99,
"grad_norm": 2.895296926163711,
"learning_rate": 5.080635271306015e-07,
"loss": 0.7831,
"step": 4885
},
{
"epoch": 1.99,
"grad_norm": 3.027947334958269,
"learning_rate": 5.076904252225924e-07,
"loss": 0.8263,
"step": 4886
},
{
"epoch": 1.99,
"grad_norm": 2.6763238285475106,
"learning_rate": 5.073174137482355e-07,
"loss": 0.7641,
"step": 4887
},
{
"epoch": 1.99,
"grad_norm": 3.190229108970956,
"learning_rate": 5.069444927760512e-07,
"loss": 0.8547,
"step": 4888
},
{
"epoch": 2.0,
"grad_norm": 3.304367331490058,
"learning_rate": 5.065716623745417e-07,
"loss": 0.8281,
"step": 4889
},
{
"epoch": 2.0,
"grad_norm": 3.139228031836925,
"learning_rate": 5.06198922612193e-07,
"loss": 0.8836,
"step": 4890
},
{
"epoch": 2.0,
"grad_norm": 2.9607465563217144,
"learning_rate": 5.05826273557475e-07,
"loss": 0.7506,
"step": 4891
},
{
"epoch": 2.0,
"grad_norm": 2.9970196615255746,
"learning_rate": 5.054537152788403e-07,
"loss": 0.7387,
"step": 4892
},
{
"epoch": 2.0,
"grad_norm": 3.1380262676385513,
"learning_rate": 5.05081247844725e-07,
"loss": 0.7896,
"step": 4893
},
{
"epoch": 2.0,
"grad_norm": 3.0221572540151875,
"learning_rate": 5.047088713235495e-07,
"loss": 0.9002,
"step": 4894
},
{
"epoch": 2.0,
"grad_norm": 5.615345377955368,
"learning_rate": 5.043365857837159e-07,
"loss": 0.879,
"step": 4895
},
{
"epoch": 2.0,
"grad_norm": 3.8199617727270674,
"learning_rate": 5.039643912936103e-07,
"loss": 0.775,
"step": 4896
},
{
"epoch": 2.0,
"grad_norm": 3.3234761055193216,
"learning_rate": 5.035922879216023e-07,
"loss": 0.8259,
"step": 4897
},
{
"epoch": 2.0,
"grad_norm": 3.6136396868614113,
"learning_rate": 5.032202757360444e-07,
"loss": 0.727,
"step": 4898
},
{
"epoch": 2.0,
"grad_norm": 3.303570056849936,
"learning_rate": 5.028483548052729e-07,
"loss": 0.7833,
"step": 4899
},
{
"epoch": 2.0,
"grad_norm": 3.4342804364527884,
"learning_rate": 5.024765251976065e-07,
"loss": 0.8172,
"step": 4900
},
{
"epoch": 2.0,
"eval_loss": 0.8676137924194336,
"eval_runtime": 467.1133,
"eval_samples_per_second": 74.611,
"eval_steps_per_second": 4.665,
"step": 4900
}
],
"logging_steps": 1.0,
"max_steps": 7350,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 417837320110080.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}