Files
qwen3-4b-think/checkpoint-500/trainer_state.json
ModelHub XC eac7119977 初始化项目,由ModelHub XC社区提供模型
Model: amphora/qwen3-4b-think
Source: Original Platform
2026-06-13 09:26:16 +08:00

3535 lines
91 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.6501650165016502,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033003300330033004,
"grad_norm": 10.81499361768409,
"learning_rate": 0.0,
"loss": 1.2079360485076904,
"step": 1
},
{
"epoch": 0.006600660066006601,
"grad_norm": 10.226770877445293,
"learning_rate": 4.395604395604396e-07,
"loss": 1.123347520828247,
"step": 2
},
{
"epoch": 0.009900990099009901,
"grad_norm": 11.292644267807786,
"learning_rate": 8.791208791208792e-07,
"loss": 1.261695384979248,
"step": 3
},
{
"epoch": 0.013201320132013201,
"grad_norm": 10.504638106263508,
"learning_rate": 1.3186813186813187e-06,
"loss": 1.1276888847351074,
"step": 4
},
{
"epoch": 0.0165016501650165,
"grad_norm": 10.822100601159539,
"learning_rate": 1.7582417582417585e-06,
"loss": 1.2254480123519897,
"step": 5
},
{
"epoch": 0.019801980198019802,
"grad_norm": 9.905516433474448,
"learning_rate": 2.197802197802198e-06,
"loss": 1.1809396743774414,
"step": 6
},
{
"epoch": 0.0231023102310231,
"grad_norm": 9.323364829402967,
"learning_rate": 2.6373626373626375e-06,
"loss": 1.2000095844268799,
"step": 7
},
{
"epoch": 0.026402640264026403,
"grad_norm": 6.706098746162178,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.0248074531555176,
"step": 8
},
{
"epoch": 0.0297029702970297,
"grad_norm": 5.761138380327878,
"learning_rate": 3.516483516483517e-06,
"loss": 1.0840561389923096,
"step": 9
},
{
"epoch": 0.033003300330033,
"grad_norm": 2.7364343552329315,
"learning_rate": 3.9560439560439565e-06,
"loss": 0.955639123916626,
"step": 10
},
{
"epoch": 0.036303630363036306,
"grad_norm": 2.113810438625661,
"learning_rate": 4.395604395604396e-06,
"loss": 0.9281604290008545,
"step": 11
},
{
"epoch": 0.039603960396039604,
"grad_norm": 1.849238684536393,
"learning_rate": 4.8351648351648355e-06,
"loss": 0.9079018831253052,
"step": 12
},
{
"epoch": 0.0429042904290429,
"grad_norm": 1.6747171029255208,
"learning_rate": 5.274725274725275e-06,
"loss": 0.9039217233657837,
"step": 13
},
{
"epoch": 0.0462046204620462,
"grad_norm": 2.0121666555693416,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.8910936117172241,
"step": 14
},
{
"epoch": 0.04950495049504951,
"grad_norm": 2.0600124028897526,
"learning_rate": 6.153846153846155e-06,
"loss": 0.895532488822937,
"step": 15
},
{
"epoch": 0.052805280528052806,
"grad_norm": 2.0613449368510044,
"learning_rate": 6.5934065934065935e-06,
"loss": 0.8889240622520447,
"step": 16
},
{
"epoch": 0.056105610561056105,
"grad_norm": 1.785450637059245,
"learning_rate": 7.032967032967034e-06,
"loss": 0.8499570488929749,
"step": 17
},
{
"epoch": 0.0594059405940594,
"grad_norm": 1.5894161631201256,
"learning_rate": 7.472527472527473e-06,
"loss": 0.839992105960846,
"step": 18
},
{
"epoch": 0.0627062706270627,
"grad_norm": 1.1904834264503976,
"learning_rate": 7.912087912087913e-06,
"loss": 0.7718420028686523,
"step": 19
},
{
"epoch": 0.066006600660066,
"grad_norm": 1.0397335564670163,
"learning_rate": 8.351648351648353e-06,
"loss": 0.7865867614746094,
"step": 20
},
{
"epoch": 0.06930693069306931,
"grad_norm": 0.8314739102256958,
"learning_rate": 8.791208791208792e-06,
"loss": 0.7982739806175232,
"step": 21
},
{
"epoch": 0.07260726072607261,
"grad_norm": 0.6542597896181986,
"learning_rate": 9.230769230769232e-06,
"loss": 0.7846421599388123,
"step": 22
},
{
"epoch": 0.07590759075907591,
"grad_norm": 0.6269389928815381,
"learning_rate": 9.670329670329671e-06,
"loss": 0.7005743980407715,
"step": 23
},
{
"epoch": 0.07920792079207921,
"grad_norm": 0.6603922634859757,
"learning_rate": 1.010989010989011e-05,
"loss": 0.7084314227104187,
"step": 24
},
{
"epoch": 0.08250825082508251,
"grad_norm": 0.6856248928818359,
"learning_rate": 1.054945054945055e-05,
"loss": 0.7310304641723633,
"step": 25
},
{
"epoch": 0.0858085808580858,
"grad_norm": 0.5728331825854258,
"learning_rate": 1.098901098901099e-05,
"loss": 0.7056888341903687,
"step": 26
},
{
"epoch": 0.0891089108910891,
"grad_norm": 0.47956485465857923,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.6987950205802917,
"step": 27
},
{
"epoch": 0.0924092409240924,
"grad_norm": 0.47407141179043555,
"learning_rate": 1.186813186813187e-05,
"loss": 0.7319807410240173,
"step": 28
},
{
"epoch": 0.09570957095709572,
"grad_norm": 0.4856924244101555,
"learning_rate": 1.230769230769231e-05,
"loss": 0.6983063220977783,
"step": 29
},
{
"epoch": 0.09900990099009901,
"grad_norm": 0.49122925908544063,
"learning_rate": 1.2747252747252747e-05,
"loss": 0.70492023229599,
"step": 30
},
{
"epoch": 0.10231023102310231,
"grad_norm": 0.4556788168903923,
"learning_rate": 1.3186813186813187e-05,
"loss": 0.7376629114151001,
"step": 31
},
{
"epoch": 0.10561056105610561,
"grad_norm": 0.4272838300827657,
"learning_rate": 1.3626373626373627e-05,
"loss": 0.6623936295509338,
"step": 32
},
{
"epoch": 0.10891089108910891,
"grad_norm": 0.40886227927218277,
"learning_rate": 1.4065934065934068e-05,
"loss": 0.7136330604553223,
"step": 33
},
{
"epoch": 0.11221122112211221,
"grad_norm": 0.37821179606418975,
"learning_rate": 1.4505494505494506e-05,
"loss": 0.7113747596740723,
"step": 34
},
{
"epoch": 0.11551155115511551,
"grad_norm": 0.4538557716923258,
"learning_rate": 1.4945054945054947e-05,
"loss": 0.8252867460250854,
"step": 35
},
{
"epoch": 0.1188118811881188,
"grad_norm": 0.3875808052898815,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.7406599521636963,
"step": 36
},
{
"epoch": 0.12211221122112212,
"grad_norm": 0.3503240143986989,
"learning_rate": 1.5824175824175826e-05,
"loss": 0.6572297811508179,
"step": 37
},
{
"epoch": 0.1254125412541254,
"grad_norm": 0.3779655372487014,
"learning_rate": 1.6263736263736265e-05,
"loss": 0.7520949840545654,
"step": 38
},
{
"epoch": 0.12871287128712872,
"grad_norm": 0.36968690038350466,
"learning_rate": 1.6703296703296707e-05,
"loss": 0.6861323118209839,
"step": 39
},
{
"epoch": 0.132013201320132,
"grad_norm": 0.3724328241107235,
"learning_rate": 1.7142857142857142e-05,
"loss": 0.6818518042564392,
"step": 40
},
{
"epoch": 0.1353135313531353,
"grad_norm": 0.35542054984937593,
"learning_rate": 1.7582417582417584e-05,
"loss": 0.6663186550140381,
"step": 41
},
{
"epoch": 0.13861386138613863,
"grad_norm": 0.3441266617586836,
"learning_rate": 1.8021978021978023e-05,
"loss": 0.6492191553115845,
"step": 42
},
{
"epoch": 0.1419141914191419,
"grad_norm": 0.3478448092762331,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.6444741487503052,
"step": 43
},
{
"epoch": 0.14521452145214522,
"grad_norm": 0.34951148057960574,
"learning_rate": 1.8901098901098903e-05,
"loss": 0.6476814150810242,
"step": 44
},
{
"epoch": 0.1485148514851485,
"grad_norm": 0.3356672452160599,
"learning_rate": 1.9340659340659342e-05,
"loss": 0.6660827994346619,
"step": 45
},
{
"epoch": 0.15181518151815182,
"grad_norm": 0.30809956365723695,
"learning_rate": 1.9780219780219784e-05,
"loss": 0.6924091577529907,
"step": 46
},
{
"epoch": 0.1551155115511551,
"grad_norm": 0.9030699054312887,
"learning_rate": 2.021978021978022e-05,
"loss": 0.6899605989456177,
"step": 47
},
{
"epoch": 0.15841584158415842,
"grad_norm": 0.35784060194946976,
"learning_rate": 2.0659340659340665e-05,
"loss": 0.7242028713226318,
"step": 48
},
{
"epoch": 0.1617161716171617,
"grad_norm": 0.3093966721093651,
"learning_rate": 2.10989010989011e-05,
"loss": 0.6203902959823608,
"step": 49
},
{
"epoch": 0.16501650165016502,
"grad_norm": 0.4242705872636108,
"learning_rate": 2.153846153846154e-05,
"loss": 0.6420010328292847,
"step": 50
},
{
"epoch": 0.16831683168316833,
"grad_norm": 0.35079960590346965,
"learning_rate": 2.197802197802198e-05,
"loss": 0.7517598867416382,
"step": 51
},
{
"epoch": 0.1716171617161716,
"grad_norm": 0.3078803790362521,
"learning_rate": 2.241758241758242e-05,
"loss": 0.6568161249160767,
"step": 52
},
{
"epoch": 0.17491749174917492,
"grad_norm": 0.34666662805484005,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.7348504662513733,
"step": 53
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.302791415801781,
"learning_rate": 2.32967032967033e-05,
"loss": 0.6164949536323547,
"step": 54
},
{
"epoch": 0.18151815181518152,
"grad_norm": 0.33732756727763136,
"learning_rate": 2.373626373626374e-05,
"loss": 0.6505363583564758,
"step": 55
},
{
"epoch": 0.1848184818481848,
"grad_norm": 0.34780152362496847,
"learning_rate": 2.4175824175824177e-05,
"loss": 0.7562520503997803,
"step": 56
},
{
"epoch": 0.18811881188118812,
"grad_norm": 0.3310895358869482,
"learning_rate": 2.461538461538462e-05,
"loss": 0.6943148374557495,
"step": 57
},
{
"epoch": 0.19141914191419143,
"grad_norm": 0.3367877938063833,
"learning_rate": 2.5054945054945058e-05,
"loss": 0.6571655869483948,
"step": 58
},
{
"epoch": 0.19471947194719472,
"grad_norm": 0.32103256018771714,
"learning_rate": 2.5494505494505493e-05,
"loss": 0.7229321002960205,
"step": 59
},
{
"epoch": 0.19801980198019803,
"grad_norm": 0.30468399230672144,
"learning_rate": 2.593406593406594e-05,
"loss": 0.6307672262191772,
"step": 60
},
{
"epoch": 0.20132013201320131,
"grad_norm": 0.3282635121595526,
"learning_rate": 2.6373626373626374e-05,
"loss": 0.6336506009101868,
"step": 61
},
{
"epoch": 0.20462046204620463,
"grad_norm": 0.3280360563022675,
"learning_rate": 2.6813186813186813e-05,
"loss": 0.6492213010787964,
"step": 62
},
{
"epoch": 0.2079207920792079,
"grad_norm": 0.3292430577817229,
"learning_rate": 2.7252747252747255e-05,
"loss": 0.6763280034065247,
"step": 63
},
{
"epoch": 0.21122112211221122,
"grad_norm": 0.47832355846700536,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.7322396039962769,
"step": 64
},
{
"epoch": 0.2145214521452145,
"grad_norm": 0.31915340164178446,
"learning_rate": 2.8131868131868136e-05,
"loss": 0.7080870270729065,
"step": 65
},
{
"epoch": 0.21782178217821782,
"grad_norm": 0.3227571040968621,
"learning_rate": 2.8571428571428574e-05,
"loss": 0.6054466962814331,
"step": 66
},
{
"epoch": 0.22112211221122113,
"grad_norm": 0.33375713186655664,
"learning_rate": 2.9010989010989013e-05,
"loss": 0.6782290935516357,
"step": 67
},
{
"epoch": 0.22442244224422442,
"grad_norm": 0.3437770801965916,
"learning_rate": 2.9450549450549455e-05,
"loss": 0.6804753541946411,
"step": 68
},
{
"epoch": 0.22772277227722773,
"grad_norm": 0.3228427319313703,
"learning_rate": 2.9890109890109894e-05,
"loss": 0.6493992805480957,
"step": 69
},
{
"epoch": 0.23102310231023102,
"grad_norm": 0.3540211756840673,
"learning_rate": 3.0329670329670332e-05,
"loss": 0.6263789534568787,
"step": 70
},
{
"epoch": 0.23432343234323433,
"grad_norm": 0.34989089824503405,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.6960322856903076,
"step": 71
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.33624443163866324,
"learning_rate": 3.120879120879121e-05,
"loss": 0.6146604418754578,
"step": 72
},
{
"epoch": 0.24092409240924093,
"grad_norm": 0.39618402867027047,
"learning_rate": 3.164835164835165e-05,
"loss": 0.6361377239227295,
"step": 73
},
{
"epoch": 0.24422442244224424,
"grad_norm": 0.361603087273114,
"learning_rate": 3.2087912087912094e-05,
"loss": 0.636134147644043,
"step": 74
},
{
"epoch": 0.24752475247524752,
"grad_norm": 0.37985663132790304,
"learning_rate": 3.252747252747253e-05,
"loss": 0.5936564803123474,
"step": 75
},
{
"epoch": 0.2508250825082508,
"grad_norm": 0.35883234873646996,
"learning_rate": 3.296703296703297e-05,
"loss": 0.6001103520393372,
"step": 76
},
{
"epoch": 0.25412541254125415,
"grad_norm": 0.35227803701073973,
"learning_rate": 3.340659340659341e-05,
"loss": 0.6254594326019287,
"step": 77
},
{
"epoch": 0.25742574257425743,
"grad_norm": 0.3563257650896171,
"learning_rate": 3.384615384615385e-05,
"loss": 0.6457959413528442,
"step": 78
},
{
"epoch": 0.2607260726072607,
"grad_norm": 0.37234316340556584,
"learning_rate": 3.4285714285714284e-05,
"loss": 0.6186954975128174,
"step": 79
},
{
"epoch": 0.264026402640264,
"grad_norm": 0.35352748449766547,
"learning_rate": 3.4725274725274726e-05,
"loss": 0.6175529956817627,
"step": 80
},
{
"epoch": 0.26732673267326734,
"grad_norm": 0.35441369709658355,
"learning_rate": 3.516483516483517e-05,
"loss": 0.6694468259811401,
"step": 81
},
{
"epoch": 0.2706270627062706,
"grad_norm": 0.39955400784840756,
"learning_rate": 3.56043956043956e-05,
"loss": 0.627490222454071,
"step": 82
},
{
"epoch": 0.2739273927392739,
"grad_norm": 0.38314031523497477,
"learning_rate": 3.6043956043956045e-05,
"loss": 0.6410495638847351,
"step": 83
},
{
"epoch": 0.27722772277227725,
"grad_norm": 0.36926215386141575,
"learning_rate": 3.648351648351649e-05,
"loss": 0.6305102109909058,
"step": 84
},
{
"epoch": 0.28052805280528054,
"grad_norm": 0.38364118080284076,
"learning_rate": 3.692307692307693e-05,
"loss": 0.6558895111083984,
"step": 85
},
{
"epoch": 0.2838283828382838,
"grad_norm": 0.3370292682974053,
"learning_rate": 3.7362637362637365e-05,
"loss": 0.6029388308525085,
"step": 86
},
{
"epoch": 0.2871287128712871,
"grad_norm": 0.39541874871701704,
"learning_rate": 3.7802197802197807e-05,
"loss": 0.6551017761230469,
"step": 87
},
{
"epoch": 0.29042904290429045,
"grad_norm": 0.3629036550044273,
"learning_rate": 3.824175824175825e-05,
"loss": 0.6588809490203857,
"step": 88
},
{
"epoch": 0.29372937293729373,
"grad_norm": 0.37786447228212183,
"learning_rate": 3.8681318681318684e-05,
"loss": 0.614648699760437,
"step": 89
},
{
"epoch": 0.297029702970297,
"grad_norm": 0.42911861803278684,
"learning_rate": 3.9120879120879126e-05,
"loss": 0.7034356594085693,
"step": 90
},
{
"epoch": 0.30033003300330036,
"grad_norm": 0.3707184094312094,
"learning_rate": 3.956043956043957e-05,
"loss": 0.6908263564109802,
"step": 91
},
{
"epoch": 0.30363036303630364,
"grad_norm": 0.38262186656216063,
"learning_rate": 4e-05,
"loss": 0.6882215738296509,
"step": 92
},
{
"epoch": 0.3069306930693069,
"grad_norm": 0.3709464296309744,
"learning_rate": 3.999985249980169e-05,
"loss": 0.6377270221710205,
"step": 93
},
{
"epoch": 0.3102310231023102,
"grad_norm": 0.3412837406106036,
"learning_rate": 3.999941000138238e-05,
"loss": 0.6735270619392395,
"step": 94
},
{
"epoch": 0.31353135313531355,
"grad_norm": 0.40165192879996064,
"learning_rate": 3.999867251126893e-05,
"loss": 0.6934541463851929,
"step": 95
},
{
"epoch": 0.31683168316831684,
"grad_norm": 0.34707128601816045,
"learning_rate": 3.9997640040339335e-05,
"loss": 0.6367039084434509,
"step": 96
},
{
"epoch": 0.3201320132013201,
"grad_norm": 0.4268828113970776,
"learning_rate": 3.999631260382257e-05,
"loss": 0.6274522542953491,
"step": 97
},
{
"epoch": 0.3234323432343234,
"grad_norm": 0.454428833020686,
"learning_rate": 3.999469022129834e-05,
"loss": 0.5874066352844238,
"step": 98
},
{
"epoch": 0.32673267326732675,
"grad_norm": 0.4200675840489775,
"learning_rate": 3.9992772916696824e-05,
"loss": 0.6175942420959473,
"step": 99
},
{
"epoch": 0.33003300330033003,
"grad_norm": 0.3796321080056305,
"learning_rate": 3.99905607182983e-05,
"loss": 0.5625832080841064,
"step": 100
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.39108856096759403,
"learning_rate": 3.998805365873274e-05,
"loss": 0.6153020262718201,
"step": 101
},
{
"epoch": 0.33663366336633666,
"grad_norm": 0.3873560194436071,
"learning_rate": 3.998525177497932e-05,
"loss": 0.5585426092147827,
"step": 102
},
{
"epoch": 0.33993399339933994,
"grad_norm": 0.4084712106325698,
"learning_rate": 3.998215510836589e-05,
"loss": 0.6586359739303589,
"step": 103
},
{
"epoch": 0.3432343234323432,
"grad_norm": 0.4383246876899704,
"learning_rate": 3.997876370456833e-05,
"loss": 0.62096107006073,
"step": 104
},
{
"epoch": 0.3465346534653465,
"grad_norm": 0.4026893562706946,
"learning_rate": 3.997507761360993e-05,
"loss": 0.6059336066246033,
"step": 105
},
{
"epoch": 0.34983498349834985,
"grad_norm": 0.46586240044914223,
"learning_rate": 3.997109688986059e-05,
"loss": 0.617970883846283,
"step": 106
},
{
"epoch": 0.35313531353135313,
"grad_norm": 0.44949199032710474,
"learning_rate": 3.9966821592036066e-05,
"loss": 0.6453397274017334,
"step": 107
},
{
"epoch": 0.3564356435643564,
"grad_norm": 0.4794978158156406,
"learning_rate": 3.996225178319709e-05,
"loss": 0.6371763348579407,
"step": 108
},
{
"epoch": 0.35973597359735976,
"grad_norm": 0.4463512391721941,
"learning_rate": 3.9957387530748435e-05,
"loss": 0.5971124172210693,
"step": 109
},
{
"epoch": 0.36303630363036304,
"grad_norm": 0.368079413354641,
"learning_rate": 3.995222890643792e-05,
"loss": 0.5679532289505005,
"step": 110
},
{
"epoch": 0.36633663366336633,
"grad_norm": 0.43733705586285254,
"learning_rate": 3.9946775986355346e-05,
"loss": 0.5988069772720337,
"step": 111
},
{
"epoch": 0.3696369636963696,
"grad_norm": 0.38235582844960775,
"learning_rate": 3.994102885093141e-05,
"loss": 0.6352983713150024,
"step": 112
},
{
"epoch": 0.37293729372937295,
"grad_norm": 0.389837871286893,
"learning_rate": 3.993498758493646e-05,
"loss": 0.58957839012146,
"step": 113
},
{
"epoch": 0.37623762376237624,
"grad_norm": 0.40399856168911097,
"learning_rate": 3.992865227747929e-05,
"loss": 0.6396822929382324,
"step": 114
},
{
"epoch": 0.3795379537953795,
"grad_norm": 0.38891668976227123,
"learning_rate": 3.992202302200582e-05,
"loss": 0.6314754486083984,
"step": 115
},
{
"epoch": 0.38283828382838286,
"grad_norm": 0.4087528543828922,
"learning_rate": 3.991509991629769e-05,
"loss": 0.673650860786438,
"step": 116
},
{
"epoch": 0.38613861386138615,
"grad_norm": 0.36330054292020786,
"learning_rate": 3.990788306247085e-05,
"loss": 0.5813701152801514,
"step": 117
},
{
"epoch": 0.38943894389438943,
"grad_norm": 0.4247110332678589,
"learning_rate": 3.990037256697404e-05,
"loss": 0.6419334411621094,
"step": 118
},
{
"epoch": 0.3927392739273927,
"grad_norm": 0.4244126002071751,
"learning_rate": 3.989256854058721e-05,
"loss": 0.6319208145141602,
"step": 119
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.3651632933942853,
"learning_rate": 3.988447109841991e-05,
"loss": 0.5989845991134644,
"step": 120
},
{
"epoch": 0.39933993399339934,
"grad_norm": 0.393158353074077,
"learning_rate": 3.987608035990957e-05,
"loss": 0.5853303670883179,
"step": 121
},
{
"epoch": 0.40264026402640263,
"grad_norm": 0.35965233332276103,
"learning_rate": 3.986739644881975e-05,
"loss": 0.6115257143974304,
"step": 122
},
{
"epoch": 0.40594059405940597,
"grad_norm": 0.4252711474203845,
"learning_rate": 3.985841949323831e-05,
"loss": 0.6440504789352417,
"step": 123
},
{
"epoch": 0.40924092409240925,
"grad_norm": 0.5578797297271848,
"learning_rate": 3.984914962557553e-05,
"loss": 0.5765030384063721,
"step": 124
},
{
"epoch": 0.41254125412541254,
"grad_norm": 0.4362455029468141,
"learning_rate": 3.983958698256214e-05,
"loss": 0.6387556791305542,
"step": 125
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.39274811063076087,
"learning_rate": 3.98297317052473e-05,
"loss": 0.6263147592544556,
"step": 126
},
{
"epoch": 0.41914191419141916,
"grad_norm": 0.42682589637163704,
"learning_rate": 3.981958393899656e-05,
"loss": 0.6091845035552979,
"step": 127
},
{
"epoch": 0.42244224422442245,
"grad_norm": 0.4033131171538041,
"learning_rate": 3.980914383348967e-05,
"loss": 0.6458015441894531,
"step": 128
},
{
"epoch": 0.42574257425742573,
"grad_norm": 0.3881606915462862,
"learning_rate": 3.9798411542718395e-05,
"loss": 0.6115552186965942,
"step": 129
},
{
"epoch": 0.429042904290429,
"grad_norm": 0.38910317938225847,
"learning_rate": 3.978738722498423e-05,
"loss": 0.6427993774414062,
"step": 130
},
{
"epoch": 0.43234323432343236,
"grad_norm": 0.36836380096259913,
"learning_rate": 3.977607104289609e-05,
"loss": 0.6121467351913452,
"step": 131
},
{
"epoch": 0.43564356435643564,
"grad_norm": 0.3743062201629088,
"learning_rate": 3.9764463163367875e-05,
"loss": 0.5951442718505859,
"step": 132
},
{
"epoch": 0.4389438943894389,
"grad_norm": 0.3699746655092952,
"learning_rate": 3.9752563757616045e-05,
"loss": 0.6639472842216492,
"step": 133
},
{
"epoch": 0.44224422442244227,
"grad_norm": 0.37398919831188604,
"learning_rate": 3.974037300115706e-05,
"loss": 0.6084764003753662,
"step": 134
},
{
"epoch": 0.44554455445544555,
"grad_norm": 0.37043195153646374,
"learning_rate": 3.972789107380484e-05,
"loss": 0.6211085915565491,
"step": 135
},
{
"epoch": 0.44884488448844884,
"grad_norm": 0.3509837417375981,
"learning_rate": 3.9715118159668046e-05,
"loss": 0.6098147034645081,
"step": 136
},
{
"epoch": 0.4521452145214521,
"grad_norm": 0.3350785925775803,
"learning_rate": 3.970205444714742e-05,
"loss": 0.6155884861946106,
"step": 137
},
{
"epoch": 0.45544554455445546,
"grad_norm": 0.38529379761335925,
"learning_rate": 3.9688700128932975e-05,
"loss": 0.5984665155410767,
"step": 138
},
{
"epoch": 0.45874587458745875,
"grad_norm": 0.45130397769476205,
"learning_rate": 3.967505540200117e-05,
"loss": 0.6656880378723145,
"step": 139
},
{
"epoch": 0.46204620462046203,
"grad_norm": 0.3277874952439621,
"learning_rate": 3.966112046761201e-05,
"loss": 0.6607398390769958,
"step": 140
},
{
"epoch": 0.46534653465346537,
"grad_norm": 2.6727599644732267,
"learning_rate": 3.9646895531306046e-05,
"loss": 0.6578342914581299,
"step": 141
},
{
"epoch": 0.46864686468646866,
"grad_norm": 0.47429126269764676,
"learning_rate": 3.963238080290136e-05,
"loss": 0.6103699803352356,
"step": 142
},
{
"epoch": 0.47194719471947194,
"grad_norm": 0.32652590291724093,
"learning_rate": 3.96175764964905e-05,
"loss": 0.5484676957130432,
"step": 143
},
{
"epoch": 0.4752475247524752,
"grad_norm": 0.4531372955951849,
"learning_rate": 3.960248283043727e-05,
"loss": 0.578776478767395,
"step": 144
},
{
"epoch": 0.47854785478547857,
"grad_norm": 0.3685580706465372,
"learning_rate": 3.958710002737355e-05,
"loss": 0.6184446811676025,
"step": 145
},
{
"epoch": 0.48184818481848185,
"grad_norm": 0.3584005630962511,
"learning_rate": 3.9571428314195984e-05,
"loss": 0.6307916045188904,
"step": 146
},
{
"epoch": 0.48514851485148514,
"grad_norm": 0.4049679254542765,
"learning_rate": 3.955546792206265e-05,
"loss": 0.6064697504043579,
"step": 147
},
{
"epoch": 0.4884488448844885,
"grad_norm": 0.3846258995775384,
"learning_rate": 3.953921908638966e-05,
"loss": 0.6055655479431152,
"step": 148
},
{
"epoch": 0.49174917491749176,
"grad_norm": 0.3643318343315678,
"learning_rate": 3.952268204684765e-05,
"loss": 0.5856431126594543,
"step": 149
},
{
"epoch": 0.49504950495049505,
"grad_norm": 0.3854715521866927,
"learning_rate": 3.950585704735829e-05,
"loss": 0.6634635925292969,
"step": 150
},
{
"epoch": 0.49834983498349833,
"grad_norm": 0.34338835592304534,
"learning_rate": 3.948874433609065e-05,
"loss": 0.5880753397941589,
"step": 151
},
{
"epoch": 0.5016501650165016,
"grad_norm": 0.3481018111538647,
"learning_rate": 3.947134416545757e-05,
"loss": 0.5594221949577332,
"step": 152
},
{
"epoch": 0.504950495049505,
"grad_norm": 0.6570220882473125,
"learning_rate": 3.94536567921119e-05,
"loss": 0.664652407169342,
"step": 153
},
{
"epoch": 0.5082508250825083,
"grad_norm": 0.340048306266198,
"learning_rate": 3.9435682476942755e-05,
"loss": 0.6002815961837769,
"step": 154
},
{
"epoch": 0.5115511551155115,
"grad_norm": 0.3488682381523364,
"learning_rate": 3.941742148507163e-05,
"loss": 0.5905177593231201,
"step": 155
},
{
"epoch": 0.5148514851485149,
"grad_norm": 0.33062666453941425,
"learning_rate": 3.939887408584853e-05,
"loss": 0.5636795163154602,
"step": 156
},
{
"epoch": 0.5181518151815182,
"grad_norm": 0.35862086331061066,
"learning_rate": 3.938004055284796e-05,
"loss": 0.5639582276344299,
"step": 157
},
{
"epoch": 0.5214521452145214,
"grad_norm": 0.31769111173717246,
"learning_rate": 3.9360921163864895e-05,
"loss": 0.6515591144561768,
"step": 158
},
{
"epoch": 0.5247524752475248,
"grad_norm": 0.38401455820073427,
"learning_rate": 3.934151620091071e-05,
"loss": 0.5721683502197266,
"step": 159
},
{
"epoch": 0.528052805280528,
"grad_norm": 0.3284331200684813,
"learning_rate": 3.9321825950209e-05,
"loss": 0.5801802277565002,
"step": 160
},
{
"epoch": 0.5313531353135313,
"grad_norm": 0.3493998878359796,
"learning_rate": 3.9301850702191344e-05,
"loss": 0.603084921836853,
"step": 161
},
{
"epoch": 0.5346534653465347,
"grad_norm": 0.32233519110844616,
"learning_rate": 3.928159075149304e-05,
"loss": 0.6376925110816956,
"step": 162
},
{
"epoch": 0.5379537953795379,
"grad_norm": 0.35833134197704153,
"learning_rate": 3.926104639694877e-05,
"loss": 0.5764102935791016,
"step": 163
},
{
"epoch": 0.5412541254125413,
"grad_norm": 0.3523567199445224,
"learning_rate": 3.924021794158818e-05,
"loss": 0.6102188229560852,
"step": 164
},
{
"epoch": 0.5445544554455446,
"grad_norm": 0.36694222553878597,
"learning_rate": 3.921910569263139e-05,
"loss": 0.5833287835121155,
"step": 165
},
{
"epoch": 0.5478547854785478,
"grad_norm": 0.37179813198977807,
"learning_rate": 3.919770996148448e-05,
"loss": 0.5891385078430176,
"step": 166
},
{
"epoch": 0.5511551155115512,
"grad_norm": 0.3507301680001106,
"learning_rate": 3.917603106373493e-05,
"loss": 0.5838547348976135,
"step": 167
},
{
"epoch": 0.5544554455445545,
"grad_norm": 0.3134001311174479,
"learning_rate": 3.9154069319146904e-05,
"loss": 0.5727800726890564,
"step": 168
},
{
"epoch": 0.5577557755775577,
"grad_norm": 0.33531781904204605,
"learning_rate": 3.913182505165656e-05,
"loss": 0.6102641224861145,
"step": 169
},
{
"epoch": 0.5610561056105611,
"grad_norm": 0.35178976522027133,
"learning_rate": 3.91092985893673e-05,
"loss": 0.5718260407447815,
"step": 170
},
{
"epoch": 0.5643564356435643,
"grad_norm": 0.47006108726602863,
"learning_rate": 3.908649026454488e-05,
"loss": 0.6308504939079285,
"step": 171
},
{
"epoch": 0.5676567656765676,
"grad_norm": 0.3687514240026255,
"learning_rate": 3.906340041361255e-05,
"loss": 0.6089432835578918,
"step": 172
},
{
"epoch": 0.570957095709571,
"grad_norm": 0.3586674884704593,
"learning_rate": 3.904002937714606e-05,
"loss": 0.6583501696586609,
"step": 173
},
{
"epoch": 0.5742574257425742,
"grad_norm": 0.3399808047240735,
"learning_rate": 3.9016377499868666e-05,
"loss": 0.6108609437942505,
"step": 174
},
{
"epoch": 0.5775577557755776,
"grad_norm": 0.3840880337988826,
"learning_rate": 3.899244513064603e-05,
"loss": 0.63509202003479,
"step": 175
},
{
"epoch": 0.5808580858085809,
"grad_norm": 0.3725541644477348,
"learning_rate": 3.896823262248107e-05,
"loss": 0.5759241580963135,
"step": 176
},
{
"epoch": 0.5841584158415841,
"grad_norm": 0.30755721985114126,
"learning_rate": 3.8943740332508754e-05,
"loss": 0.6148169040679932,
"step": 177
},
{
"epoch": 0.5874587458745875,
"grad_norm": 0.3916756097057637,
"learning_rate": 3.891896862199086e-05,
"loss": 0.5266364216804504,
"step": 178
},
{
"epoch": 0.5907590759075908,
"grad_norm": 0.3417854779376455,
"learning_rate": 3.88939178563106e-05,
"loss": 0.5626640319824219,
"step": 179
},
{
"epoch": 0.594059405940594,
"grad_norm": 0.33526488525207704,
"learning_rate": 3.886858840496727e-05,
"loss": 0.6063880920410156,
"step": 180
},
{
"epoch": 0.5973597359735974,
"grad_norm": 0.37344333250119977,
"learning_rate": 3.884298064157077e-05,
"loss": 0.5979235768318176,
"step": 181
},
{
"epoch": 0.6006600660066007,
"grad_norm": 0.3835133271197793,
"learning_rate": 3.881709494383612e-05,
"loss": 0.6628611087799072,
"step": 182
},
{
"epoch": 0.6039603960396039,
"grad_norm": 0.4344526004756121,
"learning_rate": 3.879093169357789e-05,
"loss": 0.6215270757675171,
"step": 183
},
{
"epoch": 0.6072607260726073,
"grad_norm": 0.3644174435488244,
"learning_rate": 3.876449127670452e-05,
"loss": 0.6148592233657837,
"step": 184
},
{
"epoch": 0.6105610561056105,
"grad_norm": 0.3619226265536735,
"learning_rate": 3.87377740832127e-05,
"loss": 0.6254778504371643,
"step": 185
},
{
"epoch": 0.6138613861386139,
"grad_norm": 0.3492162593840536,
"learning_rate": 3.871078050718155e-05,
"loss": 0.6025378704071045,
"step": 186
},
{
"epoch": 0.6171617161716172,
"grad_norm": 0.3866924759539626,
"learning_rate": 3.8683510946766866e-05,
"loss": 0.5887518525123596,
"step": 187
},
{
"epoch": 0.6204620462046204,
"grad_norm": 0.3357229513721586,
"learning_rate": 3.865596580419519e-05,
"loss": 0.6180317401885986,
"step": 188
},
{
"epoch": 0.6237623762376238,
"grad_norm": 0.3594949077768003,
"learning_rate": 3.8628145485757925e-05,
"loss": 0.5970651507377625,
"step": 189
},
{
"epoch": 0.6270627062706271,
"grad_norm": 0.3496234009951303,
"learning_rate": 3.860005040180533e-05,
"loss": 0.6027296781539917,
"step": 190
},
{
"epoch": 0.6303630363036303,
"grad_norm": 0.3830042583584045,
"learning_rate": 3.857168096674044e-05,
"loss": 0.6326305270195007,
"step": 191
},
{
"epoch": 0.6336633663366337,
"grad_norm": 0.333508477943962,
"learning_rate": 3.854303759901299e-05,
"loss": 0.6508482694625854,
"step": 192
},
{
"epoch": 0.636963696369637,
"grad_norm": 0.352327105927571,
"learning_rate": 3.851412072111322e-05,
"loss": 0.6088548302650452,
"step": 193
},
{
"epoch": 0.6402640264026402,
"grad_norm": 0.36196379228138037,
"learning_rate": 3.8484930759565645e-05,
"loss": 0.5975607633590698,
"step": 194
},
{
"epoch": 0.6435643564356436,
"grad_norm": 0.3231664855297077,
"learning_rate": 3.845546814492279e-05,
"loss": 0.5467930436134338,
"step": 195
},
{
"epoch": 0.6468646864686468,
"grad_norm": 0.35556526722817444,
"learning_rate": 3.8425733311758795e-05,
"loss": 0.583969235420227,
"step": 196
},
{
"epoch": 0.6501650165016502,
"grad_norm": 0.331073543443887,
"learning_rate": 3.8395726698663045e-05,
"loss": 0.6007376909255981,
"step": 197
},
{
"epoch": 0.6534653465346535,
"grad_norm": 0.34786293006180385,
"learning_rate": 3.836544874823368e-05,
"loss": 0.5971908569335938,
"step": 198
},
{
"epoch": 0.6567656765676567,
"grad_norm": 0.3128647628132879,
"learning_rate": 3.8334899907071064e-05,
"loss": 0.592069685459137,
"step": 199
},
{
"epoch": 0.6600660066006601,
"grad_norm": 0.3308125796746202,
"learning_rate": 3.830408062577121e-05,
"loss": 0.6188071966171265,
"step": 200
},
{
"epoch": 0.6633663366336634,
"grad_norm": 0.34889077565364124,
"learning_rate": 3.827299135891913e-05,
"loss": 0.5976923704147339,
"step": 201
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.33443153994631497,
"learning_rate": 3.8241632565082124e-05,
"loss": 0.6120954155921936,
"step": 202
},
{
"epoch": 0.66996699669967,
"grad_norm": 0.3573334503206899,
"learning_rate": 3.821000470680303e-05,
"loss": 0.6661979556083679,
"step": 203
},
{
"epoch": 0.6732673267326733,
"grad_norm": 0.34662331225184934,
"learning_rate": 3.8178108250593384e-05,
"loss": 0.5853559970855713,
"step": 204
},
{
"epoch": 0.6765676567656765,
"grad_norm": 0.33823171869993424,
"learning_rate": 3.814594366692654e-05,
"loss": 0.6648768186569214,
"step": 205
},
{
"epoch": 0.6798679867986799,
"grad_norm": 0.4178878629038068,
"learning_rate": 3.8113511430230745e-05,
"loss": 0.5893838405609131,
"step": 206
},
{
"epoch": 0.6831683168316832,
"grad_norm": 0.36858896529016355,
"learning_rate": 3.808081201888214e-05,
"loss": 0.6177140474319458,
"step": 207
},
{
"epoch": 0.6864686468646864,
"grad_norm": 0.38061402245158527,
"learning_rate": 3.8047845915197695e-05,
"loss": 0.5793695449829102,
"step": 208
},
{
"epoch": 0.6897689768976898,
"grad_norm": 0.3591315376932048,
"learning_rate": 3.8014613605428084e-05,
"loss": 0.5571605563163757,
"step": 209
},
{
"epoch": 0.693069306930693,
"grad_norm": 0.33319862057164595,
"learning_rate": 3.798111557975053e-05,
"loss": 0.5945760011672974,
"step": 210
},
{
"epoch": 0.6963696369636964,
"grad_norm": 0.3495679574237745,
"learning_rate": 3.7947352332261586e-05,
"loss": 0.600873589515686,
"step": 211
},
{
"epoch": 0.6996699669966997,
"grad_norm": 0.37390147639764304,
"learning_rate": 3.791332436096983e-05,
"loss": 0.6234852075576782,
"step": 212
},
{
"epoch": 0.7029702970297029,
"grad_norm": 0.3571653694610809,
"learning_rate": 3.7879032167788494e-05,
"loss": 0.6129578948020935,
"step": 213
},
{
"epoch": 0.7062706270627063,
"grad_norm": 0.48971881906384135,
"learning_rate": 3.784447625852812e-05,
"loss": 0.6204475164413452,
"step": 214
},
{
"epoch": 0.7095709570957096,
"grad_norm": 0.3610294548812676,
"learning_rate": 3.780965714288905e-05,
"loss": 0.6734122037887573,
"step": 215
},
{
"epoch": 0.7128712871287128,
"grad_norm": 0.35396639697907356,
"learning_rate": 3.777457533445393e-05,
"loss": 0.5678560137748718,
"step": 216
},
{
"epoch": 0.7161716171617162,
"grad_norm": 0.3232076597831296,
"learning_rate": 3.7739231350680135e-05,
"loss": 0.5784683227539062,
"step": 217
},
{
"epoch": 0.7194719471947195,
"grad_norm": 0.3540897506756201,
"learning_rate": 3.7703625712892125e-05,
"loss": 0.6060354113578796,
"step": 218
},
{
"epoch": 0.7227722772277227,
"grad_norm": 0.35008278157890194,
"learning_rate": 3.766775894627376e-05,
"loss": 0.6248741745948792,
"step": 219
},
{
"epoch": 0.7260726072607261,
"grad_norm": 0.32018676747331787,
"learning_rate": 3.7631631579860553e-05,
"loss": 0.6014479398727417,
"step": 220
},
{
"epoch": 0.7293729372937293,
"grad_norm": 0.32068744744726313,
"learning_rate": 3.759524414653189e-05,
"loss": 0.6283233761787415,
"step": 221
},
{
"epoch": 0.7326732673267327,
"grad_norm": 0.3047460979670785,
"learning_rate": 3.755859718300313e-05,
"loss": 0.5710185766220093,
"step": 222
},
{
"epoch": 0.735973597359736,
"grad_norm": 0.34698489216212486,
"learning_rate": 3.75216912298177e-05,
"loss": 0.6007407903671265,
"step": 223
},
{
"epoch": 0.7392739273927392,
"grad_norm": 0.4952362221345831,
"learning_rate": 3.748452683133916e-05,
"loss": 0.6852575540542603,
"step": 224
},
{
"epoch": 0.7425742574257426,
"grad_norm": 0.32106680253004655,
"learning_rate": 3.7447104535743115e-05,
"loss": 0.6270833611488342,
"step": 225
},
{
"epoch": 0.7458745874587459,
"grad_norm": 0.30214814189665545,
"learning_rate": 3.740942489500916e-05,
"loss": 0.5925471782684326,
"step": 226
},
{
"epoch": 0.7491749174917491,
"grad_norm": 0.3171932777170319,
"learning_rate": 3.737148846491275e-05,
"loss": 0.573570728302002,
"step": 227
},
{
"epoch": 0.7524752475247525,
"grad_norm": 0.31480815810804524,
"learning_rate": 3.7333295805016986e-05,
"loss": 0.6088368892669678,
"step": 228
},
{
"epoch": 0.7557755775577558,
"grad_norm": 0.3103068539492526,
"learning_rate": 3.729484747866435e-05,
"loss": 0.5496470332145691,
"step": 229
},
{
"epoch": 0.759075907590759,
"grad_norm": 0.3007603199811456,
"learning_rate": 3.725614405296843e-05,
"loss": 0.6008220314979553,
"step": 230
},
{
"epoch": 0.7623762376237624,
"grad_norm": 0.3007492168191884,
"learning_rate": 3.721718609880551e-05,
"loss": 0.5982120037078857,
"step": 231
},
{
"epoch": 0.7656765676567657,
"grad_norm": 0.3010002181490163,
"learning_rate": 3.717797419080618e-05,
"loss": 0.6404559016227722,
"step": 232
},
{
"epoch": 0.768976897689769,
"grad_norm": 0.35604106645956024,
"learning_rate": 3.713850890734689e-05,
"loss": 0.5875239372253418,
"step": 233
},
{
"epoch": 0.7722772277227723,
"grad_norm": 0.33191901009333297,
"learning_rate": 3.709879083054133e-05,
"loss": 0.5962772369384766,
"step": 234
},
{
"epoch": 0.7755775577557755,
"grad_norm": 0.29418628627284477,
"learning_rate": 3.705882054623192e-05,
"loss": 0.5764110684394836,
"step": 235
},
{
"epoch": 0.7788778877887789,
"grad_norm": 0.30409612807603364,
"learning_rate": 3.7018598643981165e-05,
"loss": 0.5635858178138733,
"step": 236
},
{
"epoch": 0.7821782178217822,
"grad_norm": 0.3039645238556037,
"learning_rate": 3.69781257170629e-05,
"loss": 0.5880881547927856,
"step": 237
},
{
"epoch": 0.7854785478547854,
"grad_norm": 0.30606246597511416,
"learning_rate": 3.6937402362453606e-05,
"loss": 0.5644733905792236,
"step": 238
},
{
"epoch": 0.7887788778877888,
"grad_norm": 0.328325214152846,
"learning_rate": 3.689642918082358e-05,
"loss": 0.6431151032447815,
"step": 239
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.2863869456911102,
"learning_rate": 3.6855206776528055e-05,
"loss": 0.5848085880279541,
"step": 240
},
{
"epoch": 0.7953795379537953,
"grad_norm": 0.3169795193025283,
"learning_rate": 3.681373575759831e-05,
"loss": 0.590021550655365,
"step": 241
},
{
"epoch": 0.7986798679867987,
"grad_norm": 0.3630216059086489,
"learning_rate": 3.67720167357327e-05,
"loss": 0.6217919588088989,
"step": 242
},
{
"epoch": 0.801980198019802,
"grad_norm": 0.2999270957223198,
"learning_rate": 3.673005032628763e-05,
"loss": 0.6075180172920227,
"step": 243
},
{
"epoch": 0.8052805280528053,
"grad_norm": 0.35145967135780704,
"learning_rate": 3.668783714826846e-05,
"loss": 0.6078404188156128,
"step": 244
},
{
"epoch": 0.8085808580858086,
"grad_norm": 0.32650805345047657,
"learning_rate": 3.664537782432042e-05,
"loss": 0.6297526955604553,
"step": 245
},
{
"epoch": 0.8118811881188119,
"grad_norm": 0.32461322862254094,
"learning_rate": 3.660267298071936e-05,
"loss": 0.5684514045715332,
"step": 246
},
{
"epoch": 0.8151815181518152,
"grad_norm": 0.32171296221654416,
"learning_rate": 3.655972324736259e-05,
"loss": 0.6192148327827454,
"step": 247
},
{
"epoch": 0.8184818481848185,
"grad_norm": 0.3322336621503604,
"learning_rate": 3.6516529257759506e-05,
"loss": 0.5900243520736694,
"step": 248
},
{
"epoch": 0.8217821782178217,
"grad_norm": 0.35183312055445004,
"learning_rate": 3.6473091649022337e-05,
"loss": 0.5941751599311829,
"step": 249
},
{
"epoch": 0.8250825082508251,
"grad_norm": 0.31255833045908565,
"learning_rate": 3.6429411061856645e-05,
"loss": 0.5744310021400452,
"step": 250
},
{
"epoch": 0.8283828382838284,
"grad_norm": 0.3266269251233177,
"learning_rate": 3.6385488140551985e-05,
"loss": 0.5985124707221985,
"step": 251
},
{
"epoch": 0.8316831683168316,
"grad_norm": 0.30426711611593643,
"learning_rate": 3.6341323532972294e-05,
"loss": 0.581912636756897,
"step": 252
},
{
"epoch": 0.834983498349835,
"grad_norm": 0.3297819735063718,
"learning_rate": 3.629691789054643e-05,
"loss": 0.586786150932312,
"step": 253
},
{
"epoch": 0.8382838283828383,
"grad_norm": 0.3074133078124695,
"learning_rate": 3.625227186825848e-05,
"loss": 0.6312603950500488,
"step": 254
},
{
"epoch": 0.8415841584158416,
"grad_norm": 0.33007753969064285,
"learning_rate": 3.620738612463818e-05,
"loss": 0.5886626243591309,
"step": 255
},
{
"epoch": 0.8448844884488449,
"grad_norm": 0.31334340596765187,
"learning_rate": 3.6162261321751114e-05,
"loss": 0.5892266035079956,
"step": 256
},
{
"epoch": 0.8481848184818482,
"grad_norm": 0.31784442826893616,
"learning_rate": 3.6116898125189045e-05,
"loss": 0.5472115278244019,
"step": 257
},
{
"epoch": 0.8514851485148515,
"grad_norm": 0.3456330158902343,
"learning_rate": 3.6071297204059995e-05,
"loss": 0.5981796383857727,
"step": 258
},
{
"epoch": 0.8547854785478548,
"grad_norm": 0.3377124553034101,
"learning_rate": 3.6025459230978475e-05,
"loss": 0.6708342432975769,
"step": 259
},
{
"epoch": 0.858085808580858,
"grad_norm": 0.3081391395426973,
"learning_rate": 3.597938488205549e-05,
"loss": 0.6306079626083374,
"step": 260
},
{
"epoch": 0.8613861386138614,
"grad_norm": 0.3398583824115319,
"learning_rate": 3.59330748368886e-05,
"loss": 0.6098329424858093,
"step": 261
},
{
"epoch": 0.8646864686468647,
"grad_norm": 0.32878067719138626,
"learning_rate": 3.588652977855189e-05,
"loss": 0.5617724061012268,
"step": 262
},
{
"epoch": 0.8679867986798679,
"grad_norm": 0.34962664282188816,
"learning_rate": 3.58397503935859e-05,
"loss": 0.5780894756317139,
"step": 263
},
{
"epoch": 0.8712871287128713,
"grad_norm": 0.32665214019362204,
"learning_rate": 3.5792737371987477e-05,
"loss": 0.578921377658844,
"step": 264
},
{
"epoch": 0.8745874587458746,
"grad_norm": 0.36673188949709323,
"learning_rate": 3.574549140719962e-05,
"loss": 0.614944577217102,
"step": 265
},
{
"epoch": 0.8778877887788779,
"grad_norm": 0.3248666143164946,
"learning_rate": 3.569801319610125e-05,
"loss": 0.6269869208335876,
"step": 266
},
{
"epoch": 0.8811881188118812,
"grad_norm": 0.3338123662452596,
"learning_rate": 3.565030343899693e-05,
"loss": 0.6045581102371216,
"step": 267
},
{
"epoch": 0.8844884488448845,
"grad_norm": 0.31011600887091817,
"learning_rate": 3.5602362839606514e-05,
"loss": 0.5872907638549805,
"step": 268
},
{
"epoch": 0.8877887788778878,
"grad_norm": 0.31857062779594814,
"learning_rate": 3.55541921050548e-05,
"loss": 0.6283375024795532,
"step": 269
},
{
"epoch": 0.8910891089108911,
"grad_norm": 0.32445751859048455,
"learning_rate": 3.5505791945861076e-05,
"loss": 0.5747002363204956,
"step": 270
},
{
"epoch": 0.8943894389438944,
"grad_norm": 0.2923309334474062,
"learning_rate": 3.545716307592864e-05,
"loss": 0.6205827593803406,
"step": 271
},
{
"epoch": 0.8976897689768977,
"grad_norm": 0.43972579907455317,
"learning_rate": 3.54083062125343e-05,
"loss": 0.5987251400947571,
"step": 272
},
{
"epoch": 0.900990099009901,
"grad_norm": 0.33194286352506225,
"learning_rate": 3.535922207631776e-05,
"loss": 0.6275356411933899,
"step": 273
},
{
"epoch": 0.9042904290429042,
"grad_norm": 0.3408278730793354,
"learning_rate": 3.5309911391270996e-05,
"loss": 0.6097655892372131,
"step": 274
},
{
"epoch": 0.9075907590759076,
"grad_norm": 0.3441995699777348,
"learning_rate": 3.52603748847276e-05,
"loss": 0.544170618057251,
"step": 275
},
{
"epoch": 0.9108910891089109,
"grad_norm": 0.3034867763949278,
"learning_rate": 3.521061328735202e-05,
"loss": 0.5723366141319275,
"step": 276
},
{
"epoch": 0.9141914191419142,
"grad_norm": 0.3091145609625042,
"learning_rate": 3.516062733312879e-05,
"loss": 0.5801889896392822,
"step": 277
},
{
"epoch": 0.9174917491749175,
"grad_norm": 0.3532845546992122,
"learning_rate": 3.511041775935175e-05,
"loss": 0.5942766666412354,
"step": 278
},
{
"epoch": 0.9207920792079208,
"grad_norm": 0.3192035342587887,
"learning_rate": 3.50599853066131e-05,
"loss": 0.5604017972946167,
"step": 279
},
{
"epoch": 0.9240924092409241,
"grad_norm": 0.4475571406552253,
"learning_rate": 3.500933071879251e-05,
"loss": 0.6151460409164429,
"step": 280
},
{
"epoch": 0.9273927392739274,
"grad_norm": 0.30946498453996385,
"learning_rate": 3.495845474304616e-05,
"loss": 0.5854936838150024,
"step": 281
},
{
"epoch": 0.9306930693069307,
"grad_norm": 0.3188531409769719,
"learning_rate": 3.490735812979572e-05,
"loss": 0.5586672425270081,
"step": 282
},
{
"epoch": 0.933993399339934,
"grad_norm": 0.3250546549981712,
"learning_rate": 3.485604163271721e-05,
"loss": 0.578475832939148,
"step": 283
},
{
"epoch": 0.9372937293729373,
"grad_norm": 0.45030229248281484,
"learning_rate": 3.4804506008730015e-05,
"loss": 0.5236382484436035,
"step": 284
},
{
"epoch": 0.9405940594059405,
"grad_norm": 0.31677157675280776,
"learning_rate": 3.475275201798559e-05,
"loss": 0.5964822769165039,
"step": 285
},
{
"epoch": 0.9438943894389439,
"grad_norm": 0.3221519247617692,
"learning_rate": 3.4700780423856334e-05,
"loss": 0.5551598072052002,
"step": 286
},
{
"epoch": 0.9471947194719472,
"grad_norm": 0.31322506983838,
"learning_rate": 3.464859199292429e-05,
"loss": 0.6095103621482849,
"step": 287
},
{
"epoch": 0.9504950495049505,
"grad_norm": 0.33333701342858213,
"learning_rate": 3.4596187494969846e-05,
"loss": 0.5893416404724121,
"step": 288
},
{
"epoch": 0.9537953795379538,
"grad_norm": 0.31167002926986764,
"learning_rate": 3.454356770296039e-05,
"loss": 0.5992231965065002,
"step": 289
},
{
"epoch": 0.9570957095709571,
"grad_norm": 0.3407826991036566,
"learning_rate": 3.4490733393038895e-05,
"loss": 0.6071972250938416,
"step": 290
},
{
"epoch": 0.9603960396039604,
"grad_norm": 0.321397588262469,
"learning_rate": 3.443768534451248e-05,
"loss": 0.5836942195892334,
"step": 291
},
{
"epoch": 0.9636963696369637,
"grad_norm": 0.3596023570145339,
"learning_rate": 3.4384424339840916e-05,
"loss": 0.5707553625106812,
"step": 292
},
{
"epoch": 0.966996699669967,
"grad_norm": 0.326365753033755,
"learning_rate": 3.4330951164625075e-05,
"loss": 0.5883970260620117,
"step": 293
},
{
"epoch": 0.9702970297029703,
"grad_norm": 0.3276030981345682,
"learning_rate": 3.427726660759535e-05,
"loss": 0.6281589269638062,
"step": 294
},
{
"epoch": 0.9735973597359736,
"grad_norm": 0.3559560269123216,
"learning_rate": 3.422337146060003e-05,
"loss": 0.6641702651977539,
"step": 295
},
{
"epoch": 0.976897689768977,
"grad_norm": 0.34661891319338206,
"learning_rate": 3.4169266518593596e-05,
"loss": 0.6398966312408447,
"step": 296
},
{
"epoch": 0.9801980198019802,
"grad_norm": 0.3392015122860613,
"learning_rate": 3.411495257962501e-05,
"loss": 0.6376276016235352,
"step": 297
},
{
"epoch": 0.9834983498349835,
"grad_norm": 0.3454832175281825,
"learning_rate": 3.406043044482596e-05,
"loss": 0.648975133895874,
"step": 298
},
{
"epoch": 0.9867986798679867,
"grad_norm": 0.3284679145456545,
"learning_rate": 3.4005700918399016e-05,
"loss": 0.6201390624046326,
"step": 299
},
{
"epoch": 0.9900990099009901,
"grad_norm": 0.33000362479964457,
"learning_rate": 3.395076480760576e-05,
"loss": 0.6103875637054443,
"step": 300
},
{
"epoch": 0.9933993399339934,
"grad_norm": 0.31707924192462417,
"learning_rate": 3.3895622922754936e-05,
"loss": 0.5486876368522644,
"step": 301
},
{
"epoch": 0.9966996699669967,
"grad_norm": 0.3094164003933957,
"learning_rate": 3.384027607719043e-05,
"loss": 0.5980846285820007,
"step": 302
},
{
"epoch": 1.0,
"grad_norm": 0.33089398879681,
"learning_rate": 3.378472508727931e-05,
"loss": 0.5986801385879517,
"step": 303
},
{
"epoch": 1.0033003300330032,
"grad_norm": 0.4690060258405477,
"learning_rate": 3.372897077239979e-05,
"loss": 0.5586727857589722,
"step": 304
},
{
"epoch": 1.0066006600660067,
"grad_norm": 0.34686786747213394,
"learning_rate": 3.36730139549291e-05,
"loss": 0.5393255949020386,
"step": 305
},
{
"epoch": 1.00990099009901,
"grad_norm": 0.4023568892604613,
"learning_rate": 3.361685546023143e-05,
"loss": 0.5377227067947388,
"step": 306
},
{
"epoch": 1.0132013201320131,
"grad_norm": 0.39915820884177944,
"learning_rate": 3.356049611664568e-05,
"loss": 0.5223784446716309,
"step": 307
},
{
"epoch": 1.0165016501650166,
"grad_norm": 0.3654265250846575,
"learning_rate": 3.350393675547328e-05,
"loss": 0.5502469539642334,
"step": 308
},
{
"epoch": 1.0198019801980198,
"grad_norm": 0.42079557297663883,
"learning_rate": 3.3447178210965936e-05,
"loss": 0.5626603960990906,
"step": 309
},
{
"epoch": 1.023102310231023,
"grad_norm": 0.3684084639129366,
"learning_rate": 3.3390221320313303e-05,
"loss": 0.48262274265289307,
"step": 310
},
{
"epoch": 1.0264026402640265,
"grad_norm": 0.39908786063309193,
"learning_rate": 3.333306692363065e-05,
"loss": 0.5850967168807983,
"step": 311
},
{
"epoch": 1.0297029702970297,
"grad_norm": 0.44262876970078274,
"learning_rate": 3.3275715863946466e-05,
"loss": 0.5444281697273254,
"step": 312
},
{
"epoch": 1.033003300330033,
"grad_norm": 0.35239079669120155,
"learning_rate": 3.3218168987190004e-05,
"loss": 0.5329654216766357,
"step": 313
},
{
"epoch": 1.0363036303630364,
"grad_norm": 0.38499730860339404,
"learning_rate": 3.316042714217885e-05,
"loss": 0.5276832580566406,
"step": 314
},
{
"epoch": 1.0396039603960396,
"grad_norm": 0.3928937531164494,
"learning_rate": 3.310249118060636e-05,
"loss": 0.5344791412353516,
"step": 315
},
{
"epoch": 1.0429042904290429,
"grad_norm": 0.3466589226743573,
"learning_rate": 3.304436195702911e-05,
"loss": 0.5479785203933716,
"step": 316
},
{
"epoch": 1.046204620462046,
"grad_norm": 0.370325309360066,
"learning_rate": 3.298604032885431e-05,
"loss": 0.5223082900047302,
"step": 317
},
{
"epoch": 1.0495049504950495,
"grad_norm": 0.4271803134046634,
"learning_rate": 3.292752715632713e-05,
"loss": 0.5667799711227417,
"step": 318
},
{
"epoch": 1.0528052805280528,
"grad_norm": 0.33752277032768196,
"learning_rate": 3.2868823302518016e-05,
"loss": 0.5194317698478699,
"step": 319
},
{
"epoch": 1.056105610561056,
"grad_norm": 0.35801795115870316,
"learning_rate": 3.2809929633309985e-05,
"loss": 0.4911007285118103,
"step": 320
},
{
"epoch": 1.0594059405940595,
"grad_norm": 0.33819516112787196,
"learning_rate": 3.2750847017385826e-05,
"loss": 0.5269002914428711,
"step": 321
},
{
"epoch": 1.0627062706270627,
"grad_norm": 0.3280280196094967,
"learning_rate": 3.269157632621529e-05,
"loss": 0.5124789476394653,
"step": 322
},
{
"epoch": 1.066006600660066,
"grad_norm": 0.3841029677303286,
"learning_rate": 3.263211843404225e-05,
"loss": 0.5483890771865845,
"step": 323
},
{
"epoch": 1.0693069306930694,
"grad_norm": 0.348752311292252,
"learning_rate": 3.25724742178718e-05,
"loss": 0.5582579374313354,
"step": 324
},
{
"epoch": 1.0726072607260726,
"grad_norm": 0.3672218653955236,
"learning_rate": 3.2512644557457304e-05,
"loss": 0.5662975907325745,
"step": 325
},
{
"epoch": 1.0759075907590758,
"grad_norm": 0.339133227284404,
"learning_rate": 3.2452630335287445e-05,
"loss": 0.5502511858940125,
"step": 326
},
{
"epoch": 1.0792079207920793,
"grad_norm": 0.3607463939055526,
"learning_rate": 3.239243243657318e-05,
"loss": 0.5614978075027466,
"step": 327
},
{
"epoch": 1.0825082508250825,
"grad_norm": 0.3354690532522152,
"learning_rate": 3.233205174923472e-05,
"loss": 0.4828110635280609,
"step": 328
},
{
"epoch": 1.0858085808580857,
"grad_norm": 0.3296040603044689,
"learning_rate": 3.22714891638884e-05,
"loss": 0.5437847971916199,
"step": 329
},
{
"epoch": 1.0891089108910892,
"grad_norm": 0.3295415767468974,
"learning_rate": 3.221074557383355e-05,
"loss": 0.6240063309669495,
"step": 330
},
{
"epoch": 1.0924092409240924,
"grad_norm": 0.3032628226796708,
"learning_rate": 3.2149821875039325e-05,
"loss": 0.5435442328453064,
"step": 331
},
{
"epoch": 1.0957095709570956,
"grad_norm": 0.30875440813945676,
"learning_rate": 3.20887189661315e-05,
"loss": 0.5240401029586792,
"step": 332
},
{
"epoch": 1.099009900990099,
"grad_norm": 0.3043121620505056,
"learning_rate": 3.202743774837919e-05,
"loss": 0.5227692127227783,
"step": 333
},
{
"epoch": 1.1023102310231023,
"grad_norm": 0.3439754692795775,
"learning_rate": 3.196597912568157e-05,
"loss": 0.5607417821884155,
"step": 334
},
{
"epoch": 1.1056105610561056,
"grad_norm": 0.29691798670137787,
"learning_rate": 3.1904344004554536e-05,
"loss": 0.5607600808143616,
"step": 335
},
{
"epoch": 1.108910891089109,
"grad_norm": 0.32493088910689055,
"learning_rate": 3.184253329411737e-05,
"loss": 0.47135430574417114,
"step": 336
},
{
"epoch": 1.1122112211221122,
"grad_norm": 0.3202945703052858,
"learning_rate": 3.178054790607924e-05,
"loss": 0.5708764791488647,
"step": 337
},
{
"epoch": 1.1155115511551155,
"grad_norm": 0.3164605548495645,
"learning_rate": 3.1718388754725883e-05,
"loss": 0.5522497296333313,
"step": 338
},
{
"epoch": 1.118811881188119,
"grad_norm": 0.3449586600316318,
"learning_rate": 3.1656056756906e-05,
"loss": 0.5556532144546509,
"step": 339
},
{
"epoch": 1.1221122112211221,
"grad_norm": 0.3130025484639745,
"learning_rate": 3.1593552832017795e-05,
"loss": 0.5727676153182983,
"step": 340
},
{
"epoch": 1.1254125412541254,
"grad_norm": 0.3195703179740936,
"learning_rate": 3.153087790199541e-05,
"loss": 0.5131651759147644,
"step": 341
},
{
"epoch": 1.1287128712871288,
"grad_norm": 0.3191177264656739,
"learning_rate": 3.146803289129528e-05,
"loss": 0.5143063068389893,
"step": 342
},
{
"epoch": 1.132013201320132,
"grad_norm": 0.33398757419035885,
"learning_rate": 3.1405018726882595e-05,
"loss": 0.509161114692688,
"step": 343
},
{
"epoch": 1.1353135313531353,
"grad_norm": 0.33058725446313514,
"learning_rate": 3.13418363382175e-05,
"loss": 0.5213526487350464,
"step": 344
},
{
"epoch": 1.1386138613861387,
"grad_norm": 0.3226863318187914,
"learning_rate": 3.127848665724149e-05,
"loss": 0.5465434789657593,
"step": 345
},
{
"epoch": 1.141914191419142,
"grad_norm": 0.6179658385179007,
"learning_rate": 3.1214970618363626e-05,
"loss": 0.5342190265655518,
"step": 346
},
{
"epoch": 1.1452145214521452,
"grad_norm": 0.47777163001134637,
"learning_rate": 3.115128915844672e-05,
"loss": 0.541754424571991,
"step": 347
},
{
"epoch": 1.1485148514851484,
"grad_norm": 0.33931974771490697,
"learning_rate": 3.10874432167936e-05,
"loss": 0.5318331122398376,
"step": 348
},
{
"epoch": 1.1518151815181519,
"grad_norm": 0.32111740987941506,
"learning_rate": 3.1023433735133134e-05,
"loss": 0.4972509741783142,
"step": 349
},
{
"epoch": 1.155115511551155,
"grad_norm": 0.30074948382432587,
"learning_rate": 3.095926165760647e-05,
"loss": 0.5417294502258301,
"step": 350
},
{
"epoch": 1.1584158415841583,
"grad_norm": 0.3410522798436207,
"learning_rate": 3.089492793075302e-05,
"loss": 0.554945707321167,
"step": 351
},
{
"epoch": 1.1617161716171618,
"grad_norm": 0.3254774061643724,
"learning_rate": 3.083043350349653e-05,
"loss": 0.5204564929008484,
"step": 352
},
{
"epoch": 1.165016501650165,
"grad_norm": 0.3088402728006412,
"learning_rate": 3.076577932713108e-05,
"loss": 0.4856947064399719,
"step": 353
},
{
"epoch": 1.1683168316831682,
"grad_norm": 0.2896918095760776,
"learning_rate": 3.0700966355307055e-05,
"loss": 0.5269368886947632,
"step": 354
},
{
"epoch": 1.1716171617161717,
"grad_norm": 0.32747543865706225,
"learning_rate": 3.063599554401708e-05,
"loss": 0.5811939239501953,
"step": 355
},
{
"epoch": 1.174917491749175,
"grad_norm": 0.29324577597304957,
"learning_rate": 3.057086785158189e-05,
"loss": 0.5636904239654541,
"step": 356
},
{
"epoch": 1.1782178217821782,
"grad_norm": 0.31779620334412045,
"learning_rate": 3.050558423863626e-05,
"loss": 0.546089768409729,
"step": 357
},
{
"epoch": 1.1815181518151816,
"grad_norm": 0.3093045991582328,
"learning_rate": 3.0440145668114774e-05,
"loss": 0.5239901542663574,
"step": 358
},
{
"epoch": 1.1848184818481848,
"grad_norm": 0.31848934088179354,
"learning_rate": 3.0374553105237637e-05,
"loss": 0.5833466053009033,
"step": 359
},
{
"epoch": 1.188118811881188,
"grad_norm": 0.33803859097620154,
"learning_rate": 3.0308807517496456e-05,
"loss": 0.5060774087905884,
"step": 360
},
{
"epoch": 1.1914191419141915,
"grad_norm": 0.31145081064149094,
"learning_rate": 3.0242909874639953e-05,
"loss": 0.5164307355880737,
"step": 361
},
{
"epoch": 1.1947194719471947,
"grad_norm": 0.29765085452905116,
"learning_rate": 3.0176861148659672e-05,
"loss": 0.49949395656585693,
"step": 362
},
{
"epoch": 1.198019801980198,
"grad_norm": 0.3296486034239661,
"learning_rate": 3.0110662313775623e-05,
"loss": 0.5581181049346924,
"step": 363
},
{
"epoch": 1.2013201320132012,
"grad_norm": 0.3116631729941006,
"learning_rate": 3.0044314346421938e-05,
"loss": 0.5657376646995544,
"step": 364
},
{
"epoch": 1.2046204620462047,
"grad_norm": 0.33012695180790946,
"learning_rate": 2.9977818225232443e-05,
"loss": 0.5269935131072998,
"step": 365
},
{
"epoch": 1.2079207920792079,
"grad_norm": 0.31869984664933465,
"learning_rate": 2.991117493102626e-05,
"loss": 0.5385931730270386,
"step": 366
},
{
"epoch": 1.2112211221122111,
"grad_norm": 0.30491226427581125,
"learning_rate": 2.984438544679329e-05,
"loss": 0.5615143179893494,
"step": 367
},
{
"epoch": 1.2145214521452146,
"grad_norm": 0.32195999076013593,
"learning_rate": 2.9777450757679754e-05,
"loss": 0.5175333023071289,
"step": 368
},
{
"epoch": 1.2178217821782178,
"grad_norm": 0.30930257180361886,
"learning_rate": 2.971037185097364e-05,
"loss": 0.565494179725647,
"step": 369
},
{
"epoch": 1.221122112211221,
"grad_norm": 0.34237830645177886,
"learning_rate": 2.9643149716090146e-05,
"loss": 0.5519120693206787,
"step": 370
},
{
"epoch": 1.2244224422442245,
"grad_norm": 0.30959351563618437,
"learning_rate": 2.9575785344557114e-05,
"loss": 0.49374374747276306,
"step": 371
},
{
"epoch": 1.2277227722772277,
"grad_norm": 0.31310768619122714,
"learning_rate": 2.950827973000034e-05,
"loss": 0.5608875751495361,
"step": 372
},
{
"epoch": 1.231023102310231,
"grad_norm": 0.31986895424613543,
"learning_rate": 2.944063386812899e-05,
"loss": 0.5866271257400513,
"step": 373
},
{
"epoch": 1.2343234323432344,
"grad_norm": 0.3359900469491975,
"learning_rate": 2.9372848756720867e-05,
"loss": 0.5342913269996643,
"step": 374
},
{
"epoch": 1.2376237623762376,
"grad_norm": 0.2956484140793021,
"learning_rate": 2.9304925395607696e-05,
"loss": 0.5539537668228149,
"step": 375
},
{
"epoch": 1.2409240924092408,
"grad_norm": 0.3239136306261367,
"learning_rate": 2.9236864786660423e-05,
"loss": 0.5614147186279297,
"step": 376
},
{
"epoch": 1.2442244224422443,
"grad_norm": 0.3311932744032855,
"learning_rate": 2.9168667933774356e-05,
"loss": 0.46689367294311523,
"step": 377
},
{
"epoch": 1.2475247524752475,
"grad_norm": 0.3291299090174619,
"learning_rate": 2.910033584285444e-05,
"loss": 0.5383083820343018,
"step": 378
},
{
"epoch": 1.2508250825082508,
"grad_norm": 0.3013900588246958,
"learning_rate": 2.903186952180037e-05,
"loss": 0.5349752902984619,
"step": 379
},
{
"epoch": 1.2541254125412542,
"grad_norm": 0.3219145450840317,
"learning_rate": 2.8963269980491743e-05,
"loss": 0.5792303681373596,
"step": 380
},
{
"epoch": 1.2574257425742574,
"grad_norm": 0.2840550960191948,
"learning_rate": 2.8894538230773147e-05,
"loss": 0.524924099445343,
"step": 381
},
{
"epoch": 1.2607260726072607,
"grad_norm": 0.3172399675943548,
"learning_rate": 2.882567528643925e-05,
"loss": 0.5137406587600708,
"step": 382
},
{
"epoch": 1.2640264026402641,
"grad_norm": 0.2893676822687234,
"learning_rate": 2.8756682163219857e-05,
"loss": 0.5196574926376343,
"step": 383
},
{
"epoch": 1.2673267326732673,
"grad_norm": 0.31363904787626334,
"learning_rate": 2.8687559878764903e-05,
"loss": 0.585644006729126,
"step": 384
},
{
"epoch": 1.2706270627062706,
"grad_norm": 0.3310272877884813,
"learning_rate": 2.8618309452629445e-05,
"loss": 0.5973786115646362,
"step": 385
},
{
"epoch": 1.273927392739274,
"grad_norm": 0.3201222210217655,
"learning_rate": 2.854893190625865e-05,
"loss": 0.5909825563430786,
"step": 386
},
{
"epoch": 1.2772277227722773,
"grad_norm": 0.3507731714316878,
"learning_rate": 2.84794282629727e-05,
"loss": 0.5903690457344055,
"step": 387
},
{
"epoch": 1.2805280528052805,
"grad_norm": 0.31011243056320775,
"learning_rate": 2.840979954795171e-05,
"loss": 0.5316457152366638,
"step": 388
},
{
"epoch": 1.283828382838284,
"grad_norm": 0.32950464198309637,
"learning_rate": 2.8340046788220613e-05,
"loss": 0.5080389976501465,
"step": 389
},
{
"epoch": 1.2871287128712872,
"grad_norm": 0.37769184930606736,
"learning_rate": 2.8270171012633994e-05,
"loss": 0.6137889623641968,
"step": 390
},
{
"epoch": 1.2904290429042904,
"grad_norm": 0.34430823745531935,
"learning_rate": 2.8200173251860928e-05,
"loss": 0.5433805584907532,
"step": 391
},
{
"epoch": 1.2937293729372938,
"grad_norm": 0.356563736773021,
"learning_rate": 2.8130054538369775e-05,
"loss": 0.4965590834617615,
"step": 392
},
{
"epoch": 1.297029702970297,
"grad_norm": 0.29380923244218154,
"learning_rate": 2.805981590641295e-05,
"loss": 0.5361340045928955,
"step": 393
},
{
"epoch": 1.3003300330033003,
"grad_norm": 0.31403525376793245,
"learning_rate": 2.7989458392011678e-05,
"loss": 0.47011327743530273,
"step": 394
},
{
"epoch": 1.3036303630363038,
"grad_norm": 0.30710914438533876,
"learning_rate": 2.7918983032940666e-05,
"loss": 0.5893687605857849,
"step": 395
},
{
"epoch": 1.306930693069307,
"grad_norm": 0.3126943781985397,
"learning_rate": 2.7848390868712886e-05,
"loss": 0.5219327211380005,
"step": 396
},
{
"epoch": 1.3102310231023102,
"grad_norm": 0.35585146532127665,
"learning_rate": 2.7777682940564142e-05,
"loss": 0.5652155876159668,
"step": 397
},
{
"epoch": 1.3135313531353137,
"grad_norm": 0.41906023992763497,
"learning_rate": 2.7706860291437784e-05,
"loss": 0.5361950397491455,
"step": 398
},
{
"epoch": 1.316831683168317,
"grad_norm": 0.29071400108766793,
"learning_rate": 2.763592396596929e-05,
"loss": 0.5355206727981567,
"step": 399
},
{
"epoch": 1.3201320132013201,
"grad_norm": 0.298123677847084,
"learning_rate": 2.756487501047086e-05,
"loss": 0.5082858800888062,
"step": 400
},
{
"epoch": 1.3234323432343233,
"grad_norm": 0.3144050740212562,
"learning_rate": 2.7493714472916013e-05,
"loss": 0.5282934904098511,
"step": 401
},
{
"epoch": 1.3267326732673268,
"grad_norm": 0.29396121691648713,
"learning_rate": 2.7422443402924074e-05,
"loss": 0.5502887964248657,
"step": 402
},
{
"epoch": 1.33003300330033,
"grad_norm": 0.2854429234726643,
"learning_rate": 2.7351062851744747e-05,
"loss": 0.5374204516410828,
"step": 403
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.30308752538818784,
"learning_rate": 2.7279573872242574e-05,
"loss": 0.5602293014526367,
"step": 404
},
{
"epoch": 1.3366336633663367,
"grad_norm": 0.30975657746221447,
"learning_rate": 2.7207977518881418e-05,
"loss": 0.5321286916732788,
"step": 405
},
{
"epoch": 1.33993399339934,
"grad_norm": 0.28965457921713383,
"learning_rate": 2.713627484770892e-05,
"loss": 0.5523560047149658,
"step": 406
},
{
"epoch": 1.3432343234323432,
"grad_norm": 0.30598816879566076,
"learning_rate": 2.706446691634089e-05,
"loss": 0.47019705176353455,
"step": 407
},
{
"epoch": 1.3465346534653464,
"grad_norm": 0.2977261513860205,
"learning_rate": 2.6992554783945748e-05,
"loss": 0.540359616279602,
"step": 408
},
{
"epoch": 1.3498349834983498,
"grad_norm": 0.2845048826043699,
"learning_rate": 2.6920539511228874e-05,
"loss": 0.561464786529541,
"step": 409
},
{
"epoch": 1.353135313531353,
"grad_norm": 0.2939741197740927,
"learning_rate": 2.6848422160416956e-05,
"loss": 0.5429259538650513,
"step": 410
},
{
"epoch": 1.3564356435643563,
"grad_norm": 0.2968609589915083,
"learning_rate": 2.677620379524237e-05,
"loss": 0.5452640652656555,
"step": 411
},
{
"epoch": 1.3597359735973598,
"grad_norm": 0.28949363661635646,
"learning_rate": 2.670388548092741e-05,
"loss": 0.49627864360809326,
"step": 412
},
{
"epoch": 1.363036303630363,
"grad_norm": 0.328169978832012,
"learning_rate": 2.663146828416867e-05,
"loss": 0.5331633687019348,
"step": 413
},
{
"epoch": 1.3663366336633662,
"grad_norm": 0.2926434963884909,
"learning_rate": 2.6558953273121216e-05,
"loss": 0.5447151064872742,
"step": 414
},
{
"epoch": 1.3696369636963697,
"grad_norm": 0.2863360845432002,
"learning_rate": 2.648634151738292e-05,
"loss": 0.5467007160186768,
"step": 415
},
{
"epoch": 1.372937293729373,
"grad_norm": 0.33044933855099695,
"learning_rate": 2.6413634087978602e-05,
"loss": 0.5804279446601868,
"step": 416
},
{
"epoch": 1.3762376237623761,
"grad_norm": 0.29168904019746145,
"learning_rate": 2.63408320573443e-05,
"loss": 0.5323517322540283,
"step": 417
},
{
"epoch": 1.3795379537953796,
"grad_norm": 0.3046417110987717,
"learning_rate": 2.6267936499311402e-05,
"loss": 0.5452409982681274,
"step": 418
},
{
"epoch": 1.3828382838283828,
"grad_norm": 0.2878853361033164,
"learning_rate": 2.619494848909084e-05,
"loss": 0.4622665047645569,
"step": 419
},
{
"epoch": 1.386138613861386,
"grad_norm": 0.3129938954769346,
"learning_rate": 2.6121869103257206e-05,
"loss": 0.531772255897522,
"step": 420
},
{
"epoch": 1.3894389438943895,
"grad_norm": 0.3044320552061303,
"learning_rate": 2.6048699419732897e-05,
"loss": 0.519554853439331,
"step": 421
},
{
"epoch": 1.3927392739273927,
"grad_norm": 0.32616258357306027,
"learning_rate": 2.5975440517772187e-05,
"loss": 0.545585572719574,
"step": 422
},
{
"epoch": 1.396039603960396,
"grad_norm": 0.297995845019565,
"learning_rate": 2.5902093477945345e-05,
"loss": 0.5641547441482544,
"step": 423
},
{
"epoch": 1.3993399339933994,
"grad_norm": 0.28406971495281874,
"learning_rate": 2.5828659382122655e-05,
"loss": 0.5578028559684753,
"step": 424
},
{
"epoch": 1.4026402640264026,
"grad_norm": 0.35618435421860006,
"learning_rate": 2.5755139313458484e-05,
"loss": 0.5931404232978821,
"step": 425
},
{
"epoch": 1.4059405940594059,
"grad_norm": 0.3227282264542969,
"learning_rate": 2.5681534356375314e-05,
"loss": 0.5486891865730286,
"step": 426
},
{
"epoch": 1.4092409240924093,
"grad_norm": 0.31220449886262164,
"learning_rate": 2.5607845596547706e-05,
"loss": 0.5007671117782593,
"step": 427
},
{
"epoch": 1.4125412541254125,
"grad_norm": 0.2970377848116104,
"learning_rate": 2.5534074120886346e-05,
"loss": 0.5044519901275635,
"step": 428
},
{
"epoch": 1.4158415841584158,
"grad_norm": 0.30667327850480125,
"learning_rate": 2.5460221017521952e-05,
"loss": 0.5227789878845215,
"step": 429
},
{
"epoch": 1.4191419141914192,
"grad_norm": 0.2902458759439887,
"learning_rate": 2.538628737578926e-05,
"loss": 0.5530189871788025,
"step": 430
},
{
"epoch": 1.4224422442244224,
"grad_norm": 0.3114416510328153,
"learning_rate": 2.5312274286210966e-05,
"loss": 0.508142352104187,
"step": 431
},
{
"epoch": 1.4257425742574257,
"grad_norm": 0.30284970816559353,
"learning_rate": 2.523818284048159e-05,
"loss": 0.5497263669967651,
"step": 432
},
{
"epoch": 1.4290429042904291,
"grad_norm": 0.3619418905679721,
"learning_rate": 2.5164014131451443e-05,
"loss": 0.5477034449577332,
"step": 433
},
{
"epoch": 1.4323432343234324,
"grad_norm": 0.28668741491270383,
"learning_rate": 2.508976925311045e-05,
"loss": 0.5091728568077087,
"step": 434
},
{
"epoch": 1.4356435643564356,
"grad_norm": 0.2922234358135184,
"learning_rate": 2.501544930057203e-05,
"loss": 0.5022713541984558,
"step": 435
},
{
"epoch": 1.438943894389439,
"grad_norm": 0.29994035273286174,
"learning_rate": 2.494105537005697e-05,
"loss": 0.5401599407196045,
"step": 436
},
{
"epoch": 1.4422442244224423,
"grad_norm": 0.27863085551634303,
"learning_rate": 2.4866588558877208e-05,
"loss": 0.5632063150405884,
"step": 437
},
{
"epoch": 1.4455445544554455,
"grad_norm": 0.2968792338733857,
"learning_rate": 2.479204996541969e-05,
"loss": 0.552355170249939,
"step": 438
},
{
"epoch": 1.448844884488449,
"grad_norm": 0.3222205976590156,
"learning_rate": 2.4717440689130154e-05,
"loss": 0.5604996681213379,
"step": 439
},
{
"epoch": 1.4521452145214522,
"grad_norm": 0.2781451863798608,
"learning_rate": 2.4642761830496893e-05,
"loss": 0.4961245656013489,
"step": 440
},
{
"epoch": 1.4554455445544554,
"grad_norm": 0.3327533816855903,
"learning_rate": 2.4568014491034565e-05,
"loss": 0.5403590202331543,
"step": 441
},
{
"epoch": 1.4587458745874589,
"grad_norm": 0.2944499869326328,
"learning_rate": 2.4493199773267902e-05,
"loss": 0.4753378629684448,
"step": 442
},
{
"epoch": 1.462046204620462,
"grad_norm": 0.30936599048377306,
"learning_rate": 2.4418318780715477e-05,
"loss": 0.5125438570976257,
"step": 443
},
{
"epoch": 1.4653465346534653,
"grad_norm": 0.3047486735791836,
"learning_rate": 2.434337261787342e-05,
"loss": 0.5670269727706909,
"step": 444
},
{
"epoch": 1.4686468646864688,
"grad_norm": 0.3348418102837006,
"learning_rate": 2.426836239019911e-05,
"loss": 0.5538198947906494,
"step": 445
},
{
"epoch": 1.471947194719472,
"grad_norm": 0.2790312641462961,
"learning_rate": 2.4193289204094893e-05,
"loss": 0.5012328028678894,
"step": 446
},
{
"epoch": 1.4752475247524752,
"grad_norm": 0.30485310749783334,
"learning_rate": 2.4118154166891762e-05,
"loss": 0.538119912147522,
"step": 447
},
{
"epoch": 1.4785478547854787,
"grad_norm": 0.32398781026753815,
"learning_rate": 2.4042958386833003e-05,
"loss": 0.5252339839935303,
"step": 448
},
{
"epoch": 1.481848184818482,
"grad_norm": 0.326928536480608,
"learning_rate": 2.3967702973057853e-05,
"loss": 0.5367081761360168,
"step": 449
},
{
"epoch": 1.4851485148514851,
"grad_norm": 0.3044938562463835,
"learning_rate": 2.3892389035585167e-05,
"loss": 0.5091884136199951,
"step": 450
},
{
"epoch": 1.4884488448844886,
"grad_norm": 0.2897824690201277,
"learning_rate": 2.3817017685297016e-05,
"loss": 0.5079891681671143,
"step": 451
},
{
"epoch": 1.4917491749174918,
"grad_norm": 0.2966882318097961,
"learning_rate": 2.3741590033922313e-05,
"loss": 0.511939287185669,
"step": 452
},
{
"epoch": 1.495049504950495,
"grad_norm": 0.28797637565211376,
"learning_rate": 2.3666107194020404e-05,
"loss": 0.5070478916168213,
"step": 453
},
{
"epoch": 1.4983498349834983,
"grad_norm": 0.29050652670321586,
"learning_rate": 2.3590570278964682e-05,
"loss": 0.547492504119873,
"step": 454
},
{
"epoch": 1.5016501650165015,
"grad_norm": 0.311874965448668,
"learning_rate": 2.3514980402926132e-05,
"loss": 0.5386558771133423,
"step": 455
},
{
"epoch": 1.504950495049505,
"grad_norm": 0.26980126113979913,
"learning_rate": 2.3439338680856943e-05,
"loss": 0.48668172955513,
"step": 456
},
{
"epoch": 1.5082508250825084,
"grad_norm": 0.31689121328788056,
"learning_rate": 2.3363646228474002e-05,
"loss": 0.5497942566871643,
"step": 457
},
{
"epoch": 1.5115511551155114,
"grad_norm": 0.3648919358675907,
"learning_rate": 2.328790416224248e-05,
"loss": 0.5267748832702637,
"step": 458
},
{
"epoch": 1.5148514851485149,
"grad_norm": 0.3191029117024018,
"learning_rate": 2.3212113599359368e-05,
"loss": 0.5578982830047607,
"step": 459
},
{
"epoch": 1.5181518151815183,
"grad_norm": 0.30610891906133464,
"learning_rate": 2.3136275657736956e-05,
"loss": 0.5136545896530151,
"step": 460
},
{
"epoch": 1.5214521452145213,
"grad_norm": 0.28466532575384307,
"learning_rate": 2.3060391455986403e-05,
"loss": 0.5718669891357422,
"step": 461
},
{
"epoch": 1.5247524752475248,
"grad_norm": 0.3064265170567389,
"learning_rate": 2.2984462113401184e-05,
"loss": 0.5427108407020569,
"step": 462
},
{
"epoch": 1.528052805280528,
"grad_norm": 0.28495826208338726,
"learning_rate": 2.2908488749940596e-05,
"loss": 0.5293564200401306,
"step": 463
},
{
"epoch": 1.5313531353135312,
"grad_norm": 0.3073240786964915,
"learning_rate": 2.2832472486213275e-05,
"loss": 0.550743579864502,
"step": 464
},
{
"epoch": 1.5346534653465347,
"grad_norm": 0.30789089349395116,
"learning_rate": 2.2756414443460602e-05,
"loss": 0.5957387685775757,
"step": 465
},
{
"epoch": 1.537953795379538,
"grad_norm": 0.2840660845057486,
"learning_rate": 2.2680315743540234e-05,
"loss": 0.4994407892227173,
"step": 466
},
{
"epoch": 1.5412541254125411,
"grad_norm": 0.2912314912557071,
"learning_rate": 2.260417750890949e-05,
"loss": 0.5120857954025269,
"step": 467
},
{
"epoch": 1.5445544554455446,
"grad_norm": 0.3024618438133355,
"learning_rate": 2.2528000862608845e-05,
"loss": 0.5727359056472778,
"step": 468
},
{
"epoch": 1.5478547854785478,
"grad_norm": 0.30379584493476613,
"learning_rate": 2.2451786928245344e-05,
"loss": 0.584964394569397,
"step": 469
},
{
"epoch": 1.551155115511551,
"grad_norm": 0.2782374360382863,
"learning_rate": 2.237553682997603e-05,
"loss": 0.5507112741470337,
"step": 470
},
{
"epoch": 1.5544554455445545,
"grad_norm": 0.26333814455393634,
"learning_rate": 2.2299251692491364e-05,
"loss": 0.49136701226234436,
"step": 471
},
{
"epoch": 1.5577557755775577,
"grad_norm": 0.31673569076077385,
"learning_rate": 2.2222932640998635e-05,
"loss": 0.5374805927276611,
"step": 472
},
{
"epoch": 1.561056105610561,
"grad_norm": 0.29370656251116817,
"learning_rate": 2.2146580801205362e-05,
"loss": 0.523996114730835,
"step": 473
},
{
"epoch": 1.5643564356435644,
"grad_norm": 0.27277397989040114,
"learning_rate": 2.207019729930271e-05,
"loss": 0.48198428750038147,
"step": 474
},
{
"epoch": 1.5676567656765676,
"grad_norm": 0.2861287068823064,
"learning_rate": 2.199378326194883e-05,
"loss": 0.5148699879646301,
"step": 475
},
{
"epoch": 1.5709570957095709,
"grad_norm": 0.2981231032466442,
"learning_rate": 2.1917339816252303e-05,
"loss": 0.5297671556472778,
"step": 476
},
{
"epoch": 1.5742574257425743,
"grad_norm": 0.2775943923870632,
"learning_rate": 2.1840868089755465e-05,
"loss": 0.5082278847694397,
"step": 477
},
{
"epoch": 1.5775577557755776,
"grad_norm": 0.2988631140370514,
"learning_rate": 2.176436921041779e-05,
"loss": 0.4755392372608185,
"step": 478
},
{
"epoch": 1.5808580858085808,
"grad_norm": 0.28707182004966697,
"learning_rate": 2.1687844306599275e-05,
"loss": 0.5249454975128174,
"step": 479
},
{
"epoch": 1.5841584158415842,
"grad_norm": 0.3023499942723386,
"learning_rate": 2.161129450704376e-05,
"loss": 0.5626166462898254,
"step": 480
},
{
"epoch": 1.5874587458745875,
"grad_norm": 0.28182475866947054,
"learning_rate": 2.1534720940862318e-05,
"loss": 0.5590533018112183,
"step": 481
},
{
"epoch": 1.5907590759075907,
"grad_norm": 0.2724331542693392,
"learning_rate": 2.1458124737516557e-05,
"loss": 0.5146170854568481,
"step": 482
},
{
"epoch": 1.5940594059405941,
"grad_norm": 0.28834268248771533,
"learning_rate": 2.1381507026802007e-05,
"loss": 0.5633066296577454,
"step": 483
},
{
"epoch": 1.5973597359735974,
"grad_norm": 0.29376551657635425,
"learning_rate": 2.130486893883141e-05,
"loss": 0.5273865461349487,
"step": 484
},
{
"epoch": 1.6006600660066006,
"grad_norm": 0.277893471974935,
"learning_rate": 2.1228211604018088e-05,
"loss": 0.5040723085403442,
"step": 485
},
{
"epoch": 1.603960396039604,
"grad_norm": 0.2901419412347278,
"learning_rate": 2.1151536153059254e-05,
"loss": 0.5254411697387695,
"step": 486
},
{
"epoch": 1.6072607260726073,
"grad_norm": 0.29340041503520936,
"learning_rate": 2.1074843716919323e-05,
"loss": 0.5789728760719299,
"step": 487
},
{
"epoch": 1.6105610561056105,
"grad_norm": 0.2858502686555999,
"learning_rate": 2.0998135426813245e-05,
"loss": 0.5521235466003418,
"step": 488
},
{
"epoch": 1.613861386138614,
"grad_norm": 0.2770947277408911,
"learning_rate": 2.092141241418984e-05,
"loss": 0.4702959954738617,
"step": 489
},
{
"epoch": 1.6171617161716172,
"grad_norm": 0.29713285242144816,
"learning_rate": 2.0844675810715046e-05,
"loss": 0.4960707128047943,
"step": 490
},
{
"epoch": 1.6204620462046204,
"grad_norm": 0.2800759957297699,
"learning_rate": 2.076792674825529e-05,
"loss": 0.5334826111793518,
"step": 491
},
{
"epoch": 1.6237623762376239,
"grad_norm": 0.4465546145157964,
"learning_rate": 2.0691166358860775e-05,
"loss": 0.5604894161224365,
"step": 492
},
{
"epoch": 1.627062706270627,
"grad_norm": 0.2895889767199155,
"learning_rate": 2.061439577474875e-05,
"loss": 0.5565654635429382,
"step": 493
},
{
"epoch": 1.6303630363036303,
"grad_norm": 0.2663082120203026,
"learning_rate": 2.0537616128286875e-05,
"loss": 0.541640043258667,
"step": 494
},
{
"epoch": 1.6336633663366338,
"grad_norm": 0.27975047407467746,
"learning_rate": 2.0460828551976436e-05,
"loss": 0.5247132182121277,
"step": 495
},
{
"epoch": 1.636963696369637,
"grad_norm": 0.30554958978585,
"learning_rate": 2.0384034178435727e-05,
"loss": 0.533937394618988,
"step": 496
},
{
"epoch": 1.6402640264026402,
"grad_norm": 0.29094539458240765,
"learning_rate": 2.0307234140383264e-05,
"loss": 0.5857927799224854,
"step": 497
},
{
"epoch": 1.6435643564356437,
"grad_norm": 0.2718482098386275,
"learning_rate": 2.0230429570621134e-05,
"loss": 0.5191807746887207,
"step": 498
},
{
"epoch": 1.6468646864686467,
"grad_norm": 0.28523897670587156,
"learning_rate": 2.0153621602018276e-05,
"loss": 0.5255881547927856,
"step": 499
},
{
"epoch": 1.6501650165016502,
"grad_norm": 0.27057309315143646,
"learning_rate": 2.0076811367493736e-05,
"loss": 0.5134017467498779,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 909,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 699790582349824.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}