Model: Mathieu-Thomas-JOSSET/joke-finetome-model-gguf-phi4-20260112-081758 Source: Original Platform
4430 lines
243 KiB
HTML
4430 lines
243 KiB
HTML
<!doctype html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
|
<title>Training Report</title>
|
|
<style>
|
|
body { margin: 0; font-family: system-ui, Segoe UI, Arial; background: #0b0f17; color: #e6e6e6; }
|
|
header { padding: 14px 18px; border-bottom: 1px solid rgba(255,255,255,0.10); }
|
|
.wrap { padding: 14px 18px; display: grid; gap: 14px; }
|
|
details { background: rgba(255,255,255,0.04); padding: 10px 12px; border-radius: 10px; }
|
|
summary { cursor: pointer; font-weight: 600; }
|
|
pre { margin: 8px 0 0; white-space: pre-wrap; word-wrap: break-word; }
|
|
.muted { opacity: .8; font-size: 12px; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<header>
|
|
<div style="font-size:18px;font-weight:700;">Training Report</div>
|
|
<div class="muted">Single-file HTML (dashboard + run payload)</div>
|
|
</header>
|
|
|
|
<div class="wrap">
|
|
<details open><summary>Notes</summary><pre></pre></details>
|
|
|
|
<div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
|
|
<script charset="utf-8" src="https://cdn.plot.ly/plotly-2.35.2.min.js"></script> <div id="fbea6609-4680-44c8-9218-05ec099740b1" class="plotly-graph-div" style="height:900px; width:100%;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("fbea6609-4680-44c8-9218-05ec099740b1")) { Plotly.newPlot( "fbea6609-4680-44c8-9218-05ec099740b1", [{"mode":"lines","name":"train_loss (raw)","opacity":0.35,"x":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600],"y":[2.2041,2.6901,2.6774,2.4478,1.9988,2.0358,2.5824,2.3479,2.6387,2.6083,2.3015,2.7042,2.5386,2.5886,2.7168,2.1886,2.4191,2.2736,2.7336,2.5288,2.0214,2.3708,2.5341,2.4852,2.4959,2.243,2.1622,2.1827,2.3346,1.9959,2.5963,2.3312,2.2653,2.6019,2.2858,2.1479,2.3149,2.4299,2.1583,2.1631,2.0198,2.3086,2.1388,1.8651,1.8407,1.9392,2.4167,2.2915,1.7683,2.0858,2.1761,2.145,1.829,2.345,1.9989,2.3336,2.0979,2.4041,2.1076,2.2672,2.1081,2.034,2.1147,1.9639,1.7557,1.8861,2.272,1.7173,1.8545,2.0492,1.9203,2.1317,2.282,2.2023,2.3545,2.1984,1.9843,2.2899,1.8448,1.857,2.1387,1.9587,2.5776,2.1138,2.0632,2.1885,2.2515,2.3173,1.9536,1.9134,2.2808,2.2055,1.9373,1.7517,1.9707,1.9629,1.8158,2.5133,2.263,2.1035,1.8114,2.3733,1.929,1.9718,2.1026,1.7999,2.0349,2.2571,2.0794,2.1671,2.1292,2.2534,2.0333,2.1495,1.5522,2.2221,2.4227,2.3886,2.0651,1.9465,2.588,1.7602,2.2738,1.8145,1.7572,2.0206,2.1292,2.1207,2.1515,1.7264,1.6036,2.132,2.061,1.8765,1.8337,2.0664,1.9651,1.9243,2.0289,1.7206,1.7212,1.559,1.9442,1.8913,2.1248,2.1745,2.0572,1.8028,2.2202,1.8407,2.3419,2.0321,1.7054,1.9304,2.1154,2.0367,1.5693,1.726,1.7872,1.9831,2.0571,1.8674,1.7982,1.7992,2.2013,1.7908,2.1308,1.735,2.1536,1.8616,1.9501,1.7965,1.8903,1.9153,2.0043,2.0106,2.0356,1.4849,1.6636,1.4097,1.7611,2.0967,2.3108,1.6263,1.6228,1.9384,1.5646,1.8545,1.8436,2.2574,1.8046,1.6635,1.9118,1.5529,1.7932,1.7632,1.8788,2.0881,1.9197,1.4541,1.8766,1.8877,1.6045,1.705,1.6478,1.6703,2.0232,1.581,1.6617,2.173,1.5917,1.531,2.193,1.9884,1.8659,2.191,2.1538,1.9939,2.1667,1.9743,1.5679,2.1841,2.1165,1.7816,1.702,1.7222,1.7151,2.0545,1.4732,1.7168,1.497,1.8177,1.9449,1.4673,1.865,1.7449,2.0348,1.8765,1.7559,1.9031,1.6102,2.0716,2.0298,1.9911,1.3477,1.4686,1.617,1.6593,1.2608,1.8523,1.7768,1.7455,1.7103,2.0054,1.7175,1.7225,1.6882,2.0721,1.6198,1.9632,1.9262,2.0807,2.0099,1.7435,1.4648,2.106,1.4486,1.4996,1.8247,1.6092,1.6071,1.9553,1.8105,1.5718,1.9999,2.0393,1.6101,1.6718,2.2007,1.7025,1.7632,1.4975,1.2189,2.1889,1.782,1.7055,2.0713,1.6159,2.0388,1.7656,1.9818,1.6421,1.653,1.8727,1.3509,1.7836,1.6315,1.9805,1.8716,1.4669,1.3418,1.4561,1.8321,1.8932,2.0071,1.9824,2.2225,1.671,1.7876,1.1102,1.7577,1.3948,1.5507,1.5299,1.6187,1.6467,1.48,1.2413,1.2792,2.0635,1.9011,1.9017,1.7304,1.9703,1.6825,1.7945,1.7701,1.8,1.3064,2.0357,1.6015,1.5858,1.9489,2.1722,1.5027,1.481,1.8746,2.0807,1.7448,1.9092,1.7032,1.9141,1.8258,1.7998,2.0583,1.494,1.7167,1.7753,1.8852,1.8729,2.1419,1.83,2.3556,2.1396,1.4222,1.9362,1.9471,2.0193,2.1048,2.0755,1.4388,1.7819,1.5135,1.7713,1.6624,1.9059,1.155,1.4513,1.8148,1.6325,1.694,1.4905,1.1557,0.999,1.4159,0.9789,1.3929,1.5721,1.4603,1.2115,1.1689,1.1904,1.3799,1.5787,1.4084,1.3685,1.5465,1.1914,1.6443,1.5371,1.5124,1.6248,1.4621,1.4315,1.4308,1.2975,1.4624,1.5891,1.3152,1.3129,1.2283,1.1884,1.3971,1.5501,1.1685,1.3617,1.3164,1.4339,1.3699,1.4938,1.0902,1.0282,1.3465,1.4312,1.1921,1.3644,1.3318,1.3955,1.5949,1.2428,1.2438,1.4979,1.4914,1.1937,1.4428,1.0756,1.1709,1.2953,1.1653,1.542,1.0957,1.1408,1.1708,1.101,1.3898,1.4717,1.6173,1.6172,1.4479,1.3459,1.2195,1.3043,1.42,1.4764,1.4508,1.3499,1.297,1.1607,1.5328,1.6258,1.5923,1.2816,1.2029,1.2117,1.4347,1.3394,1.4262,1.3738,1.4021,1.3887,1.2997,1.5967,1.2643,1.3686,0.9961,1.0603,1.6641,1.2886,1.2922,1.1046,1.6142,1.5575,1.3655,1.3344,1.2505,1.1988,1.4075,1.3235,1.4297,1.0187,1.1512,0.9209,1.6726,1.6101,1.0596,1.3009,1.1306,1.6064,1.3423,1.0908,1.6632,1.3113,1.0863,1.1982,1.3941,1.3251,0.8109,1.1339,1.1736,1.3546,1.4636,1.5519,1.5206,1.0665,1.2801,1.565,1.3502,1.5859,1.4159,1.5672,1.2551,0.9671,1.4462,1.2006,0.9354,1.5554,0.9281,1.1678,0.8378,1.6602,1.5575,1.3553,1.3618,1.0911,1.2465,0.9937,1.3931,1.029,1.383,1.4833,1.2739,1.4294,1.0204,1.1488,1.49,1.113,1.5004,1.1722,1.5033,1.279,0.6426,0.8426,1.0651,1.0804,1.0182,0.8165,0.9941,0.9579,0.6023,1.1392,0.663,1.1801,0.9107,0.9474,0.7972,0.7298,0.5749,0.7301,0.5884,0.8034,0.6528,0.8483,0.7009,0.8732,0.8319,0.726,0.4993,1.0212,0.8306,0.626,0.75,1.2685,0.7429,1.0086,1.1215,0.6729,1.2036,0.6877,0.5637,0.7498,0.6792,0.6752,0.8586,0.8345,0.6932,1.2071,0.7349,0.7923,0.6857,0.9153,0.7542,0.9487,0.6893,0.7126,0.8014,0.7078,0.6612,0.8968,0.8395,0.9248],"type":"scatter","xaxis":"x","yaxis":"y"},{"mode":"lines","name":"train_loss (EMA span=25)","x":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600],"y":[2.2041,2.2414846153846155,2.2750165680473375,2.2883076012744654,2.266037785791814,2.248327186884752,2.274025095585925,2.279707780540854,2.3073225666530957,2.3304746769105495,2.3282458556097385,2.35716540517822,2.371121912472203,2.3878509961281877,2.4131547656567887,2.3958813221447284,2.3976673742874417,2.388123730111485,2.4146988277952173,2.423475841041739,2.392546930192375,2.3908740894083462,2.4018914671461657,2.40829981582723,2.4150382915328277,2.4018045767995333,2.383373455507262,2.367937035852857,2.3653726484795605,2.3369516755195945,2.356901546633472,2.3549245045847433,2.348030311924379,2.3675587494686576,2.3612696148941454,2.344856567594596,2.3425522162411654,2.349271276530307,2.334581178335668,2.3213903184636937,2.298191063197256,2.298991750643621,2.2866693082864193,2.254240899956695,2.222430061498488,2.2006431336909125,2.2172628926377658,2.2229734393579377,2.1879985594073275,2.18013713176061,2.1798265831636403,2.177147615227976,2.1503670294412087,2.1653387964072697,2.152535812068249,2.1664638265245375,2.16118968602265,2.1798750947901384,2.174315472113974,2.1814604357975145,2.1758173253515523,2.16490830032451,2.161046123376471,2.1458810369628964,2.115867111042674,2.0981927178855453,2.1115625088174266,2.0812346235237786,2.063793498637334,2.0626709218190777,2.0517193124483795,2.0578716730292737,2.0751123135654836,2.084895981752754,2.1056347523871577,2.1127705406650685,2.1028881913831405,2.117273715122899,2.096314198574984,2.077905414069216,2.0825819206792766,2.0730525421654864,2.1118638850758336,2.112012816993077,2.108257984916687,2.1144304476154034,2.1249742593372956,2.1397685470805805,2.1254478896128437,2.109136513488779,2.1223413970665654,2.1287382126768297,2.1140121963170735,2.0861420273696063,2.077261871418098,2.068464804385937,2.0490290502024036,2.084742200186834,2.0984543386340007,2.0988424664313854,2.076731507475125,2.099544468438577,2.0864256631740714,2.0776083044683737,2.0795307425861913,2.0580206854641765,2.0562421711977015,2.071692773413263,2.0722856369968583,2.0795790495355617,2.083396045725134,2.0964732729770468,2.091613790440351,2.0960665757910935,2.0542306853456247,2.0671437095498075,2.0944941934305916,2.117117717012854,2.113116354165711,2.1002997115375797,2.1378151183423815,2.1087678015468136,2.121462586043213,2.0978500794245045,2.0716462271610814,2.067719594302537,2.072448856279265,2.0761604827193216,2.081955830202451,2.0546053817253394,2.0199126600541595,2.0285347631269164,2.031032089040231,2.0191450052679056,2.0048800048626823,2.0096123121809377,2.0061882881670194,1.999889189077249,2.002120789917461,1.9804653445391946,1.9605218564977183,1.9296355598440478,1.9307559013945057,1.927720832056467,1.9428807680521234,1.960697632048114,1.9681208911213361,1.955403899496618,1.9757728303045705,1.9653826125888345,1.9943454885435394,1.997249681732498,1.9747997062146139,1.9713843441981052,1.982462471567482,1.986634589139214,1.9545319284361977,1.9369525493257211,1.9254331224545118,1.9298690361118571,1.939656033334022,1.9340978769237127,1.9236441940834272,1.9140715637693175,1.9361660588639855,1.9249840543359866,1.9408160501562954,1.924984046298119,1.9425698888905716,1.9363414358989892,1.9373997869836825,1.9265613418310916,1.9237720078440848,1.9231203149330014,1.9293649060920015,1.93561375946954,1.943305008741114,1.9080430849917975,1.8892397707616595,1.8523520960876858,1.845332704080941,1.864668649920869,1.8989864460808021,1.8780105656130481,1.8583789836428137,1.8645344464395206,1.841462565944173,1.8424654454869291,1.8425527189110116,1.8744640482255492,1.8690898906697377,1.8532752836951427,1.8577771849493625,1.8343250937994116,1.8311616250456109,1.82593380773441,1.8300004379086863,1.849854250377249,1.85522700034823,1.82437107724452,1.8283886866872492,1.8329510954036148,1.8153779342187213,1.8068873238942045,1.7946498374408042,1.7850844653299731,1.8034010449199753,1.7862932722338236,1.7767091743696835,1.8071930840335542,1.7906166929540501,1.770646178111431,1.8031349336413212,1.817386092591989,1.8211179316233745,1.8495703984215766,1.8729726754660707,1.8822747773532962,1.904153640633812,1.9095495144312113,1.8832687825518877,1.906409645432512,1.9225704419377032,1.9117265617886492,1.8955937493433686,1.8822557686246482,1.8693976325765984,1.8836362762245524,1.85206425497651,1.8416593122860094,1.8151470574947781,1.8153434376874875,1.8253093270961425,1.797770148088747,1.8029416751588434,1.7984769309158557,1.8166556285377131,1.82125904172712,1.8162314231327261,1.8229136213532857,1.8065510350953407,1.8269394170110838,1.8425440772410004,1.8539714559147698,1.8150274977674798,1.788379228708443,1.775196211115486,1.7662811179527564,1.7273979550333138,1.7370058046461359,1.7400668965964332,1.740484827627477,1.738162917809979,1.7587196164399805,1.7555488767138283,1.7530066554281494,1.7480215280875226,1.7729506413115594,1.761169822749132,1.7767106056145834,1.788209789798077,1.8107090367366865,1.8260314185261721,1.8196828478703129,1.7923841672649043,1.8165084620906808,1.7882078111606285,1.7660072103021187,1.7705220402788788,1.758112652565119,1.7464962946754945,1.762558118161995,1.7662459552264569,1.7512885740551911,1.7704125298970996,1.7910961814434767,1.777173398255517,1.769067752235862,1.8022702328331035,1.7945955995382494,1.7921805534199227,1.7695128185414672,1.7271579863459698,1.7626766027808953,1.7641630179515957,1.7596504781091653,1.7836235182546143,1.7707217091581056,1.7913431161459439,1.7893628764424099,1.804165732100686,1.7916991373237103,1.7810299729141943,1.7880815134592563,1.7544521662700827,1.7566943073262302,1.7470639759934432,1.765020593224717,1.773219009130508,1.7496560084281612,1.7182824693183028,1.6981145870630487,1.7084211572889683,1.7226349144205861,1.7445168440805412,1.762815548382038,1.798175890814189,1.7883931299823284,1.788332119983688,1.7361681107541735,1.7378244099269295,1.7114379168556275,1.699073461712887,1.6860601185042035,1.6808785709269571,1.678249450086422,1.6629994923874667,1.6305610698961233,1.6035332952887293,1.6389153494972888,1.659083399535959,1.6777462149562699,1.68179650611348,1.7039890825662896,1.7023360762150366,1.70942560881388,1.7140928696743507,1.7207011104686314,1.6888317942787368,1.7155139639496033,1.706743659030403,1.697440300643449,1.716783354440107,1.7518154040985603,1.7326526807063636,1.7132947821904896,1.7257028758681445,1.7530103469552103,1.7523787818048095,1.7644419524352088,1.7597310330171159,1.771605568938876,1.7757743713281935,1.7776224966106402,1.7992130737944372,1.775735145041019,1.7711939800378638,1.771509827727259,1.7802552255943929,1.7873817467025166,1.8146523815715538,1.8158329676045113,1.8573535085580106,1.8790647771304714,1.8439213327358197,1.8510196917561412,1.8584104846979765,1.8707866012596708,1.8887876319320038,1.9031501217833882,1.8674308816462046,1.860851583058035,1.8341322305151093,1.8292989820139471,1.816460598782105,1.823340552721943,1.7719297409741013,1.7472659147453244,1.7524608443802996,1.7432330871202766,1.739445926572563,1.7202962399131352,1.6768657599198171,1.624722239925985,1.6086589907009092,1.560215991416224,1.5473455305380528,1.5492497204966642,1.542407434304613,1.5169530162811815,1.4901797073364753,1.4671197298490544,1.4604105198606656,1.4695097106406145,1.4648089636682595,1.457400581847624,1.464254383243961,1.4432655845328872,1.4587297703380497,1.464758249542815,1.4684229995779832,1.4804519996104462,1.4790403073327196,1.4753833606148181,1.4719538713367553,1.4585343427723896,1.4588317010206673,1.468852339403693,1.4570329286803319,1.4459457803203064,1.4292037972187446,1.4106804282019183,1.409635779878694,1.4204407198880253,1.4010606645120234,1.3980329210880216,1.3917534656197124,1.3949955067258886,1.3930650831315894,1.400813922890698,1.3769205442067982,1.3500958869601216,1.3498192802708815,1.3560793356346599,1.3434655405858398,1.3450758836176984,1.3440546618009523,1.3480119955085714,1.3670033804694506,1.3574492742794928,1.3487070224118396,1.3601834053032367,1.37027698951068,1.3566941441637048,1.3633176715357276,1.3411855429560564,1.32808665503636,1.3255646046489478,1.3132365581374903,1.330833745973068,1.3127465347443703,1.2995198782255726,1.2896183491312978,1.2751092453519672,1.2839316110941237,1.2983753333176526,1.3229079999855256,1.3455458461404852,1.3534192425912173,1.3528408393149698,1.3425838516753568,1.3396389400080218,1.3458205600074047,1.3558651323145277,1.3631678144441794,1.3621472133330887,1.3571358892305436,1.3420254362128097,1.356700402657978,1.3774003716842875,1.3939311123239577,1.3852902575298074,1.3712602377198224,1.3589863732798362,1.3648104984121565,1.3628558446881445,1.3677284720198257,1.3681955126336853,1.370803550123402,1.3721802001139096,1.3666048001051476,1.3843044308662902,1.3750733207996526,1.3745753730458332,1.3454618828115383,1.323526353364497,1.3497243261826126,1.3450224549377963,1.3409591891733503,1.3227777130830927,1.345194812076701,1.3615259803784934,1.3618316741955325,1.3597215454112608,1.351319888071933,1.3395875889894766,1.3448116206056708,1.3431722651744653,1.3498282447764294,1.3243568413320885,1.3110370843065433,1.2810265393598863,1.311147574793741,1.3341439151942225,1.3130251524869747,1.3120924484495151,1.2981314908764754,1.3218444531167466,1.3234179567231508,1.305524267744447,1.3330377856102589,1.3313656482556235,1.3125144445436525,1.3037210257326024,1.3106732545224025,1.3117830041745253,1.2732535423149465,1.2625340390599507,1.2556929591322623,1.2633011930451654,1.2787087935801527,1.2997235017662947,1.316714001630426,1.2974667707357779,1.2961308652945642,1.3168131064257516,1.319381329008386,1.3398827652385104,1.345730244835548,1.3627663798481982,1.354484350629106,1.3246855544268672,1.3340328194709543,1.3237687564347271,1.293894236708979,1.3140100646544424,1.2843246750656392,1.2753612385221285,1.241702681712734,1.2738947831194467,1.2957105690333355,1.3002943714153867,1.3050255736142033,1.2885697602592645,1.2853336248547058,1.2629002690966515,1.272915633012294,1.2541528920113483,1.2640642080104754,1.2809284997019774,1.2803878458787485,1.291850319272691,1.270969525482484,1.261571869676139,1.2791432643164362,1.266363013215172,1.2843658583524666,1.2757377154022769,1.2932425065251787,1.2921469291001648,1.2421817807078444,1.211444720653395,1.2001874344492875,1.190973016414727,1.177682784382825,1.149899493276454,1.137914916870573,1.1240676155728366,1.083931645144157,1.0881830570561448,1.055476668051826,1.0650630782016854,1.0531889952630944,1.0450513802428565,1.0259858894549445,1.003202359496872,0.9702560241509588,0.9517824838316543,0.9238299850753733,0.9145661400695754,0.8944302831411466,0.8908817998225969,0.8762678152208586,0.8760318294346388,0.872637073324282,0.8613572984531834,0.8335067370337079,0.8479446803388074,0.8466104741588992,0.8296404376851378,0.8235142501708965,0.8577439232346737,0.848909775293545,0.8611936387325031,0.881217204983849,0.865192804600476,0.8912241273235163,0.8755684252217073,0.8515785463584991,0.8437494274078453,0.8310917791457034,0.8191001038268033,0.8221385573785878,0.8230894375802349,0.8130979423817554,0.8434057929677743,0.8350591935087148,0.8317700247772752,0.8205338690251772,0.8278235714078559,0.8221602197610978,0.8318940490102441,0.8209252760094562,0.8125925624702672,0.8117315961264007,0.8037368579628313,0.7927724842733829,0.8007746008677381,0.803753477724066,0.8130647486683686],"type":"scatter","xaxis":"x","yaxis":"y"},{"mode":"lines+markers","name":"eval_loss","x":[50,100,150,200,250,300,350,400,450,500,550,600],"y":[2.315699815750122,2.2380564212799072,2.264693021774292,2.3124067783355713,2.335228443145752,2.3116097450256348,2.3213632106781006,2.531832218170166,2.571645498275757,2.6166257858276367,3.0347440242767334,2.9689695835113525],"type":"scatter","xaxis":"x2","yaxis":"y2"},{"mode":"lines","name":"eval_loss (EMA span=25)","x":[50,100,150,200,250,300,350,400,450,500,550,600],"y":[2.315699815750122,2.309727246944721,2.3062630757777653,2.3067356682822124,2.3089274201947925,2.3091337528740885,2.3100744803974744,2.327132767918451,2.3459414394843976,2.366763312280032,2.418146443972086,2.460517454705876],"type":"scatter","xaxis":"x2","yaxis":"y2"},{"mode":"lines+markers","name":"blended","x":[50,100,150,200,250,300,350,400,450,500,550,600],"y":[2.2140278027579883,2.133645955143516,2.0402102148851986,1.9463800025172828,1.8865614642710398,1.8901444425775296,1.9208771126964126,1.7174173897827714,1.6759616533321549,1.6016323412407831,1.574823298861292,1.3520409573791146],"type":"scatter","xaxis":"x3","yaxis":"y3"},{"line":{"dash":"dash"},"mode":"lines","name":"blended (EMA span=25)","x":[50,100,150,200,250,300,350,400,450,500,550,600],"y":[2.2140278027579883,2.2078445837107212,2.1949496322626043,2.1758288915129644,2.1535775509558936,2.1333134656960198,2.116972207772973,2.086237221773727,2.054677562662837,2.0198279302457562,1.9855968047546437,1.9368617395719108],"type":"scatter","xaxis":"x3","yaxis":"y3"},{"cells":{"values":[["model","dataset","examples_total","examples_train","examples_eval","world_size","effective_batch_size","steps_per_epoch_approx","max_steps","eval_steps","save_steps","learning_rate","warmup_steps","lr_scheduler_type","weight_decay","lora_r","lora_alpha","lora_dropout","best_eval_loss","best_step","best_checkpoint","LR_AUTO_ENABLED","LR_AUTO_USE_N","LR_AUTO_N_REF","LR_AUTO_BASE","LR_AUTO_MULT","LR_AUTO_FINAL","best_blended","best_blended_step"],["unsloth\u002fPhi-4-unsloth-bnb-4bit","Mathieu-Thomas-JOSSET\u002fmichael_abab_conversations_infini_instruct.jsonl","2872","1436","1436","1","8","179.5","2000","50","50","9.95267419777795e-06","10","linear","0.009206070410847844","32","64","0.0","2.2380564212799072","100","outputs\u002fcontinue_r1_from_350_20260112_073729\u002fcheckpoint-100","True","train","1436","1e-05","0.5","5e-06","1.3520409573791146","600"]]},"header":{"values":["Key","Value"]},"type":"table","domain":{"x":[0.766,1.0],"y":[0.0,1.0]}}], {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"rgb(17,17,17)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"#f2f5fa"},"error_y":{"color":"#f2f5fa"},"marker":{"line":{"color":"rgb(17,17,17)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"#A2B1C6","gridcolor":"#506784","linecolor":"#506784","minorgridcolor":"#506784","startlinecolor":"#A2B1C6"},"baxis":{"endlinecolor":"#A2B1C6","gridcolor":"#506784","linecolor":"#506784","minorgridcolor":"#506784","startlinecolor":"#A2B1C6"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"contour"}],"heatmapgl":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"heatmapgl"}],"heatmap":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergeo"}],"scattergl":[{"marker":{"line":{"color":"#283442"}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermapbox"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolar"}],"scatter":[{"marker":{"line":{"color":"#283442"}},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"#506784"},"line":{"color":"rgb(17,17,17)"}},"header":{"fill":{"color":"#2a3f5f"},"line":{"color":"rgb(17,17,17)"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"#f2f5fa","arrowhead":0,"arrowwidth":1},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]],"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]},"colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#f2f5fa"},"geo":{"bgcolor":"rgb(17,17,17)","lakecolor":"rgb(17,17,17)","landcolor":"rgb(17,17,17)","showlakes":true,"showland":true,"subunitcolor":"#506784"},"hoverlabel":{"align":"left"},"hovermode":"closest","mapbox":{"style":"dark"},"paper_bgcolor":"rgb(17,17,17)","plot_bgcolor":"rgb(17,17,17)","polar":{"angularaxis":{"gridcolor":"#506784","linecolor":"#506784","ticks":""},"bgcolor":"rgb(17,17,17)","radialaxis":{"gridcolor":"#506784","linecolor":"#506784","ticks":""}},"scene":{"xaxis":{"backgroundcolor":"rgb(17,17,17)","gridcolor":"#506784","gridwidth":2,"linecolor":"#506784","showbackground":true,"ticks":"","zerolinecolor":"#C8D4E3"},"yaxis":{"backgroundcolor":"rgb(17,17,17)","gridcolor":"#506784","gridwidth":2,"linecolor":"#506784","showbackground":true,"ticks":"","zerolinecolor":"#C8D4E3"},"zaxis":{"backgroundcolor":"rgb(17,17,17)","gridcolor":"#506784","gridwidth":2,"linecolor":"#506784","showbackground":true,"ticks":"","zerolinecolor":"#C8D4E3"}},"shapedefaults":{"line":{"color":"#f2f5fa"}},"sliderdefaults":{"bgcolor":"#C8D4E3","bordercolor":"rgb(17,17,17)","borderwidth":1,"tickwidth":0},"ternary":{"aaxis":{"gridcolor":"#506784","linecolor":"#506784","ticks":""},"baxis":{"gridcolor":"#506784","linecolor":"#506784","ticks":""},"bgcolor":"rgb(17,17,17)","caxis":{"gridcolor":"#506784","linecolor":"#506784","ticks":""}},"title":{"x":0.05},"updatemenudefaults":{"bgcolor":"#506784","borderwidth":0},"xaxis":{"automargin":true,"gridcolor":"#283442","linecolor":"#506784","ticks":"","title":{"standoff":15},"zerolinecolor":"#283442","zerolinewidth":2},"yaxis":{"automargin":true,"gridcolor":"#283442","linecolor":"#506784","ticks":"","title":{"standoff":15},"zerolinecolor":"#283442","zerolinewidth":2}}},"xaxis":{"anchor":"y","domain":[0.0,0.666],"matches":"x3","showticklabels":false},"yaxis":{"anchor":"x","domain":[0.7133333333333334,1.0],"title":{"text":"loss"}},"xaxis2":{"anchor":"y2","domain":[0.0,0.666],"matches":"x3","showticklabels":false},"yaxis2":{"anchor":"x2","domain":[0.3566666666666667,0.6433333333333333],"title":{"text":"eval_loss"}},"xaxis3":{"anchor":"y3","domain":[0.0,0.666],"title":{"text":"step"}},"yaxis3":{"anchor":"x3","domain":[0.0,0.2866666666666667],"title":{"text":"blended"}},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Training loss","x":0.333,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Run spec","x":0.883,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Validation (eval_loss)","x":0.333,"xanchor":"center","xref":"paper","y":0.6433333333333333,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Blended objective (0.75*trainEMA + 0.25*eval)","x":0.333,"xanchor":"center","xref":"paper","y":0.2866666666666667,"yanchor":"bottom","yref":"paper"}],"legend":{"orientation":"h","y":1.02,"x":0},"margin":{"l":70,"r":40,"t":90,"b":70},"hovermode":"x unified","height":900,"hoversubplots":"axis"}, {"responsive": true} ) }; </script> </div>
|
|
|
|
<details><summary>Run meta (quick)</summary><pre>{
|
|
"model": "unsloth/Phi-4-unsloth-bnb-4bit",
|
|
"dataset": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl",
|
|
"examples_total": 2872,
|
|
"examples_train": 1436,
|
|
"examples_eval": 1436,
|
|
"world_size": 1,
|
|
"effective_batch_size": 8,
|
|
"steps_per_epoch_approx": 179.5,
|
|
"max_steps": 2000,
|
|
"eval_steps": 50,
|
|
"save_steps": 50,
|
|
"learning_rate": 9.95267419777795e-06,
|
|
"warmup_steps": 10,
|
|
"lr_scheduler_type": "linear",
|
|
"weight_decay": 0.009206070410847844,
|
|
"lora_r": 32,
|
|
"lora_alpha": 64,
|
|
"lora_dropout": 0.0,
|
|
"best_checkpoint": "outputs/continue_r1_from_350_20260112_073729/checkpoint-100",
|
|
"LR_AUTO_ENABLED": true,
|
|
"LR_AUTO_USE_N": "train",
|
|
"LR_AUTO_N_REF": 1436,
|
|
"LR_AUTO_BASE": 1e-05,
|
|
"LR_AUTO_MULT": 0.5,
|
|
"LR_AUTO_FINAL": 5e-06,
|
|
"best_step": 100,
|
|
"best_eval_loss": 2.2380564212799072,
|
|
"best_blended": 1.3520409573791146,
|
|
"best_blended_step": 600
|
|
}</pre></details>
|
|
<details><summary>config_snapshot</summary><pre>{
|
|
"MODEL_NAME": "unsloth/Phi-4-unsloth-bnb-4bit",
|
|
"CHAT_TEMPLATE": "phi-4",
|
|
"MAX_SEQ_LENGTH": 2048,
|
|
"LOAD_IN_4BIT": true,
|
|
"DATASET_NAME": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl",
|
|
"DATASET_SPLIT": "train",
|
|
"PER_DEVICE_TRAIN_BATCH_SIZE": 2,
|
|
"GRADIENT_ACCUMULATION_STEPS": 4,
|
|
"WARMUP_STEPS": 10,
|
|
"MAX_STEPS": 2000,
|
|
"LEARNING_RATE": 9.95267419777795e-06,
|
|
"WEIGHT_DECAY": 0.009206070410847844,
|
|
"LR_SCHEDULER_TYPE": "linear",
|
|
"SEED": 3407,
|
|
"PLOTLY_DARK_MODE": true,
|
|
"PLOTLY_BASE_COLOR": "#00CC96",
|
|
"PLOTLY_EMA_SPAN": 25,
|
|
"LR_AUTO_ENABLED": true,
|
|
"LR_AUTO_USE_N": "train",
|
|
"LR_AUTO_N_REF": 1436,
|
|
"LR_AUTO_BASE": 1e-05,
|
|
"LR_AUTO_MULT": 0.5,
|
|
"LR_AUTO_FINAL": 5e-06
|
|
}</pre></details>
|
|
<details><summary>run_manifest</summary><pre>{
|
|
"model_name": "unsloth/Phi-4-unsloth-bnb-4bit",
|
|
"dataset": {
|
|
"name": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl",
|
|
"split": "train"
|
|
},
|
|
"training": {
|
|
"max_steps": 2000,
|
|
"learning_rate": 9.95267419777795e-06,
|
|
"per_device_train_batch_size": 2,
|
|
"gradient_accumulation_steps": 4,
|
|
"max_seq_length": 2048,
|
|
"seed": 3407,
|
|
"optimizer": "adamw_8bit",
|
|
"lr_scheduler_type": "linear"
|
|
},
|
|
"auto_lr": {
|
|
"enabled": true,
|
|
"use_n": "train",
|
|
"n_ref": 1436,
|
|
"base": 1e-05,
|
|
"mult": 0.5,
|
|
"final": 5e-06
|
|
},
|
|
"best": {
|
|
"checkpoint": "/content/outputs/continue_r1_from_350_20260112_073729/checkpoint-100",
|
|
"metric": 2.2380564212799072,
|
|
"metric_name": "eval_loss"
|
|
},
|
|
"plotly": {
|
|
"html": "training_loss_step.html"
|
|
}
|
|
}</pre></details>
|
|
<details><summary>trainer.state.log_history</summary><pre>[
|
|
{
|
|
"loss": 2.2041,
|
|
"grad_norm": 4.026190757751465,
|
|
"learning_rate": 0.0,
|
|
"epoch": 0.005571030640668524,
|
|
"step": 1
|
|
},
|
|
{
|
|
"loss": 2.6901,
|
|
"grad_norm": 1.616629719734192,
|
|
"learning_rate": 1.4636285584967574e-07,
|
|
"epoch": 0.011142061281337047,
|
|
"step": 2
|
|
},
|
|
{
|
|
"loss": 2.6774,
|
|
"grad_norm": 13.836981773376465,
|
|
"learning_rate": 2.927257116993515e-07,
|
|
"epoch": 0.016713091922005572,
|
|
"step": 3
|
|
},
|
|
{
|
|
"loss": 2.4478,
|
|
"grad_norm": 1.857710361480713,
|
|
"learning_rate": 4.3908856754902726e-07,
|
|
"epoch": 0.022284122562674095,
|
|
"step": 4
|
|
},
|
|
{
|
|
"loss": 1.9988,
|
|
"grad_norm": 1.4818029403686523,
|
|
"learning_rate": 5.85451423398703e-07,
|
|
"epoch": 0.027855153203342618,
|
|
"step": 5
|
|
},
|
|
{
|
|
"loss": 2.0358,
|
|
"grad_norm": 1.726440191268921,
|
|
"learning_rate": 7.318142792483787e-07,
|
|
"epoch": 0.033426183844011144,
|
|
"step": 6
|
|
},
|
|
{
|
|
"loss": 2.5824,
|
|
"grad_norm": 2.0604233741760254,
|
|
"learning_rate": 8.781771350980545e-07,
|
|
"epoch": 0.03899721448467967,
|
|
"step": 7
|
|
},
|
|
{
|
|
"loss": 2.3479,
|
|
"grad_norm": 1.7288694381713867,
|
|
"learning_rate": 1.0245399909477302e-06,
|
|
"epoch": 0.04456824512534819,
|
|
"step": 8
|
|
},
|
|
{
|
|
"loss": 2.6387,
|
|
"grad_norm": 1.9069620370864868,
|
|
"learning_rate": 1.170902846797406e-06,
|
|
"epoch": 0.05013927576601671,
|
|
"step": 9
|
|
},
|
|
{
|
|
"loss": 2.6083,
|
|
"grad_norm": 1.4719465970993042,
|
|
"learning_rate": 1.3172657026470817e-06,
|
|
"epoch": 0.055710306406685235,
|
|
"step": 10
|
|
},
|
|
{
|
|
"loss": 2.3015,
|
|
"grad_norm": 1.6306267976760864,
|
|
"learning_rate": 1.4636285584967574e-06,
|
|
"epoch": 0.06128133704735376,
|
|
"step": 11
|
|
},
|
|
{
|
|
"loss": 2.7042,
|
|
"grad_norm": 1.4724116325378418,
|
|
"learning_rate": 1.6099914143464333e-06,
|
|
"epoch": 0.06685236768802229,
|
|
"step": 12
|
|
},
|
|
{
|
|
"loss": 2.5386,
|
|
"grad_norm": 1.5470020771026611,
|
|
"learning_rate": 1.756354270196109e-06,
|
|
"epoch": 0.07242339832869081,
|
|
"step": 13
|
|
},
|
|
{
|
|
"loss": 2.5886,
|
|
"grad_norm": 2.022662401199341,
|
|
"learning_rate": 1.9027171260457846e-06,
|
|
"epoch": 0.07799442896935933,
|
|
"step": 14
|
|
},
|
|
{
|
|
"loss": 2.7168,
|
|
"grad_norm": 1.8387386798858643,
|
|
"learning_rate": 2.0490799818954605e-06,
|
|
"epoch": 0.08356545961002786,
|
|
"step": 15
|
|
},
|
|
{
|
|
"loss": 2.1886,
|
|
"grad_norm": 1.9359395503997803,
|
|
"learning_rate": 2.195442837745136e-06,
|
|
"epoch": 0.08913649025069638,
|
|
"step": 16
|
|
},
|
|
{
|
|
"loss": 2.4191,
|
|
"grad_norm": 1.5662318468093872,
|
|
"learning_rate": 2.341805693594812e-06,
|
|
"epoch": 0.0947075208913649,
|
|
"step": 17
|
|
},
|
|
{
|
|
"loss": 2.2736,
|
|
"grad_norm": 1.7207640409469604,
|
|
"learning_rate": 2.4881685494444876e-06,
|
|
"epoch": 0.10027855153203342,
|
|
"step": 18
|
|
},
|
|
{
|
|
"loss": 2.7336,
|
|
"grad_norm": 1.6225577592849731,
|
|
"learning_rate": 2.6345314052941634e-06,
|
|
"epoch": 0.10584958217270195,
|
|
"step": 19
|
|
},
|
|
{
|
|
"loss": 2.5288,
|
|
"grad_norm": 1.6348892450332642,
|
|
"learning_rate": 2.780894261143839e-06,
|
|
"epoch": 0.11142061281337047,
|
|
"step": 20
|
|
},
|
|
{
|
|
"loss": 2.0214,
|
|
"grad_norm": 1.5059679746627808,
|
|
"learning_rate": 2.927257116993515e-06,
|
|
"epoch": 0.116991643454039,
|
|
"step": 21
|
|
},
|
|
{
|
|
"loss": 2.3708,
|
|
"grad_norm": 1.3699105978012085,
|
|
"learning_rate": 3.073619972843191e-06,
|
|
"epoch": 0.12256267409470752,
|
|
"step": 22
|
|
},
|
|
{
|
|
"loss": 2.5341,
|
|
"grad_norm": 2.241403341293335,
|
|
"learning_rate": 3.2199828286928667e-06,
|
|
"epoch": 0.12813370473537605,
|
|
"step": 23
|
|
},
|
|
{
|
|
"loss": 2.4852,
|
|
"grad_norm": 1.7692517042160034,
|
|
"learning_rate": 3.3663456845425424e-06,
|
|
"epoch": 0.13370473537604458,
|
|
"step": 24
|
|
},
|
|
{
|
|
"loss": 2.4959,
|
|
"grad_norm": 1.9559876918792725,
|
|
"learning_rate": 3.512708540392218e-06,
|
|
"epoch": 0.1392757660167131,
|
|
"step": 25
|
|
},
|
|
{
|
|
"loss": 2.243,
|
|
"grad_norm": 1.7536145448684692,
|
|
"learning_rate": 3.659071396241894e-06,
|
|
"epoch": 0.14484679665738162,
|
|
"step": 26
|
|
},
|
|
{
|
|
"loss": 2.1622,
|
|
"grad_norm": 1.8103671073913574,
|
|
"learning_rate": 3.805434252091569e-06,
|
|
"epoch": 0.15041782729805014,
|
|
"step": 27
|
|
},
|
|
{
|
|
"loss": 2.1827,
|
|
"grad_norm": 1.5473895072937012,
|
|
"learning_rate": 3.951797107941245e-06,
|
|
"epoch": 0.15598885793871867,
|
|
"step": 28
|
|
},
|
|
{
|
|
"loss": 2.3346,
|
|
"grad_norm": 1.7137048244476318,
|
|
"learning_rate": 4.098159963790921e-06,
|
|
"epoch": 0.1615598885793872,
|
|
"step": 29
|
|
},
|
|
{
|
|
"loss": 1.9959,
|
|
"grad_norm": 1.7803088426589966,
|
|
"learning_rate": 4.244522819640596e-06,
|
|
"epoch": 0.1671309192200557,
|
|
"step": 30
|
|
},
|
|
{
|
|
"loss": 2.5963,
|
|
"grad_norm": 1.3565785884857178,
|
|
"learning_rate": 4.390885675490272e-06,
|
|
"epoch": 0.17270194986072424,
|
|
"step": 31
|
|
},
|
|
{
|
|
"loss": 2.3312,
|
|
"grad_norm": 1.4962913990020752,
|
|
"learning_rate": 4.537248531339948e-06,
|
|
"epoch": 0.17827298050139276,
|
|
"step": 32
|
|
},
|
|
{
|
|
"loss": 2.2653,
|
|
"grad_norm": 1.4864503145217896,
|
|
"learning_rate": 4.683611387189624e-06,
|
|
"epoch": 0.18384401114206128,
|
|
"step": 33
|
|
},
|
|
{
|
|
"loss": 2.6019,
|
|
"grad_norm": 2.2222468852996826,
|
|
"learning_rate": 4.829974243039299e-06,
|
|
"epoch": 0.1894150417827298,
|
|
"step": 34
|
|
},
|
|
{
|
|
"loss": 2.2858,
|
|
"grad_norm": 1.6111881732940674,
|
|
"learning_rate": 4.976337098888975e-06,
|
|
"epoch": 0.19498607242339833,
|
|
"step": 35
|
|
},
|
|
{
|
|
"loss": 2.1479,
|
|
"grad_norm": 2.307185173034668,
|
|
"learning_rate": 5.1226999547386506e-06,
|
|
"epoch": 0.20055710306406685,
|
|
"step": 36
|
|
},
|
|
{
|
|
"loss": 2.3149,
|
|
"grad_norm": 1.9714685678482056,
|
|
"learning_rate": 5.269062810588327e-06,
|
|
"epoch": 0.20612813370473537,
|
|
"step": 37
|
|
},
|
|
{
|
|
"loss": 2.4299,
|
|
"grad_norm": 1.6915550231933594,
|
|
"learning_rate": 5.415425666438002e-06,
|
|
"epoch": 0.2116991643454039,
|
|
"step": 38
|
|
},
|
|
{
|
|
"loss": 2.1583,
|
|
"grad_norm": 1.9084649085998535,
|
|
"learning_rate": 5.561788522287678e-06,
|
|
"epoch": 0.21727019498607242,
|
|
"step": 39
|
|
},
|
|
{
|
|
"loss": 2.1631,
|
|
"grad_norm": 1.882629632949829,
|
|
"learning_rate": 5.7081513781373534e-06,
|
|
"epoch": 0.22284122562674094,
|
|
"step": 40
|
|
},
|
|
{
|
|
"loss": 2.0198,
|
|
"grad_norm": 1.335666537284851,
|
|
"learning_rate": 5.85451423398703e-06,
|
|
"epoch": 0.22841225626740946,
|
|
"step": 41
|
|
},
|
|
{
|
|
"loss": 2.3086,
|
|
"grad_norm": 2.620265007019043,
|
|
"learning_rate": 6.000877089836705e-06,
|
|
"epoch": 0.233983286908078,
|
|
"step": 42
|
|
},
|
|
{
|
|
"loss": 2.1388,
|
|
"grad_norm": 2.0138227939605713,
|
|
"learning_rate": 6.147239945686382e-06,
|
|
"epoch": 0.2395543175487465,
|
|
"step": 43
|
|
},
|
|
{
|
|
"loss": 1.8651,
|
|
"grad_norm": 1.6108520030975342,
|
|
"learning_rate": 6.293602801536056e-06,
|
|
"epoch": 0.24512534818941503,
|
|
"step": 44
|
|
},
|
|
{
|
|
"loss": 1.8407,
|
|
"grad_norm": 1.5935970544815063,
|
|
"learning_rate": 6.439965657385733e-06,
|
|
"epoch": 0.25069637883008355,
|
|
"step": 45
|
|
},
|
|
{
|
|
"loss": 1.9392,
|
|
"grad_norm": 1.3794289827346802,
|
|
"learning_rate": 6.586328513235409e-06,
|
|
"epoch": 0.2562674094707521,
|
|
"step": 46
|
|
},
|
|
{
|
|
"loss": 2.4167,
|
|
"grad_norm": 1.23729407787323,
|
|
"learning_rate": 6.732691369085085e-06,
|
|
"epoch": 0.2618384401114206,
|
|
"step": 47
|
|
},
|
|
{
|
|
"loss": 2.2915,
|
|
"grad_norm": 1.4265947341918945,
|
|
"learning_rate": 6.87905422493476e-06,
|
|
"epoch": 0.26740947075208915,
|
|
"step": 48
|
|
},
|
|
{
|
|
"loss": 1.7683,
|
|
"grad_norm": 1.696736216545105,
|
|
"learning_rate": 7.025417080784436e-06,
|
|
"epoch": 0.27298050139275765,
|
|
"step": 49
|
|
},
|
|
{
|
|
"loss": 2.0858,
|
|
"grad_norm": 1.5071961879730225,
|
|
"learning_rate": 7.1717799366341115e-06,
|
|
"epoch": 0.2785515320334262,
|
|
"step": 50
|
|
},
|
|
{
|
|
"eval_loss": 2.315699815750122,
|
|
"eval_runtime": 35.9132,
|
|
"eval_samples_per_second": 39.985,
|
|
"eval_steps_per_second": 2.005,
|
|
"epoch": 0.2785515320334262,
|
|
"step": 50
|
|
},
|
|
{
|
|
"loss": 2.1761,
|
|
"grad_norm": 1.5999361276626587,
|
|
"learning_rate": 7.318142792483788e-06,
|
|
"epoch": 0.2841225626740947,
|
|
"step": 51
|
|
},
|
|
{
|
|
"loss": 2.145,
|
|
"grad_norm": 2.0915520191192627,
|
|
"learning_rate": 7.464505648333463e-06,
|
|
"epoch": 0.28969359331476324,
|
|
"step": 52
|
|
},
|
|
{
|
|
"loss": 1.829,
|
|
"grad_norm": 4.090714931488037,
|
|
"learning_rate": 7.610868504183138e-06,
|
|
"epoch": 0.29526462395543174,
|
|
"step": 53
|
|
},
|
|
{
|
|
"loss": 2.345,
|
|
"grad_norm": 1.6347575187683105,
|
|
"learning_rate": 7.757231360032815e-06,
|
|
"epoch": 0.3008356545961003,
|
|
"step": 54
|
|
},
|
|
{
|
|
"loss": 1.9989,
|
|
"grad_norm": 1.5609041452407837,
|
|
"learning_rate": 7.90359421588249e-06,
|
|
"epoch": 0.3064066852367688,
|
|
"step": 55
|
|
},
|
|
{
|
|
"loss": 2.3336,
|
|
"grad_norm": 1.6561325788497925,
|
|
"learning_rate": 8.049957071732166e-06,
|
|
"epoch": 0.31197771587743733,
|
|
"step": 56
|
|
},
|
|
{
|
|
"loss": 2.0979,
|
|
"grad_norm": 1.6579258441925049,
|
|
"learning_rate": 8.196319927581842e-06,
|
|
"epoch": 0.31754874651810583,
|
|
"step": 57
|
|
},
|
|
{
|
|
"loss": 2.4041,
|
|
"grad_norm": 1.8354761600494385,
|
|
"learning_rate": 8.342682783431518e-06,
|
|
"epoch": 0.3231197771587744,
|
|
"step": 58
|
|
},
|
|
{
|
|
"loss": 2.1076,
|
|
"grad_norm": 1.7043126821517944,
|
|
"learning_rate": 8.489045639281193e-06,
|
|
"epoch": 0.3286908077994429,
|
|
"step": 59
|
|
},
|
|
{
|
|
"loss": 2.2672,
|
|
"grad_norm": 1.5663846731185913,
|
|
"learning_rate": 8.635408495130869e-06,
|
|
"epoch": 0.3342618384401114,
|
|
"step": 60
|
|
},
|
|
{
|
|
"loss": 2.1081,
|
|
"grad_norm": 1.8134770393371582,
|
|
"learning_rate": 8.781771350980545e-06,
|
|
"epoch": 0.3398328690807799,
|
|
"step": 61
|
|
},
|
|
{
|
|
"loss": 2.034,
|
|
"grad_norm": 1.3617796897888184,
|
|
"learning_rate": 8.928134206830221e-06,
|
|
"epoch": 0.34540389972144847,
|
|
"step": 62
|
|
},
|
|
{
|
|
"loss": 2.1147,
|
|
"grad_norm": 1.7525910139083862,
|
|
"learning_rate": 9.074497062679895e-06,
|
|
"epoch": 0.35097493036211697,
|
|
"step": 63
|
|
},
|
|
{
|
|
"loss": 1.9639,
|
|
"grad_norm": 1.7186285257339478,
|
|
"learning_rate": 9.220859918529572e-06,
|
|
"epoch": 0.3565459610027855,
|
|
"step": 64
|
|
},
|
|
{
|
|
"loss": 1.7557,
|
|
"grad_norm": 1.9141530990600586,
|
|
"learning_rate": 9.367222774379248e-06,
|
|
"epoch": 0.362116991643454,
|
|
"step": 65
|
|
},
|
|
{
|
|
"loss": 1.8861,
|
|
"grad_norm": 1.696165680885315,
|
|
"learning_rate": 9.513585630228924e-06,
|
|
"epoch": 0.36768802228412256,
|
|
"step": 66
|
|
},
|
|
{
|
|
"loss": 2.272,
|
|
"grad_norm": 1.24228036403656,
|
|
"learning_rate": 9.659948486078598e-06,
|
|
"epoch": 0.3732590529247911,
|
|
"step": 67
|
|
},
|
|
{
|
|
"loss": 1.7173,
|
|
"grad_norm": 1.9760662317276,
|
|
"learning_rate": 9.806311341928276e-06,
|
|
"epoch": 0.3788300835654596,
|
|
"step": 68
|
|
},
|
|
{
|
|
"loss": 1.8545,
|
|
"grad_norm": 1.3207972049713135,
|
|
"learning_rate": 9.95267419777795e-06,
|
|
"epoch": 0.38440111420612816,
|
|
"step": 69
|
|
},
|
|
{
|
|
"loss": 2.0492,
|
|
"grad_norm": 1.5849637985229492,
|
|
"learning_rate": 1.0099037053627625e-05,
|
|
"epoch": 0.38997214484679665,
|
|
"step": 70
|
|
},
|
|
{
|
|
"loss": 1.9203,
|
|
"grad_norm": 2.7006468772888184,
|
|
"learning_rate": 1.0245399909477301e-05,
|
|
"epoch": 0.3955431754874652,
|
|
"step": 71
|
|
},
|
|
{
|
|
"loss": 2.1317,
|
|
"grad_norm": 1.9178322553634644,
|
|
"learning_rate": 1.0391762765326979e-05,
|
|
"epoch": 0.4011142061281337,
|
|
"step": 72
|
|
},
|
|
{
|
|
"loss": 2.282,
|
|
"grad_norm": 1.5044149160385132,
|
|
"learning_rate": 1.0538125621176653e-05,
|
|
"epoch": 0.40668523676880225,
|
|
"step": 73
|
|
},
|
|
{
|
|
"loss": 2.2023,
|
|
"grad_norm": 1.9386659860610962,
|
|
"learning_rate": 1.068448847702633e-05,
|
|
"epoch": 0.41225626740947074,
|
|
"step": 74
|
|
},
|
|
{
|
|
"loss": 2.3545,
|
|
"grad_norm": 1.3408238887786865,
|
|
"learning_rate": 1.0830851332876004e-05,
|
|
"epoch": 0.4178272980501393,
|
|
"step": 75
|
|
},
|
|
{
|
|
"loss": 2.1984,
|
|
"grad_norm": 2.221109390258789,
|
|
"learning_rate": 1.0977214188725682e-05,
|
|
"epoch": 0.4233983286908078,
|
|
"step": 76
|
|
},
|
|
{
|
|
"loss": 1.9843,
|
|
"grad_norm": 1.7843296527862549,
|
|
"learning_rate": 1.1123577044575356e-05,
|
|
"epoch": 0.42896935933147634,
|
|
"step": 77
|
|
},
|
|
{
|
|
"loss": 2.2899,
|
|
"grad_norm": 1.6259101629257202,
|
|
"learning_rate": 1.1269939900425032e-05,
|
|
"epoch": 0.43454038997214484,
|
|
"step": 78
|
|
},
|
|
{
|
|
"loss": 1.8448,
|
|
"grad_norm": 1.718583345413208,
|
|
"learning_rate": 1.1416302756274707e-05,
|
|
"epoch": 0.4401114206128134,
|
|
"step": 79
|
|
},
|
|
{
|
|
"loss": 1.857,
|
|
"grad_norm": 1.8396937847137451,
|
|
"learning_rate": 1.1562665612124385e-05,
|
|
"epoch": 0.4456824512534819,
|
|
"step": 80
|
|
},
|
|
{
|
|
"loss": 2.1387,
|
|
"grad_norm": 1.808605670928955,
|
|
"learning_rate": 1.170902846797406e-05,
|
|
"epoch": 0.45125348189415043,
|
|
"step": 81
|
|
},
|
|
{
|
|
"loss": 1.9587,
|
|
"grad_norm": 2.590714931488037,
|
|
"learning_rate": 1.1855391323823735e-05,
|
|
"epoch": 0.4568245125348189,
|
|
"step": 82
|
|
},
|
|
{
|
|
"loss": 2.5776,
|
|
"grad_norm": 1.550307273864746,
|
|
"learning_rate": 1.200175417967341e-05,
|
|
"epoch": 0.4623955431754875,
|
|
"step": 83
|
|
},
|
|
{
|
|
"loss": 2.1138,
|
|
"grad_norm": 1.7622662782669067,
|
|
"learning_rate": 1.2148117035523088e-05,
|
|
"epoch": 0.467966573816156,
|
|
"step": 84
|
|
},
|
|
{
|
|
"loss": 2.0632,
|
|
"grad_norm": 2.1933865547180176,
|
|
"learning_rate": 1.2294479891372764e-05,
|
|
"epoch": 0.4735376044568245,
|
|
"step": 85
|
|
},
|
|
{
|
|
"loss": 2.1885,
|
|
"grad_norm": 1.6188870668411255,
|
|
"learning_rate": 1.2440842747222438e-05,
|
|
"epoch": 0.479108635097493,
|
|
"step": 86
|
|
},
|
|
{
|
|
"loss": 2.2515,
|
|
"grad_norm": 1.6533507108688354,
|
|
"learning_rate": 1.2587205603072113e-05,
|
|
"epoch": 0.48467966573816157,
|
|
"step": 87
|
|
},
|
|
{
|
|
"loss": 2.3173,
|
|
"grad_norm": 1.295457363128662,
|
|
"learning_rate": 1.2733568458921789e-05,
|
|
"epoch": 0.49025069637883006,
|
|
"step": 88
|
|
},
|
|
{
|
|
"loss": 1.9536,
|
|
"grad_norm": 1.5764713287353516,
|
|
"learning_rate": 1.2879931314771467e-05,
|
|
"epoch": 0.4958217270194986,
|
|
"step": 89
|
|
},
|
|
{
|
|
"loss": 1.9134,
|
|
"grad_norm": 1.8399816751480103,
|
|
"learning_rate": 1.3026294170621141e-05,
|
|
"epoch": 0.5013927576601671,
|
|
"step": 90
|
|
},
|
|
{
|
|
"loss": 2.2808,
|
|
"grad_norm": 1.7519652843475342,
|
|
"learning_rate": 1.3172657026470817e-05,
|
|
"epoch": 0.5069637883008357,
|
|
"step": 91
|
|
},
|
|
{
|
|
"loss": 2.2055,
|
|
"grad_norm": 1.4549530744552612,
|
|
"learning_rate": 1.3319019882320492e-05,
|
|
"epoch": 0.5125348189415042,
|
|
"step": 92
|
|
},
|
|
{
|
|
"loss": 1.9373,
|
|
"grad_norm": 2.0461559295654297,
|
|
"learning_rate": 1.346538273817017e-05,
|
|
"epoch": 0.5181058495821727,
|
|
"step": 93
|
|
},
|
|
{
|
|
"loss": 1.7517,
|
|
"grad_norm": 1.5427114963531494,
|
|
"learning_rate": 1.3611745594019844e-05,
|
|
"epoch": 0.5236768802228412,
|
|
"step": 94
|
|
},
|
|
{
|
|
"loss": 1.9707,
|
|
"grad_norm": 1.5442962646484375,
|
|
"learning_rate": 1.375810844986952e-05,
|
|
"epoch": 0.5292479108635098,
|
|
"step": 95
|
|
},
|
|
{
|
|
"loss": 1.9629,
|
|
"grad_norm": 1.939523458480835,
|
|
"learning_rate": 1.3904471305719195e-05,
|
|
"epoch": 0.5348189415041783,
|
|
"step": 96
|
|
},
|
|
{
|
|
"loss": 1.8158,
|
|
"grad_norm": 1.9389022588729858,
|
|
"learning_rate": 1.4050834161568872e-05,
|
|
"epoch": 0.5403899721448467,
|
|
"step": 97
|
|
},
|
|
{
|
|
"loss": 2.5133,
|
|
"grad_norm": 1.9970468282699585,
|
|
"learning_rate": 1.4197197017418547e-05,
|
|
"epoch": 0.5459610027855153,
|
|
"step": 98
|
|
},
|
|
{
|
|
"loss": 2.263,
|
|
"grad_norm": 1.5786551237106323,
|
|
"learning_rate": 1.4343559873268223e-05,
|
|
"epoch": 0.5515320334261838,
|
|
"step": 99
|
|
},
|
|
{
|
|
"loss": 2.1035,
|
|
"grad_norm": 2.2139763832092285,
|
|
"learning_rate": 1.4489922729117897e-05,
|
|
"epoch": 0.5571030640668524,
|
|
"step": 100
|
|
},
|
|
{
|
|
"eval_loss": 2.2380564212799072,
|
|
"eval_runtime": 35.9418,
|
|
"eval_samples_per_second": 39.954,
|
|
"eval_steps_per_second": 2.003,
|
|
"epoch": 0.5571030640668524,
|
|
"step": 100
|
|
},
|
|
{
|
|
"loss": 1.8114,
|
|
"grad_norm": 2.2652857303619385,
|
|
"learning_rate": 1.4636285584967575e-05,
|
|
"epoch": 0.5626740947075209,
|
|
"step": 101
|
|
},
|
|
{
|
|
"loss": 2.3733,
|
|
"grad_norm": 1.688621997833252,
|
|
"learning_rate": 1.4782648440817251e-05,
|
|
"epoch": 0.5682451253481894,
|
|
"step": 102
|
|
},
|
|
{
|
|
"loss": 1.929,
|
|
"grad_norm": 2.500704765319824,
|
|
"learning_rate": 1.4929011296666926e-05,
|
|
"epoch": 0.5738161559888579,
|
|
"step": 103
|
|
},
|
|
{
|
|
"loss": 1.9718,
|
|
"grad_norm": 1.492704153060913,
|
|
"learning_rate": 1.50753741525166e-05,
|
|
"epoch": 0.5793871866295265,
|
|
"step": 104
|
|
},
|
|
{
|
|
"loss": 2.1026,
|
|
"grad_norm": 1.6980139017105103,
|
|
"learning_rate": 1.5221737008366276e-05,
|
|
"epoch": 0.584958217270195,
|
|
"step": 105
|
|
},
|
|
{
|
|
"loss": 1.7999,
|
|
"grad_norm": 1.7127199172973633,
|
|
"learning_rate": 1.5368099864215953e-05,
|
|
"epoch": 0.5905292479108635,
|
|
"step": 106
|
|
},
|
|
{
|
|
"loss": 2.0349,
|
|
"grad_norm": 1.8260376453399658,
|
|
"learning_rate": 1.551446272006563e-05,
|
|
"epoch": 0.596100278551532,
|
|
"step": 107
|
|
},
|
|
{
|
|
"loss": 2.2571,
|
|
"grad_norm": 1.8122572898864746,
|
|
"learning_rate": 1.5660825575915305e-05,
|
|
"epoch": 0.6016713091922006,
|
|
"step": 108
|
|
},
|
|
{
|
|
"loss": 2.0794,
|
|
"grad_norm": 2.299410343170166,
|
|
"learning_rate": 1.580718843176498e-05,
|
|
"epoch": 0.6072423398328691,
|
|
"step": 109
|
|
},
|
|
{
|
|
"loss": 2.1671,
|
|
"grad_norm": 1.4942196607589722,
|
|
"learning_rate": 1.5953551287614657e-05,
|
|
"epoch": 0.6128133704735376,
|
|
"step": 110
|
|
},
|
|
{
|
|
"loss": 2.1292,
|
|
"grad_norm": 1.6794716119766235,
|
|
"learning_rate": 1.609991414346433e-05,
|
|
"epoch": 0.6183844011142061,
|
|
"step": 111
|
|
},
|
|
{
|
|
"loss": 2.2534,
|
|
"grad_norm": 1.8196300268173218,
|
|
"learning_rate": 1.6246276999314006e-05,
|
|
"epoch": 0.6239554317548747,
|
|
"step": 112
|
|
},
|
|
{
|
|
"loss": 2.0333,
|
|
"grad_norm": 1.5703504085540771,
|
|
"learning_rate": 1.6392639855163684e-05,
|
|
"epoch": 0.6295264623955432,
|
|
"step": 113
|
|
},
|
|
{
|
|
"loss": 2.1495,
|
|
"grad_norm": 1.766376256942749,
|
|
"learning_rate": 1.6539002711013358e-05,
|
|
"epoch": 0.6350974930362117,
|
|
"step": 114
|
|
},
|
|
{
|
|
"loss": 1.5522,
|
|
"grad_norm": 2.6598968505859375,
|
|
"learning_rate": 1.6685365566863036e-05,
|
|
"epoch": 0.6406685236768802,
|
|
"step": 115
|
|
},
|
|
{
|
|
"loss": 2.2221,
|
|
"grad_norm": 10.731008529663086,
|
|
"learning_rate": 1.683172842271271e-05,
|
|
"epoch": 0.6462395543175488,
|
|
"step": 116
|
|
},
|
|
{
|
|
"loss": 2.4227,
|
|
"grad_norm": 2.2150168418884277,
|
|
"learning_rate": 1.6978091278562385e-05,
|
|
"epoch": 0.6518105849582173,
|
|
"step": 117
|
|
},
|
|
{
|
|
"loss": 2.3886,
|
|
"grad_norm": 2.283031940460205,
|
|
"learning_rate": 1.7124454134412063e-05,
|
|
"epoch": 0.6573816155988857,
|
|
"step": 118
|
|
},
|
|
{
|
|
"loss": 2.0651,
|
|
"grad_norm": 2.6018834114074707,
|
|
"learning_rate": 1.7270816990261737e-05,
|
|
"epoch": 0.6629526462395543,
|
|
"step": 119
|
|
},
|
|
{
|
|
"loss": 1.9465,
|
|
"grad_norm": 1.8486937284469604,
|
|
"learning_rate": 1.7417179846111412e-05,
|
|
"epoch": 0.6685236768802229,
|
|
"step": 120
|
|
},
|
|
{
|
|
"loss": 2.588,
|
|
"grad_norm": 2.0970637798309326,
|
|
"learning_rate": 1.756354270196109e-05,
|
|
"epoch": 0.6740947075208914,
|
|
"step": 121
|
|
},
|
|
{
|
|
"loss": 1.7602,
|
|
"grad_norm": 1.5886075496673584,
|
|
"learning_rate": 1.7709905557810764e-05,
|
|
"epoch": 0.6796657381615598,
|
|
"step": 122
|
|
},
|
|
{
|
|
"loss": 2.2738,
|
|
"grad_norm": 2.4414422512054443,
|
|
"learning_rate": 1.7856268413660442e-05,
|
|
"epoch": 0.6852367688022284,
|
|
"step": 123
|
|
},
|
|
{
|
|
"loss": 1.8145,
|
|
"grad_norm": 1.7890093326568604,
|
|
"learning_rate": 1.8002631269510116e-05,
|
|
"epoch": 0.6908077994428969,
|
|
"step": 124
|
|
},
|
|
{
|
|
"loss": 1.7572,
|
|
"grad_norm": 1.7805349826812744,
|
|
"learning_rate": 1.814899412535979e-05,
|
|
"epoch": 0.6963788300835655,
|
|
"step": 125
|
|
},
|
|
{
|
|
"loss": 2.0206,
|
|
"grad_norm": 1.9520258903503418,
|
|
"learning_rate": 1.829535698120947e-05,
|
|
"epoch": 0.7019498607242339,
|
|
"step": 126
|
|
},
|
|
{
|
|
"loss": 2.1292,
|
|
"grad_norm": 1.6244016885757446,
|
|
"learning_rate": 1.8441719837059143e-05,
|
|
"epoch": 0.7075208913649025,
|
|
"step": 127
|
|
},
|
|
{
|
|
"loss": 2.1207,
|
|
"grad_norm": 1.6681342124938965,
|
|
"learning_rate": 1.858808269290882e-05,
|
|
"epoch": 0.713091922005571,
|
|
"step": 128
|
|
},
|
|
{
|
|
"loss": 2.1515,
|
|
"grad_norm": 2.1032838821411133,
|
|
"learning_rate": 1.8734445548758495e-05,
|
|
"epoch": 0.7186629526462396,
|
|
"step": 129
|
|
},
|
|
{
|
|
"loss": 1.7264,
|
|
"grad_norm": 2.093341588973999,
|
|
"learning_rate": 1.888080840460817e-05,
|
|
"epoch": 0.724233983286908,
|
|
"step": 130
|
|
},
|
|
{
|
|
"loss": 1.6036,
|
|
"grad_norm": 1.9431419372558594,
|
|
"learning_rate": 1.9027171260457848e-05,
|
|
"epoch": 0.7298050139275766,
|
|
"step": 131
|
|
},
|
|
{
|
|
"loss": 2.132,
|
|
"grad_norm": 3.0380795001983643,
|
|
"learning_rate": 1.9173534116307522e-05,
|
|
"epoch": 0.7353760445682451,
|
|
"step": 132
|
|
},
|
|
{
|
|
"loss": 2.061,
|
|
"grad_norm": 3.623516321182251,
|
|
"learning_rate": 1.9319896972157197e-05,
|
|
"epoch": 0.7409470752089137,
|
|
"step": 133
|
|
},
|
|
{
|
|
"loss": 1.8765,
|
|
"grad_norm": 2.320667266845703,
|
|
"learning_rate": 1.9466259828006874e-05,
|
|
"epoch": 0.7465181058495822,
|
|
"step": 134
|
|
},
|
|
{
|
|
"loss": 1.8337,
|
|
"grad_norm": 1.9040995836257935,
|
|
"learning_rate": 1.9612622683856552e-05,
|
|
"epoch": 0.7520891364902507,
|
|
"step": 135
|
|
},
|
|
{
|
|
"loss": 2.0664,
|
|
"grad_norm": 1.8677185773849487,
|
|
"learning_rate": 1.9758985539706227e-05,
|
|
"epoch": 0.7576601671309192,
|
|
"step": 136
|
|
},
|
|
{
|
|
"loss": 1.9651,
|
|
"grad_norm": 2.414144992828369,
|
|
"learning_rate": 1.99053483955559e-05,
|
|
"epoch": 0.7632311977715878,
|
|
"step": 137
|
|
},
|
|
{
|
|
"loss": 1.9243,
|
|
"grad_norm": 2.5697357654571533,
|
|
"learning_rate": 1.989220087349807e-05,
|
|
"epoch": 0.7688022284122563,
|
|
"step": 138
|
|
},
|
|
{
|
|
"loss": 2.0289,
|
|
"grad_norm": 2.384612560272217,
|
|
"learning_rate": 1.987905335144024e-05,
|
|
"epoch": 0.7743732590529248,
|
|
"step": 139
|
|
},
|
|
{
|
|
"loss": 1.7206,
|
|
"grad_norm": 2.284605026245117,
|
|
"learning_rate": 1.986590582938241e-05,
|
|
"epoch": 0.7799442896935933,
|
|
"step": 140
|
|
},
|
|
{
|
|
"loss": 1.7212,
|
|
"grad_norm": 2.3488142490386963,
|
|
"learning_rate": 1.985275830732458e-05,
|
|
"epoch": 0.7855153203342619,
|
|
"step": 141
|
|
},
|
|
{
|
|
"loss": 1.559,
|
|
"grad_norm": 1.849543809890747,
|
|
"learning_rate": 1.983961078526675e-05,
|
|
"epoch": 0.7910863509749304,
|
|
"step": 142
|
|
},
|
|
{
|
|
"loss": 1.9442,
|
|
"grad_norm": 2.343719720840454,
|
|
"learning_rate": 1.9826463263208915e-05,
|
|
"epoch": 0.7966573816155988,
|
|
"step": 143
|
|
},
|
|
{
|
|
"loss": 1.8913,
|
|
"grad_norm": 2.6115176677703857,
|
|
"learning_rate": 1.9813315741151084e-05,
|
|
"epoch": 0.8022284122562674,
|
|
"step": 144
|
|
},
|
|
{
|
|
"loss": 2.1248,
|
|
"grad_norm": 2.703418731689453,
|
|
"learning_rate": 1.9800168219093254e-05,
|
|
"epoch": 0.807799442896936,
|
|
"step": 145
|
|
},
|
|
{
|
|
"loss": 2.1745,
|
|
"grad_norm": 2.379194736480713,
|
|
"learning_rate": 1.9787020697035423e-05,
|
|
"epoch": 0.8133704735376045,
|
|
"step": 146
|
|
},
|
|
{
|
|
"loss": 2.0572,
|
|
"grad_norm": 2.4916770458221436,
|
|
"learning_rate": 1.9773873174977593e-05,
|
|
"epoch": 0.8189415041782729,
|
|
"step": 147
|
|
},
|
|
{
|
|
"loss": 1.8028,
|
|
"grad_norm": 3.7550413608551025,
|
|
"learning_rate": 1.9760725652919762e-05,
|
|
"epoch": 0.8245125348189415,
|
|
"step": 148
|
|
},
|
|
{
|
|
"loss": 2.2202,
|
|
"grad_norm": 1.704113483428955,
|
|
"learning_rate": 1.974757813086193e-05,
|
|
"epoch": 0.83008356545961,
|
|
"step": 149
|
|
},
|
|
{
|
|
"loss": 1.8407,
|
|
"grad_norm": 2.14805269241333,
|
|
"learning_rate": 1.9734430608804098e-05,
|
|
"epoch": 0.8356545961002786,
|
|
"step": 150
|
|
},
|
|
{
|
|
"eval_loss": 2.264693021774292,
|
|
"eval_runtime": 35.9452,
|
|
"eval_samples_per_second": 39.95,
|
|
"eval_steps_per_second": 2.003,
|
|
"epoch": 0.8356545961002786,
|
|
"step": 150
|
|
},
|
|
{
|
|
"loss": 2.3419,
|
|
"grad_norm": 2.3600826263427734,
|
|
"learning_rate": 1.972128308674627e-05,
|
|
"epoch": 0.841225626740947,
|
|
"step": 151
|
|
},
|
|
{
|
|
"loss": 2.0321,
|
|
"grad_norm": 2.7362117767333984,
|
|
"learning_rate": 1.970813556468844e-05,
|
|
"epoch": 0.8467966573816156,
|
|
"step": 152
|
|
},
|
|
{
|
|
"loss": 1.7054,
|
|
"grad_norm": 2.982322931289673,
|
|
"learning_rate": 1.9694988042630607e-05,
|
|
"epoch": 0.8523676880222841,
|
|
"step": 153
|
|
},
|
|
{
|
|
"loss": 1.9304,
|
|
"grad_norm": 2.8210840225219727,
|
|
"learning_rate": 1.9681840520572776e-05,
|
|
"epoch": 0.8579387186629527,
|
|
"step": 154
|
|
},
|
|
{
|
|
"loss": 2.1154,
|
|
"grad_norm": 2.412022113800049,
|
|
"learning_rate": 1.9668692998514946e-05,
|
|
"epoch": 0.8635097493036211,
|
|
"step": 155
|
|
},
|
|
{
|
|
"loss": 2.0367,
|
|
"grad_norm": 2.439105987548828,
|
|
"learning_rate": 1.9655545476457115e-05,
|
|
"epoch": 0.8690807799442897,
|
|
"step": 156
|
|
},
|
|
{
|
|
"loss": 1.5693,
|
|
"grad_norm": 2.276296854019165,
|
|
"learning_rate": 1.9642397954399285e-05,
|
|
"epoch": 0.8746518105849582,
|
|
"step": 157
|
|
},
|
|
{
|
|
"loss": 1.726,
|
|
"grad_norm": 2.12568998336792,
|
|
"learning_rate": 1.9629250432341454e-05,
|
|
"epoch": 0.8802228412256268,
|
|
"step": 158
|
|
},
|
|
{
|
|
"loss": 1.7872,
|
|
"grad_norm": 2.1106767654418945,
|
|
"learning_rate": 1.9616102910283624e-05,
|
|
"epoch": 0.8857938718662952,
|
|
"step": 159
|
|
},
|
|
{
|
|
"loss": 1.9831,
|
|
"grad_norm": 1.9893423318862915,
|
|
"learning_rate": 1.960295538822579e-05,
|
|
"epoch": 0.8913649025069638,
|
|
"step": 160
|
|
},
|
|
{
|
|
"loss": 2.0571,
|
|
"grad_norm": 2.222038984298706,
|
|
"learning_rate": 1.958980786616796e-05,
|
|
"epoch": 0.8969359331476323,
|
|
"step": 161
|
|
},
|
|
{
|
|
"loss": 1.8674,
|
|
"grad_norm": 2.5205395221710205,
|
|
"learning_rate": 1.957666034411013e-05,
|
|
"epoch": 0.9025069637883009,
|
|
"step": 162
|
|
},
|
|
{
|
|
"loss": 1.7982,
|
|
"grad_norm": 2.212405204772949,
|
|
"learning_rate": 1.95635128220523e-05,
|
|
"epoch": 0.9080779944289693,
|
|
"step": 163
|
|
},
|
|
{
|
|
"loss": 1.7992,
|
|
"grad_norm": 2.304945468902588,
|
|
"learning_rate": 1.9550365299994468e-05,
|
|
"epoch": 0.9136490250696379,
|
|
"step": 164
|
|
},
|
|
{
|
|
"loss": 2.2013,
|
|
"grad_norm": 2.8349928855895996,
|
|
"learning_rate": 1.9537217777936638e-05,
|
|
"epoch": 0.9192200557103064,
|
|
"step": 165
|
|
},
|
|
{
|
|
"loss": 1.7908,
|
|
"grad_norm": 2.2040209770202637,
|
|
"learning_rate": 1.9524070255878807e-05,
|
|
"epoch": 0.924791086350975,
|
|
"step": 166
|
|
},
|
|
{
|
|
"loss": 2.1308,
|
|
"grad_norm": 2.550541400909424,
|
|
"learning_rate": 1.9510922733820977e-05,
|
|
"epoch": 0.9303621169916435,
|
|
"step": 167
|
|
},
|
|
{
|
|
"loss": 1.735,
|
|
"grad_norm": 2.9808292388916016,
|
|
"learning_rate": 1.9497775211763146e-05,
|
|
"epoch": 0.935933147632312,
|
|
"step": 168
|
|
},
|
|
{
|
|
"loss": 2.1536,
|
|
"grad_norm": 2.4572677612304688,
|
|
"learning_rate": 1.9484627689705316e-05,
|
|
"epoch": 0.9415041782729805,
|
|
"step": 169
|
|
},
|
|
{
|
|
"loss": 1.8616,
|
|
"grad_norm": 2.414435863494873,
|
|
"learning_rate": 1.9471480167647482e-05,
|
|
"epoch": 0.947075208913649,
|
|
"step": 170
|
|
},
|
|
{
|
|
"loss": 1.9501,
|
|
"grad_norm": 2.490251064300537,
|
|
"learning_rate": 1.945833264558965e-05,
|
|
"epoch": 0.9526462395543176,
|
|
"step": 171
|
|
},
|
|
{
|
|
"loss": 1.7965,
|
|
"grad_norm": 3.2512645721435547,
|
|
"learning_rate": 1.944518512353182e-05,
|
|
"epoch": 0.958217270194986,
|
|
"step": 172
|
|
},
|
|
{
|
|
"loss": 1.8903,
|
|
"grad_norm": 2.0697317123413086,
|
|
"learning_rate": 1.943203760147399e-05,
|
|
"epoch": 0.9637883008356546,
|
|
"step": 173
|
|
},
|
|
{
|
|
"loss": 1.9153,
|
|
"grad_norm": 2.869088888168335,
|
|
"learning_rate": 1.941889007941616e-05,
|
|
"epoch": 0.9693593314763231,
|
|
"step": 174
|
|
},
|
|
{
|
|
"loss": 2.0043,
|
|
"grad_norm": 2.5188841819763184,
|
|
"learning_rate": 1.940574255735833e-05,
|
|
"epoch": 0.9749303621169917,
|
|
"step": 175
|
|
},
|
|
{
|
|
"loss": 2.0106,
|
|
"grad_norm": 2.2558531761169434,
|
|
"learning_rate": 1.93925950353005e-05,
|
|
"epoch": 0.9805013927576601,
|
|
"step": 176
|
|
},
|
|
{
|
|
"loss": 2.0356,
|
|
"grad_norm": 2.78887677192688,
|
|
"learning_rate": 1.9379447513242665e-05,
|
|
"epoch": 0.9860724233983287,
|
|
"step": 177
|
|
},
|
|
{
|
|
"loss": 1.4849,
|
|
"grad_norm": 2.9200024604797363,
|
|
"learning_rate": 1.9366299991184838e-05,
|
|
"epoch": 0.9916434540389972,
|
|
"step": 178
|
|
},
|
|
{
|
|
"loss": 1.6636,
|
|
"grad_norm": 2.443997621536255,
|
|
"learning_rate": 1.9353152469127007e-05,
|
|
"epoch": 0.9972144846796658,
|
|
"step": 179
|
|
},
|
|
{
|
|
"loss": 1.4097,
|
|
"grad_norm": 3.399275779724121,
|
|
"learning_rate": 1.9340004947069174e-05,
|
|
"epoch": 1.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"loss": 1.7611,
|
|
"grad_norm": 2.312861442565918,
|
|
"learning_rate": 1.9326857425011343e-05,
|
|
"epoch": 1.0055710306406684,
|
|
"step": 181
|
|
},
|
|
{
|
|
"loss": 2.0967,
|
|
"grad_norm": 2.799191951751709,
|
|
"learning_rate": 1.9313709902953513e-05,
|
|
"epoch": 1.011142061281337,
|
|
"step": 182
|
|
},
|
|
{
|
|
"loss": 2.3108,
|
|
"grad_norm": 2.4845213890075684,
|
|
"learning_rate": 1.9300562380895682e-05,
|
|
"epoch": 1.0167130919220055,
|
|
"step": 183
|
|
},
|
|
{
|
|
"loss": 1.6263,
|
|
"grad_norm": 2.72027325630188,
|
|
"learning_rate": 1.928741485883785e-05,
|
|
"epoch": 1.0222841225626742,
|
|
"step": 184
|
|
},
|
|
{
|
|
"loss": 1.6228,
|
|
"grad_norm": 3.2783589363098145,
|
|
"learning_rate": 1.927426733678002e-05,
|
|
"epoch": 1.0278551532033426,
|
|
"step": 185
|
|
},
|
|
{
|
|
"loss": 1.9384,
|
|
"grad_norm": 2.455291986465454,
|
|
"learning_rate": 1.926111981472219e-05,
|
|
"epoch": 1.033426183844011,
|
|
"step": 186
|
|
},
|
|
{
|
|
"loss": 1.5646,
|
|
"grad_norm": 2.2230939865112305,
|
|
"learning_rate": 1.9247972292664357e-05,
|
|
"epoch": 1.0389972144846797,
|
|
"step": 187
|
|
},
|
|
{
|
|
"loss": 1.8545,
|
|
"grad_norm": 2.596928119659424,
|
|
"learning_rate": 1.9234824770606526e-05,
|
|
"epoch": 1.0445682451253482,
|
|
"step": 188
|
|
},
|
|
{
|
|
"loss": 1.8436,
|
|
"grad_norm": 2.5703697204589844,
|
|
"learning_rate": 1.9221677248548696e-05,
|
|
"epoch": 1.0501392757660166,
|
|
"step": 189
|
|
},
|
|
{
|
|
"loss": 2.2574,
|
|
"grad_norm": 3.021871566772461,
|
|
"learning_rate": 1.920852972649087e-05,
|
|
"epoch": 1.0557103064066853,
|
|
"step": 190
|
|
},
|
|
{
|
|
"loss": 1.8046,
|
|
"grad_norm": 2.35603404045105,
|
|
"learning_rate": 1.9195382204433035e-05,
|
|
"epoch": 1.0612813370473537,
|
|
"step": 191
|
|
},
|
|
{
|
|
"loss": 1.6635,
|
|
"grad_norm": 2.453967809677124,
|
|
"learning_rate": 1.9182234682375204e-05,
|
|
"epoch": 1.0668523676880224,
|
|
"step": 192
|
|
},
|
|
{
|
|
"loss": 1.9118,
|
|
"grad_norm": 3.2305331230163574,
|
|
"learning_rate": 1.9169087160317374e-05,
|
|
"epoch": 1.0724233983286908,
|
|
"step": 193
|
|
},
|
|
{
|
|
"loss": 1.5529,
|
|
"grad_norm": 2.248871326446533,
|
|
"learning_rate": 1.9155939638259543e-05,
|
|
"epoch": 1.0779944289693593,
|
|
"step": 194
|
|
},
|
|
{
|
|
"loss": 1.7932,
|
|
"grad_norm": 3.0331363677978516,
|
|
"learning_rate": 1.9142792116201713e-05,
|
|
"epoch": 1.083565459610028,
|
|
"step": 195
|
|
},
|
|
{
|
|
"loss": 1.7632,
|
|
"grad_norm": 3.543948173522949,
|
|
"learning_rate": 1.9129644594143882e-05,
|
|
"epoch": 1.0891364902506964,
|
|
"step": 196
|
|
},
|
|
{
|
|
"loss": 1.8788,
|
|
"grad_norm": 3.4173591136932373,
|
|
"learning_rate": 1.911649707208605e-05,
|
|
"epoch": 1.0947075208913648,
|
|
"step": 197
|
|
},
|
|
{
|
|
"loss": 2.0881,
|
|
"grad_norm": 3.4639406204223633,
|
|
"learning_rate": 1.9103349550028218e-05,
|
|
"epoch": 1.1002785515320335,
|
|
"step": 198
|
|
},
|
|
{
|
|
"loss": 1.9197,
|
|
"grad_norm": 3.6082725524902344,
|
|
"learning_rate": 1.9090202027970388e-05,
|
|
"epoch": 1.105849582172702,
|
|
"step": 199
|
|
},
|
|
{
|
|
"loss": 1.4541,
|
|
"grad_norm": 2.834181070327759,
|
|
"learning_rate": 1.9077054505912557e-05,
|
|
"epoch": 1.1114206128133706,
|
|
"step": 200
|
|
},
|
|
{
|
|
"eval_loss": 2.3124067783355713,
|
|
"eval_runtime": 35.9422,
|
|
"eval_samples_per_second": 39.953,
|
|
"eval_steps_per_second": 2.003,
|
|
"epoch": 1.1114206128133706,
|
|
"step": 200
|
|
},
|
|
{
|
|
"loss": 1.8766,
|
|
"grad_norm": 2.44728422164917,
|
|
"learning_rate": 1.9063906983854727e-05,
|
|
"epoch": 1.116991643454039,
|
|
"step": 201
|
|
},
|
|
{
|
|
"loss": 1.8877,
|
|
"grad_norm": 3.1577866077423096,
|
|
"learning_rate": 1.9050759461796896e-05,
|
|
"epoch": 1.1225626740947074,
|
|
"step": 202
|
|
},
|
|
{
|
|
"loss": 1.6045,
|
|
"grad_norm": 3.5458521842956543,
|
|
"learning_rate": 1.9037611939739066e-05,
|
|
"epoch": 1.128133704735376,
|
|
"step": 203
|
|
},
|
|
{
|
|
"loss": 1.705,
|
|
"grad_norm": 2.496349811553955,
|
|
"learning_rate": 1.9024464417681232e-05,
|
|
"epoch": 1.1337047353760445,
|
|
"step": 204
|
|
},
|
|
{
|
|
"loss": 1.6478,
|
|
"grad_norm": 3.2897088527679443,
|
|
"learning_rate": 1.9011316895623405e-05,
|
|
"epoch": 1.1392757660167132,
|
|
"step": 205
|
|
},
|
|
{
|
|
"loss": 1.6703,
|
|
"grad_norm": 3.1694509983062744,
|
|
"learning_rate": 1.8998169373565574e-05,
|
|
"epoch": 1.1448467966573816,
|
|
"step": 206
|
|
},
|
|
{
|
|
"loss": 2.0232,
|
|
"grad_norm": 2.8644907474517822,
|
|
"learning_rate": 1.8985021851507744e-05,
|
|
"epoch": 1.15041782729805,
|
|
"step": 207
|
|
},
|
|
{
|
|
"loss": 1.581,
|
|
"grad_norm": 2.930053472518921,
|
|
"learning_rate": 1.897187432944991e-05,
|
|
"epoch": 1.1559888579387188,
|
|
"step": 208
|
|
},
|
|
{
|
|
"loss": 1.6617,
|
|
"grad_norm": 2.9067940711975098,
|
|
"learning_rate": 1.895872680739208e-05,
|
|
"epoch": 1.1615598885793872,
|
|
"step": 209
|
|
},
|
|
{
|
|
"loss": 2.173,
|
|
"grad_norm": 3.746903419494629,
|
|
"learning_rate": 1.894557928533425e-05,
|
|
"epoch": 1.1671309192200556,
|
|
"step": 210
|
|
},
|
|
{
|
|
"loss": 1.5917,
|
|
"grad_norm": 4.83465576171875,
|
|
"learning_rate": 1.893243176327642e-05,
|
|
"epoch": 1.1727019498607243,
|
|
"step": 211
|
|
},
|
|
{
|
|
"loss": 1.531,
|
|
"grad_norm": 3.0352439880371094,
|
|
"learning_rate": 1.8919284241218588e-05,
|
|
"epoch": 1.1782729805013927,
|
|
"step": 212
|
|
},
|
|
{
|
|
"loss": 2.193,
|
|
"grad_norm": 2.738152027130127,
|
|
"learning_rate": 1.8906136719160758e-05,
|
|
"epoch": 1.1838440111420612,
|
|
"step": 213
|
|
},
|
|
{
|
|
"loss": 1.9884,
|
|
"grad_norm": 3.005979061126709,
|
|
"learning_rate": 1.8892989197102927e-05,
|
|
"epoch": 1.1894150417827298,
|
|
"step": 214
|
|
},
|
|
{
|
|
"loss": 1.8659,
|
|
"grad_norm": 3.930433750152588,
|
|
"learning_rate": 1.8879841675045093e-05,
|
|
"epoch": 1.1949860724233983,
|
|
"step": 215
|
|
},
|
|
{
|
|
"loss": 2.191,
|
|
"grad_norm": 3.3943190574645996,
|
|
"learning_rate": 1.8866694152987263e-05,
|
|
"epoch": 1.200557103064067,
|
|
"step": 216
|
|
},
|
|
{
|
|
"loss": 2.1538,
|
|
"grad_norm": 3.4692654609680176,
|
|
"learning_rate": 1.8853546630929436e-05,
|
|
"epoch": 1.2061281337047354,
|
|
"step": 217
|
|
},
|
|
{
|
|
"loss": 1.9939,
|
|
"grad_norm": 2.889341354370117,
|
|
"learning_rate": 1.8840399108871602e-05,
|
|
"epoch": 1.2116991643454038,
|
|
"step": 218
|
|
},
|
|
{
|
|
"loss": 2.1667,
|
|
"grad_norm": 3.123650550842285,
|
|
"learning_rate": 1.882725158681377e-05,
|
|
"epoch": 1.2172701949860725,
|
|
"step": 219
|
|
},
|
|
{
|
|
"loss": 1.9743,
|
|
"grad_norm": 2.6485071182250977,
|
|
"learning_rate": 1.881410406475594e-05,
|
|
"epoch": 1.222841225626741,
|
|
"step": 220
|
|
},
|
|
{
|
|
"loss": 1.5679,
|
|
"grad_norm": 3.791811227798462,
|
|
"learning_rate": 1.880095654269811e-05,
|
|
"epoch": 1.2284122562674096,
|
|
"step": 221
|
|
},
|
|
{
|
|
"loss": 2.1841,
|
|
"grad_norm": 3.286864757537842,
|
|
"learning_rate": 1.878780902064028e-05,
|
|
"epoch": 1.233983286908078,
|
|
"step": 222
|
|
},
|
|
{
|
|
"loss": 2.1165,
|
|
"grad_norm": 2.930072784423828,
|
|
"learning_rate": 1.877466149858245e-05,
|
|
"epoch": 1.2395543175487465,
|
|
"step": 223
|
|
},
|
|
{
|
|
"loss": 1.7816,
|
|
"grad_norm": 2.936857223510742,
|
|
"learning_rate": 1.876151397652462e-05,
|
|
"epoch": 1.2451253481894151,
|
|
"step": 224
|
|
},
|
|
{
|
|
"loss": 1.702,
|
|
"grad_norm": 2.3516695499420166,
|
|
"learning_rate": 1.8748366454466785e-05,
|
|
"epoch": 1.2506963788300836,
|
|
"step": 225
|
|
},
|
|
{
|
|
"loss": 1.7222,
|
|
"grad_norm": 3.2817559242248535,
|
|
"learning_rate": 1.8735218932408955e-05,
|
|
"epoch": 1.2562674094707522,
|
|
"step": 226
|
|
},
|
|
{
|
|
"loss": 1.7151,
|
|
"grad_norm": 2.987518548965454,
|
|
"learning_rate": 1.8722071410351124e-05,
|
|
"epoch": 1.2618384401114207,
|
|
"step": 227
|
|
},
|
|
{
|
|
"loss": 2.0545,
|
|
"grad_norm": 3.132258415222168,
|
|
"learning_rate": 1.8708923888293294e-05,
|
|
"epoch": 1.267409470752089,
|
|
"step": 228
|
|
},
|
|
{
|
|
"loss": 1.4732,
|
|
"grad_norm": 3.2233877182006836,
|
|
"learning_rate": 1.8695776366235463e-05,
|
|
"epoch": 1.2729805013927575,
|
|
"step": 229
|
|
},
|
|
{
|
|
"loss": 1.7168,
|
|
"grad_norm": 3.2920405864715576,
|
|
"learning_rate": 1.8682628844177633e-05,
|
|
"epoch": 1.2785515320334262,
|
|
"step": 230
|
|
},
|
|
{
|
|
"loss": 1.497,
|
|
"grad_norm": 2.536219596862793,
|
|
"learning_rate": 1.8669481322119802e-05,
|
|
"epoch": 1.2841225626740946,
|
|
"step": 231
|
|
},
|
|
{
|
|
"loss": 1.8177,
|
|
"grad_norm": 4.246109485626221,
|
|
"learning_rate": 1.865633380006197e-05,
|
|
"epoch": 1.2896935933147633,
|
|
"step": 232
|
|
},
|
|
{
|
|
"loss": 1.9449,
|
|
"grad_norm": 2.6518428325653076,
|
|
"learning_rate": 1.864318627800414e-05,
|
|
"epoch": 1.2952646239554317,
|
|
"step": 233
|
|
},
|
|
{
|
|
"loss": 1.4673,
|
|
"grad_norm": 3.7276058197021484,
|
|
"learning_rate": 1.863003875594631e-05,
|
|
"epoch": 1.3008356545961002,
|
|
"step": 234
|
|
},
|
|
{
|
|
"loss": 1.865,
|
|
"grad_norm": 3.2901997566223145,
|
|
"learning_rate": 1.8616891233888477e-05,
|
|
"epoch": 1.3064066852367688,
|
|
"step": 235
|
|
},
|
|
{
|
|
"loss": 1.7449,
|
|
"grad_norm": 2.6417624950408936,
|
|
"learning_rate": 1.8603743711830646e-05,
|
|
"epoch": 1.3119777158774373,
|
|
"step": 236
|
|
},
|
|
{
|
|
"loss": 2.0348,
|
|
"grad_norm": 3.81978702545166,
|
|
"learning_rate": 1.8590596189772816e-05,
|
|
"epoch": 1.317548746518106,
|
|
"step": 237
|
|
},
|
|
{
|
|
"loss": 1.8765,
|
|
"grad_norm": 2.615661382675171,
|
|
"learning_rate": 1.8577448667714985e-05,
|
|
"epoch": 1.3231197771587744,
|
|
"step": 238
|
|
},
|
|
{
|
|
"loss": 1.7559,
|
|
"grad_norm": 3.2889416217803955,
|
|
"learning_rate": 1.8564301145657155e-05,
|
|
"epoch": 1.3286908077994428,
|
|
"step": 239
|
|
},
|
|
{
|
|
"loss": 1.9031,
|
|
"grad_norm": 4.006824970245361,
|
|
"learning_rate": 1.8551153623599324e-05,
|
|
"epoch": 1.3342618384401115,
|
|
"step": 240
|
|
},
|
|
{
|
|
"loss": 1.6102,
|
|
"grad_norm": 3.3491599559783936,
|
|
"learning_rate": 1.8538006101541494e-05,
|
|
"epoch": 1.33983286908078,
|
|
"step": 241
|
|
},
|
|
{
|
|
"loss": 2.0716,
|
|
"grad_norm": 3.2669260501861572,
|
|
"learning_rate": 1.852485857948366e-05,
|
|
"epoch": 1.3454038997214486,
|
|
"step": 242
|
|
},
|
|
{
|
|
"loss": 2.0298,
|
|
"grad_norm": 4.218564510345459,
|
|
"learning_rate": 1.851171105742583e-05,
|
|
"epoch": 1.350974930362117,
|
|
"step": 243
|
|
},
|
|
{
|
|
"loss": 1.9911,
|
|
"grad_norm": 3.5515315532684326,
|
|
"learning_rate": 1.8498563535368003e-05,
|
|
"epoch": 1.3565459610027855,
|
|
"step": 244
|
|
},
|
|
{
|
|
"loss": 1.3477,
|
|
"grad_norm": 4.0060343742370605,
|
|
"learning_rate": 1.848541601331017e-05,
|
|
"epoch": 1.362116991643454,
|
|
"step": 245
|
|
},
|
|
{
|
|
"loss": 1.4686,
|
|
"grad_norm": 3.574927568435669,
|
|
"learning_rate": 1.8472268491252338e-05,
|
|
"epoch": 1.3676880222841226,
|
|
"step": 246
|
|
},
|
|
{
|
|
"loss": 1.617,
|
|
"grad_norm": 3.4316840171813965,
|
|
"learning_rate": 1.8459120969194508e-05,
|
|
"epoch": 1.3732590529247912,
|
|
"step": 247
|
|
},
|
|
{
|
|
"loss": 1.6593,
|
|
"grad_norm": 3.2629754543304443,
|
|
"learning_rate": 1.8445973447136677e-05,
|
|
"epoch": 1.3788300835654597,
|
|
"step": 248
|
|
},
|
|
{
|
|
"loss": 1.2608,
|
|
"grad_norm": 3.133815050125122,
|
|
"learning_rate": 1.8432825925078847e-05,
|
|
"epoch": 1.384401114206128,
|
|
"step": 249
|
|
},
|
|
{
|
|
"loss": 1.8523,
|
|
"grad_norm": 3.742141008377075,
|
|
"learning_rate": 1.8419678403021016e-05,
|
|
"epoch": 1.3899721448467965,
|
|
"step": 250
|
|
},
|
|
{
|
|
"eval_loss": 2.335228443145752,
|
|
"eval_runtime": 35.9668,
|
|
"eval_samples_per_second": 39.926,
|
|
"eval_steps_per_second": 2.002,
|
|
"epoch": 1.3899721448467965,
|
|
"step": 250
|
|
},
|
|
{
|
|
"loss": 1.7768,
|
|
"grad_norm": 3.9163429737091064,
|
|
"learning_rate": 1.8406530880963186e-05,
|
|
"epoch": 1.3955431754874652,
|
|
"step": 251
|
|
},
|
|
{
|
|
"loss": 1.7455,
|
|
"grad_norm": 3.3456947803497314,
|
|
"learning_rate": 1.8393383358905352e-05,
|
|
"epoch": 1.4011142061281336,
|
|
"step": 252
|
|
},
|
|
{
|
|
"loss": 1.7103,
|
|
"grad_norm": 4.220420837402344,
|
|
"learning_rate": 1.838023583684752e-05,
|
|
"epoch": 1.4066852367688023,
|
|
"step": 253
|
|
},
|
|
{
|
|
"loss": 2.0054,
|
|
"grad_norm": 4.233839511871338,
|
|
"learning_rate": 1.836708831478969e-05,
|
|
"epoch": 1.4122562674094707,
|
|
"step": 254
|
|
},
|
|
{
|
|
"loss": 1.7175,
|
|
"grad_norm": 3.703934669494629,
|
|
"learning_rate": 1.8353940792731864e-05,
|
|
"epoch": 1.4178272980501392,
|
|
"step": 255
|
|
},
|
|
{
|
|
"loss": 1.7225,
|
|
"grad_norm": 4.210822105407715,
|
|
"learning_rate": 1.834079327067403e-05,
|
|
"epoch": 1.4233983286908078,
|
|
"step": 256
|
|
},
|
|
{
|
|
"loss": 1.6882,
|
|
"grad_norm": 3.8861896991729736,
|
|
"learning_rate": 1.83276457486162e-05,
|
|
"epoch": 1.4289693593314763,
|
|
"step": 257
|
|
},
|
|
{
|
|
"loss": 2.0721,
|
|
"grad_norm": 4.4140424728393555,
|
|
"learning_rate": 1.831449822655837e-05,
|
|
"epoch": 1.434540389972145,
|
|
"step": 258
|
|
},
|
|
{
|
|
"loss": 1.6198,
|
|
"grad_norm": 3.1098673343658447,
|
|
"learning_rate": 1.830135070450054e-05,
|
|
"epoch": 1.4401114206128134,
|
|
"step": 259
|
|
},
|
|
{
|
|
"loss": 1.9632,
|
|
"grad_norm": 2.9485561847686768,
|
|
"learning_rate": 1.8288203182442708e-05,
|
|
"epoch": 1.4456824512534818,
|
|
"step": 260
|
|
},
|
|
{
|
|
"loss": 1.9262,
|
|
"grad_norm": 3.842655658721924,
|
|
"learning_rate": 1.8275055660384878e-05,
|
|
"epoch": 1.4512534818941505,
|
|
"step": 261
|
|
},
|
|
{
|
|
"loss": 2.0807,
|
|
"grad_norm": 4.122529983520508,
|
|
"learning_rate": 1.8261908138327047e-05,
|
|
"epoch": 1.456824512534819,
|
|
"step": 262
|
|
},
|
|
{
|
|
"loss": 2.0099,
|
|
"grad_norm": 3.6181795597076416,
|
|
"learning_rate": 1.8248760616269213e-05,
|
|
"epoch": 1.4623955431754876,
|
|
"step": 263
|
|
},
|
|
{
|
|
"loss": 1.7435,
|
|
"grad_norm": 3.9433975219726562,
|
|
"learning_rate": 1.8235613094211383e-05,
|
|
"epoch": 1.467966573816156,
|
|
"step": 264
|
|
},
|
|
{
|
|
"loss": 1.4648,
|
|
"grad_norm": 5.496665000915527,
|
|
"learning_rate": 1.8222465572153552e-05,
|
|
"epoch": 1.4735376044568245,
|
|
"step": 265
|
|
},
|
|
{
|
|
"loss": 2.106,
|
|
"grad_norm": 3.3920114040374756,
|
|
"learning_rate": 1.8209318050095722e-05,
|
|
"epoch": 1.479108635097493,
|
|
"step": 266
|
|
},
|
|
{
|
|
"loss": 1.4486,
|
|
"grad_norm": 4.195888519287109,
|
|
"learning_rate": 1.819617052803789e-05,
|
|
"epoch": 1.4846796657381616,
|
|
"step": 267
|
|
},
|
|
{
|
|
"loss": 1.4996,
|
|
"grad_norm": 3.5301265716552734,
|
|
"learning_rate": 1.818302300598006e-05,
|
|
"epoch": 1.49025069637883,
|
|
"step": 268
|
|
},
|
|
{
|
|
"loss": 1.8247,
|
|
"grad_norm": 3.3157520294189453,
|
|
"learning_rate": 1.8169875483922227e-05,
|
|
"epoch": 1.4958217270194987,
|
|
"step": 269
|
|
},
|
|
{
|
|
"loss": 1.6092,
|
|
"grad_norm": 4.3797383308410645,
|
|
"learning_rate": 1.8156727961864397e-05,
|
|
"epoch": 1.501392757660167,
|
|
"step": 270
|
|
},
|
|
{
|
|
"loss": 1.6071,
|
|
"grad_norm": 3.3917229175567627,
|
|
"learning_rate": 1.814358043980657e-05,
|
|
"epoch": 1.5069637883008355,
|
|
"step": 271
|
|
},
|
|
{
|
|
"loss": 1.9553,
|
|
"grad_norm": 3.171808958053589,
|
|
"learning_rate": 1.813043291774874e-05,
|
|
"epoch": 1.5125348189415042,
|
|
"step": 272
|
|
},
|
|
{
|
|
"loss": 1.8105,
|
|
"grad_norm": 3.1904940605163574,
|
|
"learning_rate": 1.8117285395690905e-05,
|
|
"epoch": 1.5181058495821727,
|
|
"step": 273
|
|
},
|
|
{
|
|
"loss": 1.5718,
|
|
"grad_norm": 3.7544777393341064,
|
|
"learning_rate": 1.8104137873633075e-05,
|
|
"epoch": 1.5236768802228413,
|
|
"step": 274
|
|
},
|
|
{
|
|
"loss": 1.9999,
|
|
"grad_norm": 4.143693923950195,
|
|
"learning_rate": 1.8090990351575244e-05,
|
|
"epoch": 1.5292479108635098,
|
|
"step": 275
|
|
},
|
|
{
|
|
"loss": 2.0393,
|
|
"grad_norm": 3.505359411239624,
|
|
"learning_rate": 1.8077842829517414e-05,
|
|
"epoch": 1.5348189415041782,
|
|
"step": 276
|
|
},
|
|
{
|
|
"loss": 1.6101,
|
|
"grad_norm": 4.118677139282227,
|
|
"learning_rate": 1.8064695307459583e-05,
|
|
"epoch": 1.5403899721448466,
|
|
"step": 277
|
|
},
|
|
{
|
|
"loss": 1.6718,
|
|
"grad_norm": 4.947996139526367,
|
|
"learning_rate": 1.8051547785401753e-05,
|
|
"epoch": 1.5459610027855153,
|
|
"step": 278
|
|
},
|
|
{
|
|
"loss": 2.2007,
|
|
"grad_norm": 4.226828575134277,
|
|
"learning_rate": 1.8038400263343922e-05,
|
|
"epoch": 1.551532033426184,
|
|
"step": 279
|
|
},
|
|
{
|
|
"loss": 1.7025,
|
|
"grad_norm": 4.085235118865967,
|
|
"learning_rate": 1.802525274128609e-05,
|
|
"epoch": 1.5571030640668524,
|
|
"step": 280
|
|
},
|
|
{
|
|
"loss": 1.7632,
|
|
"grad_norm": 3.5451292991638184,
|
|
"learning_rate": 1.8012105219228258e-05,
|
|
"epoch": 1.5626740947075208,
|
|
"step": 281
|
|
},
|
|
{
|
|
"loss": 1.4975,
|
|
"grad_norm": 5.2698540687561035,
|
|
"learning_rate": 1.799895769717043e-05,
|
|
"epoch": 1.5682451253481893,
|
|
"step": 282
|
|
},
|
|
{
|
|
"loss": 1.2189,
|
|
"grad_norm": 3.662693738937378,
|
|
"learning_rate": 1.7985810175112597e-05,
|
|
"epoch": 1.573816155988858,
|
|
"step": 283
|
|
},
|
|
{
|
|
"loss": 2.1889,
|
|
"grad_norm": 3.9369843006134033,
|
|
"learning_rate": 1.7972662653054766e-05,
|
|
"epoch": 1.5793871866295266,
|
|
"step": 284
|
|
},
|
|
{
|
|
"loss": 1.782,
|
|
"grad_norm": 5.153691291809082,
|
|
"learning_rate": 1.7959515130996936e-05,
|
|
"epoch": 1.584958217270195,
|
|
"step": 285
|
|
},
|
|
{
|
|
"loss": 1.7055,
|
|
"grad_norm": 3.5153331756591797,
|
|
"learning_rate": 1.7946367608939105e-05,
|
|
"epoch": 1.5905292479108635,
|
|
"step": 286
|
|
},
|
|
{
|
|
"loss": 2.0713,
|
|
"grad_norm": 3.8740577697753906,
|
|
"learning_rate": 1.7933220086881275e-05,
|
|
"epoch": 1.596100278551532,
|
|
"step": 287
|
|
},
|
|
{
|
|
"loss": 1.6159,
|
|
"grad_norm": 2.977501153945923,
|
|
"learning_rate": 1.7920072564823445e-05,
|
|
"epoch": 1.6016713091922006,
|
|
"step": 288
|
|
},
|
|
{
|
|
"loss": 2.0388,
|
|
"grad_norm": 4.873539447784424,
|
|
"learning_rate": 1.7906925042765614e-05,
|
|
"epoch": 1.6072423398328692,
|
|
"step": 289
|
|
},
|
|
{
|
|
"loss": 1.7656,
|
|
"grad_norm": 3.6297993659973145,
|
|
"learning_rate": 1.789377752070778e-05,
|
|
"epoch": 1.6128133704735377,
|
|
"step": 290
|
|
},
|
|
{
|
|
"loss": 1.9818,
|
|
"grad_norm": 2.868178367614746,
|
|
"learning_rate": 1.788062999864995e-05,
|
|
"epoch": 1.6183844011142061,
|
|
"step": 291
|
|
},
|
|
{
|
|
"loss": 1.6421,
|
|
"grad_norm": 4.532885551452637,
|
|
"learning_rate": 1.786748247659212e-05,
|
|
"epoch": 1.6239554317548746,
|
|
"step": 292
|
|
},
|
|
{
|
|
"loss": 1.653,
|
|
"grad_norm": 5.63344669342041,
|
|
"learning_rate": 1.785433495453429e-05,
|
|
"epoch": 1.6295264623955432,
|
|
"step": 293
|
|
},
|
|
{
|
|
"loss": 1.8727,
|
|
"grad_norm": 4.235146999359131,
|
|
"learning_rate": 1.7841187432476458e-05,
|
|
"epoch": 1.6350974930362117,
|
|
"step": 294
|
|
},
|
|
{
|
|
"loss": 1.3509,
|
|
"grad_norm": 4.512764930725098,
|
|
"learning_rate": 1.7828039910418628e-05,
|
|
"epoch": 1.6406685236768803,
|
|
"step": 295
|
|
},
|
|
{
|
|
"loss": 1.7836,
|
|
"grad_norm": 3.72898268699646,
|
|
"learning_rate": 1.7814892388360797e-05,
|
|
"epoch": 1.6462395543175488,
|
|
"step": 296
|
|
},
|
|
{
|
|
"loss": 1.6315,
|
|
"grad_norm": 3.1936659812927246,
|
|
"learning_rate": 1.7801744866302963e-05,
|
|
"epoch": 1.6518105849582172,
|
|
"step": 297
|
|
},
|
|
{
|
|
"loss": 1.9805,
|
|
"grad_norm": 3.1188321113586426,
|
|
"learning_rate": 1.7788597344245136e-05,
|
|
"epoch": 1.6573816155988856,
|
|
"step": 298
|
|
},
|
|
{
|
|
"loss": 1.8716,
|
|
"grad_norm": 4.88875150680542,
|
|
"learning_rate": 1.7775449822187306e-05,
|
|
"epoch": 1.6629526462395543,
|
|
"step": 299
|
|
},
|
|
{
|
|
"loss": 1.4669,
|
|
"grad_norm": 4.494915962219238,
|
|
"learning_rate": 1.7762302300129472e-05,
|
|
"epoch": 1.668523676880223,
|
|
"step": 300
|
|
},
|
|
{
|
|
"eval_loss": 2.3116097450256348,
|
|
"eval_runtime": 35.9294,
|
|
"eval_samples_per_second": 39.967,
|
|
"eval_steps_per_second": 2.004,
|
|
"epoch": 1.668523676880223,
|
|
"step": 300
|
|
},
|
|
{
|
|
"loss": 1.3418,
|
|
"grad_norm": 4.365106582641602,
|
|
"learning_rate": 1.774915477807164e-05,
|
|
"epoch": 1.6740947075208914,
|
|
"step": 301
|
|
},
|
|
{
|
|
"loss": 1.4561,
|
|
"grad_norm": 4.683363914489746,
|
|
"learning_rate": 1.773600725601381e-05,
|
|
"epoch": 1.6796657381615598,
|
|
"step": 302
|
|
},
|
|
{
|
|
"loss": 1.8321,
|
|
"grad_norm": 4.195693492889404,
|
|
"learning_rate": 1.772285973395598e-05,
|
|
"epoch": 1.6852367688022283,
|
|
"step": 303
|
|
},
|
|
{
|
|
"loss": 1.8932,
|
|
"grad_norm": 4.681265830993652,
|
|
"learning_rate": 1.770971221189815e-05,
|
|
"epoch": 1.690807799442897,
|
|
"step": 304
|
|
},
|
|
{
|
|
"loss": 2.0071,
|
|
"grad_norm": 5.034351348876953,
|
|
"learning_rate": 1.769656468984032e-05,
|
|
"epoch": 1.6963788300835656,
|
|
"step": 305
|
|
},
|
|
{
|
|
"loss": 1.9824,
|
|
"grad_norm": 3.9581334590911865,
|
|
"learning_rate": 1.768341716778249e-05,
|
|
"epoch": 1.701949860724234,
|
|
"step": 306
|
|
},
|
|
{
|
|
"loss": 2.2225,
|
|
"grad_norm": 3.9467825889587402,
|
|
"learning_rate": 1.7670269645724655e-05,
|
|
"epoch": 1.7075208913649025,
|
|
"step": 307
|
|
},
|
|
{
|
|
"loss": 1.671,
|
|
"grad_norm": 3.7253997325897217,
|
|
"learning_rate": 1.7657122123666825e-05,
|
|
"epoch": 1.713091922005571,
|
|
"step": 308
|
|
},
|
|
{
|
|
"loss": 1.7876,
|
|
"grad_norm": 4.8212480545043945,
|
|
"learning_rate": 1.7643974601608998e-05,
|
|
"epoch": 1.7186629526462396,
|
|
"step": 309
|
|
},
|
|
{
|
|
"loss": 1.1102,
|
|
"grad_norm": 4.235992431640625,
|
|
"learning_rate": 1.7630827079551164e-05,
|
|
"epoch": 1.724233983286908,
|
|
"step": 310
|
|
},
|
|
{
|
|
"loss": 1.7577,
|
|
"grad_norm": 3.5870513916015625,
|
|
"learning_rate": 1.7617679557493333e-05,
|
|
"epoch": 1.7298050139275767,
|
|
"step": 311
|
|
},
|
|
{
|
|
"loss": 1.3948,
|
|
"grad_norm": 4.27365779876709,
|
|
"learning_rate": 1.7604532035435503e-05,
|
|
"epoch": 1.7353760445682451,
|
|
"step": 312
|
|
},
|
|
{
|
|
"loss": 1.5507,
|
|
"grad_norm": 4.927708625793457,
|
|
"learning_rate": 1.7591384513377672e-05,
|
|
"epoch": 1.7409470752089136,
|
|
"step": 313
|
|
},
|
|
{
|
|
"loss": 1.5299,
|
|
"grad_norm": 4.702437877655029,
|
|
"learning_rate": 1.7578236991319842e-05,
|
|
"epoch": 1.7465181058495822,
|
|
"step": 314
|
|
},
|
|
{
|
|
"loss": 1.6187,
|
|
"grad_norm": 4.205385684967041,
|
|
"learning_rate": 1.756508946926201e-05,
|
|
"epoch": 1.7520891364902507,
|
|
"step": 315
|
|
},
|
|
{
|
|
"loss": 1.6467,
|
|
"grad_norm": 3.724274158477783,
|
|
"learning_rate": 1.755194194720418e-05,
|
|
"epoch": 1.7576601671309193,
|
|
"step": 316
|
|
},
|
|
{
|
|
"loss": 1.48,
|
|
"grad_norm": 5.0788187980651855,
|
|
"learning_rate": 1.7538794425146347e-05,
|
|
"epoch": 1.7632311977715878,
|
|
"step": 317
|
|
},
|
|
{
|
|
"loss": 1.2413,
|
|
"grad_norm": 4.211026191711426,
|
|
"learning_rate": 1.7525646903088517e-05,
|
|
"epoch": 1.7688022284122562,
|
|
"step": 318
|
|
},
|
|
{
|
|
"loss": 1.2792,
|
|
"grad_norm": 4.383068561553955,
|
|
"learning_rate": 1.7512499381030686e-05,
|
|
"epoch": 1.7743732590529246,
|
|
"step": 319
|
|
},
|
|
{
|
|
"loss": 2.0635,
|
|
"grad_norm": 5.2455668449401855,
|
|
"learning_rate": 1.7499351858972856e-05,
|
|
"epoch": 1.7799442896935933,
|
|
"step": 320
|
|
},
|
|
{
|
|
"loss": 1.9011,
|
|
"grad_norm": 4.73854398727417,
|
|
"learning_rate": 1.7486204336915025e-05,
|
|
"epoch": 1.785515320334262,
|
|
"step": 321
|
|
},
|
|
{
|
|
"loss": 1.9017,
|
|
"grad_norm": 5.136256217956543,
|
|
"learning_rate": 1.7473056814857195e-05,
|
|
"epoch": 1.7910863509749304,
|
|
"step": 322
|
|
},
|
|
{
|
|
"loss": 1.7304,
|
|
"grad_norm": 5.707761764526367,
|
|
"learning_rate": 1.7459909292799364e-05,
|
|
"epoch": 1.7966573816155988,
|
|
"step": 323
|
|
},
|
|
{
|
|
"loss": 1.9703,
|
|
"grad_norm": 4.81571102142334,
|
|
"learning_rate": 1.744676177074153e-05,
|
|
"epoch": 1.8022284122562673,
|
|
"step": 324
|
|
},
|
|
{
|
|
"loss": 1.6825,
|
|
"grad_norm": 6.157602310180664,
|
|
"learning_rate": 1.7433614248683703e-05,
|
|
"epoch": 1.807799442896936,
|
|
"step": 325
|
|
},
|
|
{
|
|
"loss": 1.7945,
|
|
"grad_norm": 5.200462818145752,
|
|
"learning_rate": 1.7420466726625873e-05,
|
|
"epoch": 1.8133704735376046,
|
|
"step": 326
|
|
},
|
|
{
|
|
"loss": 1.7701,
|
|
"grad_norm": 5.342528820037842,
|
|
"learning_rate": 1.7407319204568042e-05,
|
|
"epoch": 1.818941504178273,
|
|
"step": 327
|
|
},
|
|
{
|
|
"loss": 1.8,
|
|
"grad_norm": 4.419646739959717,
|
|
"learning_rate": 1.739417168251021e-05,
|
|
"epoch": 1.8245125348189415,
|
|
"step": 328
|
|
},
|
|
{
|
|
"loss": 1.3064,
|
|
"grad_norm": 5.106484889984131,
|
|
"learning_rate": 1.7381024160452378e-05,
|
|
"epoch": 1.83008356545961,
|
|
"step": 329
|
|
},
|
|
{
|
|
"loss": 2.0357,
|
|
"grad_norm": 4.221576690673828,
|
|
"learning_rate": 1.7367876638394547e-05,
|
|
"epoch": 1.8356545961002786,
|
|
"step": 330
|
|
},
|
|
{
|
|
"loss": 1.6015,
|
|
"grad_norm": 6.323553562164307,
|
|
"learning_rate": 1.7354729116336717e-05,
|
|
"epoch": 1.841225626740947,
|
|
"step": 331
|
|
},
|
|
{
|
|
"loss": 1.5858,
|
|
"grad_norm": 4.978970527648926,
|
|
"learning_rate": 1.7341581594278887e-05,
|
|
"epoch": 1.8467966573816157,
|
|
"step": 332
|
|
},
|
|
{
|
|
"loss": 1.9489,
|
|
"grad_norm": 3.1882030963897705,
|
|
"learning_rate": 1.7328434072221056e-05,
|
|
"epoch": 1.8523676880222841,
|
|
"step": 333
|
|
},
|
|
{
|
|
"loss": 2.1722,
|
|
"grad_norm": 4.047868251800537,
|
|
"learning_rate": 1.7315286550163222e-05,
|
|
"epoch": 1.8579387186629526,
|
|
"step": 334
|
|
},
|
|
{
|
|
"loss": 1.5027,
|
|
"grad_norm": 4.2307448387146,
|
|
"learning_rate": 1.730213902810539e-05,
|
|
"epoch": 1.863509749303621,
|
|
"step": 335
|
|
},
|
|
{
|
|
"loss": 1.481,
|
|
"grad_norm": 6.048774242401123,
|
|
"learning_rate": 1.7288991506047565e-05,
|
|
"epoch": 1.8690807799442897,
|
|
"step": 336
|
|
},
|
|
{
|
|
"loss": 1.8746,
|
|
"grad_norm": 5.389241695404053,
|
|
"learning_rate": 1.7275843983989734e-05,
|
|
"epoch": 1.8746518105849583,
|
|
"step": 337
|
|
},
|
|
{
|
|
"loss": 2.0807,
|
|
"grad_norm": 4.036198139190674,
|
|
"learning_rate": 1.72626964619319e-05,
|
|
"epoch": 1.8802228412256268,
|
|
"step": 338
|
|
},
|
|
{
|
|
"loss": 1.7448,
|
|
"grad_norm": 5.005743503570557,
|
|
"learning_rate": 1.724954893987407e-05,
|
|
"epoch": 1.8857938718662952,
|
|
"step": 339
|
|
},
|
|
{
|
|
"loss": 1.9092,
|
|
"grad_norm": 4.462837219238281,
|
|
"learning_rate": 1.723640141781624e-05,
|
|
"epoch": 1.8913649025069637,
|
|
"step": 340
|
|
},
|
|
{
|
|
"loss": 1.7032,
|
|
"grad_norm": 4.945067405700684,
|
|
"learning_rate": 1.722325389575841e-05,
|
|
"epoch": 1.8969359331476323,
|
|
"step": 341
|
|
},
|
|
{
|
|
"loss": 1.9141,
|
|
"grad_norm": 3.7232062816619873,
|
|
"learning_rate": 1.721010637370058e-05,
|
|
"epoch": 1.902506963788301,
|
|
"step": 342
|
|
},
|
|
{
|
|
"loss": 1.8258,
|
|
"grad_norm": 3.8830628395080566,
|
|
"learning_rate": 1.7196958851642748e-05,
|
|
"epoch": 1.9080779944289694,
|
|
"step": 343
|
|
},
|
|
{
|
|
"loss": 1.7998,
|
|
"grad_norm": 4.693456649780273,
|
|
"learning_rate": 1.7183811329584917e-05,
|
|
"epoch": 1.9136490250696379,
|
|
"step": 344
|
|
},
|
|
{
|
|
"loss": 2.0583,
|
|
"grad_norm": 4.737421989440918,
|
|
"learning_rate": 1.7170663807527083e-05,
|
|
"epoch": 1.9192200557103063,
|
|
"step": 345
|
|
},
|
|
{
|
|
"loss": 1.494,
|
|
"grad_norm": 2.78582501411438,
|
|
"learning_rate": 1.7157516285469253e-05,
|
|
"epoch": 1.924791086350975,
|
|
"step": 346
|
|
},
|
|
{
|
|
"loss": 1.7167,
|
|
"grad_norm": 4.305075168609619,
|
|
"learning_rate": 1.7144368763411423e-05,
|
|
"epoch": 1.9303621169916436,
|
|
"step": 347
|
|
},
|
|
{
|
|
"loss": 1.7753,
|
|
"grad_norm": 3.9957072734832764,
|
|
"learning_rate": 1.7131221241353592e-05,
|
|
"epoch": 1.935933147632312,
|
|
"step": 348
|
|
},
|
|
{
|
|
"loss": 1.8852,
|
|
"grad_norm": 4.9537434577941895,
|
|
"learning_rate": 1.711807371929576e-05,
|
|
"epoch": 1.9415041782729805,
|
|
"step": 349
|
|
},
|
|
{
|
|
"loss": 1.8729,
|
|
"grad_norm": 3.9404208660125732,
|
|
"learning_rate": 1.710492619723793e-05,
|
|
"epoch": 1.947075208913649,
|
|
"step": 350
|
|
},
|
|
{
|
|
"eval_loss": 2.3213632106781006,
|
|
"eval_runtime": 35.9387,
|
|
"eval_samples_per_second": 39.957,
|
|
"eval_steps_per_second": 2.003,
|
|
"epoch": 1.947075208913649,
|
|
"step": 350
|
|
},
|
|
{
|
|
"loss": 2.1419,
|
|
"grad_norm": 3.202141046524048,
|
|
"learning_rate": 1.70917786751801e-05,
|
|
"epoch": 1.9526462395543176,
|
|
"step": 351
|
|
},
|
|
{
|
|
"loss": 1.83,
|
|
"grad_norm": 4.432948112487793,
|
|
"learning_rate": 1.707863115312227e-05,
|
|
"epoch": 1.958217270194986,
|
|
"step": 352
|
|
},
|
|
{
|
|
"loss": 2.3556,
|
|
"grad_norm": 5.213648796081543,
|
|
"learning_rate": 1.706548363106444e-05,
|
|
"epoch": 1.9637883008356547,
|
|
"step": 353
|
|
},
|
|
{
|
|
"loss": 2.1396,
|
|
"grad_norm": 4.155479431152344,
|
|
"learning_rate": 1.705233610900661e-05,
|
|
"epoch": 1.9693593314763231,
|
|
"step": 354
|
|
},
|
|
{
|
|
"loss": 1.4222,
|
|
"grad_norm": 5.146358013153076,
|
|
"learning_rate": 1.7039188586948775e-05,
|
|
"epoch": 1.9749303621169916,
|
|
"step": 355
|
|
},
|
|
{
|
|
"loss": 1.9362,
|
|
"grad_norm": 3.264761447906494,
|
|
"learning_rate": 1.7026041064890945e-05,
|
|
"epoch": 1.98050139275766,
|
|
"step": 356
|
|
},
|
|
{
|
|
"loss": 1.9471,
|
|
"grad_norm": 3.308243989944458,
|
|
"learning_rate": 1.7012893542833114e-05,
|
|
"epoch": 1.9860724233983287,
|
|
"step": 357
|
|
},
|
|
{
|
|
"loss": 2.0193,
|
|
"grad_norm": 4.1630859375,
|
|
"learning_rate": 1.6999746020775284e-05,
|
|
"epoch": 1.9916434540389973,
|
|
"step": 358
|
|
},
|
|
{
|
|
"loss": 2.1048,
|
|
"grad_norm": 4.196152210235596,
|
|
"learning_rate": 1.6986598498717453e-05,
|
|
"epoch": 1.9972144846796658,
|
|
"step": 359
|
|
},
|
|
{
|
|
"loss": 2.0755,
|
|
"grad_norm": 4.194087028503418,
|
|
"learning_rate": 1.6973450976659623e-05,
|
|
"epoch": 2.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"loss": 1.4388,
|
|
"grad_norm": 4.208454132080078,
|
|
"learning_rate": 1.6960303454601792e-05,
|
|
"epoch": 2.0055710306406684,
|
|
"step": 361
|
|
},
|
|
{
|
|
"loss": 1.7819,
|
|
"grad_norm": 3.549447774887085,
|
|
"learning_rate": 1.694715593254396e-05,
|
|
"epoch": 2.011142061281337,
|
|
"step": 362
|
|
},
|
|
{
|
|
"loss": 1.5135,
|
|
"grad_norm": 3.6767420768737793,
|
|
"learning_rate": 1.693400841048613e-05,
|
|
"epoch": 2.0167130919220058,
|
|
"step": 363
|
|
},
|
|
{
|
|
"loss": 1.7713,
|
|
"grad_norm": 3.816209554672241,
|
|
"learning_rate": 1.69208608884283e-05,
|
|
"epoch": 2.022284122562674,
|
|
"step": 364
|
|
},
|
|
{
|
|
"loss": 1.6624,
|
|
"grad_norm": 3.2220561504364014,
|
|
"learning_rate": 1.6907713366370467e-05,
|
|
"epoch": 2.0278551532033426,
|
|
"step": 365
|
|
},
|
|
{
|
|
"loss": 1.9059,
|
|
"grad_norm": 3.4210987091064453,
|
|
"learning_rate": 1.6894565844312637e-05,
|
|
"epoch": 2.033426183844011,
|
|
"step": 366
|
|
},
|
|
{
|
|
"loss": 1.155,
|
|
"grad_norm": 4.348776817321777,
|
|
"learning_rate": 1.6881418322254806e-05,
|
|
"epoch": 2.0389972144846795,
|
|
"step": 367
|
|
},
|
|
{
|
|
"loss": 1.4513,
|
|
"grad_norm": 4.143118858337402,
|
|
"learning_rate": 1.6868270800196976e-05,
|
|
"epoch": 2.0445682451253484,
|
|
"step": 368
|
|
},
|
|
{
|
|
"loss": 1.8148,
|
|
"grad_norm": 4.118925094604492,
|
|
"learning_rate": 1.6855123278139145e-05,
|
|
"epoch": 2.050139275766017,
|
|
"step": 369
|
|
},
|
|
{
|
|
"loss": 1.6325,
|
|
"grad_norm": 4.060324668884277,
|
|
"learning_rate": 1.6841975756081315e-05,
|
|
"epoch": 2.0557103064066853,
|
|
"step": 370
|
|
},
|
|
{
|
|
"loss": 1.694,
|
|
"grad_norm": 4.604481220245361,
|
|
"learning_rate": 1.6828828234023484e-05,
|
|
"epoch": 2.0612813370473537,
|
|
"step": 371
|
|
},
|
|
{
|
|
"loss": 1.4905,
|
|
"grad_norm": 5.273688316345215,
|
|
"learning_rate": 1.681568071196565e-05,
|
|
"epoch": 2.066852367688022,
|
|
"step": 372
|
|
},
|
|
{
|
|
"loss": 1.1557,
|
|
"grad_norm": 6.0254387855529785,
|
|
"learning_rate": 1.680253318990782e-05,
|
|
"epoch": 2.0724233983286906,
|
|
"step": 373
|
|
},
|
|
{
|
|
"loss": 0.999,
|
|
"grad_norm": 5.017882823944092,
|
|
"learning_rate": 1.678938566784999e-05,
|
|
"epoch": 2.0779944289693595,
|
|
"step": 374
|
|
},
|
|
{
|
|
"loss": 1.4159,
|
|
"grad_norm": 6.874935626983643,
|
|
"learning_rate": 1.6776238145792162e-05,
|
|
"epoch": 2.083565459610028,
|
|
"step": 375
|
|
},
|
|
{
|
|
"loss": 0.9789,
|
|
"grad_norm": 6.245709419250488,
|
|
"learning_rate": 1.676309062373433e-05,
|
|
"epoch": 2.0891364902506964,
|
|
"step": 376
|
|
},
|
|
{
|
|
"loss": 1.3929,
|
|
"grad_norm": 6.976832866668701,
|
|
"learning_rate": 1.6749943101676498e-05,
|
|
"epoch": 2.094707520891365,
|
|
"step": 377
|
|
},
|
|
{
|
|
"loss": 1.5721,
|
|
"grad_norm": 7.426636695861816,
|
|
"learning_rate": 1.6736795579618668e-05,
|
|
"epoch": 2.1002785515320332,
|
|
"step": 378
|
|
},
|
|
{
|
|
"loss": 1.4603,
|
|
"grad_norm": 8.876333236694336,
|
|
"learning_rate": 1.6723648057560837e-05,
|
|
"epoch": 2.105849582172702,
|
|
"step": 379
|
|
},
|
|
{
|
|
"loss": 1.2115,
|
|
"grad_norm": 5.889682769775391,
|
|
"learning_rate": 1.6710500535503007e-05,
|
|
"epoch": 2.1114206128133706,
|
|
"step": 380
|
|
},
|
|
{
|
|
"loss": 1.1689,
|
|
"grad_norm": 6.435322284698486,
|
|
"learning_rate": 1.6697353013445176e-05,
|
|
"epoch": 2.116991643454039,
|
|
"step": 381
|
|
},
|
|
{
|
|
"loss": 1.1904,
|
|
"grad_norm": 6.061446666717529,
|
|
"learning_rate": 1.6684205491387342e-05,
|
|
"epoch": 2.1225626740947074,
|
|
"step": 382
|
|
},
|
|
{
|
|
"loss": 1.3799,
|
|
"grad_norm": 7.56770658493042,
|
|
"learning_rate": 1.6671057969329512e-05,
|
|
"epoch": 2.128133704735376,
|
|
"step": 383
|
|
},
|
|
{
|
|
"loss": 1.5787,
|
|
"grad_norm": 8.942233085632324,
|
|
"learning_rate": 1.665791044727168e-05,
|
|
"epoch": 2.1337047353760448,
|
|
"step": 384
|
|
},
|
|
{
|
|
"loss": 1.4084,
|
|
"grad_norm": 7.448763847351074,
|
|
"learning_rate": 1.664476292521385e-05,
|
|
"epoch": 2.139275766016713,
|
|
"step": 385
|
|
},
|
|
{
|
|
"loss": 1.3685,
|
|
"grad_norm": 5.792154312133789,
|
|
"learning_rate": 1.663161540315602e-05,
|
|
"epoch": 2.1448467966573816,
|
|
"step": 386
|
|
},
|
|
{
|
|
"loss": 1.5465,
|
|
"grad_norm": 7.226157188415527,
|
|
"learning_rate": 1.661846788109819e-05,
|
|
"epoch": 2.15041782729805,
|
|
"step": 387
|
|
},
|
|
{
|
|
"loss": 1.1914,
|
|
"grad_norm": 5.6042022705078125,
|
|
"learning_rate": 1.660532035904036e-05,
|
|
"epoch": 2.1559888579387185,
|
|
"step": 388
|
|
},
|
|
{
|
|
"loss": 1.6443,
|
|
"grad_norm": 5.619427680969238,
|
|
"learning_rate": 1.6592172836982525e-05,
|
|
"epoch": 2.1615598885793874,
|
|
"step": 389
|
|
},
|
|
{
|
|
"loss": 1.5371,
|
|
"grad_norm": 4.770148754119873,
|
|
"learning_rate": 1.65790253149247e-05,
|
|
"epoch": 2.167130919220056,
|
|
"step": 390
|
|
},
|
|
{
|
|
"loss": 1.5124,
|
|
"grad_norm": 7.61703634262085,
|
|
"learning_rate": 1.6565877792866868e-05,
|
|
"epoch": 2.1727019498607243,
|
|
"step": 391
|
|
},
|
|
{
|
|
"loss": 1.6248,
|
|
"grad_norm": 4.498234272003174,
|
|
"learning_rate": 1.6552730270809037e-05,
|
|
"epoch": 2.1782729805013927,
|
|
"step": 392
|
|
},
|
|
{
|
|
"loss": 1.4621,
|
|
"grad_norm": 4.0563063621521,
|
|
"learning_rate": 1.6539582748751204e-05,
|
|
"epoch": 2.183844011142061,
|
|
"step": 393
|
|
},
|
|
{
|
|
"loss": 1.4315,
|
|
"grad_norm": 6.069952964782715,
|
|
"learning_rate": 1.6526435226693373e-05,
|
|
"epoch": 2.1894150417827296,
|
|
"step": 394
|
|
},
|
|
{
|
|
"loss": 1.4308,
|
|
"grad_norm": 6.728673458099365,
|
|
"learning_rate": 1.6513287704635543e-05,
|
|
"epoch": 2.1949860724233985,
|
|
"step": 395
|
|
},
|
|
{
|
|
"loss": 1.2975,
|
|
"grad_norm": 14.551620483398438,
|
|
"learning_rate": 1.6500140182577712e-05,
|
|
"epoch": 2.200557103064067,
|
|
"step": 396
|
|
},
|
|
{
|
|
"loss": 1.4624,
|
|
"grad_norm": 6.782831192016602,
|
|
"learning_rate": 1.648699266051988e-05,
|
|
"epoch": 2.2061281337047354,
|
|
"step": 397
|
|
},
|
|
{
|
|
"loss": 1.5891,
|
|
"grad_norm": 6.513261795043945,
|
|
"learning_rate": 1.647384513846205e-05,
|
|
"epoch": 2.211699164345404,
|
|
"step": 398
|
|
},
|
|
{
|
|
"loss": 1.3152,
|
|
"grad_norm": 6.3476433753967285,
|
|
"learning_rate": 1.646069761640422e-05,
|
|
"epoch": 2.2172701949860723,
|
|
"step": 399
|
|
},
|
|
{
|
|
"loss": 1.3129,
|
|
"grad_norm": 4.936390399932861,
|
|
"learning_rate": 1.6447550094346387e-05,
|
|
"epoch": 2.222841225626741,
|
|
"step": 400
|
|
},
|
|
{
|
|
"eval_loss": 2.531832218170166,
|
|
"eval_runtime": 35.95,
|
|
"eval_samples_per_second": 39.944,
|
|
"eval_steps_per_second": 2.003,
|
|
"epoch": 2.222841225626741,
|
|
"step": 400
|
|
},
|
|
{
|
|
"loss": 1.2283,
|
|
"grad_norm": 8.302631378173828,
|
|
"learning_rate": 1.6434402572288556e-05,
|
|
"epoch": 2.2284122562674096,
|
|
"step": 401
|
|
},
|
|
{
|
|
"loss": 1.1884,
|
|
"grad_norm": 5.8890886306762695,
|
|
"learning_rate": 1.642125505023073e-05,
|
|
"epoch": 2.233983286908078,
|
|
"step": 402
|
|
},
|
|
{
|
|
"loss": 1.3971,
|
|
"grad_norm": 6.417287349700928,
|
|
"learning_rate": 1.6408107528172895e-05,
|
|
"epoch": 2.2395543175487465,
|
|
"step": 403
|
|
},
|
|
{
|
|
"loss": 1.5501,
|
|
"grad_norm": 6.351545810699463,
|
|
"learning_rate": 1.6394960006115065e-05,
|
|
"epoch": 2.245125348189415,
|
|
"step": 404
|
|
},
|
|
{
|
|
"loss": 1.1685,
|
|
"grad_norm": 5.121798992156982,
|
|
"learning_rate": 1.6381812484057234e-05,
|
|
"epoch": 2.2506963788300833,
|
|
"step": 405
|
|
},
|
|
{
|
|
"loss": 1.3617,
|
|
"grad_norm": 5.293002128601074,
|
|
"learning_rate": 1.6368664961999404e-05,
|
|
"epoch": 2.256267409470752,
|
|
"step": 406
|
|
},
|
|
{
|
|
"loss": 1.3164,
|
|
"grad_norm": 6.6434431076049805,
|
|
"learning_rate": 1.6355517439941573e-05,
|
|
"epoch": 2.2618384401114207,
|
|
"step": 407
|
|
},
|
|
{
|
|
"loss": 1.4339,
|
|
"grad_norm": 6.383541584014893,
|
|
"learning_rate": 1.6342369917883743e-05,
|
|
"epoch": 2.267409470752089,
|
|
"step": 408
|
|
},
|
|
{
|
|
"loss": 1.3699,
|
|
"grad_norm": 5.989224433898926,
|
|
"learning_rate": 1.6329222395825913e-05,
|
|
"epoch": 2.2729805013927575,
|
|
"step": 409
|
|
},
|
|
{
|
|
"loss": 1.4938,
|
|
"grad_norm": 6.49315881729126,
|
|
"learning_rate": 1.631607487376808e-05,
|
|
"epoch": 2.2785515320334264,
|
|
"step": 410
|
|
},
|
|
{
|
|
"loss": 1.0902,
|
|
"grad_norm": 4.942923069000244,
|
|
"learning_rate": 1.6302927351710248e-05,
|
|
"epoch": 2.284122562674095,
|
|
"step": 411
|
|
},
|
|
{
|
|
"loss": 1.0282,
|
|
"grad_norm": 5.219899654388428,
|
|
"learning_rate": 1.6289779829652418e-05,
|
|
"epoch": 2.2896935933147633,
|
|
"step": 412
|
|
},
|
|
{
|
|
"loss": 1.3465,
|
|
"grad_norm": 5.91557502746582,
|
|
"learning_rate": 1.6276632307594587e-05,
|
|
"epoch": 2.2952646239554317,
|
|
"step": 413
|
|
},
|
|
{
|
|
"loss": 1.4312,
|
|
"grad_norm": 7.332894325256348,
|
|
"learning_rate": 1.6263484785536757e-05,
|
|
"epoch": 2.3008356545961,
|
|
"step": 414
|
|
},
|
|
{
|
|
"loss": 1.1921,
|
|
"grad_norm": 6.784351825714111,
|
|
"learning_rate": 1.6250337263478926e-05,
|
|
"epoch": 2.3064066852367686,
|
|
"step": 415
|
|
},
|
|
{
|
|
"loss": 1.3644,
|
|
"grad_norm": 6.222668647766113,
|
|
"learning_rate": 1.6237189741421096e-05,
|
|
"epoch": 2.3119777158774375,
|
|
"step": 416
|
|
},
|
|
{
|
|
"loss": 1.3318,
|
|
"grad_norm": 6.7379841804504395,
|
|
"learning_rate": 1.6224042219363265e-05,
|
|
"epoch": 2.317548746518106,
|
|
"step": 417
|
|
},
|
|
{
|
|
"loss": 1.3955,
|
|
"grad_norm": 7.218482494354248,
|
|
"learning_rate": 1.6210894697305435e-05,
|
|
"epoch": 2.3231197771587744,
|
|
"step": 418
|
|
},
|
|
{
|
|
"loss": 1.5949,
|
|
"grad_norm": 6.676080226898193,
|
|
"learning_rate": 1.6197747175247604e-05,
|
|
"epoch": 2.328690807799443,
|
|
"step": 419
|
|
},
|
|
{
|
|
"loss": 1.2428,
|
|
"grad_norm": 6.974861145019531,
|
|
"learning_rate": 1.618459965318977e-05,
|
|
"epoch": 2.3342618384401113,
|
|
"step": 420
|
|
},
|
|
{
|
|
"loss": 1.2438,
|
|
"grad_norm": 7.018064975738525,
|
|
"learning_rate": 1.617145213113194e-05,
|
|
"epoch": 2.33983286908078,
|
|
"step": 421
|
|
},
|
|
{
|
|
"loss": 1.4979,
|
|
"grad_norm": 6.781156063079834,
|
|
"learning_rate": 1.615830460907411e-05,
|
|
"epoch": 2.3454038997214486,
|
|
"step": 422
|
|
},
|
|
{
|
|
"loss": 1.4914,
|
|
"grad_norm": 6.291943550109863,
|
|
"learning_rate": 1.614515708701628e-05,
|
|
"epoch": 2.350974930362117,
|
|
"step": 423
|
|
},
|
|
{
|
|
"loss": 1.1937,
|
|
"grad_norm": 6.769220352172852,
|
|
"learning_rate": 1.613200956495845e-05,
|
|
"epoch": 2.3565459610027855,
|
|
"step": 424
|
|
},
|
|
{
|
|
"loss": 1.4428,
|
|
"grad_norm": 7.461434841156006,
|
|
"learning_rate": 1.6118862042900618e-05,
|
|
"epoch": 2.362116991643454,
|
|
"step": 425
|
|
},
|
|
{
|
|
"loss": 1.0756,
|
|
"grad_norm": 5.971315860748291,
|
|
"learning_rate": 1.6105714520842788e-05,
|
|
"epoch": 2.3676880222841223,
|
|
"step": 426
|
|
},
|
|
{
|
|
"loss": 1.1709,
|
|
"grad_norm": 6.632075786590576,
|
|
"learning_rate": 1.6092566998784954e-05,
|
|
"epoch": 2.3732590529247912,
|
|
"step": 427
|
|
},
|
|
{
|
|
"loss": 1.2953,
|
|
"grad_norm": 6.03197717666626,
|
|
"learning_rate": 1.6079419476727123e-05,
|
|
"epoch": 2.3788300835654597,
|
|
"step": 428
|
|
},
|
|
{
|
|
"loss": 1.1653,
|
|
"grad_norm": 7.393289089202881,
|
|
"learning_rate": 1.6066271954669296e-05,
|
|
"epoch": 2.384401114206128,
|
|
"step": 429
|
|
},
|
|
{
|
|
"loss": 1.542,
|
|
"grad_norm": 9.518671989440918,
|
|
"learning_rate": 1.6053124432611462e-05,
|
|
"epoch": 2.3899721448467965,
|
|
"step": 430
|
|
},
|
|
{
|
|
"loss": 1.0957,
|
|
"grad_norm": 7.086347579956055,
|
|
"learning_rate": 1.6039976910553632e-05,
|
|
"epoch": 2.3955431754874654,
|
|
"step": 431
|
|
},
|
|
{
|
|
"loss": 1.1408,
|
|
"grad_norm": 5.21544885635376,
|
|
"learning_rate": 1.60268293884958e-05,
|
|
"epoch": 2.401114206128134,
|
|
"step": 432
|
|
},
|
|
{
|
|
"loss": 1.1708,
|
|
"grad_norm": 7.537359237670898,
|
|
"learning_rate": 1.601368186643797e-05,
|
|
"epoch": 2.4066852367688023,
|
|
"step": 433
|
|
},
|
|
{
|
|
"loss": 1.101,
|
|
"grad_norm": 4.926475524902344,
|
|
"learning_rate": 1.600053434438014e-05,
|
|
"epoch": 2.4122562674094707,
|
|
"step": 434
|
|
},
|
|
{
|
|
"loss": 1.3898,
|
|
"grad_norm": 5.6016740798950195,
|
|
"learning_rate": 1.598738682232231e-05,
|
|
"epoch": 2.417827298050139,
|
|
"step": 435
|
|
},
|
|
{
|
|
"loss": 1.4717,
|
|
"grad_norm": 7.16878604888916,
|
|
"learning_rate": 1.597423930026448e-05,
|
|
"epoch": 2.4233983286908076,
|
|
"step": 436
|
|
},
|
|
{
|
|
"loss": 1.6173,
|
|
"grad_norm": 6.310802459716797,
|
|
"learning_rate": 1.5961091778206646e-05,
|
|
"epoch": 2.4289693593314765,
|
|
"step": 437
|
|
},
|
|
{
|
|
"loss": 1.6172,
|
|
"grad_norm": 8.035069465637207,
|
|
"learning_rate": 1.5947944256148815e-05,
|
|
"epoch": 2.434540389972145,
|
|
"step": 438
|
|
},
|
|
{
|
|
"loss": 1.4479,
|
|
"grad_norm": 7.806406497955322,
|
|
"learning_rate": 1.5934796734090985e-05,
|
|
"epoch": 2.4401114206128134,
|
|
"step": 439
|
|
},
|
|
{
|
|
"loss": 1.3459,
|
|
"grad_norm": 5.882315635681152,
|
|
"learning_rate": 1.5921649212033154e-05,
|
|
"epoch": 2.445682451253482,
|
|
"step": 440
|
|
},
|
|
{
|
|
"loss": 1.2195,
|
|
"grad_norm": 5.817505359649658,
|
|
"learning_rate": 1.5908501689975324e-05,
|
|
"epoch": 2.4512534818941503,
|
|
"step": 441
|
|
},
|
|
{
|
|
"loss": 1.3043,
|
|
"grad_norm": 7.497400283813477,
|
|
"learning_rate": 1.5895354167917493e-05,
|
|
"epoch": 2.456824512534819,
|
|
"step": 442
|
|
},
|
|
{
|
|
"loss": 1.42,
|
|
"grad_norm": 5.955392837524414,
|
|
"learning_rate": 1.5882206645859663e-05,
|
|
"epoch": 2.4623955431754876,
|
|
"step": 443
|
|
},
|
|
{
|
|
"loss": 1.4764,
|
|
"grad_norm": 8.848158836364746,
|
|
"learning_rate": 1.5869059123801832e-05,
|
|
"epoch": 2.467966573816156,
|
|
"step": 444
|
|
},
|
|
{
|
|
"loss": 1.4508,
|
|
"grad_norm": 6.384143829345703,
|
|
"learning_rate": 1.5855911601744002e-05,
|
|
"epoch": 2.4735376044568245,
|
|
"step": 445
|
|
},
|
|
{
|
|
"loss": 1.3499,
|
|
"grad_norm": 7.251498699188232,
|
|
"learning_rate": 1.584276407968617e-05,
|
|
"epoch": 2.479108635097493,
|
|
"step": 446
|
|
},
|
|
{
|
|
"loss": 1.297,
|
|
"grad_norm": 8.700945854187012,
|
|
"learning_rate": 1.5829616557628337e-05,
|
|
"epoch": 2.4846796657381613,
|
|
"step": 447
|
|
},
|
|
{
|
|
"loss": 1.1607,
|
|
"grad_norm": 8.17098617553711,
|
|
"learning_rate": 1.5816469035570507e-05,
|
|
"epoch": 2.4902506963788302,
|
|
"step": 448
|
|
},
|
|
{
|
|
"loss": 1.5328,
|
|
"grad_norm": 6.918285846710205,
|
|
"learning_rate": 1.5803321513512676e-05,
|
|
"epoch": 2.4958217270194987,
|
|
"step": 449
|
|
},
|
|
{
|
|
"loss": 1.6258,
|
|
"grad_norm": 6.7390851974487305,
|
|
"learning_rate": 1.5790173991454846e-05,
|
|
"epoch": 2.501392757660167,
|
|
"step": 450
|
|
},
|
|
{
|
|
"eval_loss": 2.571645498275757,
|
|
"eval_runtime": 35.9556,
|
|
"eval_samples_per_second": 39.938,
|
|
"eval_steps_per_second": 2.002,
|
|
"epoch": 2.501392757660167,
|
|
"step": 450
|
|
},
|
|
{
|
|
"loss": 1.5923,
|
|
"grad_norm": 6.522182941436768,
|
|
"learning_rate": 1.5777026469397015e-05,
|
|
"epoch": 2.5069637883008355,
|
|
"step": 451
|
|
},
|
|
{
|
|
"loss": 1.2816,
|
|
"grad_norm": 5.984560489654541,
|
|
"learning_rate": 1.5763878947339185e-05,
|
|
"epoch": 2.5125348189415044,
|
|
"step": 452
|
|
},
|
|
{
|
|
"loss": 1.2029,
|
|
"grad_norm": 8.060498237609863,
|
|
"learning_rate": 1.5750731425281354e-05,
|
|
"epoch": 2.518105849582173,
|
|
"step": 453
|
|
},
|
|
{
|
|
"loss": 1.2117,
|
|
"grad_norm": 6.93899393081665,
|
|
"learning_rate": 1.573758390322352e-05,
|
|
"epoch": 2.5236768802228413,
|
|
"step": 454
|
|
},
|
|
{
|
|
"loss": 1.4347,
|
|
"grad_norm": 6.21560525894165,
|
|
"learning_rate": 1.572443638116569e-05,
|
|
"epoch": 2.5292479108635098,
|
|
"step": 455
|
|
},
|
|
{
|
|
"loss": 1.3394,
|
|
"grad_norm": 7.837366580963135,
|
|
"learning_rate": 1.5711288859107863e-05,
|
|
"epoch": 2.534818941504178,
|
|
"step": 456
|
|
},
|
|
{
|
|
"loss": 1.4262,
|
|
"grad_norm": 7.609643936157227,
|
|
"learning_rate": 1.5698141337050033e-05,
|
|
"epoch": 2.5403899721448466,
|
|
"step": 457
|
|
},
|
|
{
|
|
"loss": 1.3738,
|
|
"grad_norm": 6.487556457519531,
|
|
"learning_rate": 1.56849938149922e-05,
|
|
"epoch": 2.545961002785515,
|
|
"step": 458
|
|
},
|
|
{
|
|
"loss": 1.4021,
|
|
"grad_norm": 6.344869136810303,
|
|
"learning_rate": 1.5671846292934368e-05,
|
|
"epoch": 2.551532033426184,
|
|
"step": 459
|
|
},
|
|
{
|
|
"loss": 1.3887,
|
|
"grad_norm": 6.960203170776367,
|
|
"learning_rate": 1.5658698770876538e-05,
|
|
"epoch": 2.5571030640668524,
|
|
"step": 460
|
|
},
|
|
{
|
|
"loss": 1.2997,
|
|
"grad_norm": 11.57795524597168,
|
|
"learning_rate": 1.5645551248818707e-05,
|
|
"epoch": 2.562674094707521,
|
|
"step": 461
|
|
},
|
|
{
|
|
"loss": 1.5967,
|
|
"grad_norm": 6.889705181121826,
|
|
"learning_rate": 1.5632403726760877e-05,
|
|
"epoch": 2.5682451253481893,
|
|
"step": 462
|
|
},
|
|
{
|
|
"loss": 1.2643,
|
|
"grad_norm": 8.502350807189941,
|
|
"learning_rate": 1.5619256204703046e-05,
|
|
"epoch": 2.573816155988858,
|
|
"step": 463
|
|
},
|
|
{
|
|
"loss": 1.3686,
|
|
"grad_norm": 8.704366683959961,
|
|
"learning_rate": 1.5606108682645216e-05,
|
|
"epoch": 2.5793871866295266,
|
|
"step": 464
|
|
},
|
|
{
|
|
"loss": 0.9961,
|
|
"grad_norm": 8.154948234558105,
|
|
"learning_rate": 1.5592961160587382e-05,
|
|
"epoch": 2.584958217270195,
|
|
"step": 465
|
|
},
|
|
{
|
|
"loss": 1.0603,
|
|
"grad_norm": 5.729700088500977,
|
|
"learning_rate": 1.557981363852955e-05,
|
|
"epoch": 2.5905292479108635,
|
|
"step": 466
|
|
},
|
|
{
|
|
"loss": 1.6641,
|
|
"grad_norm": 7.716269493103027,
|
|
"learning_rate": 1.556666611647172e-05,
|
|
"epoch": 2.596100278551532,
|
|
"step": 467
|
|
},
|
|
{
|
|
"loss": 1.2886,
|
|
"grad_norm": 11.220166206359863,
|
|
"learning_rate": 1.555351859441389e-05,
|
|
"epoch": 2.6016713091922004,
|
|
"step": 468
|
|
},
|
|
{
|
|
"loss": 1.2922,
|
|
"grad_norm": 7.163726329803467,
|
|
"learning_rate": 1.554037107235606e-05,
|
|
"epoch": 2.6072423398328692,
|
|
"step": 469
|
|
},
|
|
{
|
|
"loss": 1.1046,
|
|
"grad_norm": 7.28581428527832,
|
|
"learning_rate": 1.552722355029823e-05,
|
|
"epoch": 2.6128133704735377,
|
|
"step": 470
|
|
},
|
|
{
|
|
"loss": 1.6142,
|
|
"grad_norm": 9.65365219116211,
|
|
"learning_rate": 1.5514076028240396e-05,
|
|
"epoch": 2.618384401114206,
|
|
"step": 471
|
|
},
|
|
{
|
|
"loss": 1.5575,
|
|
"grad_norm": 6.458492279052734,
|
|
"learning_rate": 1.550092850618257e-05,
|
|
"epoch": 2.6239554317548746,
|
|
"step": 472
|
|
},
|
|
{
|
|
"loss": 1.3655,
|
|
"grad_norm": 7.325246810913086,
|
|
"learning_rate": 1.5487780984124738e-05,
|
|
"epoch": 2.6295264623955434,
|
|
"step": 473
|
|
},
|
|
{
|
|
"loss": 1.3344,
|
|
"grad_norm": 7.81355619430542,
|
|
"learning_rate": 1.5474633462066908e-05,
|
|
"epoch": 2.635097493036212,
|
|
"step": 474
|
|
},
|
|
{
|
|
"loss": 1.2505,
|
|
"grad_norm": 7.347303867340088,
|
|
"learning_rate": 1.5461485940009074e-05,
|
|
"epoch": 2.6406685236768803,
|
|
"step": 475
|
|
},
|
|
{
|
|
"loss": 1.1988,
|
|
"grad_norm": 7.306774616241455,
|
|
"learning_rate": 1.5448338417951243e-05,
|
|
"epoch": 2.6462395543175488,
|
|
"step": 476
|
|
},
|
|
{
|
|
"loss": 1.4075,
|
|
"grad_norm": 7.261951446533203,
|
|
"learning_rate": 1.5435190895893413e-05,
|
|
"epoch": 2.651810584958217,
|
|
"step": 477
|
|
},
|
|
{
|
|
"loss": 1.3235,
|
|
"grad_norm": 8.138806343078613,
|
|
"learning_rate": 1.5422043373835582e-05,
|
|
"epoch": 2.6573816155988856,
|
|
"step": 478
|
|
},
|
|
{
|
|
"loss": 1.4297,
|
|
"grad_norm": 7.515624046325684,
|
|
"learning_rate": 1.5408895851777752e-05,
|
|
"epoch": 2.662952646239554,
|
|
"step": 479
|
|
},
|
|
{
|
|
"loss": 1.0187,
|
|
"grad_norm": 7.298752307891846,
|
|
"learning_rate": 1.539574832971992e-05,
|
|
"epoch": 2.668523676880223,
|
|
"step": 480
|
|
},
|
|
{
|
|
"loss": 1.1512,
|
|
"grad_norm": 7.08530855178833,
|
|
"learning_rate": 1.538260080766209e-05,
|
|
"epoch": 2.6740947075208914,
|
|
"step": 481
|
|
},
|
|
{
|
|
"loss": 0.9209,
|
|
"grad_norm": 8.528051376342773,
|
|
"learning_rate": 1.5369453285604257e-05,
|
|
"epoch": 2.67966573816156,
|
|
"step": 482
|
|
},
|
|
{
|
|
"loss": 1.6726,
|
|
"grad_norm": 6.991207122802734,
|
|
"learning_rate": 1.535630576354643e-05,
|
|
"epoch": 2.6852367688022283,
|
|
"step": 483
|
|
},
|
|
{
|
|
"loss": 1.6101,
|
|
"grad_norm": 6.910933971405029,
|
|
"learning_rate": 1.53431582414886e-05,
|
|
"epoch": 2.690807799442897,
|
|
"step": 484
|
|
},
|
|
{
|
|
"loss": 1.0596,
|
|
"grad_norm": 6.858171463012695,
|
|
"learning_rate": 1.5330010719430766e-05,
|
|
"epoch": 2.6963788300835656,
|
|
"step": 485
|
|
},
|
|
{
|
|
"loss": 1.3009,
|
|
"grad_norm": 7.1738409996032715,
|
|
"learning_rate": 1.5316863197372935e-05,
|
|
"epoch": 2.701949860724234,
|
|
"step": 486
|
|
},
|
|
{
|
|
"loss": 1.1306,
|
|
"grad_norm": 6.751303672790527,
|
|
"learning_rate": 1.5303715675315105e-05,
|
|
"epoch": 2.7075208913649025,
|
|
"step": 487
|
|
},
|
|
{
|
|
"loss": 1.6064,
|
|
"grad_norm": 7.458596706390381,
|
|
"learning_rate": 1.5290568153257274e-05,
|
|
"epoch": 2.713091922005571,
|
|
"step": 488
|
|
},
|
|
{
|
|
"loss": 1.3423,
|
|
"grad_norm": 4.847519397735596,
|
|
"learning_rate": 1.5277420631199444e-05,
|
|
"epoch": 2.7186629526462394,
|
|
"step": 489
|
|
},
|
|
{
|
|
"loss": 1.0908,
|
|
"grad_norm": 6.585028648376465,
|
|
"learning_rate": 1.5264273109141613e-05,
|
|
"epoch": 2.724233983286908,
|
|
"step": 490
|
|
},
|
|
{
|
|
"loss": 1.6632,
|
|
"grad_norm": 5.222984790802002,
|
|
"learning_rate": 1.5251125587083783e-05,
|
|
"epoch": 2.7298050139275767,
|
|
"step": 491
|
|
},
|
|
{
|
|
"loss": 1.3113,
|
|
"grad_norm": 6.947058200836182,
|
|
"learning_rate": 1.523797806502595e-05,
|
|
"epoch": 2.735376044568245,
|
|
"step": 492
|
|
},
|
|
{
|
|
"loss": 1.0863,
|
|
"grad_norm": 5.885672569274902,
|
|
"learning_rate": 1.522483054296812e-05,
|
|
"epoch": 2.7409470752089136,
|
|
"step": 493
|
|
},
|
|
{
|
|
"loss": 1.1982,
|
|
"grad_norm": 7.9502034187316895,
|
|
"learning_rate": 1.521168302091029e-05,
|
|
"epoch": 2.7465181058495824,
|
|
"step": 494
|
|
},
|
|
{
|
|
"loss": 1.3941,
|
|
"grad_norm": 5.9523773193359375,
|
|
"learning_rate": 1.5198535498852457e-05,
|
|
"epoch": 2.752089136490251,
|
|
"step": 495
|
|
},
|
|
{
|
|
"loss": 1.3251,
|
|
"grad_norm": 7.984345436096191,
|
|
"learning_rate": 1.5185387976794627e-05,
|
|
"epoch": 2.7576601671309193,
|
|
"step": 496
|
|
},
|
|
{
|
|
"loss": 0.8109,
|
|
"grad_norm": 8.467183113098145,
|
|
"learning_rate": 1.5172240454736796e-05,
|
|
"epoch": 2.7632311977715878,
|
|
"step": 497
|
|
},
|
|
{
|
|
"loss": 1.1339,
|
|
"grad_norm": 7.878790378570557,
|
|
"learning_rate": 1.5159092932678966e-05,
|
|
"epoch": 2.768802228412256,
|
|
"step": 498
|
|
},
|
|
{
|
|
"loss": 1.1736,
|
|
"grad_norm": 5.638209819793701,
|
|
"learning_rate": 1.5145945410621134e-05,
|
|
"epoch": 2.7743732590529246,
|
|
"step": 499
|
|
},
|
|
{
|
|
"loss": 1.3546,
|
|
"grad_norm": 7.818211078643799,
|
|
"learning_rate": 1.5132797888563303e-05,
|
|
"epoch": 2.779944289693593,
|
|
"step": 500
|
|
},
|
|
{
|
|
"eval_loss": 2.6166257858276367,
|
|
"eval_runtime": 35.971,
|
|
"eval_samples_per_second": 39.921,
|
|
"eval_steps_per_second": 2.002,
|
|
"epoch": 2.779944289693593,
|
|
"step": 500
|
|
},
|
|
{
|
|
"loss": 1.4636,
|
|
"grad_norm": 6.118830680847168,
|
|
"learning_rate": 1.5119650366505473e-05,
|
|
"epoch": 2.785515320334262,
|
|
"step": 501
|
|
},
|
|
{
|
|
"loss": 1.5519,
|
|
"grad_norm": 7.9165778160095215,
|
|
"learning_rate": 1.510650284444764e-05,
|
|
"epoch": 2.7910863509749304,
|
|
"step": 502
|
|
},
|
|
{
|
|
"loss": 1.5206,
|
|
"grad_norm": 6.975761413574219,
|
|
"learning_rate": 1.5093355322389812e-05,
|
|
"epoch": 2.796657381615599,
|
|
"step": 503
|
|
},
|
|
{
|
|
"loss": 1.0665,
|
|
"grad_norm": 9.277933120727539,
|
|
"learning_rate": 1.5080207800331981e-05,
|
|
"epoch": 2.8022284122562673,
|
|
"step": 504
|
|
},
|
|
{
|
|
"loss": 1.2801,
|
|
"grad_norm": 8.121682167053223,
|
|
"learning_rate": 1.5067060278274151e-05,
|
|
"epoch": 2.807799442896936,
|
|
"step": 505
|
|
},
|
|
{
|
|
"loss": 1.565,
|
|
"grad_norm": 8.76021957397461,
|
|
"learning_rate": 1.5053912756216319e-05,
|
|
"epoch": 2.8133704735376046,
|
|
"step": 506
|
|
},
|
|
{
|
|
"loss": 1.3502,
|
|
"grad_norm": 8.618566513061523,
|
|
"learning_rate": 1.5040765234158488e-05,
|
|
"epoch": 2.818941504178273,
|
|
"step": 507
|
|
},
|
|
{
|
|
"loss": 1.5859,
|
|
"grad_norm": 8.027894020080566,
|
|
"learning_rate": 1.5027617712100658e-05,
|
|
"epoch": 2.8245125348189415,
|
|
"step": 508
|
|
},
|
|
{
|
|
"loss": 1.4159,
|
|
"grad_norm": 7.063473701477051,
|
|
"learning_rate": 1.5014470190042826e-05,
|
|
"epoch": 2.83008356545961,
|
|
"step": 509
|
|
},
|
|
{
|
|
"loss": 1.5672,
|
|
"grad_norm": 6.095931053161621,
|
|
"learning_rate": 1.5001322667984995e-05,
|
|
"epoch": 2.8356545961002784,
|
|
"step": 510
|
|
},
|
|
{
|
|
"loss": 1.2551,
|
|
"grad_norm": 6.445271968841553,
|
|
"learning_rate": 1.4988175145927165e-05,
|
|
"epoch": 2.841225626740947,
|
|
"step": 511
|
|
},
|
|
{
|
|
"loss": 0.9671,
|
|
"grad_norm": 7.601891040802002,
|
|
"learning_rate": 1.4975027623869334e-05,
|
|
"epoch": 2.8467966573816157,
|
|
"step": 512
|
|
},
|
|
{
|
|
"loss": 1.4462,
|
|
"grad_norm": 8.017728805541992,
|
|
"learning_rate": 1.4961880101811502e-05,
|
|
"epoch": 2.852367688022284,
|
|
"step": 513
|
|
},
|
|
{
|
|
"loss": 1.2006,
|
|
"grad_norm": 6.753676891326904,
|
|
"learning_rate": 1.4948732579753672e-05,
|
|
"epoch": 2.8579387186629526,
|
|
"step": 514
|
|
},
|
|
{
|
|
"loss": 0.9354,
|
|
"grad_norm": 6.220627784729004,
|
|
"learning_rate": 1.4935585057695843e-05,
|
|
"epoch": 2.863509749303621,
|
|
"step": 515
|
|
},
|
|
{
|
|
"loss": 1.5554,
|
|
"grad_norm": 7.825878620147705,
|
|
"learning_rate": 1.4922437535638009e-05,
|
|
"epoch": 2.86908077994429,
|
|
"step": 516
|
|
},
|
|
{
|
|
"loss": 0.9281,
|
|
"grad_norm": 7.7669548988342285,
|
|
"learning_rate": 1.490929001358018e-05,
|
|
"epoch": 2.8746518105849583,
|
|
"step": 517
|
|
},
|
|
{
|
|
"loss": 1.1678,
|
|
"grad_norm": 6.18816614151001,
|
|
"learning_rate": 1.489614249152235e-05,
|
|
"epoch": 2.8802228412256268,
|
|
"step": 518
|
|
},
|
|
{
|
|
"loss": 0.8378,
|
|
"grad_norm": 11.241938591003418,
|
|
"learning_rate": 1.4882994969464517e-05,
|
|
"epoch": 2.885793871866295,
|
|
"step": 519
|
|
},
|
|
{
|
|
"loss": 1.6602,
|
|
"grad_norm": 6.708087921142578,
|
|
"learning_rate": 1.4869847447406687e-05,
|
|
"epoch": 2.8913649025069637,
|
|
"step": 520
|
|
},
|
|
{
|
|
"loss": 1.5575,
|
|
"grad_norm": 8.96353530883789,
|
|
"learning_rate": 1.4856699925348856e-05,
|
|
"epoch": 2.896935933147632,
|
|
"step": 521
|
|
},
|
|
{
|
|
"loss": 1.3553,
|
|
"grad_norm": 7.286456108093262,
|
|
"learning_rate": 1.4843552403291026e-05,
|
|
"epoch": 2.902506963788301,
|
|
"step": 522
|
|
},
|
|
{
|
|
"loss": 1.3618,
|
|
"grad_norm": 6.448929309844971,
|
|
"learning_rate": 1.4830404881233194e-05,
|
|
"epoch": 2.9080779944289694,
|
|
"step": 523
|
|
},
|
|
{
|
|
"loss": 1.0911,
|
|
"grad_norm": 6.1524739265441895,
|
|
"learning_rate": 1.4817257359175363e-05,
|
|
"epoch": 2.913649025069638,
|
|
"step": 524
|
|
},
|
|
{
|
|
"loss": 1.2465,
|
|
"grad_norm": 6.833171367645264,
|
|
"learning_rate": 1.4804109837117533e-05,
|
|
"epoch": 2.9192200557103063,
|
|
"step": 525
|
|
},
|
|
{
|
|
"loss": 0.9937,
|
|
"grad_norm": 8.745670318603516,
|
|
"learning_rate": 1.47909623150597e-05,
|
|
"epoch": 2.924791086350975,
|
|
"step": 526
|
|
},
|
|
{
|
|
"loss": 1.3931,
|
|
"grad_norm": 6.3659186363220215,
|
|
"learning_rate": 1.477781479300187e-05,
|
|
"epoch": 2.9303621169916436,
|
|
"step": 527
|
|
},
|
|
{
|
|
"loss": 1.029,
|
|
"grad_norm": 8.309256553649902,
|
|
"learning_rate": 1.476466727094404e-05,
|
|
"epoch": 2.935933147632312,
|
|
"step": 528
|
|
},
|
|
{
|
|
"loss": 1.383,
|
|
"grad_norm": 7.611057758331299,
|
|
"learning_rate": 1.4751519748886211e-05,
|
|
"epoch": 2.9415041782729805,
|
|
"step": 529
|
|
},
|
|
{
|
|
"loss": 1.4833,
|
|
"grad_norm": 9.441068649291992,
|
|
"learning_rate": 1.4738372226828379e-05,
|
|
"epoch": 2.947075208913649,
|
|
"step": 530
|
|
},
|
|
{
|
|
"loss": 1.2739,
|
|
"grad_norm": 7.198431968688965,
|
|
"learning_rate": 1.4725224704770548e-05,
|
|
"epoch": 2.9526462395543174,
|
|
"step": 531
|
|
},
|
|
{
|
|
"loss": 1.4294,
|
|
"grad_norm": 8.88117790222168,
|
|
"learning_rate": 1.4712077182712718e-05,
|
|
"epoch": 2.958217270194986,
|
|
"step": 532
|
|
},
|
|
{
|
|
"loss": 1.0204,
|
|
"grad_norm": 9.982294082641602,
|
|
"learning_rate": 1.4698929660654886e-05,
|
|
"epoch": 2.9637883008356547,
|
|
"step": 533
|
|
},
|
|
{
|
|
"loss": 1.1488,
|
|
"grad_norm": 8.535533905029297,
|
|
"learning_rate": 1.4685782138597055e-05,
|
|
"epoch": 2.969359331476323,
|
|
"step": 534
|
|
},
|
|
{
|
|
"loss": 1.49,
|
|
"grad_norm": 6.813885688781738,
|
|
"learning_rate": 1.4672634616539225e-05,
|
|
"epoch": 2.9749303621169916,
|
|
"step": 535
|
|
},
|
|
{
|
|
"loss": 1.113,
|
|
"grad_norm": 9.557439804077148,
|
|
"learning_rate": 1.4659487094481394e-05,
|
|
"epoch": 2.98050139275766,
|
|
"step": 536
|
|
},
|
|
{
|
|
"loss": 1.5004,
|
|
"grad_norm": 6.406128883361816,
|
|
"learning_rate": 1.4646339572423562e-05,
|
|
"epoch": 2.986072423398329,
|
|
"step": 537
|
|
},
|
|
{
|
|
"loss": 1.1722,
|
|
"grad_norm": 7.9670915603637695,
|
|
"learning_rate": 1.4633192050365732e-05,
|
|
"epoch": 2.9916434540389973,
|
|
"step": 538
|
|
},
|
|
{
|
|
"loss": 1.5033,
|
|
"grad_norm": 9.402728080749512,
|
|
"learning_rate": 1.4620044528307901e-05,
|
|
"epoch": 2.997214484679666,
|
|
"step": 539
|
|
},
|
|
{
|
|
"loss": 1.279,
|
|
"grad_norm": 7.38714075088501,
|
|
"learning_rate": 1.4606897006250069e-05,
|
|
"epoch": 3.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"loss": 0.6426,
|
|
"grad_norm": 7.639667510986328,
|
|
"learning_rate": 1.4593749484192238e-05,
|
|
"epoch": 3.0055710306406684,
|
|
"step": 541
|
|
},
|
|
{
|
|
"loss": 0.8426,
|
|
"grad_norm": 7.864633560180664,
|
|
"learning_rate": 1.458060196213441e-05,
|
|
"epoch": 3.011142061281337,
|
|
"step": 542
|
|
},
|
|
{
|
|
"loss": 1.0651,
|
|
"grad_norm": 6.637276649475098,
|
|
"learning_rate": 1.4567454440076576e-05,
|
|
"epoch": 3.0167130919220058,
|
|
"step": 543
|
|
},
|
|
{
|
|
"loss": 1.0804,
|
|
"grad_norm": 7.148686408996582,
|
|
"learning_rate": 1.4554306918018747e-05,
|
|
"epoch": 3.022284122562674,
|
|
"step": 544
|
|
},
|
|
{
|
|
"loss": 1.0182,
|
|
"grad_norm": 6.767364501953125,
|
|
"learning_rate": 1.4541159395960917e-05,
|
|
"epoch": 3.0278551532033426,
|
|
"step": 545
|
|
},
|
|
{
|
|
"loss": 0.8165,
|
|
"grad_norm": 6.77062463760376,
|
|
"learning_rate": 1.4528011873903086e-05,
|
|
"epoch": 3.033426183844011,
|
|
"step": 546
|
|
},
|
|
{
|
|
"loss": 0.9941,
|
|
"grad_norm": 8.067922592163086,
|
|
"learning_rate": 1.4514864351845254e-05,
|
|
"epoch": 3.0389972144846795,
|
|
"step": 547
|
|
},
|
|
{
|
|
"loss": 0.9579,
|
|
"grad_norm": 8.817468643188477,
|
|
"learning_rate": 1.4501716829787423e-05,
|
|
"epoch": 3.0445682451253484,
|
|
"step": 548
|
|
},
|
|
{
|
|
"loss": 0.6023,
|
|
"grad_norm": 8.70374870300293,
|
|
"learning_rate": 1.4488569307729593e-05,
|
|
"epoch": 3.050139275766017,
|
|
"step": 549
|
|
},
|
|
{
|
|
"loss": 1.1392,
|
|
"grad_norm": 9.344374656677246,
|
|
"learning_rate": 1.447542178567176e-05,
|
|
"epoch": 3.0557103064066853,
|
|
"step": 550
|
|
},
|
|
{
|
|
"eval_loss": 3.0347440242767334,
|
|
"eval_runtime": 35.9429,
|
|
"eval_samples_per_second": 39.952,
|
|
"eval_steps_per_second": 2.003,
|
|
"epoch": 3.0557103064066853,
|
|
"step": 550
|
|
},
|
|
{
|
|
"loss": 0.663,
|
|
"grad_norm": 10.07166862487793,
|
|
"learning_rate": 1.446227426361393e-05,
|
|
"epoch": 3.0612813370473537,
|
|
"step": 551
|
|
},
|
|
{
|
|
"loss": 1.1801,
|
|
"grad_norm": 14.619653701782227,
|
|
"learning_rate": 1.44491267415561e-05,
|
|
"epoch": 3.066852367688022,
|
|
"step": 552
|
|
},
|
|
{
|
|
"loss": 0.9107,
|
|
"grad_norm": 10.427509307861328,
|
|
"learning_rate": 1.443597921949827e-05,
|
|
"epoch": 3.0724233983286906,
|
|
"step": 553
|
|
},
|
|
{
|
|
"loss": 0.9474,
|
|
"grad_norm": 8.392213821411133,
|
|
"learning_rate": 1.4422831697440437e-05,
|
|
"epoch": 3.0779944289693595,
|
|
"step": 554
|
|
},
|
|
{
|
|
"loss": 0.7972,
|
|
"grad_norm": 13.848929405212402,
|
|
"learning_rate": 1.4409684175382607e-05,
|
|
"epoch": 3.083565459610028,
|
|
"step": 555
|
|
},
|
|
{
|
|
"loss": 0.7298,
|
|
"grad_norm": 9.263422966003418,
|
|
"learning_rate": 1.4396536653324778e-05,
|
|
"epoch": 3.0891364902506964,
|
|
"step": 556
|
|
},
|
|
{
|
|
"loss": 0.5749,
|
|
"grad_norm": 11.082460403442383,
|
|
"learning_rate": 1.4383389131266946e-05,
|
|
"epoch": 3.094707520891365,
|
|
"step": 557
|
|
},
|
|
{
|
|
"loss": 0.7301,
|
|
"grad_norm": 7.7812604904174805,
|
|
"learning_rate": 1.4370241609209115e-05,
|
|
"epoch": 3.1002785515320332,
|
|
"step": 558
|
|
},
|
|
{
|
|
"loss": 0.5884,
|
|
"grad_norm": 12.2935791015625,
|
|
"learning_rate": 1.4357094087151285e-05,
|
|
"epoch": 3.105849582172702,
|
|
"step": 559
|
|
},
|
|
{
|
|
"loss": 0.8034,
|
|
"grad_norm": 8.129678726196289,
|
|
"learning_rate": 1.4343946565093454e-05,
|
|
"epoch": 3.1114206128133706,
|
|
"step": 560
|
|
},
|
|
{
|
|
"loss": 0.6528,
|
|
"grad_norm": 8.628301620483398,
|
|
"learning_rate": 1.4330799043035622e-05,
|
|
"epoch": 3.116991643454039,
|
|
"step": 561
|
|
},
|
|
{
|
|
"loss": 0.8483,
|
|
"grad_norm": 10.514995574951172,
|
|
"learning_rate": 1.4317651520977792e-05,
|
|
"epoch": 3.1225626740947074,
|
|
"step": 562
|
|
},
|
|
{
|
|
"loss": 0.7009,
|
|
"grad_norm": 8.187010765075684,
|
|
"learning_rate": 1.4304503998919961e-05,
|
|
"epoch": 3.128133704735376,
|
|
"step": 563
|
|
},
|
|
{
|
|
"loss": 0.8732,
|
|
"grad_norm": 10.525712013244629,
|
|
"learning_rate": 1.4291356476862129e-05,
|
|
"epoch": 3.1337047353760448,
|
|
"step": 564
|
|
},
|
|
{
|
|
"loss": 0.8319,
|
|
"grad_norm": 9.198347091674805,
|
|
"learning_rate": 1.4278208954804298e-05,
|
|
"epoch": 3.139275766016713,
|
|
"step": 565
|
|
},
|
|
{
|
|
"loss": 0.726,
|
|
"grad_norm": 8.486757278442383,
|
|
"learning_rate": 1.4265061432746468e-05,
|
|
"epoch": 3.1448467966573816,
|
|
"step": 566
|
|
},
|
|
{
|
|
"loss": 0.4993,
|
|
"grad_norm": 8.220407485961914,
|
|
"learning_rate": 1.4251913910688636e-05,
|
|
"epoch": 3.15041782729805,
|
|
"step": 567
|
|
},
|
|
{
|
|
"loss": 1.0212,
|
|
"grad_norm": 7.644767761230469,
|
|
"learning_rate": 1.4238766388630805e-05,
|
|
"epoch": 3.1559888579387185,
|
|
"step": 568
|
|
},
|
|
{
|
|
"loss": 0.8306,
|
|
"grad_norm": 11.287712097167969,
|
|
"learning_rate": 1.4225618866572977e-05,
|
|
"epoch": 3.1615598885793874,
|
|
"step": 569
|
|
},
|
|
{
|
|
"loss": 0.626,
|
|
"grad_norm": 7.9160637855529785,
|
|
"learning_rate": 1.4212471344515146e-05,
|
|
"epoch": 3.167130919220056,
|
|
"step": 570
|
|
},
|
|
{
|
|
"loss": 0.75,
|
|
"grad_norm": 11.988582611083984,
|
|
"learning_rate": 1.4199323822457314e-05,
|
|
"epoch": 3.1727019498607243,
|
|
"step": 571
|
|
},
|
|
{
|
|
"loss": 1.2685,
|
|
"grad_norm": 9.961721420288086,
|
|
"learning_rate": 1.4186176300399483e-05,
|
|
"epoch": 3.1782729805013927,
|
|
"step": 572
|
|
},
|
|
{
|
|
"loss": 0.7429,
|
|
"grad_norm": 12.098424911499023,
|
|
"learning_rate": 1.4173028778341653e-05,
|
|
"epoch": 3.183844011142061,
|
|
"step": 573
|
|
},
|
|
{
|
|
"loss": 1.0086,
|
|
"grad_norm": 8.59049129486084,
|
|
"learning_rate": 1.415988125628382e-05,
|
|
"epoch": 3.1894150417827296,
|
|
"step": 574
|
|
},
|
|
{
|
|
"loss": 1.1215,
|
|
"grad_norm": 10.50232219696045,
|
|
"learning_rate": 1.414673373422599e-05,
|
|
"epoch": 3.1949860724233985,
|
|
"step": 575
|
|
},
|
|
{
|
|
"loss": 0.6729,
|
|
"grad_norm": 11.673900604248047,
|
|
"learning_rate": 1.413358621216816e-05,
|
|
"epoch": 3.200557103064067,
|
|
"step": 576
|
|
},
|
|
{
|
|
"loss": 1.2036,
|
|
"grad_norm": 6.419600009918213,
|
|
"learning_rate": 1.412043869011033e-05,
|
|
"epoch": 3.2061281337047354,
|
|
"step": 577
|
|
},
|
|
{
|
|
"loss": 0.6877,
|
|
"grad_norm": 10.218490600585938,
|
|
"learning_rate": 1.4107291168052497e-05,
|
|
"epoch": 3.211699164345404,
|
|
"step": 578
|
|
},
|
|
{
|
|
"loss": 0.5637,
|
|
"grad_norm": 5.7183918952941895,
|
|
"learning_rate": 1.4094143645994667e-05,
|
|
"epoch": 3.2172701949860723,
|
|
"step": 579
|
|
},
|
|
{
|
|
"loss": 0.7498,
|
|
"grad_norm": 11.460823059082031,
|
|
"learning_rate": 1.4080996123936836e-05,
|
|
"epoch": 3.222841225626741,
|
|
"step": 580
|
|
},
|
|
{
|
|
"loss": 0.6792,
|
|
"grad_norm": 8.623233795166016,
|
|
"learning_rate": 1.4067848601879004e-05,
|
|
"epoch": 3.2284122562674096,
|
|
"step": 581
|
|
},
|
|
{
|
|
"loss": 0.6752,
|
|
"grad_norm": 11.339884757995605,
|
|
"learning_rate": 1.4054701079821174e-05,
|
|
"epoch": 3.233983286908078,
|
|
"step": 582
|
|
},
|
|
{
|
|
"loss": 0.8586,
|
|
"grad_norm": 12.452316284179688,
|
|
"learning_rate": 1.4041553557763345e-05,
|
|
"epoch": 3.2395543175487465,
|
|
"step": 583
|
|
},
|
|
{
|
|
"loss": 0.8345,
|
|
"grad_norm": 6.755831241607666,
|
|
"learning_rate": 1.4028406035705514e-05,
|
|
"epoch": 3.245125348189415,
|
|
"step": 584
|
|
},
|
|
{
|
|
"loss": 0.6932,
|
|
"grad_norm": 9.68067741394043,
|
|
"learning_rate": 1.4015258513647682e-05,
|
|
"epoch": 3.2506963788300833,
|
|
"step": 585
|
|
},
|
|
{
|
|
"loss": 1.2071,
|
|
"grad_norm": 11.948298454284668,
|
|
"learning_rate": 1.4002110991589852e-05,
|
|
"epoch": 3.256267409470752,
|
|
"step": 586
|
|
},
|
|
{
|
|
"loss": 0.7349,
|
|
"grad_norm": 11.49226188659668,
|
|
"learning_rate": 1.3988963469532021e-05,
|
|
"epoch": 3.2618384401114207,
|
|
"step": 587
|
|
},
|
|
{
|
|
"loss": 0.7923,
|
|
"grad_norm": 10.757736206054688,
|
|
"learning_rate": 1.3975815947474189e-05,
|
|
"epoch": 3.267409470752089,
|
|
"step": 588
|
|
},
|
|
{
|
|
"loss": 0.6857,
|
|
"grad_norm": 8.46744441986084,
|
|
"learning_rate": 1.3962668425416358e-05,
|
|
"epoch": 3.2729805013927575,
|
|
"step": 589
|
|
},
|
|
{
|
|
"loss": 0.9153,
|
|
"grad_norm": 6.472330093383789,
|
|
"learning_rate": 1.3949520903358528e-05,
|
|
"epoch": 3.2785515320334264,
|
|
"step": 590
|
|
},
|
|
{
|
|
"loss": 0.7542,
|
|
"grad_norm": 12.151514053344727,
|
|
"learning_rate": 1.3936373381300696e-05,
|
|
"epoch": 3.284122562674095,
|
|
"step": 591
|
|
},
|
|
{
|
|
"loss": 0.9487,
|
|
"grad_norm": 11.680760383605957,
|
|
"learning_rate": 1.3923225859242865e-05,
|
|
"epoch": 3.2896935933147633,
|
|
"step": 592
|
|
},
|
|
{
|
|
"loss": 0.6893,
|
|
"grad_norm": 9.367558479309082,
|
|
"learning_rate": 1.3910078337185035e-05,
|
|
"epoch": 3.2952646239554317,
|
|
"step": 593
|
|
},
|
|
{
|
|
"loss": 0.7126,
|
|
"grad_norm": 10.658570289611816,
|
|
"learning_rate": 1.3896930815127206e-05,
|
|
"epoch": 3.3008356545961,
|
|
"step": 594
|
|
},
|
|
{
|
|
"loss": 0.8014,
|
|
"grad_norm": 8.675304412841797,
|
|
"learning_rate": 1.3883783293069372e-05,
|
|
"epoch": 3.3064066852367686,
|
|
"step": 595
|
|
},
|
|
{
|
|
"loss": 0.7078,
|
|
"grad_norm": 6.470170974731445,
|
|
"learning_rate": 1.3870635771011543e-05,
|
|
"epoch": 3.3119777158774375,
|
|
"step": 596
|
|
},
|
|
{
|
|
"loss": 0.6612,
|
|
"grad_norm": 7.141599178314209,
|
|
"learning_rate": 1.3857488248953713e-05,
|
|
"epoch": 3.317548746518106,
|
|
"step": 597
|
|
},
|
|
{
|
|
"loss": 0.8968,
|
|
"grad_norm": 9.977639198303223,
|
|
"learning_rate": 1.384434072689588e-05,
|
|
"epoch": 3.3231197771587744,
|
|
"step": 598
|
|
},
|
|
{
|
|
"loss": 0.8395,
|
|
"grad_norm": 10.208252906799316,
|
|
"learning_rate": 1.383119320483805e-05,
|
|
"epoch": 3.328690807799443,
|
|
"step": 599
|
|
},
|
|
{
|
|
"loss": 0.9248,
|
|
"grad_norm": 9.933085441589355,
|
|
"learning_rate": 1.381804568278022e-05,
|
|
"epoch": 3.3342618384401113,
|
|
"step": 600
|
|
},
|
|
{
|
|
"eval_loss": 2.9689695835113525,
|
|
"eval_runtime": 35.9526,
|
|
"eval_samples_per_second": 39.941,
|
|
"eval_steps_per_second": 2.003,
|
|
"epoch": 3.3342618384401113,
|
|
"step": 600
|
|
},
|
|
{
|
|
"train_runtime": 2220.807,
|
|
"train_samples_per_second": 5.944,
|
|
"train_steps_per_second": 0.743,
|
|
"total_flos": 4.35104765343744e+16,
|
|
"train_loss": 1.654274252106746,
|
|
"epoch": 3.3342618384401113,
|
|
"step": 600
|
|
}
|
|
]</pre></details>
|
|
|
|
<script type="application/json" id="run-payload">{"run_meta": {"model": "unsloth/Phi-4-unsloth-bnb-4bit", "dataset": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "examples_total": 2872, "examples_train": 1436, "examples_eval": 1436, "world_size": 1, "effective_batch_size": 8, "steps_per_epoch_approx": 179.5, "max_steps": 2000, "eval_steps": 50, "save_steps": 50, "learning_rate": 9.95267419777795e-06, "warmup_steps": 10, "lr_scheduler_type": "linear", "weight_decay": 0.009206070410847844, "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.0, "best_checkpoint": "outputs/continue_r1_from_350_20260112_073729/checkpoint-100", "LR_AUTO_ENABLED": true, "LR_AUTO_USE_N": "train", "LR_AUTO_N_REF": 1436, "LR_AUTO_BASE": 1e-05, "LR_AUTO_MULT": 0.5, "LR_AUTO_FINAL": 5e-06, "best_step": 100, "best_eval_loss": 2.2380564212799072, "best_blended": 1.3520409573791146, "best_blended_step": 600}, "config_snapshot": {"MODEL_NAME": "unsloth/Phi-4-unsloth-bnb-4bit", "CHAT_TEMPLATE": "phi-4", "MAX_SEQ_LENGTH": 2048, "LOAD_IN_4BIT": true, "DATASET_NAME": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "DATASET_SPLIT": "train", "PER_DEVICE_TRAIN_BATCH_SIZE": 2, "GRADIENT_ACCUMULATION_STEPS": 4, "WARMUP_STEPS": 10, "MAX_STEPS": 2000, "LEARNING_RATE": 9.95267419777795e-06, "WEIGHT_DECAY": 0.009206070410847844, "LR_SCHEDULER_TYPE": "linear", "SEED": 3407, "PLOTLY_DARK_MODE": true, "PLOTLY_BASE_COLOR": "#00CC96", "PLOTLY_EMA_SPAN": 25, "LR_AUTO_ENABLED": true, "LR_AUTO_USE_N": "train", "LR_AUTO_N_REF": 1436, "LR_AUTO_BASE": 1e-05, "LR_AUTO_MULT": 0.5, "LR_AUTO_FINAL": 5e-06}, "run_manifest": {"model_name": "unsloth/Phi-4-unsloth-bnb-4bit", "dataset": {"name": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "split": "train"}, "training": {"max_steps": 2000, "learning_rate": 9.95267419777795e-06, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 4, "max_seq_length": 2048, "seed": 3407, "optimizer": "adamw_8bit", "lr_scheduler_type": "linear"}, "auto_lr": {"enabled": true, "use_n": "train", "n_ref": 1436, "base": 1e-05, "mult": 0.5, "final": 5e-06}, "best": {"checkpoint": "/content/outputs/continue_r1_from_350_20260112_073729/checkpoint-100", "metric": 2.2380564212799072, "metric_name": "eval_loss"}, "plotly": {"html": "training_loss_step.html"}}, "log_history": [{"loss": 2.2041, "grad_norm": 4.026190757751465, "learning_rate": 0.0, "epoch": 0.005571030640668524, "step": 1}, {"loss": 2.6901, "grad_norm": 1.616629719734192, "learning_rate": 1.4636285584967574e-07, "epoch": 0.011142061281337047, "step": 2}, {"loss": 2.6774, "grad_norm": 13.836981773376465, "learning_rate": 2.927257116993515e-07, "epoch": 0.016713091922005572, "step": 3}, {"loss": 2.4478, "grad_norm": 1.857710361480713, "learning_rate": 4.3908856754902726e-07, "epoch": 0.022284122562674095, "step": 4}, {"loss": 1.9988, "grad_norm": 1.4818029403686523, "learning_rate": 5.85451423398703e-07, "epoch": 0.027855153203342618, "step": 5}, {"loss": 2.0358, "grad_norm": 1.726440191268921, "learning_rate": 7.318142792483787e-07, "epoch": 0.033426183844011144, "step": 6}, {"loss": 2.5824, "grad_norm": 2.0604233741760254, "learning_rate": 8.781771350980545e-07, "epoch": 0.03899721448467967, "step": 7}, {"loss": 2.3479, "grad_norm": 1.7288694381713867, "learning_rate": 1.0245399909477302e-06, "epoch": 0.04456824512534819, "step": 8}, {"loss": 2.6387, "grad_norm": 1.9069620370864868, "learning_rate": 1.170902846797406e-06, "epoch": 0.05013927576601671, "step": 9}, {"loss": 2.6083, "grad_norm": 1.4719465970993042, "learning_rate": 1.3172657026470817e-06, "epoch": 0.055710306406685235, "step": 10}, {"loss": 2.3015, "grad_norm": 1.6306267976760864, "learning_rate": 1.4636285584967574e-06, "epoch": 0.06128133704735376, "step": 11}, {"loss": 2.7042, "grad_norm": 1.4724116325378418, "learning_rate": 1.6099914143464333e-06, "epoch": 0.06685236768802229, "step": 12}, {"loss": 2.5386, "grad_norm": 1.5470020771026611, "learning_rate": 1.756354270196109e-06, "epoch": 0.07242339832869081, "step": 13}, {"loss": 2.5886, "grad_norm": 2.022662401199341, "learning_rate": 1.9027171260457846e-06, "epoch": 0.07799442896935933, "step": 14}, {"loss": 2.7168, "grad_norm": 1.8387386798858643, "learning_rate": 2.0490799818954605e-06, "epoch": 0.08356545961002786, "step": 15}, {"loss": 2.1886, "grad_norm": 1.9359395503997803, "learning_rate": 2.195442837745136e-06, "epoch": 0.08913649025069638, "step": 16}, {"loss": 2.4191, "grad_norm": 1.5662318468093872, "learning_rate": 2.341805693594812e-06, "epoch": 0.0947075208913649, "step": 17}, {"loss": 2.2736, "grad_norm": 1.7207640409469604, "learning_rate": 2.4881685494444876e-06, "epoch": 0.10027855153203342, "step": 18}, {"loss": 2.7336, "grad_norm": 1.6225577592849731, "learning_rate": 2.6345314052941634e-06, "epoch": 0.10584958217270195, "step": 19}, {"loss": 2.5288, "grad_norm": 1.6348892450332642, "learning_rate": 2.780894261143839e-06, "epoch": 0.11142061281337047, "step": 20}, {"loss": 2.0214, "grad_norm": 1.5059679746627808, "learning_rate": 2.927257116993515e-06, "epoch": 0.116991643454039, "step": 21}, {"loss": 2.3708, "grad_norm": 1.3699105978012085, "learning_rate": 3.073619972843191e-06, "epoch": 0.12256267409470752, "step": 22}, {"loss": 2.5341, "grad_norm": 2.241403341293335, "learning_rate": 3.2199828286928667e-06, "epoch": 0.12813370473537605, "step": 23}, {"loss": 2.4852, "grad_norm": 1.7692517042160034, "learning_rate": 3.3663456845425424e-06, "epoch": 0.13370473537604458, "step": 24}, {"loss": 2.4959, "grad_norm": 1.9559876918792725, "learning_rate": 3.512708540392218e-06, "epoch": 0.1392757660167131, "step": 25}, {"loss": 2.243, "grad_norm": 1.7536145448684692, "learning_rate": 3.659071396241894e-06, "epoch": 0.14484679665738162, "step": 26}, {"loss": 2.1622, "grad_norm": 1.8103671073913574, "learning_rate": 3.805434252091569e-06, "epoch": 0.15041782729805014, "step": 27}, {"loss": 2.1827, "grad_norm": 1.5473895072937012, "learning_rate": 3.951797107941245e-06, "epoch": 0.15598885793871867, "step": 28}, {"loss": 2.3346, "grad_norm": 1.7137048244476318, "learning_rate": 4.098159963790921e-06, "epoch": 0.1615598885793872, "step": 29}, {"loss": 1.9959, "grad_norm": 1.7803088426589966, "learning_rate": 4.244522819640596e-06, "epoch": 0.1671309192200557, "step": 30}, {"loss": 2.5963, "grad_norm": 1.3565785884857178, "learning_rate": 4.390885675490272e-06, "epoch": 0.17270194986072424, "step": 31}, {"loss": 2.3312, "grad_norm": 1.4962913990020752, "learning_rate": 4.537248531339948e-06, "epoch": 0.17827298050139276, "step": 32}, {"loss": 2.2653, "grad_norm": 1.4864503145217896, "learning_rate": 4.683611387189624e-06, "epoch": 0.18384401114206128, "step": 33}, {"loss": 2.6019, "grad_norm": 2.2222468852996826, "learning_rate": 4.829974243039299e-06, "epoch": 0.1894150417827298, "step": 34}, {"loss": 2.2858, "grad_norm": 1.6111881732940674, "learning_rate": 4.976337098888975e-06, "epoch": 0.19498607242339833, "step": 35}, {"loss": 2.1479, "grad_norm": 2.307185173034668, "learning_rate": 5.1226999547386506e-06, "epoch": 0.20055710306406685, "step": 36}, {"loss": 2.3149, "grad_norm": 1.9714685678482056, "learning_rate": 5.269062810588327e-06, "epoch": 0.20612813370473537, "step": 37}, {"loss": 2.4299, "grad_norm": 1.6915550231933594, "learning_rate": 5.415425666438002e-06, "epoch": 0.2116991643454039, "step": 38}, {"loss": 2.1583, "grad_norm": 1.9084649085998535, "learning_rate": 5.561788522287678e-06, "epoch": 0.21727019498607242, "step": 39}, {"loss": 2.1631, "grad_norm": 1.882629632949829, "learning_rate": 5.7081513781373534e-06, "epoch": 0.22284122562674094, "step": 40}, {"loss": 2.0198, "grad_norm": 1.335666537284851, "learning_rate": 5.85451423398703e-06, "epoch": 0.22841225626740946, "step": 41}, {"loss": 2.3086, "grad_norm": 2.620265007019043, "learning_rate": 6.000877089836705e-06, "epoch": 0.233983286908078, "step": 42}, {"loss": 2.1388, "grad_norm": 2.0138227939605713, "learning_rate": 6.147239945686382e-06, "epoch": 0.2395543175487465, "step": 43}, {"loss": 1.8651, "grad_norm": 1.6108520030975342, "learning_rate": 6.293602801536056e-06, "epoch": 0.24512534818941503, "step": 44}, {"loss": 1.8407, "grad_norm": 1.5935970544815063, "learning_rate": 6.439965657385733e-06, "epoch": 0.25069637883008355, "step": 45}, {"loss": 1.9392, "grad_norm": 1.3794289827346802, "learning_rate": 6.586328513235409e-06, "epoch": 0.2562674094707521, "step": 46}, {"loss": 2.4167, "grad_norm": 1.23729407787323, "learning_rate": 6.732691369085085e-06, "epoch": 0.2618384401114206, "step": 47}, {"loss": 2.2915, "grad_norm": 1.4265947341918945, "learning_rate": 6.87905422493476e-06, "epoch": 0.26740947075208915, "step": 48}, {"loss": 1.7683, "grad_norm": 1.696736216545105, "learning_rate": 7.025417080784436e-06, "epoch": 0.27298050139275765, "step": 49}, {"loss": 2.0858, "grad_norm": 1.5071961879730225, "learning_rate": 7.1717799366341115e-06, "epoch": 0.2785515320334262, "step": 50}, {"eval_loss": 2.315699815750122, "eval_runtime": 35.9132, "eval_samples_per_second": 39.985, "eval_steps_per_second": 2.005, "epoch": 0.2785515320334262, "step": 50}, {"loss": 2.1761, "grad_norm": 1.5999361276626587, "learning_rate": 7.318142792483788e-06, "epoch": 0.2841225626740947, "step": 51}, {"loss": 2.145, "grad_norm": 2.0915520191192627, "learning_rate": 7.464505648333463e-06, "epoch": 0.28969359331476324, "step": 52}, {"loss": 1.829, "grad_norm": 4.090714931488037, "learning_rate": 7.610868504183138e-06, "epoch": 0.29526462395543174, "step": 53}, {"loss": 2.345, "grad_norm": 1.6347575187683105, "learning_rate": 7.757231360032815e-06, "epoch": 0.3008356545961003, "step": 54}, {"loss": 1.9989, "grad_norm": 1.5609041452407837, "learning_rate": 7.90359421588249e-06, "epoch": 0.3064066852367688, "step": 55}, {"loss": 2.3336, "grad_norm": 1.6561325788497925, "learning_rate": 8.049957071732166e-06, "epoch": 0.31197771587743733, "step": 56}, {"loss": 2.0979, "grad_norm": 1.6579258441925049, "learning_rate": 8.196319927581842e-06, "epoch": 0.31754874651810583, "step": 57}, {"loss": 2.4041, "grad_norm": 1.8354761600494385, "learning_rate": 8.342682783431518e-06, "epoch": 0.3231197771587744, "step": 58}, {"loss": 2.1076, "grad_norm": 1.7043126821517944, "learning_rate": 8.489045639281193e-06, "epoch": 0.3286908077994429, "step": 59}, {"loss": 2.2672, "grad_norm": 1.5663846731185913, "learning_rate": 8.635408495130869e-06, "epoch": 0.3342618384401114, "step": 60}, {"loss": 2.1081, "grad_norm": 1.8134770393371582, "learning_rate": 8.781771350980545e-06, "epoch": 0.3398328690807799, "step": 61}, {"loss": 2.034, "grad_norm": 1.3617796897888184, "learning_rate": 8.928134206830221e-06, "epoch": 0.34540389972144847, "step": 62}, {"loss": 2.1147, "grad_norm": 1.7525910139083862, "learning_rate": 9.074497062679895e-06, "epoch": 0.35097493036211697, "step": 63}, {"loss": 1.9639, "grad_norm": 1.7186285257339478, "learning_rate": 9.220859918529572e-06, "epoch": 0.3565459610027855, "step": 64}, {"loss": 1.7557, "grad_norm": 1.9141530990600586, "learning_rate": 9.367222774379248e-06, "epoch": 0.362116991643454, "step": 65}, {"loss": 1.8861, "grad_norm": 1.696165680885315, "learning_rate": 9.513585630228924e-06, "epoch": 0.36768802228412256, "step": 66}, {"loss": 2.272, "grad_norm": 1.24228036403656, "learning_rate": 9.659948486078598e-06, "epoch": 0.3732590529247911, "step": 67}, {"loss": 1.7173, "grad_norm": 1.9760662317276, "learning_rate": 9.806311341928276e-06, "epoch": 0.3788300835654596, "step": 68}, {"loss": 1.8545, "grad_norm": 1.3207972049713135, "learning_rate": 9.95267419777795e-06, "epoch": 0.38440111420612816, "step": 69}, {"loss": 2.0492, "grad_norm": 1.5849637985229492, "learning_rate": 1.0099037053627625e-05, "epoch": 0.38997214484679665, "step": 70}, {"loss": 1.9203, "grad_norm": 2.7006468772888184, "learning_rate": 1.0245399909477301e-05, "epoch": 0.3955431754874652, "step": 71}, {"loss": 2.1317, "grad_norm": 1.9178322553634644, "learning_rate": 1.0391762765326979e-05, "epoch": 0.4011142061281337, "step": 72}, {"loss": 2.282, "grad_norm": 1.5044149160385132, "learning_rate": 1.0538125621176653e-05, "epoch": 0.40668523676880225, "step": 73}, {"loss": 2.2023, "grad_norm": 1.9386659860610962, "learning_rate": 1.068448847702633e-05, "epoch": 0.41225626740947074, "step": 74}, {"loss": 2.3545, "grad_norm": 1.3408238887786865, "learning_rate": 1.0830851332876004e-05, "epoch": 0.4178272980501393, "step": 75}, {"loss": 2.1984, "grad_norm": 2.221109390258789, "learning_rate": 1.0977214188725682e-05, "epoch": 0.4233983286908078, "step": 76}, {"loss": 1.9843, "grad_norm": 1.7843296527862549, "learning_rate": 1.1123577044575356e-05, "epoch": 0.42896935933147634, "step": 77}, {"loss": 2.2899, "grad_norm": 1.6259101629257202, "learning_rate": 1.1269939900425032e-05, "epoch": 0.43454038997214484, "step": 78}, {"loss": 1.8448, "grad_norm": 1.718583345413208, "learning_rate": 1.1416302756274707e-05, "epoch": 0.4401114206128134, "step": 79}, {"loss": 1.857, "grad_norm": 1.8396937847137451, "learning_rate": 1.1562665612124385e-05, "epoch": 0.4456824512534819, "step": 80}, {"loss": 2.1387, "grad_norm": 1.808605670928955, "learning_rate": 1.170902846797406e-05, "epoch": 0.45125348189415043, "step": 81}, {"loss": 1.9587, "grad_norm": 2.590714931488037, "learning_rate": 1.1855391323823735e-05, "epoch": 0.4568245125348189, "step": 82}, {"loss": 2.5776, "grad_norm": 1.550307273864746, "learning_rate": 1.200175417967341e-05, "epoch": 0.4623955431754875, "step": 83}, {"loss": 2.1138, "grad_norm": 1.7622662782669067, "learning_rate": 1.2148117035523088e-05, "epoch": 0.467966573816156, "step": 84}, {"loss": 2.0632, "grad_norm": 2.1933865547180176, "learning_rate": 1.2294479891372764e-05, "epoch": 0.4735376044568245, "step": 85}, {"loss": 2.1885, "grad_norm": 1.6188870668411255, "learning_rate": 1.2440842747222438e-05, "epoch": 0.479108635097493, "step": 86}, {"loss": 2.2515, "grad_norm": 1.6533507108688354, "learning_rate": 1.2587205603072113e-05, "epoch": 0.48467966573816157, "step": 87}, {"loss": 2.3173, "grad_norm": 1.295457363128662, "learning_rate": 1.2733568458921789e-05, "epoch": 0.49025069637883006, "step": 88}, {"loss": 1.9536, "grad_norm": 1.5764713287353516, "learning_rate": 1.2879931314771467e-05, "epoch": 0.4958217270194986, "step": 89}, {"loss": 1.9134, "grad_norm": 1.8399816751480103, "learning_rate": 1.3026294170621141e-05, "epoch": 0.5013927576601671, "step": 90}, {"loss": 2.2808, "grad_norm": 1.7519652843475342, "learning_rate": 1.3172657026470817e-05, "epoch": 0.5069637883008357, "step": 91}, {"loss": 2.2055, "grad_norm": 1.4549530744552612, "learning_rate": 1.3319019882320492e-05, "epoch": 0.5125348189415042, "step": 92}, {"loss": 1.9373, "grad_norm": 2.0461559295654297, "learning_rate": 1.346538273817017e-05, "epoch": 0.5181058495821727, "step": 93}, {"loss": 1.7517, "grad_norm": 1.5427114963531494, "learning_rate": 1.3611745594019844e-05, "epoch": 0.5236768802228412, "step": 94}, {"loss": 1.9707, "grad_norm": 1.5442962646484375, "learning_rate": 1.375810844986952e-05, "epoch": 0.5292479108635098, "step": 95}, {"loss": 1.9629, "grad_norm": 1.939523458480835, "learning_rate": 1.3904471305719195e-05, "epoch": 0.5348189415041783, "step": 96}, {"loss": 1.8158, "grad_norm": 1.9389022588729858, "learning_rate": 1.4050834161568872e-05, "epoch": 0.5403899721448467, "step": 97}, {"loss": 2.5133, "grad_norm": 1.9970468282699585, "learning_rate": 1.4197197017418547e-05, "epoch": 0.5459610027855153, "step": 98}, {"loss": 2.263, "grad_norm": 1.5786551237106323, "learning_rate": 1.4343559873268223e-05, "epoch": 0.5515320334261838, "step": 99}, {"loss": 2.1035, "grad_norm": 2.2139763832092285, "learning_rate": 1.4489922729117897e-05, "epoch": 0.5571030640668524, "step": 100}, {"eval_loss": 2.2380564212799072, "eval_runtime": 35.9418, "eval_samples_per_second": 39.954, "eval_steps_per_second": 2.003, "epoch": 0.5571030640668524, "step": 100}, {"loss": 1.8114, "grad_norm": 2.2652857303619385, "learning_rate": 1.4636285584967575e-05, "epoch": 0.5626740947075209, "step": 101}, {"loss": 2.3733, "grad_norm": 1.688621997833252, "learning_rate": 1.4782648440817251e-05, "epoch": 0.5682451253481894, "step": 102}, {"loss": 1.929, "grad_norm": 2.500704765319824, "learning_rate": 1.4929011296666926e-05, "epoch": 0.5738161559888579, "step": 103}, {"loss": 1.9718, "grad_norm": 1.492704153060913, "learning_rate": 1.50753741525166e-05, "epoch": 0.5793871866295265, "step": 104}, {"loss": 2.1026, "grad_norm": 1.6980139017105103, "learning_rate": 1.5221737008366276e-05, "epoch": 0.584958217270195, "step": 105}, {"loss": 1.7999, "grad_norm": 1.7127199172973633, "learning_rate": 1.5368099864215953e-05, "epoch": 0.5905292479108635, "step": 106}, {"loss": 2.0349, "grad_norm": 1.8260376453399658, "learning_rate": 1.551446272006563e-05, "epoch": 0.596100278551532, "step": 107}, {"loss": 2.2571, "grad_norm": 1.8122572898864746, "learning_rate": 1.5660825575915305e-05, "epoch": 0.6016713091922006, "step": 108}, {"loss": 2.0794, "grad_norm": 2.299410343170166, "learning_rate": 1.580718843176498e-05, "epoch": 0.6072423398328691, "step": 109}, {"loss": 2.1671, "grad_norm": 1.4942196607589722, "learning_rate": 1.5953551287614657e-05, "epoch": 0.6128133704735376, "step": 110}, {"loss": 2.1292, "grad_norm": 1.6794716119766235, "learning_rate": 1.609991414346433e-05, "epoch": 0.6183844011142061, "step": 111}, {"loss": 2.2534, "grad_norm": 1.8196300268173218, "learning_rate": 1.6246276999314006e-05, "epoch": 0.6239554317548747, "step": 112}, {"loss": 2.0333, "grad_norm": 1.5703504085540771, "learning_rate": 1.6392639855163684e-05, "epoch": 0.6295264623955432, "step": 113}, {"loss": 2.1495, "grad_norm": 1.766376256942749, "learning_rate": 1.6539002711013358e-05, "epoch": 0.6350974930362117, "step": 114}, {"loss": 1.5522, "grad_norm": 2.6598968505859375, "learning_rate": 1.6685365566863036e-05, "epoch": 0.6406685236768802, "step": 115}, {"loss": 2.2221, "grad_norm": 10.731008529663086, "learning_rate": 1.683172842271271e-05, "epoch": 0.6462395543175488, "step": 116}, {"loss": 2.4227, "grad_norm": 2.2150168418884277, "learning_rate": 1.6978091278562385e-05, "epoch": 0.6518105849582173, "step": 117}, {"loss": 2.3886, "grad_norm": 2.283031940460205, "learning_rate": 1.7124454134412063e-05, "epoch": 0.6573816155988857, "step": 118}, {"loss": 2.0651, "grad_norm": 2.6018834114074707, "learning_rate": 1.7270816990261737e-05, "epoch": 0.6629526462395543, "step": 119}, {"loss": 1.9465, "grad_norm": 1.8486937284469604, "learning_rate": 1.7417179846111412e-05, "epoch": 0.6685236768802229, "step": 120}, {"loss": 2.588, "grad_norm": 2.0970637798309326, "learning_rate": 1.756354270196109e-05, "epoch": 0.6740947075208914, "step": 121}, {"loss": 1.7602, "grad_norm": 1.5886075496673584, "learning_rate": 1.7709905557810764e-05, "epoch": 0.6796657381615598, "step": 122}, {"loss": 2.2738, "grad_norm": 2.4414422512054443, "learning_rate": 1.7856268413660442e-05, "epoch": 0.6852367688022284, "step": 123}, {"loss": 1.8145, "grad_norm": 1.7890093326568604, "learning_rate": 1.8002631269510116e-05, "epoch": 0.6908077994428969, "step": 124}, {"loss": 1.7572, "grad_norm": 1.7805349826812744, "learning_rate": 1.814899412535979e-05, "epoch": 0.6963788300835655, "step": 125}, {"loss": 2.0206, "grad_norm": 1.9520258903503418, "learning_rate": 1.829535698120947e-05, "epoch": 0.7019498607242339, "step": 126}, {"loss": 2.1292, "grad_norm": 1.6244016885757446, "learning_rate": 1.8441719837059143e-05, "epoch": 0.7075208913649025, "step": 127}, {"loss": 2.1207, "grad_norm": 1.6681342124938965, "learning_rate": 1.858808269290882e-05, "epoch": 0.713091922005571, "step": 128}, {"loss": 2.1515, "grad_norm": 2.1032838821411133, "learning_rate": 1.8734445548758495e-05, "epoch": 0.7186629526462396, "step": 129}, {"loss": 1.7264, "grad_norm": 2.093341588973999, "learning_rate": 1.888080840460817e-05, "epoch": 0.724233983286908, "step": 130}, {"loss": 1.6036, "grad_norm": 1.9431419372558594, "learning_rate": 1.9027171260457848e-05, "epoch": 0.7298050139275766, "step": 131}, {"loss": 2.132, "grad_norm": 3.0380795001983643, "learning_rate": 1.9173534116307522e-05, "epoch": 0.7353760445682451, "step": 132}, {"loss": 2.061, "grad_norm": 3.623516321182251, "learning_rate": 1.9319896972157197e-05, "epoch": 0.7409470752089137, "step": 133}, {"loss": 1.8765, "grad_norm": 2.320667266845703, "learning_rate": 1.9466259828006874e-05, "epoch": 0.7465181058495822, "step": 134}, {"loss": 1.8337, "grad_norm": 1.9040995836257935, "learning_rate": 1.9612622683856552e-05, "epoch": 0.7520891364902507, "step": 135}, {"loss": 2.0664, "grad_norm": 1.8677185773849487, "learning_rate": 1.9758985539706227e-05, "epoch": 0.7576601671309192, "step": 136}, {"loss": 1.9651, "grad_norm": 2.414144992828369, "learning_rate": 1.99053483955559e-05, "epoch": 0.7632311977715878, "step": 137}, {"loss": 1.9243, "grad_norm": 2.5697357654571533, "learning_rate": 1.989220087349807e-05, "epoch": 0.7688022284122563, "step": 138}, {"loss": 2.0289, "grad_norm": 2.384612560272217, "learning_rate": 1.987905335144024e-05, "epoch": 0.7743732590529248, "step": 139}, {"loss": 1.7206, "grad_norm": 2.284605026245117, "learning_rate": 1.986590582938241e-05, "epoch": 0.7799442896935933, "step": 140}, {"loss": 1.7212, "grad_norm": 2.3488142490386963, "learning_rate": 1.985275830732458e-05, "epoch": 0.7855153203342619, "step": 141}, {"loss": 1.559, "grad_norm": 1.849543809890747, "learning_rate": 1.983961078526675e-05, "epoch": 0.7910863509749304, "step": 142}, {"loss": 1.9442, "grad_norm": 2.343719720840454, "learning_rate": 1.9826463263208915e-05, "epoch": 0.7966573816155988, "step": 143}, {"loss": 1.8913, "grad_norm": 2.6115176677703857, "learning_rate": 1.9813315741151084e-05, "epoch": 0.8022284122562674, "step": 144}, {"loss": 2.1248, "grad_norm": 2.703418731689453, "learning_rate": 1.9800168219093254e-05, "epoch": 0.807799442896936, "step": 145}, {"loss": 2.1745, "grad_norm": 2.379194736480713, "learning_rate": 1.9787020697035423e-05, "epoch": 0.8133704735376045, "step": 146}, {"loss": 2.0572, "grad_norm": 2.4916770458221436, "learning_rate": 1.9773873174977593e-05, "epoch": 0.8189415041782729, "step": 147}, {"loss": 1.8028, "grad_norm": 3.7550413608551025, "learning_rate": 1.9760725652919762e-05, "epoch": 0.8245125348189415, "step": 148}, {"loss": 2.2202, "grad_norm": 1.704113483428955, "learning_rate": 1.974757813086193e-05, "epoch": 0.83008356545961, "step": 149}, {"loss": 1.8407, "grad_norm": 2.14805269241333, "learning_rate": 1.9734430608804098e-05, "epoch": 0.8356545961002786, "step": 150}, {"eval_loss": 2.264693021774292, "eval_runtime": 35.9452, "eval_samples_per_second": 39.95, "eval_steps_per_second": 2.003, "epoch": 0.8356545961002786, "step": 150}, {"loss": 2.3419, "grad_norm": 2.3600826263427734, "learning_rate": 1.972128308674627e-05, "epoch": 0.841225626740947, "step": 151}, {"loss": 2.0321, "grad_norm": 2.7362117767333984, "learning_rate": 1.970813556468844e-05, "epoch": 0.8467966573816156, "step": 152}, {"loss": 1.7054, "grad_norm": 2.982322931289673, "learning_rate": 1.9694988042630607e-05, "epoch": 0.8523676880222841, "step": 153}, {"loss": 1.9304, "grad_norm": 2.8210840225219727, "learning_rate": 1.9681840520572776e-05, "epoch": 0.8579387186629527, "step": 154}, {"loss": 2.1154, "grad_norm": 2.412022113800049, "learning_rate": 1.9668692998514946e-05, "epoch": 0.8635097493036211, "step": 155}, {"loss": 2.0367, "grad_norm": 2.439105987548828, "learning_rate": 1.9655545476457115e-05, "epoch": 0.8690807799442897, "step": 156}, {"loss": 1.5693, "grad_norm": 2.276296854019165, "learning_rate": 1.9642397954399285e-05, "epoch": 0.8746518105849582, "step": 157}, {"loss": 1.726, "grad_norm": 2.12568998336792, "learning_rate": 1.9629250432341454e-05, "epoch": 0.8802228412256268, "step": 158}, {"loss": 1.7872, "grad_norm": 2.1106767654418945, "learning_rate": 1.9616102910283624e-05, "epoch": 0.8857938718662952, "step": 159}, {"loss": 1.9831, "grad_norm": 1.9893423318862915, "learning_rate": 1.960295538822579e-05, "epoch": 0.8913649025069638, "step": 160}, {"loss": 2.0571, "grad_norm": 2.222038984298706, "learning_rate": 1.958980786616796e-05, "epoch": 0.8969359331476323, "step": 161}, {"loss": 1.8674, "grad_norm": 2.5205395221710205, "learning_rate": 1.957666034411013e-05, "epoch": 0.9025069637883009, "step": 162}, {"loss": 1.7982, "grad_norm": 2.212405204772949, "learning_rate": 1.95635128220523e-05, "epoch": 0.9080779944289693, "step": 163}, {"loss": 1.7992, "grad_norm": 2.304945468902588, "learning_rate": 1.9550365299994468e-05, "epoch": 0.9136490250696379, "step": 164}, {"loss": 2.2013, "grad_norm": 2.8349928855895996, "learning_rate": 1.9537217777936638e-05, "epoch": 0.9192200557103064, "step": 165}, {"loss": 1.7908, "grad_norm": 2.2040209770202637, "learning_rate": 1.9524070255878807e-05, "epoch": 0.924791086350975, "step": 166}, {"loss": 2.1308, "grad_norm": 2.550541400909424, "learning_rate": 1.9510922733820977e-05, "epoch": 0.9303621169916435, "step": 167}, {"loss": 1.735, "grad_norm": 2.9808292388916016, "learning_rate": 1.9497775211763146e-05, "epoch": 0.935933147632312, "step": 168}, {"loss": 2.1536, "grad_norm": 2.4572677612304688, "learning_rate": 1.9484627689705316e-05, "epoch": 0.9415041782729805, "step": 169}, {"loss": 1.8616, "grad_norm": 2.414435863494873, "learning_rate": 1.9471480167647482e-05, "epoch": 0.947075208913649, "step": 170}, {"loss": 1.9501, "grad_norm": 2.490251064300537, "learning_rate": 1.945833264558965e-05, "epoch": 0.9526462395543176, "step": 171}, {"loss": 1.7965, "grad_norm": 3.2512645721435547, "learning_rate": 1.944518512353182e-05, "epoch": 0.958217270194986, "step": 172}, {"loss": 1.8903, "grad_norm": 2.0697317123413086, "learning_rate": 1.943203760147399e-05, "epoch": 0.9637883008356546, "step": 173}, {"loss": 1.9153, "grad_norm": 2.869088888168335, "learning_rate": 1.941889007941616e-05, "epoch": 0.9693593314763231, "step": 174}, {"loss": 2.0043, "grad_norm": 2.5188841819763184, "learning_rate": 1.940574255735833e-05, "epoch": 0.9749303621169917, "step": 175}, {"loss": 2.0106, "grad_norm": 2.2558531761169434, "learning_rate": 1.93925950353005e-05, "epoch": 0.9805013927576601, "step": 176}, {"loss": 2.0356, "grad_norm": 2.78887677192688, "learning_rate": 1.9379447513242665e-05, "epoch": 0.9860724233983287, "step": 177}, {"loss": 1.4849, "grad_norm": 2.9200024604797363, "learning_rate": 1.9366299991184838e-05, "epoch": 0.9916434540389972, "step": 178}, {"loss": 1.6636, "grad_norm": 2.443997621536255, "learning_rate": 1.9353152469127007e-05, "epoch": 0.9972144846796658, "step": 179}, {"loss": 1.4097, "grad_norm": 3.399275779724121, "learning_rate": 1.9340004947069174e-05, "epoch": 1.0, "step": 180}, {"loss": 1.7611, "grad_norm": 2.312861442565918, "learning_rate": 1.9326857425011343e-05, "epoch": 1.0055710306406684, "step": 181}, {"loss": 2.0967, "grad_norm": 2.799191951751709, "learning_rate": 1.9313709902953513e-05, "epoch": 1.011142061281337, "step": 182}, {"loss": 2.3108, "grad_norm": 2.4845213890075684, "learning_rate": 1.9300562380895682e-05, "epoch": 1.0167130919220055, "step": 183}, {"loss": 1.6263, "grad_norm": 2.72027325630188, "learning_rate": 1.928741485883785e-05, "epoch": 1.0222841225626742, "step": 184}, {"loss": 1.6228, "grad_norm": 3.2783589363098145, "learning_rate": 1.927426733678002e-05, "epoch": 1.0278551532033426, "step": 185}, {"loss": 1.9384, "grad_norm": 2.455291986465454, "learning_rate": 1.926111981472219e-05, "epoch": 1.033426183844011, "step": 186}, {"loss": 1.5646, "grad_norm": 2.2230939865112305, "learning_rate": 1.9247972292664357e-05, "epoch": 1.0389972144846797, "step": 187}, {"loss": 1.8545, "grad_norm": 2.596928119659424, "learning_rate": 1.9234824770606526e-05, "epoch": 1.0445682451253482, "step": 188}, {"loss": 1.8436, "grad_norm": 2.5703697204589844, "learning_rate": 1.9221677248548696e-05, "epoch": 1.0501392757660166, "step": 189}, {"loss": 2.2574, "grad_norm": 3.021871566772461, "learning_rate": 1.920852972649087e-05, "epoch": 1.0557103064066853, "step": 190}, {"loss": 1.8046, "grad_norm": 2.35603404045105, "learning_rate": 1.9195382204433035e-05, "epoch": 1.0612813370473537, "step": 191}, {"loss": 1.6635, "grad_norm": 2.453967809677124, "learning_rate": 1.9182234682375204e-05, "epoch": 1.0668523676880224, "step": 192}, {"loss": 1.9118, "grad_norm": 3.2305331230163574, "learning_rate": 1.9169087160317374e-05, "epoch": 1.0724233983286908, "step": 193}, {"loss": 1.5529, "grad_norm": 2.248871326446533, "learning_rate": 1.9155939638259543e-05, "epoch": 1.0779944289693593, "step": 194}, {"loss": 1.7932, "grad_norm": 3.0331363677978516, "learning_rate": 1.9142792116201713e-05, "epoch": 1.083565459610028, "step": 195}, {"loss": 1.7632, "grad_norm": 3.543948173522949, "learning_rate": 1.9129644594143882e-05, "epoch": 1.0891364902506964, "step": 196}, {"loss": 1.8788, "grad_norm": 3.4173591136932373, "learning_rate": 1.911649707208605e-05, "epoch": 1.0947075208913648, "step": 197}, {"loss": 2.0881, "grad_norm": 3.4639406204223633, "learning_rate": 1.9103349550028218e-05, "epoch": 1.1002785515320335, "step": 198}, {"loss": 1.9197, "grad_norm": 3.6082725524902344, "learning_rate": 1.9090202027970388e-05, "epoch": 1.105849582172702, "step": 199}, {"loss": 1.4541, "grad_norm": 2.834181070327759, "learning_rate": 1.9077054505912557e-05, "epoch": 1.1114206128133706, "step": 200}, {"eval_loss": 2.3124067783355713, "eval_runtime": 35.9422, "eval_samples_per_second": 39.953, "eval_steps_per_second": 2.003, "epoch": 1.1114206128133706, "step": 200}, {"loss": 1.8766, "grad_norm": 2.44728422164917, "learning_rate": 1.9063906983854727e-05, "epoch": 1.116991643454039, "step": 201}, {"loss": 1.8877, "grad_norm": 3.1577866077423096, "learning_rate": 1.9050759461796896e-05, "epoch": 1.1225626740947074, "step": 202}, {"loss": 1.6045, "grad_norm": 3.5458521842956543, "learning_rate": 1.9037611939739066e-05, "epoch": 1.128133704735376, "step": 203}, {"loss": 1.705, "grad_norm": 2.496349811553955, "learning_rate": 1.9024464417681232e-05, "epoch": 1.1337047353760445, "step": 204}, {"loss": 1.6478, "grad_norm": 3.2897088527679443, "learning_rate": 1.9011316895623405e-05, "epoch": 1.1392757660167132, "step": 205}, {"loss": 1.6703, "grad_norm": 3.1694509983062744, "learning_rate": 1.8998169373565574e-05, "epoch": 1.1448467966573816, "step": 206}, {"loss": 2.0232, "grad_norm": 2.8644907474517822, "learning_rate": 1.8985021851507744e-05, "epoch": 1.15041782729805, "step": 207}, {"loss": 1.581, "grad_norm": 2.930053472518921, "learning_rate": 1.897187432944991e-05, "epoch": 1.1559888579387188, "step": 208}, {"loss": 1.6617, "grad_norm": 2.9067940711975098, "learning_rate": 1.895872680739208e-05, "epoch": 1.1615598885793872, "step": 209}, {"loss": 2.173, "grad_norm": 3.746903419494629, "learning_rate": 1.894557928533425e-05, "epoch": 1.1671309192200556, "step": 210}, {"loss": 1.5917, "grad_norm": 4.83465576171875, "learning_rate": 1.893243176327642e-05, "epoch": 1.1727019498607243, "step": 211}, {"loss": 1.531, "grad_norm": 3.0352439880371094, "learning_rate": 1.8919284241218588e-05, "epoch": 1.1782729805013927, "step": 212}, {"loss": 2.193, "grad_norm": 2.738152027130127, "learning_rate": 1.8906136719160758e-05, "epoch": 1.1838440111420612, "step": 213}, {"loss": 1.9884, "grad_norm": 3.005979061126709, "learning_rate": 1.8892989197102927e-05, "epoch": 1.1894150417827298, "step": 214}, {"loss": 1.8659, "grad_norm": 3.930433750152588, "learning_rate": 1.8879841675045093e-05, "epoch": 1.1949860724233983, "step": 215}, {"loss": 2.191, "grad_norm": 3.3943190574645996, "learning_rate": 1.8866694152987263e-05, "epoch": 1.200557103064067, "step": 216}, {"loss": 2.1538, "grad_norm": 3.4692654609680176, "learning_rate": 1.8853546630929436e-05, "epoch": 1.2061281337047354, "step": 217}, {"loss": 1.9939, "grad_norm": 2.889341354370117, "learning_rate": 1.8840399108871602e-05, "epoch": 1.2116991643454038, "step": 218}, {"loss": 2.1667, "grad_norm": 3.123650550842285, "learning_rate": 1.882725158681377e-05, "epoch": 1.2172701949860725, "step": 219}, {"loss": 1.9743, "grad_norm": 2.6485071182250977, "learning_rate": 1.881410406475594e-05, "epoch": 1.222841225626741, "step": 220}, {"loss": 1.5679, "grad_norm": 3.791811227798462, "learning_rate": 1.880095654269811e-05, "epoch": 1.2284122562674096, "step": 221}, {"loss": 2.1841, "grad_norm": 3.286864757537842, "learning_rate": 1.878780902064028e-05, "epoch": 1.233983286908078, "step": 222}, {"loss": 2.1165, "grad_norm": 2.930072784423828, "learning_rate": 1.877466149858245e-05, "epoch": 1.2395543175487465, "step": 223}, {"loss": 1.7816, "grad_norm": 2.936857223510742, "learning_rate": 1.876151397652462e-05, "epoch": 1.2451253481894151, "step": 224}, {"loss": 1.702, "grad_norm": 2.3516695499420166, "learning_rate": 1.8748366454466785e-05, "epoch": 1.2506963788300836, "step": 225}, {"loss": 1.7222, "grad_norm": 3.2817559242248535, "learning_rate": 1.8735218932408955e-05, "epoch": 1.2562674094707522, "step": 226}, {"loss": 1.7151, "grad_norm": 2.987518548965454, "learning_rate": 1.8722071410351124e-05, "epoch": 1.2618384401114207, "step": 227}, {"loss": 2.0545, "grad_norm": 3.132258415222168, "learning_rate": 1.8708923888293294e-05, "epoch": 1.267409470752089, "step": 228}, {"loss": 1.4732, "grad_norm": 3.2233877182006836, "learning_rate": 1.8695776366235463e-05, "epoch": 1.2729805013927575, "step": 229}, {"loss": 1.7168, "grad_norm": 3.2920405864715576, "learning_rate": 1.8682628844177633e-05, "epoch": 1.2785515320334262, "step": 230}, {"loss": 1.497, "grad_norm": 2.536219596862793, "learning_rate": 1.8669481322119802e-05, "epoch": 1.2841225626740946, "step": 231}, {"loss": 1.8177, "grad_norm": 4.246109485626221, "learning_rate": 1.865633380006197e-05, "epoch": 1.2896935933147633, "step": 232}, {"loss": 1.9449, "grad_norm": 2.6518428325653076, "learning_rate": 1.864318627800414e-05, "epoch": 1.2952646239554317, "step": 233}, {"loss": 1.4673, "grad_norm": 3.7276058197021484, "learning_rate": 1.863003875594631e-05, "epoch": 1.3008356545961002, "step": 234}, {"loss": 1.865, "grad_norm": 3.2901997566223145, "learning_rate": 1.8616891233888477e-05, "epoch": 1.3064066852367688, "step": 235}, {"loss": 1.7449, "grad_norm": 2.6417624950408936, "learning_rate": 1.8603743711830646e-05, "epoch": 1.3119777158774373, "step": 236}, {"loss": 2.0348, "grad_norm": 3.81978702545166, "learning_rate": 1.8590596189772816e-05, "epoch": 1.317548746518106, "step": 237}, {"loss": 1.8765, "grad_norm": 2.615661382675171, "learning_rate": 1.8577448667714985e-05, "epoch": 1.3231197771587744, "step": 238}, {"loss": 1.7559, "grad_norm": 3.2889416217803955, "learning_rate": 1.8564301145657155e-05, "epoch": 1.3286908077994428, "step": 239}, {"loss": 1.9031, "grad_norm": 4.006824970245361, "learning_rate": 1.8551153623599324e-05, "epoch": 1.3342618384401115, "step": 240}, {"loss": 1.6102, "grad_norm": 3.3491599559783936, "learning_rate": 1.8538006101541494e-05, "epoch": 1.33983286908078, "step": 241}, {"loss": 2.0716, "grad_norm": 3.2669260501861572, "learning_rate": 1.852485857948366e-05, "epoch": 1.3454038997214486, "step": 242}, {"loss": 2.0298, "grad_norm": 4.218564510345459, "learning_rate": 1.851171105742583e-05, "epoch": 1.350974930362117, "step": 243}, {"loss": 1.9911, "grad_norm": 3.5515315532684326, "learning_rate": 1.8498563535368003e-05, "epoch": 1.3565459610027855, "step": 244}, {"loss": 1.3477, "grad_norm": 4.0060343742370605, "learning_rate": 1.848541601331017e-05, "epoch": 1.362116991643454, "step": 245}, {"loss": 1.4686, "grad_norm": 3.574927568435669, "learning_rate": 1.8472268491252338e-05, "epoch": 1.3676880222841226, "step": 246}, {"loss": 1.617, "grad_norm": 3.4316840171813965, "learning_rate": 1.8459120969194508e-05, "epoch": 1.3732590529247912, "step": 247}, {"loss": 1.6593, "grad_norm": 3.2629754543304443, "learning_rate": 1.8445973447136677e-05, "epoch": 1.3788300835654597, "step": 248}, {"loss": 1.2608, "grad_norm": 3.133815050125122, "learning_rate": 1.8432825925078847e-05, "epoch": 1.384401114206128, "step": 249}, {"loss": 1.8523, "grad_norm": 3.742141008377075, "learning_rate": 1.8419678403021016e-05, "epoch": 1.3899721448467965, "step": 250}, {"eval_loss": 2.335228443145752, "eval_runtime": 35.9668, "eval_samples_per_second": 39.926, "eval_steps_per_second": 2.002, "epoch": 1.3899721448467965, "step": 250}, {"loss": 1.7768, "grad_norm": 3.9163429737091064, "learning_rate": 1.8406530880963186e-05, "epoch": 1.3955431754874652, "step": 251}, {"loss": 1.7455, "grad_norm": 3.3456947803497314, "learning_rate": 1.8393383358905352e-05, "epoch": 1.4011142061281336, "step": 252}, {"loss": 1.7103, "grad_norm": 4.220420837402344, "learning_rate": 1.838023583684752e-05, "epoch": 1.4066852367688023, "step": 253}, {"loss": 2.0054, "grad_norm": 4.233839511871338, "learning_rate": 1.836708831478969e-05, "epoch": 1.4122562674094707, "step": 254}, {"loss": 1.7175, "grad_norm": 3.703934669494629, "learning_rate": 1.8353940792731864e-05, "epoch": 1.4178272980501392, "step": 255}, {"loss": 1.7225, "grad_norm": 4.210822105407715, "learning_rate": 1.834079327067403e-05, "epoch": 1.4233983286908078, "step": 256}, {"loss": 1.6882, "grad_norm": 3.8861896991729736, "learning_rate": 1.83276457486162e-05, "epoch": 1.4289693593314763, "step": 257}, {"loss": 2.0721, "grad_norm": 4.4140424728393555, "learning_rate": 1.831449822655837e-05, "epoch": 1.434540389972145, "step": 258}, {"loss": 1.6198, "grad_norm": 3.1098673343658447, "learning_rate": 1.830135070450054e-05, "epoch": 1.4401114206128134, "step": 259}, {"loss": 1.9632, "grad_norm": 2.9485561847686768, "learning_rate": 1.8288203182442708e-05, "epoch": 1.4456824512534818, "step": 260}, {"loss": 1.9262, "grad_norm": 3.842655658721924, "learning_rate": 1.8275055660384878e-05, "epoch": 1.4512534818941505, "step": 261}, {"loss": 2.0807, "grad_norm": 4.122529983520508, "learning_rate": 1.8261908138327047e-05, "epoch": 1.456824512534819, "step": 262}, {"loss": 2.0099, "grad_norm": 3.6181795597076416, "learning_rate": 1.8248760616269213e-05, "epoch": 1.4623955431754876, "step": 263}, {"loss": 1.7435, "grad_norm": 3.9433975219726562, "learning_rate": 1.8235613094211383e-05, "epoch": 1.467966573816156, "step": 264}, {"loss": 1.4648, "grad_norm": 5.496665000915527, "learning_rate": 1.8222465572153552e-05, "epoch": 1.4735376044568245, "step": 265}, {"loss": 2.106, "grad_norm": 3.3920114040374756, "learning_rate": 1.8209318050095722e-05, "epoch": 1.479108635097493, "step": 266}, {"loss": 1.4486, "grad_norm": 4.195888519287109, "learning_rate": 1.819617052803789e-05, "epoch": 1.4846796657381616, "step": 267}, {"loss": 1.4996, "grad_norm": 3.5301265716552734, "learning_rate": 1.818302300598006e-05, "epoch": 1.49025069637883, "step": 268}, {"loss": 1.8247, "grad_norm": 3.3157520294189453, "learning_rate": 1.8169875483922227e-05, "epoch": 1.4958217270194987, "step": 269}, {"loss": 1.6092, "grad_norm": 4.3797383308410645, "learning_rate": 1.8156727961864397e-05, "epoch": 1.501392757660167, "step": 270}, {"loss": 1.6071, "grad_norm": 3.3917229175567627, "learning_rate": 1.814358043980657e-05, "epoch": 1.5069637883008355, "step": 271}, {"loss": 1.9553, "grad_norm": 3.171808958053589, "learning_rate": 1.813043291774874e-05, "epoch": 1.5125348189415042, "step": 272}, {"loss": 1.8105, "grad_norm": 3.1904940605163574, "learning_rate": 1.8117285395690905e-05, "epoch": 1.5181058495821727, "step": 273}, {"loss": 1.5718, "grad_norm": 3.7544777393341064, "learning_rate": 1.8104137873633075e-05, "epoch": 1.5236768802228413, "step": 274}, {"loss": 1.9999, "grad_norm": 4.143693923950195, "learning_rate": 1.8090990351575244e-05, "epoch": 1.5292479108635098, "step": 275}, {"loss": 2.0393, "grad_norm": 3.505359411239624, "learning_rate": 1.8077842829517414e-05, "epoch": 1.5348189415041782, "step": 276}, {"loss": 1.6101, "grad_norm": 4.118677139282227, "learning_rate": 1.8064695307459583e-05, "epoch": 1.5403899721448466, "step": 277}, {"loss": 1.6718, "grad_norm": 4.947996139526367, "learning_rate": 1.8051547785401753e-05, "epoch": 1.5459610027855153, "step": 278}, {"loss": 2.2007, "grad_norm": 4.226828575134277, "learning_rate": 1.8038400263343922e-05, "epoch": 1.551532033426184, "step": 279}, {"loss": 1.7025, "grad_norm": 4.085235118865967, "learning_rate": 1.802525274128609e-05, "epoch": 1.5571030640668524, "step": 280}, {"loss": 1.7632, "grad_norm": 3.5451292991638184, "learning_rate": 1.8012105219228258e-05, "epoch": 1.5626740947075208, "step": 281}, {"loss": 1.4975, "grad_norm": 5.2698540687561035, "learning_rate": 1.799895769717043e-05, "epoch": 1.5682451253481893, "step": 282}, {"loss": 1.2189, "grad_norm": 3.662693738937378, "learning_rate": 1.7985810175112597e-05, "epoch": 1.573816155988858, "step": 283}, {"loss": 2.1889, "grad_norm": 3.9369843006134033, "learning_rate": 1.7972662653054766e-05, "epoch": 1.5793871866295266, "step": 284}, {"loss": 1.782, "grad_norm": 5.153691291809082, "learning_rate": 1.7959515130996936e-05, "epoch": 1.584958217270195, "step": 285}, {"loss": 1.7055, "grad_norm": 3.5153331756591797, "learning_rate": 1.7946367608939105e-05, "epoch": 1.5905292479108635, "step": 286}, {"loss": 2.0713, "grad_norm": 3.8740577697753906, "learning_rate": 1.7933220086881275e-05, "epoch": 1.596100278551532, "step": 287}, {"loss": 1.6159, "grad_norm": 2.977501153945923, "learning_rate": 1.7920072564823445e-05, "epoch": 1.6016713091922006, "step": 288}, {"loss": 2.0388, "grad_norm": 4.873539447784424, "learning_rate": 1.7906925042765614e-05, "epoch": 1.6072423398328692, "step": 289}, {"loss": 1.7656, "grad_norm": 3.6297993659973145, "learning_rate": 1.789377752070778e-05, "epoch": 1.6128133704735377, "step": 290}, {"loss": 1.9818, "grad_norm": 2.868178367614746, "learning_rate": 1.788062999864995e-05, "epoch": 1.6183844011142061, "step": 291}, {"loss": 1.6421, "grad_norm": 4.532885551452637, "learning_rate": 1.786748247659212e-05, "epoch": 1.6239554317548746, "step": 292}, {"loss": 1.653, "grad_norm": 5.63344669342041, "learning_rate": 1.785433495453429e-05, "epoch": 1.6295264623955432, "step": 293}, {"loss": 1.8727, "grad_norm": 4.235146999359131, "learning_rate": 1.7841187432476458e-05, "epoch": 1.6350974930362117, "step": 294}, {"loss": 1.3509, "grad_norm": 4.512764930725098, "learning_rate": 1.7828039910418628e-05, "epoch": 1.6406685236768803, "step": 295}, {"loss": 1.7836, "grad_norm": 3.72898268699646, "learning_rate": 1.7814892388360797e-05, "epoch": 1.6462395543175488, "step": 296}, {"loss": 1.6315, "grad_norm": 3.1936659812927246, "learning_rate": 1.7801744866302963e-05, "epoch": 1.6518105849582172, "step": 297}, {"loss": 1.9805, "grad_norm": 3.1188321113586426, "learning_rate": 1.7788597344245136e-05, "epoch": 1.6573816155988856, "step": 298}, {"loss": 1.8716, "grad_norm": 4.88875150680542, "learning_rate": 1.7775449822187306e-05, "epoch": 1.6629526462395543, "step": 299}, {"loss": 1.4669, "grad_norm": 4.494915962219238, "learning_rate": 1.7762302300129472e-05, "epoch": 1.668523676880223, "step": 300}, {"eval_loss": 2.3116097450256348, "eval_runtime": 35.9294, "eval_samples_per_second": 39.967, "eval_steps_per_second": 2.004, "epoch": 1.668523676880223, "step": 300}, {"loss": 1.3418, "grad_norm": 4.365106582641602, "learning_rate": 1.774915477807164e-05, "epoch": 1.6740947075208914, "step": 301}, {"loss": 1.4561, "grad_norm": 4.683363914489746, "learning_rate": 1.773600725601381e-05, "epoch": 1.6796657381615598, "step": 302}, {"loss": 1.8321, "grad_norm": 4.195693492889404, "learning_rate": 1.772285973395598e-05, "epoch": 1.6852367688022283, "step": 303}, {"loss": 1.8932, "grad_norm": 4.681265830993652, "learning_rate": 1.770971221189815e-05, "epoch": 1.690807799442897, "step": 304}, {"loss": 2.0071, "grad_norm": 5.034351348876953, "learning_rate": 1.769656468984032e-05, "epoch": 1.6963788300835656, "step": 305}, {"loss": 1.9824, "grad_norm": 3.9581334590911865, "learning_rate": 1.768341716778249e-05, "epoch": 1.701949860724234, "step": 306}, {"loss": 2.2225, "grad_norm": 3.9467825889587402, "learning_rate": 1.7670269645724655e-05, "epoch": 1.7075208913649025, "step": 307}, {"loss": 1.671, "grad_norm": 3.7253997325897217, "learning_rate": 1.7657122123666825e-05, "epoch": 1.713091922005571, "step": 308}, {"loss": 1.7876, "grad_norm": 4.8212480545043945, "learning_rate": 1.7643974601608998e-05, "epoch": 1.7186629526462396, "step": 309}, {"loss": 1.1102, "grad_norm": 4.235992431640625, "learning_rate": 1.7630827079551164e-05, "epoch": 1.724233983286908, "step": 310}, {"loss": 1.7577, "grad_norm": 3.5870513916015625, "learning_rate": 1.7617679557493333e-05, "epoch": 1.7298050139275767, "step": 311}, {"loss": 1.3948, "grad_norm": 4.27365779876709, "learning_rate": 1.7604532035435503e-05, "epoch": 1.7353760445682451, "step": 312}, {"loss": 1.5507, "grad_norm": 4.927708625793457, "learning_rate": 1.7591384513377672e-05, "epoch": 1.7409470752089136, "step": 313}, {"loss": 1.5299, "grad_norm": 4.702437877655029, "learning_rate": 1.7578236991319842e-05, "epoch": 1.7465181058495822, "step": 314}, {"loss": 1.6187, "grad_norm": 4.205385684967041, "learning_rate": 1.756508946926201e-05, "epoch": 1.7520891364902507, "step": 315}, {"loss": 1.6467, "grad_norm": 3.724274158477783, "learning_rate": 1.755194194720418e-05, "epoch": 1.7576601671309193, "step": 316}, {"loss": 1.48, "grad_norm": 5.0788187980651855, "learning_rate": 1.7538794425146347e-05, "epoch": 1.7632311977715878, "step": 317}, {"loss": 1.2413, "grad_norm": 4.211026191711426, "learning_rate": 1.7525646903088517e-05, "epoch": 1.7688022284122562, "step": 318}, {"loss": 1.2792, "grad_norm": 4.383068561553955, "learning_rate": 1.7512499381030686e-05, "epoch": 1.7743732590529246, "step": 319}, {"loss": 2.0635, "grad_norm": 5.2455668449401855, "learning_rate": 1.7499351858972856e-05, "epoch": 1.7799442896935933, "step": 320}, {"loss": 1.9011, "grad_norm": 4.73854398727417, "learning_rate": 1.7486204336915025e-05, "epoch": 1.785515320334262, "step": 321}, {"loss": 1.9017, "grad_norm": 5.136256217956543, "learning_rate": 1.7473056814857195e-05, "epoch": 1.7910863509749304, "step": 322}, {"loss": 1.7304, "grad_norm": 5.707761764526367, "learning_rate": 1.7459909292799364e-05, "epoch": 1.7966573816155988, "step": 323}, {"loss": 1.9703, "grad_norm": 4.81571102142334, "learning_rate": 1.744676177074153e-05, "epoch": 1.8022284122562673, "step": 324}, {"loss": 1.6825, "grad_norm": 6.157602310180664, "learning_rate": 1.7433614248683703e-05, "epoch": 1.807799442896936, "step": 325}, {"loss": 1.7945, "grad_norm": 5.200462818145752, "learning_rate": 1.7420466726625873e-05, "epoch": 1.8133704735376046, "step": 326}, {"loss": 1.7701, "grad_norm": 5.342528820037842, "learning_rate": 1.7407319204568042e-05, "epoch": 1.818941504178273, "step": 327}, {"loss": 1.8, "grad_norm": 4.419646739959717, "learning_rate": 1.739417168251021e-05, "epoch": 1.8245125348189415, "step": 328}, {"loss": 1.3064, "grad_norm": 5.106484889984131, "learning_rate": 1.7381024160452378e-05, "epoch": 1.83008356545961, "step": 329}, {"loss": 2.0357, "grad_norm": 4.221576690673828, "learning_rate": 1.7367876638394547e-05, "epoch": 1.8356545961002786, "step": 330}, {"loss": 1.6015, "grad_norm": 6.323553562164307, "learning_rate": 1.7354729116336717e-05, "epoch": 1.841225626740947, "step": 331}, {"loss": 1.5858, "grad_norm": 4.978970527648926, "learning_rate": 1.7341581594278887e-05, "epoch": 1.8467966573816157, "step": 332}, {"loss": 1.9489, "grad_norm": 3.1882030963897705, "learning_rate": 1.7328434072221056e-05, "epoch": 1.8523676880222841, "step": 333}, {"loss": 2.1722, "grad_norm": 4.047868251800537, "learning_rate": 1.7315286550163222e-05, "epoch": 1.8579387186629526, "step": 334}, {"loss": 1.5027, "grad_norm": 4.2307448387146, "learning_rate": 1.730213902810539e-05, "epoch": 1.863509749303621, "step": 335}, {"loss": 1.481, "grad_norm": 6.048774242401123, "learning_rate": 1.7288991506047565e-05, "epoch": 1.8690807799442897, "step": 336}, {"loss": 1.8746, "grad_norm": 5.389241695404053, "learning_rate": 1.7275843983989734e-05, "epoch": 1.8746518105849583, "step": 337}, {"loss": 2.0807, "grad_norm": 4.036198139190674, "learning_rate": 1.72626964619319e-05, "epoch": 1.8802228412256268, "step": 338}, {"loss": 1.7448, "grad_norm": 5.005743503570557, "learning_rate": 1.724954893987407e-05, "epoch": 1.8857938718662952, "step": 339}, {"loss": 1.9092, "grad_norm": 4.462837219238281, "learning_rate": 1.723640141781624e-05, "epoch": 1.8913649025069637, "step": 340}, {"loss": 1.7032, "grad_norm": 4.945067405700684, "learning_rate": 1.722325389575841e-05, "epoch": 1.8969359331476323, "step": 341}, {"loss": 1.9141, "grad_norm": 3.7232062816619873, "learning_rate": 1.721010637370058e-05, "epoch": 1.902506963788301, "step": 342}, {"loss": 1.8258, "grad_norm": 3.8830628395080566, "learning_rate": 1.7196958851642748e-05, "epoch": 1.9080779944289694, "step": 343}, {"loss": 1.7998, "grad_norm": 4.693456649780273, "learning_rate": 1.7183811329584917e-05, "epoch": 1.9136490250696379, "step": 344}, {"loss": 2.0583, "grad_norm": 4.737421989440918, "learning_rate": 1.7170663807527083e-05, "epoch": 1.9192200557103063, "step": 345}, {"loss": 1.494, "grad_norm": 2.78582501411438, "learning_rate": 1.7157516285469253e-05, "epoch": 1.924791086350975, "step": 346}, {"loss": 1.7167, "grad_norm": 4.305075168609619, "learning_rate": 1.7144368763411423e-05, "epoch": 1.9303621169916436, "step": 347}, {"loss": 1.7753, "grad_norm": 3.9957072734832764, "learning_rate": 1.7131221241353592e-05, "epoch": 1.935933147632312, "step": 348}, {"loss": 1.8852, "grad_norm": 4.9537434577941895, "learning_rate": 1.711807371929576e-05, "epoch": 1.9415041782729805, "step": 349}, {"loss": 1.8729, "grad_norm": 3.9404208660125732, "learning_rate": 1.710492619723793e-05, "epoch": 1.947075208913649, "step": 350}, {"eval_loss": 2.3213632106781006, "eval_runtime": 35.9387, "eval_samples_per_second": 39.957, "eval_steps_per_second": 2.003, "epoch": 1.947075208913649, "step": 350}, {"loss": 2.1419, "grad_norm": 3.202141046524048, "learning_rate": 1.70917786751801e-05, "epoch": 1.9526462395543176, "step": 351}, {"loss": 1.83, "grad_norm": 4.432948112487793, "learning_rate": 1.707863115312227e-05, "epoch": 1.958217270194986, "step": 352}, {"loss": 2.3556, "grad_norm": 5.213648796081543, "learning_rate": 1.706548363106444e-05, "epoch": 1.9637883008356547, "step": 353}, {"loss": 2.1396, "grad_norm": 4.155479431152344, "learning_rate": 1.705233610900661e-05, "epoch": 1.9693593314763231, "step": 354}, {"loss": 1.4222, "grad_norm": 5.146358013153076, "learning_rate": 1.7039188586948775e-05, "epoch": 1.9749303621169916, "step": 355}, {"loss": 1.9362, "grad_norm": 3.264761447906494, "learning_rate": 1.7026041064890945e-05, "epoch": 1.98050139275766, "step": 356}, {"loss": 1.9471, "grad_norm": 3.308243989944458, "learning_rate": 1.7012893542833114e-05, "epoch": 1.9860724233983287, "step": 357}, {"loss": 2.0193, "grad_norm": 4.1630859375, "learning_rate": 1.6999746020775284e-05, "epoch": 1.9916434540389973, "step": 358}, {"loss": 2.1048, "grad_norm": 4.196152210235596, "learning_rate": 1.6986598498717453e-05, "epoch": 1.9972144846796658, "step": 359}, {"loss": 2.0755, "grad_norm": 4.194087028503418, "learning_rate": 1.6973450976659623e-05, "epoch": 2.0, "step": 360}, {"loss": 1.4388, "grad_norm": 4.208454132080078, "learning_rate": 1.6960303454601792e-05, "epoch": 2.0055710306406684, "step": 361}, {"loss": 1.7819, "grad_norm": 3.549447774887085, "learning_rate": 1.694715593254396e-05, "epoch": 2.011142061281337, "step": 362}, {"loss": 1.5135, "grad_norm": 3.6767420768737793, "learning_rate": 1.693400841048613e-05, "epoch": 2.0167130919220058, "step": 363}, {"loss": 1.7713, "grad_norm": 3.816209554672241, "learning_rate": 1.69208608884283e-05, "epoch": 2.022284122562674, "step": 364}, {"loss": 1.6624, "grad_norm": 3.2220561504364014, "learning_rate": 1.6907713366370467e-05, "epoch": 2.0278551532033426, "step": 365}, {"loss": 1.9059, "grad_norm": 3.4210987091064453, "learning_rate": 1.6894565844312637e-05, "epoch": 2.033426183844011, "step": 366}, {"loss": 1.155, "grad_norm": 4.348776817321777, "learning_rate": 1.6881418322254806e-05, "epoch": 2.0389972144846795, "step": 367}, {"loss": 1.4513, "grad_norm": 4.143118858337402, "learning_rate": 1.6868270800196976e-05, "epoch": 2.0445682451253484, "step": 368}, {"loss": 1.8148, "grad_norm": 4.118925094604492, "learning_rate": 1.6855123278139145e-05, "epoch": 2.050139275766017, "step": 369}, {"loss": 1.6325, "grad_norm": 4.060324668884277, "learning_rate": 1.6841975756081315e-05, "epoch": 2.0557103064066853, "step": 370}, {"loss": 1.694, "grad_norm": 4.604481220245361, "learning_rate": 1.6828828234023484e-05, "epoch": 2.0612813370473537, "step": 371}, {"loss": 1.4905, "grad_norm": 5.273688316345215, "learning_rate": 1.681568071196565e-05, "epoch": 2.066852367688022, "step": 372}, {"loss": 1.1557, "grad_norm": 6.0254387855529785, "learning_rate": 1.680253318990782e-05, "epoch": 2.0724233983286906, "step": 373}, {"loss": 0.999, "grad_norm": 5.017882823944092, "learning_rate": 1.678938566784999e-05, "epoch": 2.0779944289693595, "step": 374}, {"loss": 1.4159, "grad_norm": 6.874935626983643, "learning_rate": 1.6776238145792162e-05, "epoch": 2.083565459610028, "step": 375}, {"loss": 0.9789, "grad_norm": 6.245709419250488, "learning_rate": 1.676309062373433e-05, "epoch": 2.0891364902506964, "step": 376}, {"loss": 1.3929, "grad_norm": 6.976832866668701, "learning_rate": 1.6749943101676498e-05, "epoch": 2.094707520891365, "step": 377}, {"loss": 1.5721, "grad_norm": 7.426636695861816, "learning_rate": 1.6736795579618668e-05, "epoch": 2.1002785515320332, "step": 378}, {"loss": 1.4603, "grad_norm": 8.876333236694336, "learning_rate": 1.6723648057560837e-05, "epoch": 2.105849582172702, "step": 379}, {"loss": 1.2115, "grad_norm": 5.889682769775391, "learning_rate": 1.6710500535503007e-05, "epoch": 2.1114206128133706, "step": 380}, {"loss": 1.1689, "grad_norm": 6.435322284698486, "learning_rate": 1.6697353013445176e-05, "epoch": 2.116991643454039, "step": 381}, {"loss": 1.1904, "grad_norm": 6.061446666717529, "learning_rate": 1.6684205491387342e-05, "epoch": 2.1225626740947074, "step": 382}, {"loss": 1.3799, "grad_norm": 7.56770658493042, "learning_rate": 1.6671057969329512e-05, "epoch": 2.128133704735376, "step": 383}, {"loss": 1.5787, "grad_norm": 8.942233085632324, "learning_rate": 1.665791044727168e-05, "epoch": 2.1337047353760448, "step": 384}, {"loss": 1.4084, "grad_norm": 7.448763847351074, "learning_rate": 1.664476292521385e-05, "epoch": 2.139275766016713, "step": 385}, {"loss": 1.3685, "grad_norm": 5.792154312133789, "learning_rate": 1.663161540315602e-05, "epoch": 2.1448467966573816, "step": 386}, {"loss": 1.5465, "grad_norm": 7.226157188415527, "learning_rate": 1.661846788109819e-05, "epoch": 2.15041782729805, "step": 387}, {"loss": 1.1914, "grad_norm": 5.6042022705078125, "learning_rate": 1.660532035904036e-05, "epoch": 2.1559888579387185, "step": 388}, {"loss": 1.6443, "grad_norm": 5.619427680969238, "learning_rate": 1.6592172836982525e-05, "epoch": 2.1615598885793874, "step": 389}, {"loss": 1.5371, "grad_norm": 4.770148754119873, "learning_rate": 1.65790253149247e-05, "epoch": 2.167130919220056, "step": 390}, {"loss": 1.5124, "grad_norm": 7.61703634262085, "learning_rate": 1.6565877792866868e-05, "epoch": 2.1727019498607243, "step": 391}, {"loss": 1.6248, "grad_norm": 4.498234272003174, "learning_rate": 1.6552730270809037e-05, "epoch": 2.1782729805013927, "step": 392}, {"loss": 1.4621, "grad_norm": 4.0563063621521, "learning_rate": 1.6539582748751204e-05, "epoch": 2.183844011142061, "step": 393}, {"loss": 1.4315, "grad_norm": 6.069952964782715, "learning_rate": 1.6526435226693373e-05, "epoch": 2.1894150417827296, "step": 394}, {"loss": 1.4308, "grad_norm": 6.728673458099365, "learning_rate": 1.6513287704635543e-05, "epoch": 2.1949860724233985, "step": 395}, {"loss": 1.2975, "grad_norm": 14.551620483398438, "learning_rate": 1.6500140182577712e-05, "epoch": 2.200557103064067, "step": 396}, {"loss": 1.4624, "grad_norm": 6.782831192016602, "learning_rate": 1.648699266051988e-05, "epoch": 2.2061281337047354, "step": 397}, {"loss": 1.5891, "grad_norm": 6.513261795043945, "learning_rate": 1.647384513846205e-05, "epoch": 2.211699164345404, "step": 398}, {"loss": 1.3152, "grad_norm": 6.3476433753967285, "learning_rate": 1.646069761640422e-05, "epoch": 2.2172701949860723, "step": 399}, {"loss": 1.3129, "grad_norm": 4.936390399932861, "learning_rate": 1.6447550094346387e-05, "epoch": 2.222841225626741, "step": 400}, {"eval_loss": 2.531832218170166, "eval_runtime": 35.95, "eval_samples_per_second": 39.944, "eval_steps_per_second": 2.003, "epoch": 2.222841225626741, "step": 400}, {"loss": 1.2283, "grad_norm": 8.302631378173828, "learning_rate": 1.6434402572288556e-05, "epoch": 2.2284122562674096, "step": 401}, {"loss": 1.1884, "grad_norm": 5.8890886306762695, "learning_rate": 1.642125505023073e-05, "epoch": 2.233983286908078, "step": 402}, {"loss": 1.3971, "grad_norm": 6.417287349700928, "learning_rate": 1.6408107528172895e-05, "epoch": 2.2395543175487465, "step": 403}, {"loss": 1.5501, "grad_norm": 6.351545810699463, "learning_rate": 1.6394960006115065e-05, "epoch": 2.245125348189415, "step": 404}, {"loss": 1.1685, "grad_norm": 5.121798992156982, "learning_rate": 1.6381812484057234e-05, "epoch": 2.2506963788300833, "step": 405}, {"loss": 1.3617, "grad_norm": 5.293002128601074, "learning_rate": 1.6368664961999404e-05, "epoch": 2.256267409470752, "step": 406}, {"loss": 1.3164, "grad_norm": 6.6434431076049805, "learning_rate": 1.6355517439941573e-05, "epoch": 2.2618384401114207, "step": 407}, {"loss": 1.4339, "grad_norm": 6.383541584014893, "learning_rate": 1.6342369917883743e-05, "epoch": 2.267409470752089, "step": 408}, {"loss": 1.3699, "grad_norm": 5.989224433898926, "learning_rate": 1.6329222395825913e-05, "epoch": 2.2729805013927575, "step": 409}, {"loss": 1.4938, "grad_norm": 6.49315881729126, "learning_rate": 1.631607487376808e-05, "epoch": 2.2785515320334264, "step": 410}, {"loss": 1.0902, "grad_norm": 4.942923069000244, "learning_rate": 1.6302927351710248e-05, "epoch": 2.284122562674095, "step": 411}, {"loss": 1.0282, "grad_norm": 5.219899654388428, "learning_rate": 1.6289779829652418e-05, "epoch": 2.2896935933147633, "step": 412}, {"loss": 1.3465, "grad_norm": 5.91557502746582, "learning_rate": 1.6276632307594587e-05, "epoch": 2.2952646239554317, "step": 413}, {"loss": 1.4312, "grad_norm": 7.332894325256348, "learning_rate": 1.6263484785536757e-05, "epoch": 2.3008356545961, "step": 414}, {"loss": 1.1921, "grad_norm": 6.784351825714111, "learning_rate": 1.6250337263478926e-05, "epoch": 2.3064066852367686, "step": 415}, {"loss": 1.3644, "grad_norm": 6.222668647766113, "learning_rate": 1.6237189741421096e-05, "epoch": 2.3119777158774375, "step": 416}, {"loss": 1.3318, "grad_norm": 6.7379841804504395, "learning_rate": 1.6224042219363265e-05, "epoch": 2.317548746518106, "step": 417}, {"loss": 1.3955, "grad_norm": 7.218482494354248, "learning_rate": 1.6210894697305435e-05, "epoch": 2.3231197771587744, "step": 418}, {"loss": 1.5949, "grad_norm": 6.676080226898193, "learning_rate": 1.6197747175247604e-05, "epoch": 2.328690807799443, "step": 419}, {"loss": 1.2428, "grad_norm": 6.974861145019531, "learning_rate": 1.618459965318977e-05, "epoch": 2.3342618384401113, "step": 420}, {"loss": 1.2438, "grad_norm": 7.018064975738525, "learning_rate": 1.617145213113194e-05, "epoch": 2.33983286908078, "step": 421}, {"loss": 1.4979, "grad_norm": 6.781156063079834, "learning_rate": 1.615830460907411e-05, "epoch": 2.3454038997214486, "step": 422}, {"loss": 1.4914, "grad_norm": 6.291943550109863, "learning_rate": 1.614515708701628e-05, "epoch": 2.350974930362117, "step": 423}, {"loss": 1.1937, "grad_norm": 6.769220352172852, "learning_rate": 1.613200956495845e-05, "epoch": 2.3565459610027855, "step": 424}, {"loss": 1.4428, "grad_norm": 7.461434841156006, "learning_rate": 1.6118862042900618e-05, "epoch": 2.362116991643454, "step": 425}, {"loss": 1.0756, "grad_norm": 5.971315860748291, "learning_rate": 1.6105714520842788e-05, "epoch": 2.3676880222841223, "step": 426}, {"loss": 1.1709, "grad_norm": 6.632075786590576, "learning_rate": 1.6092566998784954e-05, "epoch": 2.3732590529247912, "step": 427}, {"loss": 1.2953, "grad_norm": 6.03197717666626, "learning_rate": 1.6079419476727123e-05, "epoch": 2.3788300835654597, "step": 428}, {"loss": 1.1653, "grad_norm": 7.393289089202881, "learning_rate": 1.6066271954669296e-05, "epoch": 2.384401114206128, "step": 429}, {"loss": 1.542, "grad_norm": 9.518671989440918, "learning_rate": 1.6053124432611462e-05, "epoch": 2.3899721448467965, "step": 430}, {"loss": 1.0957, "grad_norm": 7.086347579956055, "learning_rate": 1.6039976910553632e-05, "epoch": 2.3955431754874654, "step": 431}, {"loss": 1.1408, "grad_norm": 5.21544885635376, "learning_rate": 1.60268293884958e-05, "epoch": 2.401114206128134, "step": 432}, {"loss": 1.1708, "grad_norm": 7.537359237670898, "learning_rate": 1.601368186643797e-05, "epoch": 2.4066852367688023, "step": 433}, {"loss": 1.101, "grad_norm": 4.926475524902344, "learning_rate": 1.600053434438014e-05, "epoch": 2.4122562674094707, "step": 434}, {"loss": 1.3898, "grad_norm": 5.6016740798950195, "learning_rate": 1.598738682232231e-05, "epoch": 2.417827298050139, "step": 435}, {"loss": 1.4717, "grad_norm": 7.16878604888916, "learning_rate": 1.597423930026448e-05, "epoch": 2.4233983286908076, "step": 436}, {"loss": 1.6173, "grad_norm": 6.310802459716797, "learning_rate": 1.5961091778206646e-05, "epoch": 2.4289693593314765, "step": 437}, {"loss": 1.6172, "grad_norm": 8.035069465637207, "learning_rate": 1.5947944256148815e-05, "epoch": 2.434540389972145, "step": 438}, {"loss": 1.4479, "grad_norm": 7.806406497955322, "learning_rate": 1.5934796734090985e-05, "epoch": 2.4401114206128134, "step": 439}, {"loss": 1.3459, "grad_norm": 5.882315635681152, "learning_rate": 1.5921649212033154e-05, "epoch": 2.445682451253482, "step": 440}, {"loss": 1.2195, "grad_norm": 5.817505359649658, "learning_rate": 1.5908501689975324e-05, "epoch": 2.4512534818941503, "step": 441}, {"loss": 1.3043, "grad_norm": 7.497400283813477, "learning_rate": 1.5895354167917493e-05, "epoch": 2.456824512534819, "step": 442}, {"loss": 1.42, "grad_norm": 5.955392837524414, "learning_rate": 1.5882206645859663e-05, "epoch": 2.4623955431754876, "step": 443}, {"loss": 1.4764, "grad_norm": 8.848158836364746, "learning_rate": 1.5869059123801832e-05, "epoch": 2.467966573816156, "step": 444}, {"loss": 1.4508, "grad_norm": 6.384143829345703, "learning_rate": 1.5855911601744002e-05, "epoch": 2.4735376044568245, "step": 445}, {"loss": 1.3499, "grad_norm": 7.251498699188232, "learning_rate": 1.584276407968617e-05, "epoch": 2.479108635097493, "step": 446}, {"loss": 1.297, "grad_norm": 8.700945854187012, "learning_rate": 1.5829616557628337e-05, "epoch": 2.4846796657381613, "step": 447}, {"loss": 1.1607, "grad_norm": 8.17098617553711, "learning_rate": 1.5816469035570507e-05, "epoch": 2.4902506963788302, "step": 448}, {"loss": 1.5328, "grad_norm": 6.918285846710205, "learning_rate": 1.5803321513512676e-05, "epoch": 2.4958217270194987, "step": 449}, {"loss": 1.6258, "grad_norm": 6.7390851974487305, "learning_rate": 1.5790173991454846e-05, "epoch": 2.501392757660167, "step": 450}, {"eval_loss": 2.571645498275757, "eval_runtime": 35.9556, "eval_samples_per_second": 39.938, "eval_steps_per_second": 2.002, "epoch": 2.501392757660167, "step": 450}, {"loss": 1.5923, "grad_norm": 6.522182941436768, "learning_rate": 1.5777026469397015e-05, "epoch": 2.5069637883008355, "step": 451}, {"loss": 1.2816, "grad_norm": 5.984560489654541, "learning_rate": 1.5763878947339185e-05, "epoch": 2.5125348189415044, "step": 452}, {"loss": 1.2029, "grad_norm": 8.060498237609863, "learning_rate": 1.5750731425281354e-05, "epoch": 2.518105849582173, "step": 453}, {"loss": 1.2117, "grad_norm": 6.93899393081665, "learning_rate": 1.573758390322352e-05, "epoch": 2.5236768802228413, "step": 454}, {"loss": 1.4347, "grad_norm": 6.21560525894165, "learning_rate": 1.572443638116569e-05, "epoch": 2.5292479108635098, "step": 455}, {"loss": 1.3394, "grad_norm": 7.837366580963135, "learning_rate": 1.5711288859107863e-05, "epoch": 2.534818941504178, "step": 456}, {"loss": 1.4262, "grad_norm": 7.609643936157227, "learning_rate": 1.5698141337050033e-05, "epoch": 2.5403899721448466, "step": 457}, {"loss": 1.3738, "grad_norm": 6.487556457519531, "learning_rate": 1.56849938149922e-05, "epoch": 2.545961002785515, "step": 458}, {"loss": 1.4021, "grad_norm": 6.344869136810303, "learning_rate": 1.5671846292934368e-05, "epoch": 2.551532033426184, "step": 459}, {"loss": 1.3887, "grad_norm": 6.960203170776367, "learning_rate": 1.5658698770876538e-05, "epoch": 2.5571030640668524, "step": 460}, {"loss": 1.2997, "grad_norm": 11.57795524597168, "learning_rate": 1.5645551248818707e-05, "epoch": 2.562674094707521, "step": 461}, {"loss": 1.5967, "grad_norm": 6.889705181121826, "learning_rate": 1.5632403726760877e-05, "epoch": 2.5682451253481893, "step": 462}, {"loss": 1.2643, "grad_norm": 8.502350807189941, "learning_rate": 1.5619256204703046e-05, "epoch": 2.573816155988858, "step": 463}, {"loss": 1.3686, "grad_norm": 8.704366683959961, "learning_rate": 1.5606108682645216e-05, "epoch": 2.5793871866295266, "step": 464}, {"loss": 0.9961, "grad_norm": 8.154948234558105, "learning_rate": 1.5592961160587382e-05, "epoch": 2.584958217270195, "step": 465}, {"loss": 1.0603, "grad_norm": 5.729700088500977, "learning_rate": 1.557981363852955e-05, "epoch": 2.5905292479108635, "step": 466}, {"loss": 1.6641, "grad_norm": 7.716269493103027, "learning_rate": 1.556666611647172e-05, "epoch": 2.596100278551532, "step": 467}, {"loss": 1.2886, "grad_norm": 11.220166206359863, "learning_rate": 1.555351859441389e-05, "epoch": 2.6016713091922004, "step": 468}, {"loss": 1.2922, "grad_norm": 7.163726329803467, "learning_rate": 1.554037107235606e-05, "epoch": 2.6072423398328692, "step": 469}, {"loss": 1.1046, "grad_norm": 7.28581428527832, "learning_rate": 1.552722355029823e-05, "epoch": 2.6128133704735377, "step": 470}, {"loss": 1.6142, "grad_norm": 9.65365219116211, "learning_rate": 1.5514076028240396e-05, "epoch": 2.618384401114206, "step": 471}, {"loss": 1.5575, "grad_norm": 6.458492279052734, "learning_rate": 1.550092850618257e-05, "epoch": 2.6239554317548746, "step": 472}, {"loss": 1.3655, "grad_norm": 7.325246810913086, "learning_rate": 1.5487780984124738e-05, "epoch": 2.6295264623955434, "step": 473}, {"loss": 1.3344, "grad_norm": 7.81355619430542, "learning_rate": 1.5474633462066908e-05, "epoch": 2.635097493036212, "step": 474}, {"loss": 1.2505, "grad_norm": 7.347303867340088, "learning_rate": 1.5461485940009074e-05, "epoch": 2.6406685236768803, "step": 475}, {"loss": 1.1988, "grad_norm": 7.306774616241455, "learning_rate": 1.5448338417951243e-05, "epoch": 2.6462395543175488, "step": 476}, {"loss": 1.4075, "grad_norm": 7.261951446533203, "learning_rate": 1.5435190895893413e-05, "epoch": 2.651810584958217, "step": 477}, {"loss": 1.3235, "grad_norm": 8.138806343078613, "learning_rate": 1.5422043373835582e-05, "epoch": 2.6573816155988856, "step": 478}, {"loss": 1.4297, "grad_norm": 7.515624046325684, "learning_rate": 1.5408895851777752e-05, "epoch": 2.662952646239554, "step": 479}, {"loss": 1.0187, "grad_norm": 7.298752307891846, "learning_rate": 1.539574832971992e-05, "epoch": 2.668523676880223, "step": 480}, {"loss": 1.1512, "grad_norm": 7.08530855178833, "learning_rate": 1.538260080766209e-05, "epoch": 2.6740947075208914, "step": 481}, {"loss": 0.9209, "grad_norm": 8.528051376342773, "learning_rate": 1.5369453285604257e-05, "epoch": 2.67966573816156, "step": 482}, {"loss": 1.6726, "grad_norm": 6.991207122802734, "learning_rate": 1.535630576354643e-05, "epoch": 2.6852367688022283, "step": 483}, {"loss": 1.6101, "grad_norm": 6.910933971405029, "learning_rate": 1.53431582414886e-05, "epoch": 2.690807799442897, "step": 484}, {"loss": 1.0596, "grad_norm": 6.858171463012695, "learning_rate": 1.5330010719430766e-05, "epoch": 2.6963788300835656, "step": 485}, {"loss": 1.3009, "grad_norm": 7.1738409996032715, "learning_rate": 1.5316863197372935e-05, "epoch": 2.701949860724234, "step": 486}, {"loss": 1.1306, "grad_norm": 6.751303672790527, "learning_rate": 1.5303715675315105e-05, "epoch": 2.7075208913649025, "step": 487}, {"loss": 1.6064, "grad_norm": 7.458596706390381, "learning_rate": 1.5290568153257274e-05, "epoch": 2.713091922005571, "step": 488}, {"loss": 1.3423, "grad_norm": 4.847519397735596, "learning_rate": 1.5277420631199444e-05, "epoch": 2.7186629526462394, "step": 489}, {"loss": 1.0908, "grad_norm": 6.585028648376465, "learning_rate": 1.5264273109141613e-05, "epoch": 2.724233983286908, "step": 490}, {"loss": 1.6632, "grad_norm": 5.222984790802002, "learning_rate": 1.5251125587083783e-05, "epoch": 2.7298050139275767, "step": 491}, {"loss": 1.3113, "grad_norm": 6.947058200836182, "learning_rate": 1.523797806502595e-05, "epoch": 2.735376044568245, "step": 492}, {"loss": 1.0863, "grad_norm": 5.885672569274902, "learning_rate": 1.522483054296812e-05, "epoch": 2.7409470752089136, "step": 493}, {"loss": 1.1982, "grad_norm": 7.9502034187316895, "learning_rate": 1.521168302091029e-05, "epoch": 2.7465181058495824, "step": 494}, {"loss": 1.3941, "grad_norm": 5.9523773193359375, "learning_rate": 1.5198535498852457e-05, "epoch": 2.752089136490251, "step": 495}, {"loss": 1.3251, "grad_norm": 7.984345436096191, "learning_rate": 1.5185387976794627e-05, "epoch": 2.7576601671309193, "step": 496}, {"loss": 0.8109, "grad_norm": 8.467183113098145, "learning_rate": 1.5172240454736796e-05, "epoch": 2.7632311977715878, "step": 497}, {"loss": 1.1339, "grad_norm": 7.878790378570557, "learning_rate": 1.5159092932678966e-05, "epoch": 2.768802228412256, "step": 498}, {"loss": 1.1736, "grad_norm": 5.638209819793701, "learning_rate": 1.5145945410621134e-05, "epoch": 2.7743732590529246, "step": 499}, {"loss": 1.3546, "grad_norm": 7.818211078643799, "learning_rate": 1.5132797888563303e-05, "epoch": 2.779944289693593, "step": 500}, {"eval_loss": 2.6166257858276367, "eval_runtime": 35.971, "eval_samples_per_second": 39.921, "eval_steps_per_second": 2.002, "epoch": 2.779944289693593, "step": 500}, {"loss": 1.4636, "grad_norm": 6.118830680847168, "learning_rate": 1.5119650366505473e-05, "epoch": 2.785515320334262, "step": 501}, {"loss": 1.5519, "grad_norm": 7.9165778160095215, "learning_rate": 1.510650284444764e-05, "epoch": 2.7910863509749304, "step": 502}, {"loss": 1.5206, "grad_norm": 6.975761413574219, "learning_rate": 1.5093355322389812e-05, "epoch": 2.796657381615599, "step": 503}, {"loss": 1.0665, "grad_norm": 9.277933120727539, "learning_rate": 1.5080207800331981e-05, "epoch": 2.8022284122562673, "step": 504}, {"loss": 1.2801, "grad_norm": 8.121682167053223, "learning_rate": 1.5067060278274151e-05, "epoch": 2.807799442896936, "step": 505}, {"loss": 1.565, "grad_norm": 8.76021957397461, "learning_rate": 1.5053912756216319e-05, "epoch": 2.8133704735376046, "step": 506}, {"loss": 1.3502, "grad_norm": 8.618566513061523, "learning_rate": 1.5040765234158488e-05, "epoch": 2.818941504178273, "step": 507}, {"loss": 1.5859, "grad_norm": 8.027894020080566, "learning_rate": 1.5027617712100658e-05, "epoch": 2.8245125348189415, "step": 508}, {"loss": 1.4159, "grad_norm": 7.063473701477051, "learning_rate": 1.5014470190042826e-05, "epoch": 2.83008356545961, "step": 509}, {"loss": 1.5672, "grad_norm": 6.095931053161621, "learning_rate": 1.5001322667984995e-05, "epoch": 2.8356545961002784, "step": 510}, {"loss": 1.2551, "grad_norm": 6.445271968841553, "learning_rate": 1.4988175145927165e-05, "epoch": 2.841225626740947, "step": 511}, {"loss": 0.9671, "grad_norm": 7.601891040802002, "learning_rate": 1.4975027623869334e-05, "epoch": 2.8467966573816157, "step": 512}, {"loss": 1.4462, "grad_norm": 8.017728805541992, "learning_rate": 1.4961880101811502e-05, "epoch": 2.852367688022284, "step": 513}, {"loss": 1.2006, "grad_norm": 6.753676891326904, "learning_rate": 1.4948732579753672e-05, "epoch": 2.8579387186629526, "step": 514}, {"loss": 0.9354, "grad_norm": 6.220627784729004, "learning_rate": 1.4935585057695843e-05, "epoch": 2.863509749303621, "step": 515}, {"loss": 1.5554, "grad_norm": 7.825878620147705, "learning_rate": 1.4922437535638009e-05, "epoch": 2.86908077994429, "step": 516}, {"loss": 0.9281, "grad_norm": 7.7669548988342285, "learning_rate": 1.490929001358018e-05, "epoch": 2.8746518105849583, "step": 517}, {"loss": 1.1678, "grad_norm": 6.18816614151001, "learning_rate": 1.489614249152235e-05, "epoch": 2.8802228412256268, "step": 518}, {"loss": 0.8378, "grad_norm": 11.241938591003418, "learning_rate": 1.4882994969464517e-05, "epoch": 2.885793871866295, "step": 519}, {"loss": 1.6602, "grad_norm": 6.708087921142578, "learning_rate": 1.4869847447406687e-05, "epoch": 2.8913649025069637, "step": 520}, {"loss": 1.5575, "grad_norm": 8.96353530883789, "learning_rate": 1.4856699925348856e-05, "epoch": 2.896935933147632, "step": 521}, {"loss": 1.3553, "grad_norm": 7.286456108093262, "learning_rate": 1.4843552403291026e-05, "epoch": 2.902506963788301, "step": 522}, {"loss": 1.3618, "grad_norm": 6.448929309844971, "learning_rate": 1.4830404881233194e-05, "epoch": 2.9080779944289694, "step": 523}, {"loss": 1.0911, "grad_norm": 6.1524739265441895, "learning_rate": 1.4817257359175363e-05, "epoch": 2.913649025069638, "step": 524}, {"loss": 1.2465, "grad_norm": 6.833171367645264, "learning_rate": 1.4804109837117533e-05, "epoch": 2.9192200557103063, "step": 525}, {"loss": 0.9937, "grad_norm": 8.745670318603516, "learning_rate": 1.47909623150597e-05, "epoch": 2.924791086350975, "step": 526}, {"loss": 1.3931, "grad_norm": 6.3659186363220215, "learning_rate": 1.477781479300187e-05, "epoch": 2.9303621169916436, "step": 527}, {"loss": 1.029, "grad_norm": 8.309256553649902, "learning_rate": 1.476466727094404e-05, "epoch": 2.935933147632312, "step": 528}, {"loss": 1.383, "grad_norm": 7.611057758331299, "learning_rate": 1.4751519748886211e-05, "epoch": 2.9415041782729805, "step": 529}, {"loss": 1.4833, "grad_norm": 9.441068649291992, "learning_rate": 1.4738372226828379e-05, "epoch": 2.947075208913649, "step": 530}, {"loss": 1.2739, "grad_norm": 7.198431968688965, "learning_rate": 1.4725224704770548e-05, "epoch": 2.9526462395543174, "step": 531}, {"loss": 1.4294, "grad_norm": 8.88117790222168, "learning_rate": 1.4712077182712718e-05, "epoch": 2.958217270194986, "step": 532}, {"loss": 1.0204, "grad_norm": 9.982294082641602, "learning_rate": 1.4698929660654886e-05, "epoch": 2.9637883008356547, "step": 533}, {"loss": 1.1488, "grad_norm": 8.535533905029297, "learning_rate": 1.4685782138597055e-05, "epoch": 2.969359331476323, "step": 534}, {"loss": 1.49, "grad_norm": 6.813885688781738, "learning_rate": 1.4672634616539225e-05, "epoch": 2.9749303621169916, "step": 535}, {"loss": 1.113, "grad_norm": 9.557439804077148, "learning_rate": 1.4659487094481394e-05, "epoch": 2.98050139275766, "step": 536}, {"loss": 1.5004, "grad_norm": 6.406128883361816, "learning_rate": 1.4646339572423562e-05, "epoch": 2.986072423398329, "step": 537}, {"loss": 1.1722, "grad_norm": 7.9670915603637695, "learning_rate": 1.4633192050365732e-05, "epoch": 2.9916434540389973, "step": 538}, {"loss": 1.5033, "grad_norm": 9.402728080749512, "learning_rate": 1.4620044528307901e-05, "epoch": 2.997214484679666, "step": 539}, {"loss": 1.279, "grad_norm": 7.38714075088501, "learning_rate": 1.4606897006250069e-05, "epoch": 3.0, "step": 540}, {"loss": 0.6426, "grad_norm": 7.639667510986328, "learning_rate": 1.4593749484192238e-05, "epoch": 3.0055710306406684, "step": 541}, {"loss": 0.8426, "grad_norm": 7.864633560180664, "learning_rate": 1.458060196213441e-05, "epoch": 3.011142061281337, "step": 542}, {"loss": 1.0651, "grad_norm": 6.637276649475098, "learning_rate": 1.4567454440076576e-05, "epoch": 3.0167130919220058, "step": 543}, {"loss": 1.0804, "grad_norm": 7.148686408996582, "learning_rate": 1.4554306918018747e-05, "epoch": 3.022284122562674, "step": 544}, {"loss": 1.0182, "grad_norm": 6.767364501953125, "learning_rate": 1.4541159395960917e-05, "epoch": 3.0278551532033426, "step": 545}, {"loss": 0.8165, "grad_norm": 6.77062463760376, "learning_rate": 1.4528011873903086e-05, "epoch": 3.033426183844011, "step": 546}, {"loss": 0.9941, "grad_norm": 8.067922592163086, "learning_rate": 1.4514864351845254e-05, "epoch": 3.0389972144846795, "step": 547}, {"loss": 0.9579, "grad_norm": 8.817468643188477, "learning_rate": 1.4501716829787423e-05, "epoch": 3.0445682451253484, "step": 548}, {"loss": 0.6023, "grad_norm": 8.70374870300293, "learning_rate": 1.4488569307729593e-05, "epoch": 3.050139275766017, "step": 549}, {"loss": 1.1392, "grad_norm": 9.344374656677246, "learning_rate": 1.447542178567176e-05, "epoch": 3.0557103064066853, "step": 550}, {"eval_loss": 3.0347440242767334, "eval_runtime": 35.9429, "eval_samples_per_second": 39.952, "eval_steps_per_second": 2.003, "epoch": 3.0557103064066853, "step": 550}, {"loss": 0.663, "grad_norm": 10.07166862487793, "learning_rate": 1.446227426361393e-05, "epoch": 3.0612813370473537, "step": 551}, {"loss": 1.1801, "grad_norm": 14.619653701782227, "learning_rate": 1.44491267415561e-05, "epoch": 3.066852367688022, "step": 552}, {"loss": 0.9107, "grad_norm": 10.427509307861328, "learning_rate": 1.443597921949827e-05, "epoch": 3.0724233983286906, "step": 553}, {"loss": 0.9474, "grad_norm": 8.392213821411133, "learning_rate": 1.4422831697440437e-05, "epoch": 3.0779944289693595, "step": 554}, {"loss": 0.7972, "grad_norm": 13.848929405212402, "learning_rate": 1.4409684175382607e-05, "epoch": 3.083565459610028, "step": 555}, {"loss": 0.7298, "grad_norm": 9.263422966003418, "learning_rate": 1.4396536653324778e-05, "epoch": 3.0891364902506964, "step": 556}, {"loss": 0.5749, "grad_norm": 11.082460403442383, "learning_rate": 1.4383389131266946e-05, "epoch": 3.094707520891365, "step": 557}, {"loss": 0.7301, "grad_norm": 7.7812604904174805, "learning_rate": 1.4370241609209115e-05, "epoch": 3.1002785515320332, "step": 558}, {"loss": 0.5884, "grad_norm": 12.2935791015625, "learning_rate": 1.4357094087151285e-05, "epoch": 3.105849582172702, "step": 559}, {"loss": 0.8034, "grad_norm": 8.129678726196289, "learning_rate": 1.4343946565093454e-05, "epoch": 3.1114206128133706, "step": 560}, {"loss": 0.6528, "grad_norm": 8.628301620483398, "learning_rate": 1.4330799043035622e-05, "epoch": 3.116991643454039, "step": 561}, {"loss": 0.8483, "grad_norm": 10.514995574951172, "learning_rate": 1.4317651520977792e-05, "epoch": 3.1225626740947074, "step": 562}, {"loss": 0.7009, "grad_norm": 8.187010765075684, "learning_rate": 1.4304503998919961e-05, "epoch": 3.128133704735376, "step": 563}, {"loss": 0.8732, "grad_norm": 10.525712013244629, "learning_rate": 1.4291356476862129e-05, "epoch": 3.1337047353760448, "step": 564}, {"loss": 0.8319, "grad_norm": 9.198347091674805, "learning_rate": 1.4278208954804298e-05, "epoch": 3.139275766016713, "step": 565}, {"loss": 0.726, "grad_norm": 8.486757278442383, "learning_rate": 1.4265061432746468e-05, "epoch": 3.1448467966573816, "step": 566}, {"loss": 0.4993, "grad_norm": 8.220407485961914, "learning_rate": 1.4251913910688636e-05, "epoch": 3.15041782729805, "step": 567}, {"loss": 1.0212, "grad_norm": 7.644767761230469, "learning_rate": 1.4238766388630805e-05, "epoch": 3.1559888579387185, "step": 568}, {"loss": 0.8306, "grad_norm": 11.287712097167969, "learning_rate": 1.4225618866572977e-05, "epoch": 3.1615598885793874, "step": 569}, {"loss": 0.626, "grad_norm": 7.9160637855529785, "learning_rate": 1.4212471344515146e-05, "epoch": 3.167130919220056, "step": 570}, {"loss": 0.75, "grad_norm": 11.988582611083984, "learning_rate": 1.4199323822457314e-05, "epoch": 3.1727019498607243, "step": 571}, {"loss": 1.2685, "grad_norm": 9.961721420288086, "learning_rate": 1.4186176300399483e-05, "epoch": 3.1782729805013927, "step": 572}, {"loss": 0.7429, "grad_norm": 12.098424911499023, "learning_rate": 1.4173028778341653e-05, "epoch": 3.183844011142061, "step": 573}, {"loss": 1.0086, "grad_norm": 8.59049129486084, "learning_rate": 1.415988125628382e-05, "epoch": 3.1894150417827296, "step": 574}, {"loss": 1.1215, "grad_norm": 10.50232219696045, "learning_rate": 1.414673373422599e-05, "epoch": 3.1949860724233985, "step": 575}, {"loss": 0.6729, "grad_norm": 11.673900604248047, "learning_rate": 1.413358621216816e-05, "epoch": 3.200557103064067, "step": 576}, {"loss": 1.2036, "grad_norm": 6.419600009918213, "learning_rate": 1.412043869011033e-05, "epoch": 3.2061281337047354, "step": 577}, {"loss": 0.6877, "grad_norm": 10.218490600585938, "learning_rate": 1.4107291168052497e-05, "epoch": 3.211699164345404, "step": 578}, {"loss": 0.5637, "grad_norm": 5.7183918952941895, "learning_rate": 1.4094143645994667e-05, "epoch": 3.2172701949860723, "step": 579}, {"loss": 0.7498, "grad_norm": 11.460823059082031, "learning_rate": 1.4080996123936836e-05, "epoch": 3.222841225626741, "step": 580}, {"loss": 0.6792, "grad_norm": 8.623233795166016, "learning_rate": 1.4067848601879004e-05, "epoch": 3.2284122562674096, "step": 581}, {"loss": 0.6752, "grad_norm": 11.339884757995605, "learning_rate": 1.4054701079821174e-05, "epoch": 3.233983286908078, "step": 582}, {"loss": 0.8586, "grad_norm": 12.452316284179688, "learning_rate": 1.4041553557763345e-05, "epoch": 3.2395543175487465, "step": 583}, {"loss": 0.8345, "grad_norm": 6.755831241607666, "learning_rate": 1.4028406035705514e-05, "epoch": 3.245125348189415, "step": 584}, {"loss": 0.6932, "grad_norm": 9.68067741394043, "learning_rate": 1.4015258513647682e-05, "epoch": 3.2506963788300833, "step": 585}, {"loss": 1.2071, "grad_norm": 11.948298454284668, "learning_rate": 1.4002110991589852e-05, "epoch": 3.256267409470752, "step": 586}, {"loss": 0.7349, "grad_norm": 11.49226188659668, "learning_rate": 1.3988963469532021e-05, "epoch": 3.2618384401114207, "step": 587}, {"loss": 0.7923, "grad_norm": 10.757736206054688, "learning_rate": 1.3975815947474189e-05, "epoch": 3.267409470752089, "step": 588}, {"loss": 0.6857, "grad_norm": 8.46744441986084, "learning_rate": 1.3962668425416358e-05, "epoch": 3.2729805013927575, "step": 589}, {"loss": 0.9153, "grad_norm": 6.472330093383789, "learning_rate": 1.3949520903358528e-05, "epoch": 3.2785515320334264, "step": 590}, {"loss": 0.7542, "grad_norm": 12.151514053344727, "learning_rate": 1.3936373381300696e-05, "epoch": 3.284122562674095, "step": 591}, {"loss": 0.9487, "grad_norm": 11.680760383605957, "learning_rate": 1.3923225859242865e-05, "epoch": 3.2896935933147633, "step": 592}, {"loss": 0.6893, "grad_norm": 9.367558479309082, "learning_rate": 1.3910078337185035e-05, "epoch": 3.2952646239554317, "step": 593}, {"loss": 0.7126, "grad_norm": 10.658570289611816, "learning_rate": 1.3896930815127206e-05, "epoch": 3.3008356545961, "step": 594}, {"loss": 0.8014, "grad_norm": 8.675304412841797, "learning_rate": 1.3883783293069372e-05, "epoch": 3.3064066852367686, "step": 595}, {"loss": 0.7078, "grad_norm": 6.470170974731445, "learning_rate": 1.3870635771011543e-05, "epoch": 3.3119777158774375, "step": 596}, {"loss": 0.6612, "grad_norm": 7.141599178314209, "learning_rate": 1.3857488248953713e-05, "epoch": 3.317548746518106, "step": 597}, {"loss": 0.8968, "grad_norm": 9.977639198303223, "learning_rate": 1.384434072689588e-05, "epoch": 3.3231197771587744, "step": 598}, {"loss": 0.8395, "grad_norm": 10.208252906799316, "learning_rate": 1.383119320483805e-05, "epoch": 3.328690807799443, "step": 599}, {"loss": 0.9248, "grad_norm": 9.933085441589355, "learning_rate": 1.381804568278022e-05, "epoch": 3.3342618384401113, "step": 600}, {"eval_loss": 2.9689695835113525, "eval_runtime": 35.9526, "eval_samples_per_second": 39.941, "eval_steps_per_second": 2.003, "epoch": 3.3342618384401113, "step": 600}, {"train_runtime": 2220.807, "train_samples_per_second": 5.944, "train_steps_per_second": 0.743, "total_flos": 4.35104765343744e+16, "train_loss": 1.654274252106746, "epoch": 3.3342618384401113, "step": 600}], "notes_md": ""}</script>
|
|
</div>
|
|
</body>
|
|
</html>
|