Files
Qwen3-1.7B-SFT-medical-2e-5/trainer_state.json

3195 lines
82 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": 250,
"best_metric": 1.4088929891586304,
"best_model_checkpoint": "saves/qwen3-1.7B/medical-o1-sft-full/checkpoint-250",
"epoch": 3.0,
"eval_steps": 50,
"global_step": 441,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006837606837606838,
"grad_norm": 83.15293884277344,
"learning_rate": 0.0,
"loss": 2.8199405670166016,
"step": 1
},
{
"epoch": 0.013675213675213675,
"grad_norm": 81.79350280761719,
"learning_rate": 8.695652173913044e-07,
"loss": 2.7888758182525635,
"step": 2
},
{
"epoch": 0.020512820512820513,
"grad_norm": 83.25151824951172,
"learning_rate": 1.7391304347826088e-06,
"loss": 2.820769786834717,
"step": 3
},
{
"epoch": 0.02735042735042735,
"grad_norm": 75.52108001708984,
"learning_rate": 2.6086956521739132e-06,
"loss": 2.734041690826416,
"step": 4
},
{
"epoch": 0.03418803418803419,
"grad_norm": 72.11664581298828,
"learning_rate": 3.4782608695652175e-06,
"loss": 2.7135212421417236,
"step": 5
},
{
"epoch": 0.041025641025641026,
"grad_norm": 55.534324645996094,
"learning_rate": 4.347826086956522e-06,
"loss": 2.4443650245666504,
"step": 6
},
{
"epoch": 0.04786324786324787,
"grad_norm": 48.14010238647461,
"learning_rate": 5.2173913043478265e-06,
"loss": 2.3162710666656494,
"step": 7
},
{
"epoch": 0.0547008547008547,
"grad_norm": 20.861207962036133,
"learning_rate": 6.086956521739132e-06,
"loss": 2.0038950443267822,
"step": 8
},
{
"epoch": 0.06153846153846154,
"grad_norm": 15.49008846282959,
"learning_rate": 6.956521739130435e-06,
"loss": 1.8993940353393555,
"step": 9
},
{
"epoch": 0.06837606837606838,
"grad_norm": 5.190984725952148,
"learning_rate": 7.82608695652174e-06,
"loss": 1.7324286699295044,
"step": 10
},
{
"epoch": 0.07521367521367521,
"grad_norm": 4.630637168884277,
"learning_rate": 8.695652173913044e-06,
"loss": 1.654750943183899,
"step": 11
},
{
"epoch": 0.08205128205128205,
"grad_norm": 3.784055233001709,
"learning_rate": 9.565217391304349e-06,
"loss": 1.7394911050796509,
"step": 12
},
{
"epoch": 0.08888888888888889,
"grad_norm": 3.4299561977386475,
"learning_rate": 1.0434782608695653e-05,
"loss": 1.6633565425872803,
"step": 13
},
{
"epoch": 0.09572649572649573,
"grad_norm": 4.693484306335449,
"learning_rate": 1.1304347826086957e-05,
"loss": 1.670560359954834,
"step": 14
},
{
"epoch": 0.10256410256410256,
"grad_norm": 5.14279317855835,
"learning_rate": 1.2173913043478263e-05,
"loss": 1.647332787513733,
"step": 15
},
{
"epoch": 0.1094017094017094,
"grad_norm": 3.8385608196258545,
"learning_rate": 1.3043478260869566e-05,
"loss": 1.6399732828140259,
"step": 16
},
{
"epoch": 0.11623931623931624,
"grad_norm": 2.6695456504821777,
"learning_rate": 1.391304347826087e-05,
"loss": 1.5681482553482056,
"step": 17
},
{
"epoch": 0.12307692307692308,
"grad_norm": 2.117490291595459,
"learning_rate": 1.4782608695652174e-05,
"loss": 1.6053783893585205,
"step": 18
},
{
"epoch": 0.12991452991452992,
"grad_norm": 1.9541882276535034,
"learning_rate": 1.565217391304348e-05,
"loss": 1.5954205989837646,
"step": 19
},
{
"epoch": 0.13675213675213677,
"grad_norm": 2.011003255844116,
"learning_rate": 1.6521739130434785e-05,
"loss": 1.5820363759994507,
"step": 20
},
{
"epoch": 0.14358974358974358,
"grad_norm": 1.9789162874221802,
"learning_rate": 1.739130434782609e-05,
"loss": 1.532997727394104,
"step": 21
},
{
"epoch": 0.15042735042735042,
"grad_norm": 1.8961035013198853,
"learning_rate": 1.8260869565217393e-05,
"loss": 1.5475587844848633,
"step": 22
},
{
"epoch": 0.15726495726495726,
"grad_norm": 1.5811997652053833,
"learning_rate": 1.9130434782608697e-05,
"loss": 1.580260992050171,
"step": 23
},
{
"epoch": 0.1641025641025641,
"grad_norm": 1.4591213464736938,
"learning_rate": 2e-05,
"loss": 1.5463660955429077,
"step": 24
},
{
"epoch": 0.17094017094017094,
"grad_norm": 1.4459729194641113,
"learning_rate": 1.999971756719333e-05,
"loss": 1.5187675952911377,
"step": 25
},
{
"epoch": 0.17777777777777778,
"grad_norm": 1.4411983489990234,
"learning_rate": 1.9998870284726968e-05,
"loss": 1.529025673866272,
"step": 26
},
{
"epoch": 0.18461538461538463,
"grad_norm": 1.3215960264205933,
"learning_rate": 1.9997458200460994e-05,
"loss": 1.513730525970459,
"step": 27
},
{
"epoch": 0.19145299145299147,
"grad_norm": 1.324648141860962,
"learning_rate": 1.999548139415919e-05,
"loss": 1.5576432943344116,
"step": 28
},
{
"epoch": 0.19829059829059828,
"grad_norm": 1.1139763593673706,
"learning_rate": 1.999293997748454e-05,
"loss": 1.5223976373672485,
"step": 29
},
{
"epoch": 0.20512820512820512,
"grad_norm": 1.175620675086975,
"learning_rate": 1.9989834093992945e-05,
"loss": 1.529496431350708,
"step": 30
},
{
"epoch": 0.21196581196581196,
"grad_norm": 1.2628631591796875,
"learning_rate": 1.9986163919125077e-05,
"loss": 1.5556331872940063,
"step": 31
},
{
"epoch": 0.2188034188034188,
"grad_norm": 1.121780276298523,
"learning_rate": 1.9981929660196492e-05,
"loss": 1.522382140159607,
"step": 32
},
{
"epoch": 0.22564102564102564,
"grad_norm": 1.057112693786621,
"learning_rate": 1.997713155638592e-05,
"loss": 1.5269778966903687,
"step": 33
},
{
"epoch": 0.23247863247863249,
"grad_norm": 1.1212079524993896,
"learning_rate": 1.9971769878721747e-05,
"loss": 1.5179802179336548,
"step": 34
},
{
"epoch": 0.23931623931623933,
"grad_norm": 1.1053107976913452,
"learning_rate": 1.99658449300667e-05,
"loss": 1.4600404500961304,
"step": 35
},
{
"epoch": 0.24615384615384617,
"grad_norm": 1.0344611406326294,
"learning_rate": 1.9959357045100764e-05,
"loss": 1.4895355701446533,
"step": 36
},
{
"epoch": 0.252991452991453,
"grad_norm": 1.0998711585998535,
"learning_rate": 1.9952306590302247e-05,
"loss": 1.498748779296875,
"step": 37
},
{
"epoch": 0.25982905982905985,
"grad_norm": 1.0810974836349487,
"learning_rate": 1.9944693963927092e-05,
"loss": 1.4847540855407715,
"step": 38
},
{
"epoch": 0.26666666666666666,
"grad_norm": 1.0349794626235962,
"learning_rate": 1.9936519595986395e-05,
"loss": 1.4850821495056152,
"step": 39
},
{
"epoch": 0.27350427350427353,
"grad_norm": 0.9509456157684326,
"learning_rate": 1.9927783948222084e-05,
"loss": 1.4879685640335083,
"step": 40
},
{
"epoch": 0.28034188034188035,
"grad_norm": 0.9873176217079163,
"learning_rate": 1.9918487514080867e-05,
"loss": 1.5055975914001465,
"step": 41
},
{
"epoch": 0.28717948717948716,
"grad_norm": 0.9554620385169983,
"learning_rate": 1.990863081868634e-05,
"loss": 1.4576541185379028,
"step": 42
},
{
"epoch": 0.294017094017094,
"grad_norm": 0.915795087814331,
"learning_rate": 1.989821441880933e-05,
"loss": 1.469474196434021,
"step": 43
},
{
"epoch": 0.30085470085470084,
"grad_norm": 1.006457805633545,
"learning_rate": 1.988723890283645e-05,
"loss": 1.5073033571243286,
"step": 44
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.9496122598648071,
"learning_rate": 1.9875704890736853e-05,
"loss": 1.496271014213562,
"step": 45
},
{
"epoch": 0.3145299145299145,
"grad_norm": 0.9319558143615723,
"learning_rate": 1.9863613034027224e-05,
"loss": 1.4825000762939453,
"step": 46
},
{
"epoch": 0.3213675213675214,
"grad_norm": 0.9389411807060242,
"learning_rate": 1.985096401573497e-05,
"loss": 1.4443243741989136,
"step": 47
},
{
"epoch": 0.3282051282051282,
"grad_norm": 0.9735950827598572,
"learning_rate": 1.9837758550359637e-05,
"loss": 1.4762128591537476,
"step": 48
},
{
"epoch": 0.335042735042735,
"grad_norm": 0.9494331479072571,
"learning_rate": 1.982399738383255e-05,
"loss": 1.5045385360717773,
"step": 49
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.9520753026008606,
"learning_rate": 1.9809681293474693e-05,
"loss": 1.496164321899414,
"step": 50
},
{
"epoch": 0.3418803418803419,
"eval_loss": 1.4685521125793457,
"eval_runtime": 14.1604,
"eval_samples_per_second": 69.631,
"eval_steps_per_second": 8.757,
"step": 50
},
{
"epoch": 0.3487179487179487,
"grad_norm": 0.9688102602958679,
"learning_rate": 1.979481108795278e-05,
"loss": 1.4734501838684082,
"step": 51
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.9477071166038513,
"learning_rate": 1.9779387607233587e-05,
"loss": 1.4600017070770264,
"step": 52
},
{
"epoch": 0.3623931623931624,
"grad_norm": 0.9507799744606018,
"learning_rate": 1.9763411722536503e-05,
"loss": 1.455001711845398,
"step": 53
},
{
"epoch": 0.36923076923076925,
"grad_norm": 0.9292111992835999,
"learning_rate": 1.9746884336284316e-05,
"loss": 1.4742114543914795,
"step": 54
},
{
"epoch": 0.37606837606837606,
"grad_norm": 0.9916467666625977,
"learning_rate": 1.972980638205225e-05,
"loss": 1.5147836208343506,
"step": 55
},
{
"epoch": 0.38290598290598293,
"grad_norm": 0.9744175672531128,
"learning_rate": 1.971217882451521e-05,
"loss": 1.4713977575302124,
"step": 56
},
{
"epoch": 0.38974358974358975,
"grad_norm": 1.0033540725708008,
"learning_rate": 1.9694002659393306e-05,
"loss": 1.4538943767547607,
"step": 57
},
{
"epoch": 0.39658119658119656,
"grad_norm": 0.946854293346405,
"learning_rate": 1.9675278913395605e-05,
"loss": 1.4287432432174683,
"step": 58
},
{
"epoch": 0.40341880341880343,
"grad_norm": 1.0013198852539062,
"learning_rate": 1.9656008644162134e-05,
"loss": 1.4492701292037964,
"step": 59
},
{
"epoch": 0.41025641025641024,
"grad_norm": 1.0438623428344727,
"learning_rate": 1.9636192940204134e-05,
"loss": 1.4924561977386475,
"step": 60
},
{
"epoch": 0.4170940170940171,
"grad_norm": 0.9705636501312256,
"learning_rate": 1.961583292084259e-05,
"loss": 1.4596234560012817,
"step": 61
},
{
"epoch": 0.4239316239316239,
"grad_norm": 0.9079157114028931,
"learning_rate": 1.9594929736144978e-05,
"loss": 1.44952392578125,
"step": 62
},
{
"epoch": 0.4307692307692308,
"grad_norm": 0.9640805125236511,
"learning_rate": 1.957348456686032e-05,
"loss": 1.4430960416793823,
"step": 63
},
{
"epoch": 0.4376068376068376,
"grad_norm": 0.9475866556167603,
"learning_rate": 1.9551498624352497e-05,
"loss": 1.446009635925293,
"step": 64
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.948258638381958,
"learning_rate": 1.9528973150531787e-05,
"loss": 1.4411481618881226,
"step": 65
},
{
"epoch": 0.4512820512820513,
"grad_norm": 0.9805014133453369,
"learning_rate": 1.9505909417784758e-05,
"loss": 1.4417314529418945,
"step": 66
},
{
"epoch": 0.4581196581196581,
"grad_norm": 0.9225365519523621,
"learning_rate": 1.9482308728902358e-05,
"loss": 1.480376958847046,
"step": 67
},
{
"epoch": 0.46495726495726497,
"grad_norm": 0.9221044182777405,
"learning_rate": 1.9458172417006347e-05,
"loss": 1.4625794887542725,
"step": 68
},
{
"epoch": 0.4717948717948718,
"grad_norm": 0.9901456832885742,
"learning_rate": 1.9433501845473996e-05,
"loss": 1.4856598377227783,
"step": 69
},
{
"epoch": 0.47863247863247865,
"grad_norm": 0.9551020860671997,
"learning_rate": 1.9408298407861045e-05,
"loss": 1.4896745681762695,
"step": 70
},
{
"epoch": 0.48547008547008547,
"grad_norm": 0.9381822943687439,
"learning_rate": 1.9382563527823026e-05,
"loss": 1.4343875646591187,
"step": 71
},
{
"epoch": 0.49230769230769234,
"grad_norm": 0.8770731091499329,
"learning_rate": 1.935629865903482e-05,
"loss": 1.4482182264328003,
"step": 72
},
{
"epoch": 0.49914529914529915,
"grad_norm": 0.934929609298706,
"learning_rate": 1.9329505285108544e-05,
"loss": 1.4524080753326416,
"step": 73
},
{
"epoch": 0.505982905982906,
"grad_norm": 0.9203254580497742,
"learning_rate": 1.9302184919509758e-05,
"loss": 1.4096636772155762,
"step": 74
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.9084986448287964,
"learning_rate": 1.927433910547197e-05,
"loss": 1.423622488975525,
"step": 75
},
{
"epoch": 0.5196581196581197,
"grad_norm": 0.8734993934631348,
"learning_rate": 1.9245969415909464e-05,
"loss": 1.4265828132629395,
"step": 76
},
{
"epoch": 0.5264957264957265,
"grad_norm": 0.8964496850967407,
"learning_rate": 1.921707745332845e-05,
"loss": 1.4725595712661743,
"step": 77
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.9096109867095947,
"learning_rate": 1.9187664849736542e-05,
"loss": 1.457470417022705,
"step": 78
},
{
"epoch": 0.5401709401709401,
"grad_norm": 0.8932516574859619,
"learning_rate": 1.9157733266550577e-05,
"loss": 1.454951286315918,
"step": 79
},
{
"epoch": 0.5470085470085471,
"grad_norm": 0.8940214514732361,
"learning_rate": 1.9127284394502765e-05,
"loss": 1.4776511192321777,
"step": 80
},
{
"epoch": 0.5538461538461539,
"grad_norm": 0.8789263963699341,
"learning_rate": 1.9096319953545186e-05,
"loss": 1.4376585483551025,
"step": 81
},
{
"epoch": 0.5606837606837607,
"grad_norm": 0.9395255446434021,
"learning_rate": 1.906484169275263e-05,
"loss": 1.4360781908035278,
"step": 82
},
{
"epoch": 0.5675213675213675,
"grad_norm": 0.8618428707122803,
"learning_rate": 1.903285139022381e-05,
"loss": 1.4329712390899658,
"step": 83
},
{
"epoch": 0.5743589743589743,
"grad_norm": 0.9313262104988098,
"learning_rate": 1.900035085298091e-05,
"loss": 1.446253776550293,
"step": 84
},
{
"epoch": 0.5811965811965812,
"grad_norm": 0.8763355016708374,
"learning_rate": 1.896734191686752e-05,
"loss": 1.4160209894180298,
"step": 85
},
{
"epoch": 0.588034188034188,
"grad_norm": 0.8777135610580444,
"learning_rate": 1.8933826446444933e-05,
"loss": 1.449493408203125,
"step": 86
},
{
"epoch": 0.5948717948717949,
"grad_norm": 0.8737928867340088,
"learning_rate": 1.889980633488683e-05,
"loss": 1.377128005027771,
"step": 87
},
{
"epoch": 0.6017094017094017,
"grad_norm": 0.923620343208313,
"learning_rate": 1.8865283503872325e-05,
"loss": 1.422142505645752,
"step": 88
},
{
"epoch": 0.6085470085470085,
"grad_norm": 0.9419258832931519,
"learning_rate": 1.8830259903477427e-05,
"loss": 1.4897931814193726,
"step": 89
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.9292656779289246,
"learning_rate": 1.879473751206489e-05,
"loss": 1.4244943857192993,
"step": 90
},
{
"epoch": 0.6222222222222222,
"grad_norm": 0.9174057841300964,
"learning_rate": 1.8758718336172462e-05,
"loss": 1.432208776473999,
"step": 91
},
{
"epoch": 0.629059829059829,
"grad_norm": 0.9447773694992065,
"learning_rate": 1.8722204410399524e-05,
"loss": 1.4501725435256958,
"step": 92
},
{
"epoch": 0.6358974358974359,
"grad_norm": 0.8907484412193298,
"learning_rate": 1.868519779729218e-05,
"loss": 1.4563168287277222,
"step": 93
},
{
"epoch": 0.6427350427350428,
"grad_norm": 0.8975157141685486,
"learning_rate": 1.864770058722676e-05,
"loss": 1.4320740699768066,
"step": 94
},
{
"epoch": 0.6495726495726496,
"grad_norm": 0.9034259915351868,
"learning_rate": 1.8609714898291716e-05,
"loss": 1.4002689123153687,
"step": 95
},
{
"epoch": 0.6564102564102564,
"grad_norm": 0.9356617331504822,
"learning_rate": 1.8571242876167995e-05,
"loss": 1.4669139385223389,
"step": 96
},
{
"epoch": 0.6632478632478632,
"grad_norm": 0.9355176091194153,
"learning_rate": 1.853228669400784e-05,
"loss": 1.4444191455841064,
"step": 97
},
{
"epoch": 0.67008547008547,
"grad_norm": 0.8931655883789062,
"learning_rate": 1.8492848552312016e-05,
"loss": 1.4415756464004517,
"step": 98
},
{
"epoch": 0.676923076923077,
"grad_norm": 0.8951373100280762,
"learning_rate": 1.8452930678805536e-05,
"loss": 1.4061449766159058,
"step": 99
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.9179074168205261,
"learning_rate": 1.8412535328311813e-05,
"loss": 1.4215387105941772,
"step": 100
},
{
"epoch": 0.6837606837606838,
"eval_loss": 1.4336893558502197,
"eval_runtime": 13.7947,
"eval_samples_per_second": 71.477,
"eval_steps_per_second": 8.989,
"step": 100
},
{
"epoch": 0.6905982905982906,
"grad_norm": 0.977781355381012,
"learning_rate": 1.8371664782625287e-05,
"loss": 1.4540152549743652,
"step": 101
},
{
"epoch": 0.6974358974358974,
"grad_norm": 0.9076094627380371,
"learning_rate": 1.8330321350382545e-05,
"loss": 1.415886640548706,
"step": 102
},
{
"epoch": 0.7042735042735043,
"grad_norm": 0.8912188410758972,
"learning_rate": 1.8288507366931907e-05,
"loss": 1.4277691841125488,
"step": 103
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.8660780787467957,
"learning_rate": 1.8246225194201517e-05,
"loss": 1.39166259765625,
"step": 104
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.9204691648483276,
"learning_rate": 1.8203477220565912e-05,
"loss": 1.4161370992660522,
"step": 105
},
{
"epoch": 0.7247863247863248,
"grad_norm": 0.9661011695861816,
"learning_rate": 1.8160265860711134e-05,
"loss": 1.4492610692977905,
"step": 106
},
{
"epoch": 0.7316239316239316,
"grad_norm": 0.9005808234214783,
"learning_rate": 1.8116593555498308e-05,
"loss": 1.4389468431472778,
"step": 107
},
{
"epoch": 0.7384615384615385,
"grad_norm": 0.9088156223297119,
"learning_rate": 1.807246277182578e-05,
"loss": 1.4940838813781738,
"step": 108
},
{
"epoch": 0.7452991452991453,
"grad_norm": 0.9402887225151062,
"learning_rate": 1.802787600248977e-05,
"loss": 1.4154539108276367,
"step": 109
},
{
"epoch": 0.7521367521367521,
"grad_norm": 0.9380722045898438,
"learning_rate": 1.798283576604356e-05,
"loss": 1.4318289756774902,
"step": 110
},
{
"epoch": 0.7589743589743589,
"grad_norm": 0.9319474101066589,
"learning_rate": 1.7937344606655228e-05,
"loss": 1.4192531108856201,
"step": 111
},
{
"epoch": 0.7658119658119659,
"grad_norm": 0.9068304896354675,
"learning_rate": 1.789140509396394e-05,
"loss": 1.4170390367507935,
"step": 112
},
{
"epoch": 0.7726495726495727,
"grad_norm": 0.8808281421661377,
"learning_rate": 1.784501982293479e-05,
"loss": 1.432860016822815,
"step": 113
},
{
"epoch": 0.7794871794871795,
"grad_norm": 0.8805544376373291,
"learning_rate": 1.7798191413712244e-05,
"loss": 1.4037058353424072,
"step": 114
},
{
"epoch": 0.7863247863247863,
"grad_norm": 0.8959332704544067,
"learning_rate": 1.775092251147211e-05,
"loss": 1.4175316095352173,
"step": 115
},
{
"epoch": 0.7931623931623931,
"grad_norm": 0.8379173278808594,
"learning_rate": 1.770321578627213e-05,
"loss": 1.404625654220581,
"step": 116
},
{
"epoch": 0.8,
"grad_norm": 0.8591132164001465,
"learning_rate": 1.765507393290117e-05,
"loss": 1.4534145593643188,
"step": 117
},
{
"epoch": 0.8068376068376069,
"grad_norm": 0.8517522215843201,
"learning_rate": 1.7606499670726972e-05,
"loss": 1.4170221090316772,
"step": 118
},
{
"epoch": 0.8136752136752137,
"grad_norm": 0.8700085282325745,
"learning_rate": 1.7557495743542586e-05,
"loss": 1.4001213312149048,
"step": 119
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.8774170875549316,
"learning_rate": 1.7508064919411344e-05,
"loss": 1.418135643005371,
"step": 120
},
{
"epoch": 0.8273504273504273,
"grad_norm": 0.8984478116035461,
"learning_rate": 1.745820999051053e-05,
"loss": 1.4195680618286133,
"step": 121
},
{
"epoch": 0.8341880341880342,
"grad_norm": 0.8648718595504761,
"learning_rate": 1.7407933772973638e-05,
"loss": 1.383607029914856,
"step": 122
},
{
"epoch": 0.841025641025641,
"grad_norm": 0.9336929321289062,
"learning_rate": 1.735723910673132e-05,
"loss": 1.4406161308288574,
"step": 123
},
{
"epoch": 0.8478632478632478,
"grad_norm": 0.8780763149261475,
"learning_rate": 1.730612885535094e-05,
"loss": 1.4191570281982422,
"step": 124
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.8674494624137878,
"learning_rate": 1.7254605905874862e-05,
"loss": 1.437395691871643,
"step": 125
},
{
"epoch": 0.8615384615384616,
"grad_norm": 0.9440014958381653,
"learning_rate": 1.7202673168657318e-05,
"loss": 1.4250893592834473,
"step": 126
},
{
"epoch": 0.8683760683760684,
"grad_norm": 0.9403019547462463,
"learning_rate": 1.7150333577200062e-05,
"loss": 1.435499906539917,
"step": 127
},
{
"epoch": 0.8752136752136752,
"grad_norm": 0.863822877407074,
"learning_rate": 1.709759008798663e-05,
"loss": 1.409804105758667,
"step": 128
},
{
"epoch": 0.882051282051282,
"grad_norm": 0.9274973273277283,
"learning_rate": 1.7044445680315374e-05,
"loss": 1.433601975440979,
"step": 129
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.9369088411331177,
"learning_rate": 1.6990903356131125e-05,
"loss": 1.4320355653762817,
"step": 130
},
{
"epoch": 0.8957264957264958,
"grad_norm": 0.8703179955482483,
"learning_rate": 1.6936966139855664e-05,
"loss": 1.4167561531066895,
"step": 131
},
{
"epoch": 0.9025641025641026,
"grad_norm": 0.9144904017448425,
"learning_rate": 1.6882637078216867e-05,
"loss": 1.4223415851593018,
"step": 132
},
{
"epoch": 0.9094017094017094,
"grad_norm": 0.9126601219177246,
"learning_rate": 1.6827919240076612e-05,
"loss": 1.4480727910995483,
"step": 133
},
{
"epoch": 0.9162393162393162,
"grad_norm": 0.8591611981391907,
"learning_rate": 1.6772815716257414e-05,
"loss": 1.40584135055542,
"step": 134
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.8316404223442078,
"learning_rate": 1.671732961936785e-05,
"loss": 1.449837565422058,
"step": 135
},
{
"epoch": 0.9299145299145299,
"grad_norm": 0.8785284757614136,
"learning_rate": 1.6661464083626734e-05,
"loss": 1.440337061882019,
"step": 136
},
{
"epoch": 0.9367521367521368,
"grad_norm": 0.8786150813102722,
"learning_rate": 1.6605222264686085e-05,
"loss": 1.440657138824463,
"step": 137
},
{
"epoch": 0.9435897435897436,
"grad_norm": 0.8501399159431458,
"learning_rate": 1.6548607339452853e-05,
"loss": 1.397615671157837,
"step": 138
},
{
"epoch": 0.9504273504273504,
"grad_norm": 0.8737369775772095,
"learning_rate": 1.6491622505909483e-05,
"loss": 1.4285824298858643,
"step": 139
},
{
"epoch": 0.9572649572649573,
"grad_norm": 0.8369284868240356,
"learning_rate": 1.6434270982933272e-05,
"loss": 1.3992527723312378,
"step": 140
},
{
"epoch": 0.9641025641025641,
"grad_norm": 0.8740672469139099,
"learning_rate": 1.637655601011454e-05,
"loss": 1.4451634883880615,
"step": 141
},
{
"epoch": 0.9709401709401709,
"grad_norm": 0.873289942741394,
"learning_rate": 1.631848084757364e-05,
"loss": 1.3965365886688232,
"step": 142
},
{
"epoch": 0.9777777777777777,
"grad_norm": 0.9107730984687805,
"learning_rate": 1.6260048775776804e-05,
"loss": 1.4110256433486938,
"step": 143
},
{
"epoch": 0.9846153846153847,
"grad_norm": 0.8785021305084229,
"learning_rate": 1.6201263095350833e-05,
"loss": 1.4294975996017456,
"step": 144
},
{
"epoch": 0.9914529914529915,
"grad_norm": 0.8321818113327026,
"learning_rate": 1.6142127126896682e-05,
"loss": 1.4016475677490234,
"step": 145
},
{
"epoch": 0.9982905982905983,
"grad_norm": 0.8866358399391174,
"learning_rate": 1.6082644210801846e-05,
"loss": 1.3802778720855713,
"step": 146
},
{
"epoch": 1.0,
"grad_norm": 1.623956561088562,
"learning_rate": 1.602281770705172e-05,
"loss": 1.4806468486785889,
"step": 147
},
{
"epoch": 1.0068376068376068,
"grad_norm": 1.1759995222091675,
"learning_rate": 1.5962650995039783e-05,
"loss": 1.3020893335342407,
"step": 148
},
{
"epoch": 1.0136752136752136,
"grad_norm": 1.0619325637817383,
"learning_rate": 1.5902147473376695e-05,
"loss": 1.2844979763031006,
"step": 149
},
{
"epoch": 1.0205128205128204,
"grad_norm": 0.9689248204231262,
"learning_rate": 1.5841310559698346e-05,
"loss": 1.3303570747375488,
"step": 150
},
{
"epoch": 1.0205128205128204,
"eval_loss": 1.4194111824035645,
"eval_runtime": 13.7873,
"eval_samples_per_second": 71.515,
"eval_steps_per_second": 8.994,
"step": 150
},
{
"epoch": 1.0273504273504273,
"grad_norm": 0.9153519868850708,
"learning_rate": 1.578014369047279e-05,
"loss": 1.3417026996612549,
"step": 151
},
{
"epoch": 1.0341880341880343,
"grad_norm": 0.9799442887306213,
"learning_rate": 1.5718650320806145e-05,
"loss": 1.293771743774414,
"step": 152
},
{
"epoch": 1.041025641025641,
"grad_norm": 1.0599641799926758,
"learning_rate": 1.56568339242474e-05,
"loss": 1.3117493391036987,
"step": 153
},
{
"epoch": 1.047863247863248,
"grad_norm": 0.9470742344856262,
"learning_rate": 1.5594697992592232e-05,
"loss": 1.2798222303390503,
"step": 154
},
{
"epoch": 1.0547008547008547,
"grad_norm": 0.9936373829841614,
"learning_rate": 1.5532246035685755e-05,
"loss": 1.3070576190948486,
"step": 155
},
{
"epoch": 1.0615384615384615,
"grad_norm": 0.9454049468040466,
"learning_rate": 1.5469481581224274e-05,
"loss": 1.3386294841766357,
"step": 156
},
{
"epoch": 1.0683760683760684,
"grad_norm": 0.9544969797134399,
"learning_rate": 1.5406408174555978e-05,
"loss": 1.302185297012329,
"step": 157
},
{
"epoch": 1.0752136752136752,
"grad_norm": 0.9065172076225281,
"learning_rate": 1.5343029378480733e-05,
"loss": 1.3039960861206055,
"step": 158
},
{
"epoch": 1.082051282051282,
"grad_norm": 0.867220938205719,
"learning_rate": 1.527934877304879e-05,
"loss": 1.3006991147994995,
"step": 159
},
{
"epoch": 1.0888888888888888,
"grad_norm": 0.9097728133201599,
"learning_rate": 1.5215369955358568e-05,
"loss": 1.2785807847976685,
"step": 160
},
{
"epoch": 1.0957264957264958,
"grad_norm": 0.9294711351394653,
"learning_rate": 1.5151096539353481e-05,
"loss": 1.3051520586013794,
"step": 161
},
{
"epoch": 1.1025641025641026,
"grad_norm": 0.9427935481071472,
"learning_rate": 1.5086532155617785e-05,
"loss": 1.3146125078201294,
"step": 162
},
{
"epoch": 1.1094017094017095,
"grad_norm": 0.9104812741279602,
"learning_rate": 1.5021680451171499e-05,
"loss": 1.2878390550613403,
"step": 163
},
{
"epoch": 1.1162393162393163,
"grad_norm": 0.8972042202949524,
"learning_rate": 1.4956545089264408e-05,
"loss": 1.3068175315856934,
"step": 164
},
{
"epoch": 1.123076923076923,
"grad_norm": 0.9040313959121704,
"learning_rate": 1.489112974916912e-05,
"loss": 1.2897545099258423,
"step": 165
},
{
"epoch": 1.12991452991453,
"grad_norm": 0.9337772727012634,
"learning_rate": 1.4825438125973263e-05,
"loss": 1.301710844039917,
"step": 166
},
{
"epoch": 1.1367521367521367,
"grad_norm": 0.8870652914047241,
"learning_rate": 1.4759473930370738e-05,
"loss": 1.3163543939590454,
"step": 167
},
{
"epoch": 1.1435897435897435,
"grad_norm": 0.8637550473213196,
"learning_rate": 1.4693240888452121e-05,
"loss": 1.3200492858886719,
"step": 168
},
{
"epoch": 1.1504273504273503,
"grad_norm": 0.8388293981552124,
"learning_rate": 1.4626742741494207e-05,
"loss": 1.307487964630127,
"step": 169
},
{
"epoch": 1.1572649572649572,
"grad_norm": 0.9050071835517883,
"learning_rate": 1.4559983245748639e-05,
"loss": 1.2808455228805542,
"step": 170
},
{
"epoch": 1.1641025641025642,
"grad_norm": 0.965691089630127,
"learning_rate": 1.449296617222978e-05,
"loss": 1.332348346710205,
"step": 171
},
{
"epoch": 1.170940170940171,
"grad_norm": 0.8704518675804138,
"learning_rate": 1.4425695306501656e-05,
"loss": 1.306895136833191,
"step": 172
},
{
"epoch": 1.1777777777777778,
"grad_norm": 0.8741139769554138,
"learning_rate": 1.4358174448464155e-05,
"loss": 1.2980892658233643,
"step": 173
},
{
"epoch": 1.1846153846153846,
"grad_norm": 0.9941467642784119,
"learning_rate": 1.4290407412138365e-05,
"loss": 1.2821602821350098,
"step": 174
},
{
"epoch": 1.1914529914529914,
"grad_norm": 0.9268296957015991,
"learning_rate": 1.4222398025451137e-05,
"loss": 1.302233338356018,
"step": 175
},
{
"epoch": 1.1982905982905983,
"grad_norm": 0.8978403806686401,
"learning_rate": 1.4154150130018867e-05,
"loss": 1.265356421470642,
"step": 176
},
{
"epoch": 1.205128205128205,
"grad_norm": 0.9328585267066956,
"learning_rate": 1.4085667580930482e-05,
"loss": 1.320369005203247,
"step": 177
},
{
"epoch": 1.2119658119658119,
"grad_norm": 0.9113616943359375,
"learning_rate": 1.4016954246529697e-05,
"loss": 1.2897846698760986,
"step": 178
},
{
"epoch": 1.218803418803419,
"grad_norm": 0.9257543087005615,
"learning_rate": 1.3948014008196486e-05,
"loss": 1.3368397951126099,
"step": 179
},
{
"epoch": 1.2256410256410257,
"grad_norm": 0.8960409164428711,
"learning_rate": 1.3878850760127848e-05,
"loss": 1.3266628980636597,
"step": 180
},
{
"epoch": 1.2324786324786325,
"grad_norm": 0.9111725687980652,
"learning_rate": 1.3809468409117845e-05,
"loss": 1.2674126625061035,
"step": 181
},
{
"epoch": 1.2393162393162394,
"grad_norm": 0.9564438462257385,
"learning_rate": 1.3739870874336898e-05,
"loss": 1.2953293323516846,
"step": 182
},
{
"epoch": 1.2461538461538462,
"grad_norm": 1.0268452167510986,
"learning_rate": 1.3670062087110423e-05,
"loss": 1.3054559230804443,
"step": 183
},
{
"epoch": 1.252991452991453,
"grad_norm": 0.8995468020439148,
"learning_rate": 1.3600045990696762e-05,
"loss": 1.3053619861602783,
"step": 184
},
{
"epoch": 1.2598290598290598,
"grad_norm": 0.8805936574935913,
"learning_rate": 1.352982654006444e-05,
"loss": 1.3140225410461426,
"step": 185
},
{
"epoch": 1.2666666666666666,
"grad_norm": 0.9060247540473938,
"learning_rate": 1.3459407701668762e-05,
"loss": 1.3046287298202515,
"step": 186
},
{
"epoch": 1.2735042735042734,
"grad_norm": 0.8805747628211975,
"learning_rate": 1.3388793453227766e-05,
"loss": 1.3128578662872314,
"step": 187
},
{
"epoch": 1.2803418803418802,
"grad_norm": 0.8997815847396851,
"learning_rate": 1.331798778349752e-05,
"loss": 1.3107125759124756,
"step": 188
},
{
"epoch": 1.287179487179487,
"grad_norm": 0.9592490792274475,
"learning_rate": 1.3246994692046837e-05,
"loss": 1.3269885778427124,
"step": 189
},
{
"epoch": 1.294017094017094,
"grad_norm": 0.9726372957229614,
"learning_rate": 1.3175818189031326e-05,
"loss": 1.337971806526184,
"step": 190
},
{
"epoch": 1.300854700854701,
"grad_norm": 0.9480524659156799,
"learning_rate": 1.3104462294966895e-05,
"loss": 1.287239670753479,
"step": 191
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.9071521162986755,
"learning_rate": 1.3032931040502627e-05,
"loss": 1.2962584495544434,
"step": 192
},
{
"epoch": 1.3145299145299145,
"grad_norm": 0.9058794379234314,
"learning_rate": 1.2961228466193116e-05,
"loss": 1.280348300933838,
"step": 193
},
{
"epoch": 1.3213675213675213,
"grad_norm": 0.9048560261726379,
"learning_rate": 1.2889358622270225e-05,
"loss": 1.3330844640731812,
"step": 194
},
{
"epoch": 1.3282051282051281,
"grad_norm": 0.945749819278717,
"learning_rate": 1.2817325568414299e-05,
"loss": 1.3170994520187378,
"step": 195
},
{
"epoch": 1.335042735042735,
"grad_norm": 0.9457980394363403,
"learning_rate": 1.2745133373524855e-05,
"loss": 1.3166072368621826,
"step": 196
},
{
"epoch": 1.341880341880342,
"grad_norm": 0.9297810196876526,
"learning_rate": 1.267278611549073e-05,
"loss": 1.3273459672927856,
"step": 197
},
{
"epoch": 1.3487179487179488,
"grad_norm": 0.9370136260986328,
"learning_rate": 1.2600287880959762e-05,
"loss": 1.3432742357254028,
"step": 198
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.904547393321991,
"learning_rate": 1.2527642765107919e-05,
"loss": 1.3275690078735352,
"step": 199
},
{
"epoch": 1.3623931623931624,
"grad_norm": 0.9034311175346375,
"learning_rate": 1.2454854871407993e-05,
"loss": 1.3097259998321533,
"step": 200
},
{
"epoch": 1.3623931623931624,
"eval_loss": 1.4159187078475952,
"eval_runtime": 13.7977,
"eval_samples_per_second": 71.461,
"eval_steps_per_second": 8.987,
"step": 200
},
{
"epoch": 1.3692307692307693,
"grad_norm": 0.8713945150375366,
"learning_rate": 1.2381928311397806e-05,
"loss": 1.2865114212036133,
"step": 201
},
{
"epoch": 1.376068376068376,
"grad_norm": 0.8947977423667908,
"learning_rate": 1.2308867204447958e-05,
"loss": 1.277376651763916,
"step": 202
},
{
"epoch": 1.3829059829059829,
"grad_norm": 0.9047794342041016,
"learning_rate": 1.2235675677529158e-05,
"loss": 1.288478970527649,
"step": 203
},
{
"epoch": 1.3897435897435897,
"grad_norm": 0.8953425884246826,
"learning_rate": 1.2162357864979073e-05,
"loss": 1.2861666679382324,
"step": 204
},
{
"epoch": 1.3965811965811965,
"grad_norm": 0.9369704723358154,
"learning_rate": 1.2088917908268822e-05,
"loss": 1.2857511043548584,
"step": 205
},
{
"epoch": 1.4034188034188033,
"grad_norm": 0.887296736240387,
"learning_rate": 1.2015359955769021e-05,
"loss": 1.2925364971160889,
"step": 206
},
{
"epoch": 1.4102564102564101,
"grad_norm": 0.875452995300293,
"learning_rate": 1.1941688162515468e-05,
"loss": 1.3017300367355347,
"step": 207
},
{
"epoch": 1.4170940170940172,
"grad_norm": 0.8836603760719299,
"learning_rate": 1.186790668997443e-05,
"loss": 1.2731754779815674,
"step": 208
},
{
"epoch": 1.423931623931624,
"grad_norm": 0.8866926431655884,
"learning_rate": 1.1794019705807584e-05,
"loss": 1.3009804487228394,
"step": 209
},
{
"epoch": 1.4307692307692308,
"grad_norm": 0.8414238095283508,
"learning_rate": 1.1720031383636585e-05,
"loss": 1.3082433938980103,
"step": 210
},
{
"epoch": 1.4376068376068376,
"grad_norm": 0.8662127256393433,
"learning_rate": 1.164594590280734e-05,
"loss": 1.2641851902008057,
"step": 211
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.9151703119277954,
"learning_rate": 1.15717674481539e-05,
"loss": 1.3064939975738525,
"step": 212
},
{
"epoch": 1.4512820512820512,
"grad_norm": 0.9086518883705139,
"learning_rate": 1.1497500209762102e-05,
"loss": 1.3118016719818115,
"step": 213
},
{
"epoch": 1.458119658119658,
"grad_norm": 0.9340091347694397,
"learning_rate": 1.1423148382732854e-05,
"loss": 1.3228766918182373,
"step": 214
},
{
"epoch": 1.464957264957265,
"grad_norm": 0.865403950214386,
"learning_rate": 1.1348716166945195e-05,
"loss": 1.2863235473632812,
"step": 215
},
{
"epoch": 1.471794871794872,
"grad_norm": 0.8879923224449158,
"learning_rate": 1.127420776681905e-05,
"loss": 1.3169306516647339,
"step": 216
},
{
"epoch": 1.4786324786324787,
"grad_norm": 0.8761537075042725,
"learning_rate": 1.1199627391077732e-05,
"loss": 1.2758698463439941,
"step": 217
},
{
"epoch": 1.4854700854700855,
"grad_norm": 0.905274510383606,
"learning_rate": 1.1124979252510209e-05,
"loss": 1.3158073425292969,
"step": 218
},
{
"epoch": 1.4923076923076923,
"grad_norm": 0.9052457213401794,
"learning_rate": 1.105026756773314e-05,
"loss": 1.3242114782333374,
"step": 219
},
{
"epoch": 1.4991452991452991,
"grad_norm": 0.8539809584617615,
"learning_rate": 1.0975496556952683e-05,
"loss": 1.295405387878418,
"step": 220
},
{
"epoch": 1.505982905982906,
"grad_norm": 0.9171442985534668,
"learning_rate": 1.0900670443726136e-05,
"loss": 1.3160406351089478,
"step": 221
},
{
"epoch": 1.5128205128205128,
"grad_norm": 0.877983570098877,
"learning_rate": 1.0825793454723325e-05,
"loss": 1.315245509147644,
"step": 222
},
{
"epoch": 1.5196581196581196,
"grad_norm": 0.8745649456977844,
"learning_rate": 1.0750869819487884e-05,
"loss": 1.3248393535614014,
"step": 223
},
{
"epoch": 1.5264957264957264,
"grad_norm": 0.8661232590675354,
"learning_rate": 1.0675903770198333e-05,
"loss": 1.2788147926330566,
"step": 224
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.8793037533760071,
"learning_rate": 1.0600899541429004e-05,
"loss": 1.288352608680725,
"step": 225
},
{
"epoch": 1.54017094017094,
"grad_norm": 0.9148133397102356,
"learning_rate": 1.0525861369910877e-05,
"loss": 1.3211514949798584,
"step": 226
},
{
"epoch": 1.547008547008547,
"grad_norm": 0.9006965160369873,
"learning_rate": 1.0450793494292223e-05,
"loss": 1.3327584266662598,
"step": 227
},
{
"epoch": 1.5538461538461539,
"grad_norm": 0.8701738119125366,
"learning_rate": 1.0375700154899208e-05,
"loss": 1.3010832071304321,
"step": 228
},
{
"epoch": 1.5606837606837607,
"grad_norm": 0.880436360836029,
"learning_rate": 1.0300585593496348e-05,
"loss": 1.3152333498001099,
"step": 229
},
{
"epoch": 1.5675213675213675,
"grad_norm": 0.8781545758247375,
"learning_rate": 1.0225454053046922e-05,
"loss": 1.2808175086975098,
"step": 230
},
{
"epoch": 1.5743589743589743,
"grad_norm": 0.8630225658416748,
"learning_rate": 1.0150309777473305e-05,
"loss": 1.2873480319976807,
"step": 231
},
{
"epoch": 1.5811965811965814,
"grad_norm": 0.8928260803222656,
"learning_rate": 1.007515701141722e-05,
"loss": 1.28458571434021,
"step": 232
},
{
"epoch": 1.5880341880341882,
"grad_norm": 0.8699108958244324,
"learning_rate": 1e-05,
"loss": 1.2885918617248535,
"step": 233
},
{
"epoch": 1.594871794871795,
"grad_norm": 0.8759332895278931,
"learning_rate": 9.924842988582783e-06,
"loss": 1.2787448167800903,
"step": 234
},
{
"epoch": 1.6017094017094018,
"grad_norm": 0.8956566452980042,
"learning_rate": 9.849690222526698e-06,
"loss": 1.304962158203125,
"step": 235
},
{
"epoch": 1.6085470085470086,
"grad_norm": 0.8675941824913025,
"learning_rate": 9.77454594695308e-06,
"loss": 1.2871266603469849,
"step": 236
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.9092246294021606,
"learning_rate": 9.699414406503655e-06,
"loss": 1.327986240386963,
"step": 237
},
{
"epoch": 1.6222222222222222,
"grad_norm": 0.8909919857978821,
"learning_rate": 9.624299845100795e-06,
"loss": 1.2647631168365479,
"step": 238
},
{
"epoch": 1.629059829059829,
"grad_norm": 0.8657082915306091,
"learning_rate": 9.549206505707778e-06,
"loss": 1.294311761856079,
"step": 239
},
{
"epoch": 1.6358974358974359,
"grad_norm": 0.8618515133857727,
"learning_rate": 9.474138630089124e-06,
"loss": 1.3014901876449585,
"step": 240
},
{
"epoch": 1.6427350427350427,
"grad_norm": 0.8630589246749878,
"learning_rate": 9.399100458570998e-06,
"loss": 1.293131709098816,
"step": 241
},
{
"epoch": 1.6495726495726495,
"grad_norm": 0.8735710978507996,
"learning_rate": 9.324096229801673e-06,
"loss": 1.290333867073059,
"step": 242
},
{
"epoch": 1.6564102564102563,
"grad_norm": 0.8574416041374207,
"learning_rate": 9.249130180512118e-06,
"loss": 1.3111311197280884,
"step": 243
},
{
"epoch": 1.6632478632478631,
"grad_norm": 0.9102303981781006,
"learning_rate": 9.174206545276678e-06,
"loss": 1.271691083908081,
"step": 244
},
{
"epoch": 1.67008547008547,
"grad_norm": 0.867579996585846,
"learning_rate": 9.099329556273866e-06,
"loss": 1.3228224515914917,
"step": 245
},
{
"epoch": 1.676923076923077,
"grad_norm": 0.8179166316986084,
"learning_rate": 9.024503443047318e-06,
"loss": 1.3084717988967896,
"step": 246
},
{
"epoch": 1.6837606837606838,
"grad_norm": 0.8923108577728271,
"learning_rate": 8.949732432266867e-06,
"loss": 1.2903640270233154,
"step": 247
},
{
"epoch": 1.6905982905982906,
"grad_norm": 0.9241410493850708,
"learning_rate": 8.875020747489795e-06,
"loss": 1.302449345588684,
"step": 248
},
{
"epoch": 1.6974358974358974,
"grad_norm": 0.8430485129356384,
"learning_rate": 8.800372608922272e-06,
"loss": 1.2765015363693237,
"step": 249
},
{
"epoch": 1.7042735042735044,
"grad_norm": 0.8592954874038696,
"learning_rate": 8.72579223318095e-06,
"loss": 1.317484736442566,
"step": 250
},
{
"epoch": 1.7042735042735044,
"eval_loss": 1.4088929891586304,
"eval_runtime": 13.7993,
"eval_samples_per_second": 71.453,
"eval_steps_per_second": 8.986,
"step": 250
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.916032612323761,
"learning_rate": 8.65128383305481e-06,
"loss": 1.300941824913025,
"step": 251
},
{
"epoch": 1.717948717948718,
"grad_norm": 0.8675019145011902,
"learning_rate": 8.576851617267151e-06,
"loss": 1.3122076988220215,
"step": 252
},
{
"epoch": 1.7247863247863249,
"grad_norm": 0.8310043811798096,
"learning_rate": 8.5024997902379e-06,
"loss": 1.3160263299942017,
"step": 253
},
{
"epoch": 1.7316239316239317,
"grad_norm": 0.8706823587417603,
"learning_rate": 8.428232551846101e-06,
"loss": 1.2773703336715698,
"step": 254
},
{
"epoch": 1.7384615384615385,
"grad_norm": 0.8875864744186401,
"learning_rate": 8.35405409719266e-06,
"loss": 1.288883090019226,
"step": 255
},
{
"epoch": 1.7452991452991453,
"grad_norm": 0.9055056571960449,
"learning_rate": 8.279968616363417e-06,
"loss": 1.3028110265731812,
"step": 256
},
{
"epoch": 1.7521367521367521,
"grad_norm": 0.905623197555542,
"learning_rate": 8.205980294192421e-06,
"loss": 1.3112901449203491,
"step": 257
},
{
"epoch": 1.758974358974359,
"grad_norm": 0.847100555896759,
"learning_rate": 8.132093310025572e-06,
"loss": 1.311500906944275,
"step": 258
},
{
"epoch": 1.7658119658119658,
"grad_norm": 0.8671444058418274,
"learning_rate": 8.058311837484537e-06,
"loss": 1.308862566947937,
"step": 259
},
{
"epoch": 1.7726495726495726,
"grad_norm": 0.844569742679596,
"learning_rate": 7.984640044230984e-06,
"loss": 1.3032524585723877,
"step": 260
},
{
"epoch": 1.7794871794871794,
"grad_norm": 0.9013960957527161,
"learning_rate": 7.911082091731182e-06,
"loss": 1.2791337966918945,
"step": 261
},
{
"epoch": 1.7863247863247862,
"grad_norm": 0.8714650869369507,
"learning_rate": 7.837642135020929e-06,
"loss": 1.2602317333221436,
"step": 262
},
{
"epoch": 1.793162393162393,
"grad_norm": 0.9024747014045715,
"learning_rate": 7.764324322470842e-06,
"loss": 1.279998540878296,
"step": 263
},
{
"epoch": 1.8,
"grad_norm": 0.8714993596076965,
"learning_rate": 7.691132795552044e-06,
"loss": 1.284783959388733,
"step": 264
},
{
"epoch": 1.8068376068376069,
"grad_norm": 0.8371661305427551,
"learning_rate": 7.618071688602199e-06,
"loss": 1.3234297037124634,
"step": 265
},
{
"epoch": 1.8136752136752137,
"grad_norm": 0.8943991661071777,
"learning_rate": 7.545145128592009e-06,
"loss": 1.2969616651535034,
"step": 266
},
{
"epoch": 1.8205128205128205,
"grad_norm": 0.8753275275230408,
"learning_rate": 7.472357234892083e-06,
"loss": 1.2795380353927612,
"step": 267
},
{
"epoch": 1.8273504273504273,
"grad_norm": 0.8614721894264221,
"learning_rate": 7.3997121190402375e-06,
"loss": 1.3064361810684204,
"step": 268
},
{
"epoch": 1.8341880341880343,
"grad_norm": 0.853656530380249,
"learning_rate": 7.3272138845092725e-06,
"loss": 1.3017405271530151,
"step": 269
},
{
"epoch": 1.8410256410256411,
"grad_norm": 0.8655431866645813,
"learning_rate": 7.254866626475152e-06,
"loss": 1.304486632347107,
"step": 270
},
{
"epoch": 1.847863247863248,
"grad_norm": 0.87064528465271,
"learning_rate": 7.182674431585703e-06,
"loss": 1.2795239686965942,
"step": 271
},
{
"epoch": 1.8547008547008548,
"grad_norm": 0.8889244198799133,
"learning_rate": 7.110641377729778e-06,
"loss": 1.294914960861206,
"step": 272
},
{
"epoch": 1.8615384615384616,
"grad_norm": 0.9096329212188721,
"learning_rate": 7.038771533806884e-06,
"loss": 1.2885854244232178,
"step": 273
},
{
"epoch": 1.8683760683760684,
"grad_norm": 0.8873443007469177,
"learning_rate": 6.967068959497376e-06,
"loss": 1.297377347946167,
"step": 274
},
{
"epoch": 1.8752136752136752,
"grad_norm": 0.8182293772697449,
"learning_rate": 6.895537705033108e-06,
"loss": 1.3091909885406494,
"step": 275
},
{
"epoch": 1.882051282051282,
"grad_norm": 0.849620521068573,
"learning_rate": 6.824181810968675e-06,
"loss": 1.2712843418121338,
"step": 276
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.8953171372413635,
"learning_rate": 6.7530053079531664e-06,
"loss": 1.305629849433899,
"step": 277
},
{
"epoch": 1.8957264957264957,
"grad_norm": 0.8743292689323425,
"learning_rate": 6.6820122165024845e-06,
"loss": 1.3009774684906006,
"step": 278
},
{
"epoch": 1.9025641025641025,
"grad_norm": 0.8852370977401733,
"learning_rate": 6.6112065467722375e-06,
"loss": 1.2898852825164795,
"step": 279
},
{
"epoch": 1.9094017094017093,
"grad_norm": 0.8812291026115417,
"learning_rate": 6.540592298331239e-06,
"loss": 1.3161499500274658,
"step": 280
},
{
"epoch": 1.916239316239316,
"grad_norm": 0.8949340581893921,
"learning_rate": 6.4701734599355605e-06,
"loss": 1.2947360277175903,
"step": 281
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.8372949957847595,
"learning_rate": 6.3999540093032396e-06,
"loss": 1.263576626777649,
"step": 282
},
{
"epoch": 1.92991452991453,
"grad_norm": 0.8882158398628235,
"learning_rate": 6.329937912889582e-06,
"loss": 1.2893450260162354,
"step": 283
},
{
"epoch": 1.9367521367521368,
"grad_norm": 0.838527500629425,
"learning_rate": 6.260129125663106e-06,
"loss": 1.2985213994979858,
"step": 284
},
{
"epoch": 1.9435897435897436,
"grad_norm": 0.8823593258857727,
"learning_rate": 6.1905315908821584e-06,
"loss": 1.306897521018982,
"step": 285
},
{
"epoch": 1.9504273504273504,
"grad_norm": 0.8618027567863464,
"learning_rate": 6.121149239872151e-06,
"loss": 1.2990589141845703,
"step": 286
},
{
"epoch": 1.9572649572649574,
"grad_norm": 0.8389527797698975,
"learning_rate": 6.051985991803517e-06,
"loss": 1.2886924743652344,
"step": 287
},
{
"epoch": 1.9641025641025642,
"grad_norm": 0.8738916516304016,
"learning_rate": 5.983045753470308e-06,
"loss": 1.3003113269805908,
"step": 288
},
{
"epoch": 1.970940170940171,
"grad_norm": 0.8567415475845337,
"learning_rate": 5.91433241906952e-06,
"loss": 1.285038948059082,
"step": 289
},
{
"epoch": 1.9777777777777779,
"grad_norm": 0.8555871248245239,
"learning_rate": 5.845849869981137e-06,
"loss": 1.2825312614440918,
"step": 290
},
{
"epoch": 1.9846153846153847,
"grad_norm": 0.8524548411369324,
"learning_rate": 5.7776019745488665e-06,
"loss": 1.3078036308288574,
"step": 291
},
{
"epoch": 1.9914529914529915,
"grad_norm": 0.8610931634902954,
"learning_rate": 5.709592587861637e-06,
"loss": 1.2933144569396973,
"step": 292
},
{
"epoch": 1.9982905982905983,
"grad_norm": 0.8547428250312805,
"learning_rate": 5.641825551535849e-06,
"loss": 1.2723497152328491,
"step": 293
},
{
"epoch": 2.0,
"grad_norm": 1.6815301179885864,
"learning_rate": 5.574304693498346e-06,
"loss": 1.260840892791748,
"step": 294
},
{
"epoch": 2.006837606837607,
"grad_norm": 1.1894463300704956,
"learning_rate": 5.507033827770225e-06,
"loss": 1.2158567905426025,
"step": 295
},
{
"epoch": 2.0136752136752136,
"grad_norm": 1.1574074029922485,
"learning_rate": 5.440016754251364e-06,
"loss": 1.188340663909912,
"step": 296
},
{
"epoch": 2.0205128205128204,
"grad_norm": 0.9981362819671631,
"learning_rate": 5.373257258505798e-06,
"loss": 1.1729332208633423,
"step": 297
},
{
"epoch": 2.0273504273504273,
"grad_norm": 1.0496586561203003,
"learning_rate": 5.306759111547881e-06,
"loss": 1.1735312938690186,
"step": 298
},
{
"epoch": 2.034188034188034,
"grad_norm": 0.9409749507904053,
"learning_rate": 5.240526069629265e-06,
"loss": 1.198318600654602,
"step": 299
},
{
"epoch": 2.041025641025641,
"grad_norm": 0.9382721781730652,
"learning_rate": 5.174561874026741e-06,
"loss": 1.2194828987121582,
"step": 300
},
{
"epoch": 2.041025641025641,
"eval_loss": 1.4175776243209839,
"eval_runtime": 13.7699,
"eval_samples_per_second": 71.606,
"eval_steps_per_second": 9.005,
"step": 300
},
{
"epoch": 2.0478632478632477,
"grad_norm": 0.936610996723175,
"learning_rate": 5.1088702508308815e-06,
"loss": 1.2439236640930176,
"step": 301
},
{
"epoch": 2.0547008547008545,
"grad_norm": 0.9476950764656067,
"learning_rate": 5.043454910735595e-06,
"loss": 1.2119914293289185,
"step": 302
},
{
"epoch": 2.0615384615384613,
"grad_norm": 0.975143313407898,
"learning_rate": 4.978319548828504e-06,
"loss": 1.1766479015350342,
"step": 303
},
{
"epoch": 2.0683760683760686,
"grad_norm": 0.9535344243049622,
"learning_rate": 4.913467844382217e-06,
"loss": 1.2154781818389893,
"step": 304
},
{
"epoch": 2.0752136752136754,
"grad_norm": 0.9839100241661072,
"learning_rate": 4.848903460646522e-06,
"loss": 1.1973791122436523,
"step": 305
},
{
"epoch": 2.082051282051282,
"grad_norm": 0.9296822547912598,
"learning_rate": 4.784630044641435e-06,
"loss": 1.2077343463897705,
"step": 306
},
{
"epoch": 2.088888888888889,
"grad_norm": 0.9518297910690308,
"learning_rate": 4.720651226951213e-06,
"loss": 1.2044742107391357,
"step": 307
},
{
"epoch": 2.095726495726496,
"grad_norm": 0.9024590253829956,
"learning_rate": 4.65697062151927e-06,
"loss": 1.2214324474334717,
"step": 308
},
{
"epoch": 2.1025641025641026,
"grad_norm": 0.8939958214759827,
"learning_rate": 4.593591825444028e-06,
"loss": 1.230959177017212,
"step": 309
},
{
"epoch": 2.1094017094017095,
"grad_norm": 0.9565759301185608,
"learning_rate": 4.530518418775734e-06,
"loss": 1.2308049201965332,
"step": 310
},
{
"epoch": 2.1162393162393163,
"grad_norm": 0.8952397704124451,
"learning_rate": 4.467753964314245e-06,
"loss": 1.2218645811080933,
"step": 311
},
{
"epoch": 2.123076923076923,
"grad_norm": 0.9192137122154236,
"learning_rate": 4.40530200740777e-06,
"loss": 1.1945393085479736,
"step": 312
},
{
"epoch": 2.12991452991453,
"grad_norm": 0.9151750206947327,
"learning_rate": 4.343166075752605e-06,
"loss": 1.1909265518188477,
"step": 313
},
{
"epoch": 2.1367521367521367,
"grad_norm": 0.912064790725708,
"learning_rate": 4.281349679193862e-06,
"loss": 1.176002860069275,
"step": 314
},
{
"epoch": 2.1435897435897435,
"grad_norm": 0.9001777172088623,
"learning_rate": 4.219856309527212e-06,
"loss": 1.2102347612380981,
"step": 315
},
{
"epoch": 2.1504273504273503,
"grad_norm": 0.9100410342216492,
"learning_rate": 4.1586894403016576e-06,
"loss": 1.2215776443481445,
"step": 316
},
{
"epoch": 2.157264957264957,
"grad_norm": 0.8823668360710144,
"learning_rate": 4.097852526623307e-06,
"loss": 1.1972424983978271,
"step": 317
},
{
"epoch": 2.164102564102564,
"grad_norm": 0.8945139050483704,
"learning_rate": 4.03734900496022e-06,
"loss": 1.2440537214279175,
"step": 318
},
{
"epoch": 2.1709401709401708,
"grad_norm": 0.858863890171051,
"learning_rate": 3.9771822929482825e-06,
"loss": 1.2240134477615356,
"step": 319
},
{
"epoch": 2.1777777777777776,
"grad_norm": 0.9579023122787476,
"learning_rate": 3.917355789198157e-06,
"loss": 1.1975905895233154,
"step": 320
},
{
"epoch": 2.184615384615385,
"grad_norm": 0.8992065191268921,
"learning_rate": 3.857872873103322e-06,
"loss": 1.2251243591308594,
"step": 321
},
{
"epoch": 2.1914529914529917,
"grad_norm": 0.8930969834327698,
"learning_rate": 3.7987369046491684e-06,
"loss": 1.1994602680206299,
"step": 322
},
{
"epoch": 2.1982905982905985,
"grad_norm": 0.8879907727241516,
"learning_rate": 3.7399512242231994e-06,
"loss": 1.2023355960845947,
"step": 323
},
{
"epoch": 2.2051282051282053,
"grad_norm": 0.8827998638153076,
"learning_rate": 3.6815191524263628e-06,
"loss": 1.1980074644088745,
"step": 324
},
{
"epoch": 2.211965811965812,
"grad_norm": 0.9081103801727295,
"learning_rate": 3.623443989885462e-06,
"loss": 1.2123109102249146,
"step": 325
},
{
"epoch": 2.218803418803419,
"grad_norm": 0.8658437132835388,
"learning_rate": 3.565729017066729e-06,
"loss": 1.1860473155975342,
"step": 326
},
{
"epoch": 2.2256410256410257,
"grad_norm": 0.8716210722923279,
"learning_rate": 3.508377494090521e-06,
"loss": 1.246274471282959,
"step": 327
},
{
"epoch": 2.2324786324786325,
"grad_norm": 0.8930105566978455,
"learning_rate": 3.4513926605471504e-06,
"loss": 1.2249618768692017,
"step": 328
},
{
"epoch": 2.2393162393162394,
"grad_norm": 0.8859133720397949,
"learning_rate": 3.3947777353139188e-06,
"loss": 1.2300435304641724,
"step": 329
},
{
"epoch": 2.246153846153846,
"grad_norm": 0.876879096031189,
"learning_rate": 3.338535916373267e-06,
"loss": 1.226067066192627,
"step": 330
},
{
"epoch": 2.252991452991453,
"grad_norm": 0.8582764863967896,
"learning_rate": 3.2826703806321526e-06,
"loss": 1.2141978740692139,
"step": 331
},
{
"epoch": 2.25982905982906,
"grad_norm": 0.9050947427749634,
"learning_rate": 3.2271842837425917e-06,
"loss": 1.199479103088379,
"step": 332
},
{
"epoch": 2.2666666666666666,
"grad_norm": 0.8743166923522949,
"learning_rate": 3.1720807599233903e-06,
"loss": 1.2526406049728394,
"step": 333
},
{
"epoch": 2.2735042735042734,
"grad_norm": 0.9142019152641296,
"learning_rate": 3.1173629217831345e-06,
"loss": 1.1963285207748413,
"step": 334
},
{
"epoch": 2.2803418803418802,
"grad_norm": 0.8888209462165833,
"learning_rate": 3.063033860144339e-06,
"loss": 1.209120512008667,
"step": 335
},
{
"epoch": 2.287179487179487,
"grad_norm": 0.8925624489784241,
"learning_rate": 3.0090966438688774e-06,
"loss": 1.1804795265197754,
"step": 336
},
{
"epoch": 2.294017094017094,
"grad_norm": 0.9087634682655334,
"learning_rate": 2.9555543196846293e-06,
"loss": 1.2147403955459595,
"step": 337
},
{
"epoch": 2.3008547008547007,
"grad_norm": 0.9099950194358826,
"learning_rate": 2.9024099120133674e-06,
"loss": 1.2237548828125,
"step": 338
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.8658971786499023,
"learning_rate": 2.8496664227999417e-06,
"loss": 1.2072890996932983,
"step": 339
},
{
"epoch": 2.3145299145299143,
"grad_norm": 0.8897408843040466,
"learning_rate": 2.7973268313426836e-06,
"loss": 1.2147533893585205,
"step": 340
},
{
"epoch": 2.3213675213675216,
"grad_norm": 0.8564779758453369,
"learning_rate": 2.745394094125141e-06,
"loss": 1.2456395626068115,
"step": 341
},
{
"epoch": 2.3282051282051284,
"grad_norm": 0.8652287125587463,
"learning_rate": 2.6938711446490607e-06,
"loss": 1.2109252214431763,
"step": 342
},
{
"epoch": 2.335042735042735,
"grad_norm": 0.8643552660942078,
"learning_rate": 2.642760893268684e-06,
"loss": 1.1878920793533325,
"step": 343
},
{
"epoch": 2.341880341880342,
"grad_norm": 0.8824043869972229,
"learning_rate": 2.5920662270263653e-06,
"loss": 1.1911319494247437,
"step": 344
},
{
"epoch": 2.348717948717949,
"grad_norm": 0.8898422122001648,
"learning_rate": 2.541790009489474e-06,
"loss": 1.193242073059082,
"step": 345
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.8772786259651184,
"learning_rate": 2.491935080588658e-06,
"loss": 1.1836318969726562,
"step": 346
},
{
"epoch": 2.3623931623931624,
"grad_norm": 0.8587839603424072,
"learning_rate": 2.4425042564574186e-06,
"loss": 1.2118480205535889,
"step": 347
},
{
"epoch": 2.3692307692307693,
"grad_norm": 0.8739367127418518,
"learning_rate": 2.3935003292730295e-06,
"loss": 1.2201834917068481,
"step": 348
},
{
"epoch": 2.376068376068376,
"grad_norm": 0.8904187679290771,
"learning_rate": 2.344926067098836e-06,
"loss": 1.1912821531295776,
"step": 349
},
{
"epoch": 2.382905982905983,
"grad_norm": 0.8717731237411499,
"learning_rate": 2.2967842137278706e-06,
"loss": 1.2726080417633057,
"step": 350
},
{
"epoch": 2.382905982905983,
"eval_loss": 1.422935962677002,
"eval_runtime": 13.7932,
"eval_samples_per_second": 71.484,
"eval_steps_per_second": 8.99,
"step": 350
},
{
"epoch": 2.3897435897435897,
"grad_norm": 0.8623640537261963,
"learning_rate": 2.249077488527891e-06,
"loss": 1.1917917728424072,
"step": 351
},
{
"epoch": 2.3965811965811965,
"grad_norm": 0.9295298457145691,
"learning_rate": 2.201808586287757e-06,
"loss": 1.195438027381897,
"step": 352
},
{
"epoch": 2.4034188034188033,
"grad_norm": 0.8726212382316589,
"learning_rate": 2.15498017706521e-06,
"loss": 1.1993173360824585,
"step": 353
},
{
"epoch": 2.41025641025641,
"grad_norm": 0.8750997185707092,
"learning_rate": 2.1085949060360654e-06,
"loss": 1.2198253870010376,
"step": 354
},
{
"epoch": 2.417094017094017,
"grad_norm": 0.8799977898597717,
"learning_rate": 2.0626553933447734e-06,
"loss": 1.1714023351669312,
"step": 355
},
{
"epoch": 2.4239316239316238,
"grad_norm": 0.9106065034866333,
"learning_rate": 2.01716423395644e-06,
"loss": 1.2285724878311157,
"step": 356
},
{
"epoch": 2.430769230769231,
"grad_norm": 0.8555257320404053,
"learning_rate": 1.9721239975102313e-06,
"loss": 1.1813218593597412,
"step": 357
},
{
"epoch": 2.437606837606838,
"grad_norm": 0.8696889877319336,
"learning_rate": 1.9275372281742242e-06,
"loss": 1.2316478490829468,
"step": 358
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.9041836857795715,
"learning_rate": 1.8834064445016952e-06,
"loss": 1.2227892875671387,
"step": 359
},
{
"epoch": 2.4512820512820515,
"grad_norm": 0.8697716593742371,
"learning_rate": 1.8397341392888679e-06,
"loss": 1.224617600440979,
"step": 360
},
{
"epoch": 2.4581196581196583,
"grad_norm": 0.8882873058319092,
"learning_rate": 1.7965227794340879e-06,
"loss": 1.1995422840118408,
"step": 361
},
{
"epoch": 2.464957264957265,
"grad_norm": 0.8834539651870728,
"learning_rate": 1.7537748057984861e-06,
"loss": 1.2222732305526733,
"step": 362
},
{
"epoch": 2.471794871794872,
"grad_norm": 0.899989128112793,
"learning_rate": 1.7114926330680958e-06,
"loss": 1.2143341302871704,
"step": 363
},
{
"epoch": 2.4786324786324787,
"grad_norm": 0.8635477423667908,
"learning_rate": 1.6696786496174578e-06,
"loss": 1.2323466539382935,
"step": 364
},
{
"epoch": 2.4854700854700855,
"grad_norm": 0.8827865719795227,
"learning_rate": 1.6283352173747148e-06,
"loss": 1.1907907724380493,
"step": 365
},
{
"epoch": 2.4923076923076923,
"grad_norm": 0.8702190518379211,
"learning_rate": 1.587464671688187e-06,
"loss": 1.201211929321289,
"step": 366
},
{
"epoch": 2.499145299145299,
"grad_norm": 0.8626653552055359,
"learning_rate": 1.5470693211944643e-06,
"loss": 1.1894201040267944,
"step": 367
},
{
"epoch": 2.505982905982906,
"grad_norm": 0.879705011844635,
"learning_rate": 1.5071514476879878e-06,
"loss": 1.2102407217025757,
"step": 368
},
{
"epoch": 2.5128205128205128,
"grad_norm": 0.8780226707458496,
"learning_rate": 1.4677133059921634e-06,
"loss": 1.235593557357788,
"step": 369
},
{
"epoch": 2.5196581196581196,
"grad_norm": 0.8804551362991333,
"learning_rate": 1.4287571238320053e-06,
"loss": 1.2265985012054443,
"step": 370
},
{
"epoch": 2.5264957264957264,
"grad_norm": 0.8670660257339478,
"learning_rate": 1.3902851017082863e-06,
"loss": 1.1925873756408691,
"step": 371
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.8729323744773865,
"learning_rate": 1.3522994127732415e-06,
"loss": 1.20308518409729,
"step": 372
},
{
"epoch": 2.54017094017094,
"grad_norm": 0.8794763088226318,
"learning_rate": 1.3148022027078223e-06,
"loss": 1.2204805612564087,
"step": 373
},
{
"epoch": 2.547008547008547,
"grad_norm": 0.870823323726654,
"learning_rate": 1.2777955896004812e-06,
"loss": 1.2257260084152222,
"step": 374
},
{
"epoch": 2.5538461538461537,
"grad_norm": 0.8570955991744995,
"learning_rate": 1.2412816638275406e-06,
"loss": 1.2166708707809448,
"step": 375
},
{
"epoch": 2.5606837606837605,
"grad_norm": 0.8496021628379822,
"learning_rate": 1.2052624879351105e-06,
"loss": 1.1956825256347656,
"step": 376
},
{
"epoch": 2.5675213675213673,
"grad_norm": 0.8563467860221863,
"learning_rate": 1.1697400965225746e-06,
"loss": 1.2383781671524048,
"step": 377
},
{
"epoch": 2.574358974358974,
"grad_norm": 0.8653855919837952,
"learning_rate": 1.134716496127679e-06,
"loss": 1.218265414237976,
"step": 378
},
{
"epoch": 2.5811965811965814,
"grad_norm": 0.8653165698051453,
"learning_rate": 1.1001936651131717e-06,
"loss": 1.226462483406067,
"step": 379
},
{
"epoch": 2.588034188034188,
"grad_norm": 0.8810314536094666,
"learning_rate": 1.0661735535550666e-06,
"loss": 1.176276445388794,
"step": 380
},
{
"epoch": 2.594871794871795,
"grad_norm": 0.8538199663162231,
"learning_rate": 1.0326580831324816e-06,
"loss": 1.2393090724945068,
"step": 381
},
{
"epoch": 2.601709401709402,
"grad_norm": 0.849739134311676,
"learning_rate": 9.996491470190917e-07,
"loss": 1.2231508493423462,
"step": 382
},
{
"epoch": 2.6085470085470086,
"grad_norm": 0.891149640083313,
"learning_rate": 9.671486097761918e-07,
"loss": 1.2225626707077026,
"step": 383
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.8668763637542725,
"learning_rate": 9.351583072473713e-07,
"loss": 1.2182505130767822,
"step": 384
},
{
"epoch": 2.6222222222222222,
"grad_norm": 0.8931220173835754,
"learning_rate": 9.036800464548157e-07,
"loss": 1.1996538639068604,
"step": 385
},
{
"epoch": 2.629059829059829,
"grad_norm": 0.923690140247345,
"learning_rate": 8.727156054972374e-07,
"loss": 1.238417148590088,
"step": 386
},
{
"epoch": 2.635897435897436,
"grad_norm": 0.9119179844856262,
"learning_rate": 8.42266733449425e-07,
"loss": 1.226833462715149,
"step": 387
},
{
"epoch": 2.6427350427350427,
"grad_norm": 0.8686037659645081,
"learning_rate": 8.123351502634625e-07,
"loss": 1.1834110021591187,
"step": 388
},
{
"epoch": 2.6495726495726495,
"grad_norm": 0.8596007823944092,
"learning_rate": 7.829225466715551e-07,
"loss": 1.1922662258148193,
"step": 389
},
{
"epoch": 2.6564102564102563,
"grad_norm": 0.8411397337913513,
"learning_rate": 7.540305840905371e-07,
"loss": 1.2220802307128906,
"step": 390
},
{
"epoch": 2.663247863247863,
"grad_norm": 0.8473320007324219,
"learning_rate": 7.256608945280319e-07,
"loss": 1.176034688949585,
"step": 391
},
{
"epoch": 2.67008547008547,
"grad_norm": 0.8465791940689087,
"learning_rate": 6.978150804902451e-07,
"loss": 1.2118513584136963,
"step": 392
},
{
"epoch": 2.676923076923077,
"grad_norm": 0.8556994199752808,
"learning_rate": 6.704947148914608e-07,
"loss": 1.2035595178604126,
"step": 393
},
{
"epoch": 2.683760683760684,
"grad_norm": 0.8603663444519043,
"learning_rate": 6.437013409651849e-07,
"loss": 1.2043513059616089,
"step": 394
},
{
"epoch": 2.690598290598291,
"grad_norm": 0.8347552418708801,
"learning_rate": 6.174364721769744e-07,
"loss": 1.260666847229004,
"step": 395
},
{
"epoch": 2.6974358974358976,
"grad_norm": 0.867624044418335,
"learning_rate": 5.917015921389569e-07,
"loss": 1.2071622610092163,
"step": 396
},
{
"epoch": 2.7042735042735044,
"grad_norm": 0.8668217062950134,
"learning_rate": 5.664981545260073e-07,
"loss": 1.197313904762268,
"step": 397
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.8758941292762756,
"learning_rate": 5.418275829936537e-07,
"loss": 1.1844048500061035,
"step": 398
},
{
"epoch": 2.717948717948718,
"grad_norm": 0.866844892501831,
"learning_rate": 5.176912710976467e-07,
"loss": 1.1971948146820068,
"step": 399
},
{
"epoch": 2.724786324786325,
"grad_norm": 0.8587160110473633,
"learning_rate": 4.940905822152454e-07,
"loss": 1.1895333528518677,
"step": 400
},
{
"epoch": 2.724786324786325,
"eval_loss": 1.4216117858886719,
"eval_runtime": 13.782,
"eval_samples_per_second": 71.543,
"eval_steps_per_second": 8.997,
"step": 400
},
{
"epoch": 2.7316239316239317,
"grad_norm": 0.8763930201530457,
"learning_rate": 4.710268494682146e-07,
"loss": 1.1914920806884766,
"step": 401
},
{
"epoch": 2.7384615384615385,
"grad_norm": 0.8831557035446167,
"learning_rate": 4.485013756475076e-07,
"loss": 1.1900079250335693,
"step": 402
},
{
"epoch": 2.7452991452991453,
"grad_norm": 0.866532027721405,
"learning_rate": 4.265154331396815e-07,
"loss": 1.1844745874404907,
"step": 403
},
{
"epoch": 2.752136752136752,
"grad_norm": 0.8787288069725037,
"learning_rate": 4.0507026385502747e-07,
"loss": 1.2126126289367676,
"step": 404
},
{
"epoch": 2.758974358974359,
"grad_norm": 0.8669936060905457,
"learning_rate": 3.841670791574137e-07,
"loss": 1.229267954826355,
"step": 405
},
{
"epoch": 2.7658119658119658,
"grad_norm": 0.8436914086341858,
"learning_rate": 3.638070597958665e-07,
"loss": 1.1994611024856567,
"step": 406
},
{
"epoch": 2.7726495726495726,
"grad_norm": 0.8477561473846436,
"learning_rate": 3.439913558378705e-07,
"loss": 1.2160733938217163,
"step": 407
},
{
"epoch": 2.7794871794871794,
"grad_norm": 0.9217561483383179,
"learning_rate": 3.2472108660439706e-07,
"loss": 1.1882672309875488,
"step": 408
},
{
"epoch": 2.786324786324786,
"grad_norm": 0.8692064881324768,
"learning_rate": 3.059973406066963e-07,
"loss": 1.186108112335205,
"step": 409
},
{
"epoch": 2.793162393162393,
"grad_norm": 0.8593800067901611,
"learning_rate": 2.878211754847926e-07,
"loss": 1.2128371000289917,
"step": 410
},
{
"epoch": 2.8,
"grad_norm": 0.8875913023948669,
"learning_rate": 2.701936179477516e-07,
"loss": 1.1906311511993408,
"step": 411
},
{
"epoch": 2.8068376068376066,
"grad_norm": 0.8833599090576172,
"learning_rate": 2.5311566371568505e-07,
"loss": 1.1937415599822998,
"step": 412
},
{
"epoch": 2.8136752136752134,
"grad_norm": 0.8523573279380798,
"learning_rate": 2.3658827746349976e-07,
"loss": 1.1862268447875977,
"step": 413
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.8653656244277954,
"learning_rate": 2.206123927664161e-07,
"loss": 1.2255483865737915,
"step": 414
},
{
"epoch": 2.827350427350427,
"grad_norm": 0.874724805355072,
"learning_rate": 2.0518891204722169e-07,
"loss": 1.2177876234054565,
"step": 415
},
{
"epoch": 2.8341880341880343,
"grad_norm": 0.8411559462547302,
"learning_rate": 1.903187065253076e-07,
"loss": 1.2034833431243896,
"step": 416
},
{
"epoch": 2.841025641025641,
"grad_norm": 0.8371963500976562,
"learning_rate": 1.7600261616745106e-07,
"loss": 1.1710231304168701,
"step": 417
},
{
"epoch": 2.847863247863248,
"grad_norm": 0.8555141687393188,
"learning_rate": 1.622414496403668e-07,
"loss": 1.2024474143981934,
"step": 418
},
{
"epoch": 2.8547008547008548,
"grad_norm": 0.8661652207374573,
"learning_rate": 1.490359842650324e-07,
"loss": 1.2498114109039307,
"step": 419
},
{
"epoch": 2.8615384615384616,
"grad_norm": 0.8592333197593689,
"learning_rate": 1.3638696597277678e-07,
"loss": 1.2100580930709839,
"step": 420
},
{
"epoch": 2.8683760683760684,
"grad_norm": 0.8594926595687866,
"learning_rate": 1.2429510926314835e-07,
"loss": 1.1787865161895752,
"step": 421
},
{
"epoch": 2.875213675213675,
"grad_norm": 0.8879026174545288,
"learning_rate": 1.1276109716355288e-07,
"loss": 1.2315534353256226,
"step": 422
},
{
"epoch": 2.882051282051282,
"grad_norm": 0.8497971892356873,
"learning_rate": 1.0178558119067316e-07,
"loss": 1.2027359008789062,
"step": 423
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.8838421106338501,
"learning_rate": 9.136918131366412e-08,
"loss": 1.2284358739852905,
"step": 424
},
{
"epoch": 2.8957264957264957,
"grad_norm": 0.8940805196762085,
"learning_rate": 8.151248591913519e-08,
"loss": 1.2018911838531494,
"step": 425
},
{
"epoch": 2.9025641025641025,
"grad_norm": 0.8463784456253052,
"learning_rate": 7.22160517779169e-08,
"loss": 1.2137906551361084,
"step": 426
},
{
"epoch": 2.9094017094017093,
"grad_norm": 0.8508373498916626,
"learning_rate": 6.348040401360833e-08,
"loss": 1.2048455476760864,
"step": 427
},
{
"epoch": 2.916239316239316,
"grad_norm": 0.8702911138534546,
"learning_rate": 5.530603607290852e-08,
"loss": 1.216880202293396,
"step": 428
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.8441773653030396,
"learning_rate": 4.7693409697756596e-08,
"loss": 1.2449169158935547,
"step": 429
},
{
"epoch": 2.92991452991453,
"grad_norm": 0.8643396496772766,
"learning_rate": 4.0642954899238196e-08,
"loss": 1.2154898643493652,
"step": 430
},
{
"epoch": 2.936752136752137,
"grad_norm": 0.8390621542930603,
"learning_rate": 3.4155069933301535e-08,
"loss": 1.1894207000732422,
"step": 431
},
{
"epoch": 2.943589743589744,
"grad_norm": 0.8889386057853699,
"learning_rate": 2.823012127825764e-08,
"loss": 1.2449326515197754,
"step": 432
},
{
"epoch": 2.9504273504273506,
"grad_norm": 0.8431465029716492,
"learning_rate": 2.2868443614082468e-08,
"loss": 1.1918964385986328,
"step": 433
},
{
"epoch": 2.9572649572649574,
"grad_norm": 0.859993577003479,
"learning_rate": 1.8070339803509805e-08,
"loss": 1.1882524490356445,
"step": 434
},
{
"epoch": 2.9641025641025642,
"grad_norm": 0.8584935069084167,
"learning_rate": 1.383608087492605e-08,
"loss": 1.2315739393234253,
"step": 435
},
{
"epoch": 2.970940170940171,
"grad_norm": 0.8648282289505005,
"learning_rate": 1.0165906007056914e-08,
"loss": 1.235274314880371,
"step": 436
},
{
"epoch": 2.977777777777778,
"grad_norm": 0.8602524399757385,
"learning_rate": 7.060022515460452e-09,
"loss": 1.1928036212921143,
"step": 437
},
{
"epoch": 2.9846153846153847,
"grad_norm": 0.8722023367881775,
"learning_rate": 4.5186058408153156e-09,
"loss": 1.2146607637405396,
"step": 438
},
{
"epoch": 2.9914529914529915,
"grad_norm": 0.8878926038742065,
"learning_rate": 2.5417995390086824e-09,
"loss": 1.1910994052886963,
"step": 439
},
{
"epoch": 2.9982905982905983,
"grad_norm": 0.8773415088653564,
"learning_rate": 1.129715273033849e-09,
"loss": 1.1811952590942383,
"step": 440
},
{
"epoch": 3.0,
"grad_norm": 1.841178059577942,
"learning_rate": 2.8243280667306084e-10,
"loss": 1.14687180519104,
"step": 441
},
{
"epoch": 3.0,
"step": 441,
"total_flos": 5.3379040973665075e+17,
"train_loss": 1.3470534840408637,
"train_runtime": 3006.8003,
"train_samples_per_second": 18.676,
"train_steps_per_second": 0.147
}
],
"logging_steps": 1.0,
"max_steps": 441,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.3379040973665075e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}