{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 92, "global_step": 813, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003692165685935156, "grad_norm": 1.7612229585647583, "learning_rate": 0.0, "loss": 0.0643, "step": 1 }, { "epoch": 0.007384331371870312, "grad_norm": 1.5242195129394531, "learning_rate": 4.878048780487805e-07, "loss": 0.0413, "step": 2 }, { "epoch": 0.01107649705780547, "grad_norm": 1.9117767810821533, "learning_rate": 9.75609756097561e-07, "loss": 0.0658, "step": 3 }, { "epoch": 0.014768662743740625, "grad_norm": 1.904267430305481, "learning_rate": 1.4634146341463414e-06, "loss": 0.063, "step": 4 }, { "epoch": 0.018460828429675783, "grad_norm": 1.6038223505020142, "learning_rate": 1.951219512195122e-06, "loss": 0.0546, "step": 5 }, { "epoch": 0.02215299411561094, "grad_norm": 0.9653654098510742, "learning_rate": 2.4390243902439027e-06, "loss": 0.0417, "step": 6 }, { "epoch": 0.025845159801546093, "grad_norm": 0.8979856371879578, "learning_rate": 2.926829268292683e-06, "loss": 0.0398, "step": 7 }, { "epoch": 0.02953732548748125, "grad_norm": 0.7607001066207886, "learning_rate": 3.414634146341464e-06, "loss": 0.0419, "step": 8 }, { "epoch": 0.033229491173416406, "grad_norm": 0.7884329557418823, "learning_rate": 3.902439024390244e-06, "loss": 0.0267, "step": 9 }, { "epoch": 0.036921656859351566, "grad_norm": 0.3715834617614746, "learning_rate": 4.390243902439025e-06, "loss": 0.0277, "step": 10 }, { "epoch": 0.04061382254528672, "grad_norm": 0.31171751022338867, "learning_rate": 4.8780487804878055e-06, "loss": 0.0296, "step": 11 }, { "epoch": 0.04430598823122188, "grad_norm": 0.41572287678718567, "learning_rate": 5.365853658536586e-06, "loss": 0.0184, "step": 12 }, { "epoch": 0.04799815391715703, "grad_norm": 0.46749481558799744, "learning_rate": 5.853658536585366e-06, "loss": 0.0182, "step": 13 }, { "epoch": 0.051690319603092186, "grad_norm": 0.2749096751213074, "learning_rate": 6.341463414634147e-06, "loss": 0.0248, "step": 14 }, { "epoch": 0.055382485289027346, "grad_norm": 0.17462071776390076, "learning_rate": 6.829268292682928e-06, "loss": 0.0123, "step": 15 }, { "epoch": 0.0590746509749625, "grad_norm": 0.18316905200481415, "learning_rate": 7.317073170731707e-06, "loss": 0.0145, "step": 16 }, { "epoch": 0.06276681666089766, "grad_norm": 0.16834756731987, "learning_rate": 7.804878048780489e-06, "loss": 0.0194, "step": 17 }, { "epoch": 0.06645898234683281, "grad_norm": 0.150247722864151, "learning_rate": 8.292682926829268e-06, "loss": 0.0117, "step": 18 }, { "epoch": 0.07015114803276797, "grad_norm": 0.193730428814888, "learning_rate": 8.78048780487805e-06, "loss": 0.0208, "step": 19 }, { "epoch": 0.07384331371870313, "grad_norm": 0.16806142032146454, "learning_rate": 9.268292682926831e-06, "loss": 0.0099, "step": 20 }, { "epoch": 0.07753547940463829, "grad_norm": 0.2117680311203003, "learning_rate": 9.756097560975611e-06, "loss": 0.0368, "step": 21 }, { "epoch": 0.08122764509057344, "grad_norm": 0.12042353302240372, "learning_rate": 1.024390243902439e-05, "loss": 0.013, "step": 22 }, { "epoch": 0.08491981077650859, "grad_norm": 0.16888391971588135, "learning_rate": 1.0731707317073172e-05, "loss": 0.0137, "step": 23 }, { "epoch": 0.08861197646244376, "grad_norm": 0.32111823558807373, "learning_rate": 1.1219512195121953e-05, "loss": 0.0248, "step": 24 }, { "epoch": 0.09230414214837891, "grad_norm": 0.1346050500869751, "learning_rate": 1.1707317073170731e-05, "loss": 0.0256, "step": 25 }, { "epoch": 0.09599630783431407, "grad_norm": 0.1447174847126007, "learning_rate": 1.2195121951219513e-05, "loss": 0.0124, "step": 26 }, { "epoch": 0.09968847352024922, "grad_norm": 0.13269546627998352, "learning_rate": 1.2682926829268294e-05, "loss": 0.0096, "step": 27 }, { "epoch": 0.10338063920618437, "grad_norm": 0.08903706818819046, "learning_rate": 1.3170731707317076e-05, "loss": 0.0076, "step": 28 }, { "epoch": 0.10707280489211954, "grad_norm": 0.1776096373796463, "learning_rate": 1.3658536585365855e-05, "loss": 0.0181, "step": 29 }, { "epoch": 0.11076497057805469, "grad_norm": 0.10876930505037308, "learning_rate": 1.4146341463414635e-05, "loss": 0.0111, "step": 30 }, { "epoch": 0.11445713626398984, "grad_norm": 0.1793457418680191, "learning_rate": 1.4634146341463415e-05, "loss": 0.023, "step": 31 }, { "epoch": 0.118149301949925, "grad_norm": 0.17572703957557678, "learning_rate": 1.5121951219512196e-05, "loss": 0.0162, "step": 32 }, { "epoch": 0.12184146763586017, "grad_norm": 0.14067068696022034, "learning_rate": 1.5609756097560978e-05, "loss": 0.0071, "step": 33 }, { "epoch": 0.12553363332179532, "grad_norm": 0.1744857132434845, "learning_rate": 1.6097560975609757e-05, "loss": 0.0153, "step": 34 }, { "epoch": 0.12922579900773049, "grad_norm": 0.17557364702224731, "learning_rate": 1.6585365853658537e-05, "loss": 0.0227, "step": 35 }, { "epoch": 0.13291796469366562, "grad_norm": 0.10515311360359192, "learning_rate": 1.7073170731707317e-05, "loss": 0.0152, "step": 36 }, { "epoch": 0.1366101303796008, "grad_norm": 0.12522749602794647, "learning_rate": 1.75609756097561e-05, "loss": 0.0103, "step": 37 }, { "epoch": 0.14030229606553593, "grad_norm": 0.10669893026351929, "learning_rate": 1.804878048780488e-05, "loss": 0.0161, "step": 38 }, { "epoch": 0.1439944617514711, "grad_norm": 0.09148227423429489, "learning_rate": 1.8536585365853663e-05, "loss": 0.0072, "step": 39 }, { "epoch": 0.14768662743740626, "grad_norm": 0.2032286524772644, "learning_rate": 1.902439024390244e-05, "loss": 0.0092, "step": 40 }, { "epoch": 0.1513787931233414, "grad_norm": 0.10257123410701752, "learning_rate": 1.9512195121951222e-05, "loss": 0.007, "step": 41 }, { "epoch": 0.15507095880927657, "grad_norm": 0.07334409654140472, "learning_rate": 2e-05, "loss": 0.0064, "step": 42 }, { "epoch": 0.1587631244952117, "grad_norm": 0.09883085638284683, "learning_rate": 2.048780487804878e-05, "loss": 0.0089, "step": 43 }, { "epoch": 0.16245529018114688, "grad_norm": 0.10087648034095764, "learning_rate": 2.0975609756097564e-05, "loss": 0.0065, "step": 44 }, { "epoch": 0.16614745586708204, "grad_norm": 0.10451877117156982, "learning_rate": 2.1463414634146344e-05, "loss": 0.0102, "step": 45 }, { "epoch": 0.16983962155301718, "grad_norm": 0.13105975091457367, "learning_rate": 2.1951219512195124e-05, "loss": 0.009, "step": 46 }, { "epoch": 0.17353178723895235, "grad_norm": 0.2514360547065735, "learning_rate": 2.2439024390243907e-05, "loss": 0.0168, "step": 47 }, { "epoch": 0.17722395292488752, "grad_norm": 0.11838189512491226, "learning_rate": 2.2926829268292683e-05, "loss": 0.0119, "step": 48 }, { "epoch": 0.18091611861082266, "grad_norm": 0.16984423995018005, "learning_rate": 2.3414634146341463e-05, "loss": 0.0065, "step": 49 }, { "epoch": 0.18460828429675782, "grad_norm": 0.11164893954992294, "learning_rate": 2.3902439024390246e-05, "loss": 0.0135, "step": 50 }, { "epoch": 0.18830044998269296, "grad_norm": 0.09878280013799667, "learning_rate": 2.4390243902439026e-05, "loss": 0.0066, "step": 51 }, { "epoch": 0.19199261566862813, "grad_norm": 0.09212549030780792, "learning_rate": 2.4878048780487805e-05, "loss": 0.0082, "step": 52 }, { "epoch": 0.1956847813545633, "grad_norm": 0.09363257884979248, "learning_rate": 2.536585365853659e-05, "loss": 0.0099, "step": 53 }, { "epoch": 0.19937694704049844, "grad_norm": 0.07449876517057419, "learning_rate": 2.5853658536585368e-05, "loss": 0.006, "step": 54 }, { "epoch": 0.2030691127264336, "grad_norm": 0.07617678493261337, "learning_rate": 2.634146341463415e-05, "loss": 0.0076, "step": 55 }, { "epoch": 0.20676127841236874, "grad_norm": 0.09494733065366745, "learning_rate": 2.682926829268293e-05, "loss": 0.0082, "step": 56 }, { "epoch": 0.2104534440983039, "grad_norm": 0.10162504017353058, "learning_rate": 2.731707317073171e-05, "loss": 0.011, "step": 57 }, { "epoch": 0.21414560978423908, "grad_norm": 0.16772620379924774, "learning_rate": 2.7804878048780487e-05, "loss": 0.0086, "step": 58 }, { "epoch": 0.21783777547017422, "grad_norm": 0.10658068209886551, "learning_rate": 2.829268292682927e-05, "loss": 0.0067, "step": 59 }, { "epoch": 0.22152994115610938, "grad_norm": 0.11568617820739746, "learning_rate": 2.878048780487805e-05, "loss": 0.0081, "step": 60 }, { "epoch": 0.22522210684204455, "grad_norm": 0.11837513744831085, "learning_rate": 2.926829268292683e-05, "loss": 0.0086, "step": 61 }, { "epoch": 0.2289142725279797, "grad_norm": 0.08128459751605988, "learning_rate": 2.9756097560975613e-05, "loss": 0.0047, "step": 62 }, { "epoch": 0.23260643821391486, "grad_norm": 0.36463257670402527, "learning_rate": 3.0243902439024392e-05, "loss": 0.0099, "step": 63 }, { "epoch": 0.23629860389985, "grad_norm": 0.08298784494400024, "learning_rate": 3.073170731707317e-05, "loss": 0.006, "step": 64 }, { "epoch": 0.23999076958578516, "grad_norm": 0.2765791416168213, "learning_rate": 3.1219512195121955e-05, "loss": 0.015, "step": 65 }, { "epoch": 0.24368293527172033, "grad_norm": 0.09410832822322845, "learning_rate": 3.170731707317074e-05, "loss": 0.007, "step": 66 }, { "epoch": 0.24737510095765547, "grad_norm": 0.08687976747751236, "learning_rate": 3.2195121951219514e-05, "loss": 0.0054, "step": 67 }, { "epoch": 0.25106726664359064, "grad_norm": 0.1658174693584442, "learning_rate": 3.268292682926829e-05, "loss": 0.0059, "step": 68 }, { "epoch": 0.2547594323295258, "grad_norm": 0.16597941517829895, "learning_rate": 3.3170731707317074e-05, "loss": 0.0043, "step": 69 }, { "epoch": 0.25845159801546097, "grad_norm": 0.14238758385181427, "learning_rate": 3.365853658536586e-05, "loss": 0.0202, "step": 70 }, { "epoch": 0.2621437637013961, "grad_norm": 0.28217750787734985, "learning_rate": 3.414634146341463e-05, "loss": 0.0245, "step": 71 }, { "epoch": 0.26583592938733125, "grad_norm": 0.08783560991287231, "learning_rate": 3.4634146341463416e-05, "loss": 0.0062, "step": 72 }, { "epoch": 0.2695280950732664, "grad_norm": 0.13205395638942719, "learning_rate": 3.51219512195122e-05, "loss": 0.0091, "step": 73 }, { "epoch": 0.2732202607592016, "grad_norm": 0.11077655106782913, "learning_rate": 3.5609756097560976e-05, "loss": 0.0068, "step": 74 }, { "epoch": 0.27691242644513675, "grad_norm": 0.07510002702474594, "learning_rate": 3.609756097560976e-05, "loss": 0.0056, "step": 75 }, { "epoch": 0.28060459213107186, "grad_norm": 0.1183568611741066, "learning_rate": 3.658536585365854e-05, "loss": 0.0107, "step": 76 }, { "epoch": 0.28429675781700703, "grad_norm": 0.19435347616672516, "learning_rate": 3.7073170731707325e-05, "loss": 0.0194, "step": 77 }, { "epoch": 0.2879889235029422, "grad_norm": 0.1364523470401764, "learning_rate": 3.75609756097561e-05, "loss": 0.0149, "step": 78 }, { "epoch": 0.29168108918887736, "grad_norm": 0.20089037716388702, "learning_rate": 3.804878048780488e-05, "loss": 0.0211, "step": 79 }, { "epoch": 0.29537325487481253, "grad_norm": 0.13373729586601257, "learning_rate": 3.853658536585366e-05, "loss": 0.0111, "step": 80 }, { "epoch": 0.29906542056074764, "grad_norm": 0.08260685950517654, "learning_rate": 3.9024390243902444e-05, "loss": 0.0094, "step": 81 }, { "epoch": 0.3027575862466828, "grad_norm": 0.15578778088092804, "learning_rate": 3.951219512195122e-05, "loss": 0.01, "step": 82 }, { "epoch": 0.306449751932618, "grad_norm": 0.11668805778026581, "learning_rate": 4e-05, "loss": 0.0135, "step": 83 }, { "epoch": 0.31014191761855314, "grad_norm": 0.08324134349822998, "learning_rate": 3.999981530109401e-05, "loss": 0.0086, "step": 84 }, { "epoch": 0.3138340833044883, "grad_norm": 0.2406614124774933, "learning_rate": 3.999926120778742e-05, "loss": 0.0125, "step": 85 }, { "epoch": 0.3175262489904234, "grad_norm": 0.1322041004896164, "learning_rate": 3.9998337730314274e-05, "loss": 0.0094, "step": 86 }, { "epoch": 0.3212184146763586, "grad_norm": 0.18763262033462524, "learning_rate": 3.999704488573108e-05, "loss": 0.0123, "step": 87 }, { "epoch": 0.32491058036229375, "grad_norm": 0.16090139746665955, "learning_rate": 3.9995382697916555e-05, "loss": 0.0084, "step": 88 }, { "epoch": 0.3286027460482289, "grad_norm": 0.11704199016094208, "learning_rate": 3.999335119757112e-05, "loss": 0.0088, "step": 89 }, { "epoch": 0.3322949117341641, "grad_norm": 0.13167352974414825, "learning_rate": 3.9990950422216367e-05, "loss": 0.0148, "step": 90 }, { "epoch": 0.3359870774200992, "grad_norm": 0.09690048545598984, "learning_rate": 3.998818041619435e-05, "loss": 0.0077, "step": 91 }, { "epoch": 0.33967924310603437, "grad_norm": 0.11628638207912445, "learning_rate": 3.998504123066679e-05, "loss": 0.007, "step": 92 }, { "epoch": 0.33967924310603437, "eval_loss": 0.010080486536026001, "eval_runtime": 89.886, "eval_samples_per_second": 10.157, "eval_steps_per_second": 5.084, "step": 92 }, { "epoch": 0.34337140879196953, "grad_norm": 0.11528339982032776, "learning_rate": 3.9981532923614074e-05, "loss": 0.0082, "step": 93 }, { "epoch": 0.3470635744779047, "grad_norm": 0.2657609283924103, "learning_rate": 3.9977655559834275e-05, "loss": 0.013, "step": 94 }, { "epoch": 0.35075574016383987, "grad_norm": 0.11277999728918076, "learning_rate": 3.9973409210941864e-05, "loss": 0.0076, "step": 95 }, { "epoch": 0.35444790584977504, "grad_norm": 5.088494777679443, "learning_rate": 3.9968793955366445e-05, "loss": 0.0289, "step": 96 }, { "epoch": 0.35814007153571015, "grad_norm": 0.1601097732782364, "learning_rate": 3.996380987835128e-05, "loss": 0.0219, "step": 97 }, { "epoch": 0.3618322372216453, "grad_norm": 0.20405948162078857, "learning_rate": 3.995845707195173e-05, "loss": 0.0162, "step": 98 }, { "epoch": 0.3655244029075805, "grad_norm": 0.6011687517166138, "learning_rate": 3.995273563503355e-05, "loss": 0.0096, "step": 99 }, { "epoch": 0.36921656859351565, "grad_norm": 0.2641352117061615, "learning_rate": 3.9946645673271034e-05, "loss": 0.0231, "step": 100 }, { "epoch": 0.3729087342794508, "grad_norm": 0.18041729927062988, "learning_rate": 3.9940187299145134e-05, "loss": 0.0124, "step": 101 }, { "epoch": 0.3766008999653859, "grad_norm": 0.13946884870529175, "learning_rate": 3.9933360631941294e-05, "loss": 0.0136, "step": 102 }, { "epoch": 0.3802930656513211, "grad_norm": 0.14014215767383575, "learning_rate": 3.992616579774732e-05, "loss": 0.0123, "step": 103 }, { "epoch": 0.38398523133725626, "grad_norm": 0.215935617685318, "learning_rate": 3.9918602929451015e-05, "loss": 0.0166, "step": 104 }, { "epoch": 0.38767739702319143, "grad_norm": 0.18600742518901825, "learning_rate": 3.991067216673772e-05, "loss": 0.0249, "step": 105 }, { "epoch": 0.3913695627091266, "grad_norm": 0.19758182764053345, "learning_rate": 3.990237365608776e-05, "loss": 0.0215, "step": 106 }, { "epoch": 0.3950617283950617, "grad_norm": 0.09306969493627548, "learning_rate": 3.9893707550773714e-05, "loss": 0.0062, "step": 107 }, { "epoch": 0.3987538940809969, "grad_norm": 0.15354327857494354, "learning_rate": 3.988467401085761e-05, "loss": 0.0153, "step": 108 }, { "epoch": 0.40244605976693204, "grad_norm": 0.18650248646736145, "learning_rate": 3.987527320318793e-05, "loss": 0.0207, "step": 109 }, { "epoch": 0.4061382254528672, "grad_norm": 0.11927786469459534, "learning_rate": 3.986550530139657e-05, "loss": 0.0071, "step": 110 }, { "epoch": 0.4098303911388024, "grad_norm": 0.18260295689105988, "learning_rate": 3.985537048589561e-05, "loss": 0.0165, "step": 111 }, { "epoch": 0.4135225568247375, "grad_norm": 0.13270637392997742, "learning_rate": 3.9844868943873975e-05, "loss": 0.0095, "step": 112 }, { "epoch": 0.41721472251067265, "grad_norm": 0.10370643436908722, "learning_rate": 3.9834000869294e-05, "loss": 0.0075, "step": 113 }, { "epoch": 0.4209068881966078, "grad_norm": 0.11767494678497314, "learning_rate": 3.982276646288784e-05, "loss": 0.0129, "step": 114 }, { "epoch": 0.424599053882543, "grad_norm": 0.12992636859416962, "learning_rate": 3.981116593215374e-05, "loss": 0.0145, "step": 115 }, { "epoch": 0.42829121956847815, "grad_norm": 0.27428507804870605, "learning_rate": 3.9799199491352246e-05, "loss": 0.0166, "step": 116 }, { "epoch": 0.43198338525441327, "grad_norm": 0.1593196988105774, "learning_rate": 3.978686736150221e-05, "loss": 0.0102, "step": 117 }, { "epoch": 0.43567555094034843, "grad_norm": 0.20442363619804382, "learning_rate": 3.977416977037671e-05, "loss": 0.0143, "step": 118 }, { "epoch": 0.4393677166262836, "grad_norm": 0.11397796869277954, "learning_rate": 3.9761106952498874e-05, "loss": 0.0108, "step": 119 }, { "epoch": 0.44305988231221877, "grad_norm": 0.12436648458242416, "learning_rate": 3.974767914913751e-05, "loss": 0.008, "step": 120 }, { "epoch": 0.44675204799815393, "grad_norm": 0.10099371522665024, "learning_rate": 3.973388660830269e-05, "loss": 0.0071, "step": 121 }, { "epoch": 0.4504442136840891, "grad_norm": 0.18873853981494904, "learning_rate": 3.971972958474113e-05, "loss": 0.0126, "step": 122 }, { "epoch": 0.4541363793700242, "grad_norm": 0.14228899776935577, "learning_rate": 3.97052083399315e-05, "loss": 0.0108, "step": 123 }, { "epoch": 0.4578285450559594, "grad_norm": 0.16871212422847748, "learning_rate": 3.969032314207961e-05, "loss": 0.0127, "step": 124 }, { "epoch": 0.46152071074189455, "grad_norm": 0.10817577689886093, "learning_rate": 3.967507426611344e-05, "loss": 0.0166, "step": 125 }, { "epoch": 0.4652128764278297, "grad_norm": 0.09942852705717087, "learning_rate": 3.965946199367804e-05, "loss": 0.0087, "step": 126 }, { "epoch": 0.4689050421137649, "grad_norm": 0.08521483838558197, "learning_rate": 3.96434866131304e-05, "loss": 0.01, "step": 127 }, { "epoch": 0.4725972077997, "grad_norm": 0.09523651003837585, "learning_rate": 3.9627148419534026e-05, "loss": 0.0172, "step": 128 }, { "epoch": 0.47628937348563516, "grad_norm": 0.08050525188446045, "learning_rate": 3.961044771465359e-05, "loss": 0.0052, "step": 129 }, { "epoch": 0.4799815391715703, "grad_norm": 0.13714231550693512, "learning_rate": 3.9593384806949263e-05, "loss": 0.0156, "step": 130 }, { "epoch": 0.4836737048575055, "grad_norm": 0.08329559117555618, "learning_rate": 3.9575960011571106e-05, "loss": 0.014, "step": 131 }, { "epoch": 0.48736587054344066, "grad_norm": 0.109232597053051, "learning_rate": 3.955817365035316e-05, "loss": 0.0081, "step": 132 }, { "epoch": 0.49105803622937577, "grad_norm": 0.09475994855165482, "learning_rate": 3.954002605180759e-05, "loss": 0.0066, "step": 133 }, { "epoch": 0.49475020191531094, "grad_norm": 0.1157636046409607, "learning_rate": 3.952151755111855e-05, "loss": 0.0226, "step": 134 }, { "epoch": 0.4984423676012461, "grad_norm": 0.1317131668329239, "learning_rate": 3.9502648490136016e-05, "loss": 0.0097, "step": 135 }, { "epoch": 0.5021345332871813, "grad_norm": 0.12749852240085602, "learning_rate": 3.948341921736948e-05, "loss": 0.0104, "step": 136 }, { "epoch": 0.5058266989731164, "grad_norm": 0.08242950588464737, "learning_rate": 3.946383008798152e-05, "loss": 0.0054, "step": 137 }, { "epoch": 0.5095188646590516, "grad_norm": 0.125824436545372, "learning_rate": 3.94438814637812e-05, "loss": 0.0173, "step": 138 }, { "epoch": 0.5132110303449867, "grad_norm": 0.09344890713691711, "learning_rate": 3.942357371321743e-05, "loss": 0.0105, "step": 139 }, { "epoch": 0.5169031960309219, "grad_norm": 0.11648397147655487, "learning_rate": 3.940290721137214e-05, "loss": 0.0211, "step": 140 }, { "epoch": 0.520595361716857, "grad_norm": 0.10685181617736816, "learning_rate": 3.938188233995336e-05, "loss": 0.0105, "step": 141 }, { "epoch": 0.5242875274027922, "grad_norm": 0.1415766328573227, "learning_rate": 3.936049948728816e-05, "loss": 0.0132, "step": 142 }, { "epoch": 0.5279796930887274, "grad_norm": 0.20898067951202393, "learning_rate": 3.933875904831551e-05, "loss": 0.0163, "step": 143 }, { "epoch": 0.5316718587746625, "grad_norm": 0.09275670349597931, "learning_rate": 3.931666142457891e-05, "loss": 0.0099, "step": 144 }, { "epoch": 0.5353640244605977, "grad_norm": 0.08970851451158524, "learning_rate": 3.929420702421907e-05, "loss": 0.0064, "step": 145 }, { "epoch": 0.5390561901465328, "grad_norm": 0.10725483298301697, "learning_rate": 3.9271396261966305e-05, "loss": 0.0061, "step": 146 }, { "epoch": 0.5427483558324679, "grad_norm": 0.10847834497690201, "learning_rate": 3.92482295591329e-05, "loss": 0.0135, "step": 147 }, { "epoch": 0.5464405215184032, "grad_norm": 0.08085348457098007, "learning_rate": 3.9224707343605315e-05, "loss": 0.0092, "step": 148 }, { "epoch": 0.5501326872043383, "grad_norm": 0.10083277523517609, "learning_rate": 3.92008300498363e-05, "loss": 0.0107, "step": 149 }, { "epoch": 0.5538248528902735, "grad_norm": 0.25901204347610474, "learning_rate": 3.917659811883687e-05, "loss": 0.0061, "step": 150 }, { "epoch": 0.5575170185762086, "grad_norm": 0.06085089221596718, "learning_rate": 3.915201199816812e-05, "loss": 0.0065, "step": 151 }, { "epoch": 0.5612091842621437, "grad_norm": 0.19471333920955658, "learning_rate": 3.9127072141933025e-05, "loss": 0.0159, "step": 152 }, { "epoch": 0.564901349948079, "grad_norm": 1.377177119255066, "learning_rate": 3.910177901076799e-05, "loss": 0.0129, "step": 153 }, { "epoch": 0.5685935156340141, "grad_norm": 0.12816794216632843, "learning_rate": 3.907613307183439e-05, "loss": 0.0145, "step": 154 }, { "epoch": 0.5722856813199493, "grad_norm": 1.0884933471679688, "learning_rate": 3.905013479880992e-05, "loss": 0.0296, "step": 155 }, { "epoch": 0.5759778470058844, "grad_norm": 0.19217349588871002, "learning_rate": 3.902378467187981e-05, "loss": 0.0066, "step": 156 }, { "epoch": 0.5796700126918195, "grad_norm": 0.17874930799007416, "learning_rate": 3.8997083177728044e-05, "loss": 0.007, "step": 157 }, { "epoch": 0.5833621783777547, "grad_norm": 0.17409397661685944, "learning_rate": 3.897003080952828e-05, "loss": 0.0131, "step": 158 }, { "epoch": 0.5870543440636898, "grad_norm": 0.1562495231628418, "learning_rate": 3.8942628066934826e-05, "loss": 0.0086, "step": 159 }, { "epoch": 0.5907465097496251, "grad_norm": 0.15565301477909088, "learning_rate": 3.891487545607332e-05, "loss": 0.0104, "step": 160 }, { "epoch": 0.5944386754355602, "grad_norm": 0.14708881080150604, "learning_rate": 3.888677348953145e-05, "loss": 0.0097, "step": 161 }, { "epoch": 0.5981308411214953, "grad_norm": 0.14112205803394318, "learning_rate": 3.885832268634946e-05, "loss": 0.0111, "step": 162 }, { "epoch": 0.6018230068074305, "grad_norm": 0.3906686305999756, "learning_rate": 3.8829523572010586e-05, "loss": 0.0163, "step": 163 }, { "epoch": 0.6055151724933656, "grad_norm": 0.33456534147262573, "learning_rate": 3.880037667843131e-05, "loss": 0.0183, "step": 164 }, { "epoch": 0.6092073381793008, "grad_norm": 0.18434567749500275, "learning_rate": 3.877088254395157e-05, "loss": 0.0128, "step": 165 }, { "epoch": 0.612899503865236, "grad_norm": 0.1495783030986786, "learning_rate": 3.874104171332481e-05, "loss": 0.0255, "step": 166 }, { "epoch": 0.6165916695511711, "grad_norm": 0.17371267080307007, "learning_rate": 3.871085473770789e-05, "loss": 0.01, "step": 167 }, { "epoch": 0.6202838352371063, "grad_norm": 0.11804357171058655, "learning_rate": 3.868032217465097e-05, "loss": 0.0089, "step": 168 }, { "epoch": 0.6239760009230414, "grad_norm": 0.19819952547550201, "learning_rate": 3.864944458808712e-05, "loss": 0.0146, "step": 169 }, { "epoch": 0.6276681666089766, "grad_norm": 0.14190199971199036, "learning_rate": 3.861822254832201e-05, "loss": 0.0106, "step": 170 }, { "epoch": 0.6313603322949117, "grad_norm": 0.08863028138875961, "learning_rate": 3.858665663202329e-05, "loss": 0.0077, "step": 171 }, { "epoch": 0.6350524979808468, "grad_norm": 0.2075955718755722, "learning_rate": 3.855474742220998e-05, "loss": 0.0097, "step": 172 }, { "epoch": 0.6387446636667821, "grad_norm": 0.1362001597881317, "learning_rate": 3.852249550824169e-05, "loss": 0.0168, "step": 173 }, { "epoch": 0.6424368293527172, "grad_norm": 0.8073941469192505, "learning_rate": 3.848990148580776e-05, "loss": 0.0298, "step": 174 }, { "epoch": 0.6461289950386524, "grad_norm": 0.08775315433740616, "learning_rate": 3.84569659569162e-05, "loss": 0.0106, "step": 175 }, { "epoch": 0.6498211607245875, "grad_norm": 0.3348487615585327, "learning_rate": 3.8423689529882635e-05, "loss": 0.0205, "step": 176 }, { "epoch": 0.6535133264105226, "grad_norm": 0.1472369283437729, "learning_rate": 3.839007281931902e-05, "loss": 0.0249, "step": 177 }, { "epoch": 0.6572054920964578, "grad_norm": 0.12712818384170532, "learning_rate": 3.835611644612234e-05, "loss": 0.0273, "step": 178 }, { "epoch": 0.660897657782393, "grad_norm": 0.16857929527759552, "learning_rate": 3.832182103746308e-05, "loss": 0.0154, "step": 179 }, { "epoch": 0.6645898234683282, "grad_norm": 0.11381746828556061, "learning_rate": 3.828718722677369e-05, "loss": 0.0072, "step": 180 }, { "epoch": 0.6682819891542633, "grad_norm": 0.27607351541519165, "learning_rate": 3.825221565373687e-05, "loss": 0.0147, "step": 181 }, { "epoch": 0.6719741548401984, "grad_norm": 0.09366216510534286, "learning_rate": 3.821690696427373e-05, "loss": 0.0075, "step": 182 }, { "epoch": 0.6756663205261336, "grad_norm": 0.11064954102039337, "learning_rate": 3.8181261810531926e-05, "loss": 0.0068, "step": 183 }, { "epoch": 0.6793584862120687, "grad_norm": 0.10137422382831573, "learning_rate": 3.8145280850873524e-05, "loss": 0.0078, "step": 184 }, { "epoch": 0.6793584862120687, "eval_loss": 0.01165215577930212, "eval_runtime": 90.6071, "eval_samples_per_second": 10.076, "eval_steps_per_second": 5.044, "step": 184 }, { "epoch": 0.683050651898004, "grad_norm": 0.1427939236164093, "learning_rate": 3.810896474986294e-05, "loss": 0.0099, "step": 185 }, { "epoch": 0.6867428175839391, "grad_norm": 0.11883988231420517, "learning_rate": 3.8072314178254556e-05, "loss": 0.0111, "step": 186 }, { "epoch": 0.6904349832698742, "grad_norm": 0.10005701333284378, "learning_rate": 3.803532981298044e-05, "loss": 0.0066, "step": 187 }, { "epoch": 0.6941271489558094, "grad_norm": 0.11374926567077637, "learning_rate": 3.7998012337137765e-05, "loss": 0.0115, "step": 188 }, { "epoch": 0.6978193146417445, "grad_norm": 0.1569487750530243, "learning_rate": 3.7960362439976234e-05, "loss": 0.0086, "step": 189 }, { "epoch": 0.7015114803276797, "grad_norm": 0.11175687611103058, "learning_rate": 3.7922380816885323e-05, "loss": 0.0112, "step": 190 }, { "epoch": 0.7052036460136148, "grad_norm": 0.1092715710401535, "learning_rate": 3.7884068169381454e-05, "loss": 0.0077, "step": 191 }, { "epoch": 0.7088958116995501, "grad_norm": 0.19558553397655487, "learning_rate": 3.784542520509503e-05, "loss": 0.0098, "step": 192 }, { "epoch": 0.7125879773854852, "grad_norm": 0.13434702157974243, "learning_rate": 3.78064526377574e-05, "loss": 0.0151, "step": 193 }, { "epoch": 0.7162801430714203, "grad_norm": 0.15444593131542206, "learning_rate": 3.7767151187187586e-05, "loss": 0.0106, "step": 194 }, { "epoch": 0.7199723087573555, "grad_norm": 0.06894934922456741, "learning_rate": 3.7727521579279095e-05, "loss": 0.0049, "step": 195 }, { "epoch": 0.7236644744432906, "grad_norm": 0.1396859586238861, "learning_rate": 3.768756454598645e-05, "loss": 0.0118, "step": 196 }, { "epoch": 0.7273566401292259, "grad_norm": 0.1533813774585724, "learning_rate": 3.764728082531169e-05, "loss": 0.0104, "step": 197 }, { "epoch": 0.731048805815161, "grad_norm": 0.1005004420876503, "learning_rate": 3.760667116129072e-05, "loss": 0.0041, "step": 198 }, { "epoch": 0.7347409715010961, "grad_norm": 0.1538483053445816, "learning_rate": 3.756573630397958e-05, "loss": 0.0083, "step": 199 }, { "epoch": 0.7384331371870313, "grad_norm": 0.09973620623350143, "learning_rate": 3.752447700944064e-05, "loss": 0.0067, "step": 200 }, { "epoch": 0.7421253028729664, "grad_norm": 0.17357227206230164, "learning_rate": 3.7482894039728525e-05, "loss": 0.0165, "step": 201 }, { "epoch": 0.7458174685589016, "grad_norm": 0.1083260253071785, "learning_rate": 3.744098816287616e-05, "loss": 0.0063, "step": 202 }, { "epoch": 0.7495096342448367, "grad_norm": 0.0863720178604126, "learning_rate": 3.7398760152880484e-05, "loss": 0.0072, "step": 203 }, { "epoch": 0.7532017999307719, "grad_norm": 0.1260920763015747, "learning_rate": 3.735621078968823e-05, "loss": 0.0123, "step": 204 }, { "epoch": 0.7568939656167071, "grad_norm": 0.0873272716999054, "learning_rate": 3.731334085918149e-05, "loss": 0.0223, "step": 205 }, { "epoch": 0.7605861313026422, "grad_norm": 0.0700017586350441, "learning_rate": 3.7270151153163174e-05, "loss": 0.0073, "step": 206 }, { "epoch": 0.7642782969885774, "grad_norm": 0.07996451109647751, "learning_rate": 3.722664246934244e-05, "loss": 0.0061, "step": 207 }, { "epoch": 0.7679704626745125, "grad_norm": 0.10587499290704727, "learning_rate": 3.718281561131992e-05, "loss": 0.0365, "step": 208 }, { "epoch": 0.7716626283604476, "grad_norm": 0.14550632238388062, "learning_rate": 3.713867138857288e-05, "loss": 0.0151, "step": 209 }, { "epoch": 0.7753547940463829, "grad_norm": 0.17798705399036407, "learning_rate": 3.7094210616440284e-05, "loss": 0.0116, "step": 210 }, { "epoch": 0.779046959732318, "grad_norm": 0.08906543254852295, "learning_rate": 3.704943411610774e-05, "loss": 0.0076, "step": 211 }, { "epoch": 0.7827391254182532, "grad_norm": 0.07332167774438858, "learning_rate": 3.700434271459229e-05, "loss": 0.0086, "step": 212 }, { "epoch": 0.7864312911041883, "grad_norm": 0.11497493088245392, "learning_rate": 3.69589372447272e-05, "loss": 0.015, "step": 213 }, { "epoch": 0.7901234567901234, "grad_norm": 0.08599186688661575, "learning_rate": 3.6913218545146536e-05, "loss": 0.0111, "step": 214 }, { "epoch": 0.7938156224760586, "grad_norm": 0.0786437839269638, "learning_rate": 3.686718746026967e-05, "loss": 0.0084, "step": 215 }, { "epoch": 0.7975077881619937, "grad_norm": 0.07181154191493988, "learning_rate": 3.68208448402857e-05, "loss": 0.008, "step": 216 }, { "epoch": 0.801199953847929, "grad_norm": 0.11212016642093658, "learning_rate": 3.677419154113776e-05, "loss": 0.014, "step": 217 }, { "epoch": 0.8048921195338641, "grad_norm": 0.0851934626698494, "learning_rate": 3.672722842450717e-05, "loss": 0.012, "step": 218 }, { "epoch": 0.8085842852197992, "grad_norm": 0.053547583520412445, "learning_rate": 3.667995635779756e-05, "loss": 0.0048, "step": 219 }, { "epoch": 0.8122764509057344, "grad_norm": 0.09233611822128296, "learning_rate": 3.6632376214118836e-05, "loss": 0.0082, "step": 220 }, { "epoch": 0.8159686165916695, "grad_norm": 0.08250945061445236, "learning_rate": 3.6584488872271035e-05, "loss": 0.005, "step": 221 }, { "epoch": 0.8196607822776047, "grad_norm": 0.06966373324394226, "learning_rate": 3.6536295216728136e-05, "loss": 0.0058, "step": 222 }, { "epoch": 0.8233529479635399, "grad_norm": 0.08930462598800659, "learning_rate": 3.648779613762167e-05, "loss": 0.0098, "step": 223 }, { "epoch": 0.827045113649475, "grad_norm": 0.11651594936847687, "learning_rate": 3.643899253072433e-05, "loss": 0.0149, "step": 224 }, { "epoch": 0.8307372793354102, "grad_norm": 0.08121360838413239, "learning_rate": 3.63898852974334e-05, "loss": 0.0077, "step": 225 }, { "epoch": 0.8344294450213453, "grad_norm": 0.0807429626584053, "learning_rate": 3.634047534475409e-05, "loss": 0.0071, "step": 226 }, { "epoch": 0.8381216107072805, "grad_norm": 0.10255023092031479, "learning_rate": 3.629076358528284e-05, "loss": 0.0122, "step": 227 }, { "epoch": 0.8418137763932156, "grad_norm": 0.12542816996574402, "learning_rate": 3.62407509371904e-05, "loss": 0.0093, "step": 228 }, { "epoch": 0.8455059420791508, "grad_norm": 0.07941003143787384, "learning_rate": 3.6190438324204905e-05, "loss": 0.0059, "step": 229 }, { "epoch": 0.849198107765086, "grad_norm": 0.0921979695558548, "learning_rate": 3.613982667559483e-05, "loss": 0.0083, "step": 230 }, { "epoch": 0.8528902734510211, "grad_norm": 0.10604582726955414, "learning_rate": 3.608891692615176e-05, "loss": 0.0128, "step": 231 }, { "epoch": 0.8565824391369563, "grad_norm": 0.09270962327718735, "learning_rate": 3.603771001617322e-05, "loss": 0.0141, "step": 232 }, { "epoch": 0.8602746048228914, "grad_norm": 0.062396857887506485, "learning_rate": 3.598620689144523e-05, "loss": 0.0094, "step": 233 }, { "epoch": 0.8639667705088265, "grad_norm": 0.07858982682228088, "learning_rate": 3.5934408503224864e-05, "loss": 0.0071, "step": 234 }, { "epoch": 0.8676589361947618, "grad_norm": 0.07246547937393188, "learning_rate": 3.588231580822269e-05, "loss": 0.0057, "step": 235 }, { "epoch": 0.8713511018806969, "grad_norm": 0.3656146824359894, "learning_rate": 3.5829929768585086e-05, "loss": 0.0103, "step": 236 }, { "epoch": 0.8750432675666321, "grad_norm": 0.05800218507647514, "learning_rate": 3.577725135187647e-05, "loss": 0.0061, "step": 237 }, { "epoch": 0.8787354332525672, "grad_norm": 0.08162401616573334, "learning_rate": 3.5724281531061436e-05, "loss": 0.0143, "step": 238 }, { "epoch": 0.8824275989385023, "grad_norm": 0.07134553790092468, "learning_rate": 3.567102128448678e-05, "loss": 0.0054, "step": 239 }, { "epoch": 0.8861197646244375, "grad_norm": 0.13681039214134216, "learning_rate": 3.561747159586343e-05, "loss": 0.0198, "step": 240 }, { "epoch": 0.8898119303103726, "grad_norm": 0.05192103236913681, "learning_rate": 3.5563633454248275e-05, "loss": 0.0043, "step": 241 }, { "epoch": 0.8935040959963079, "grad_norm": 0.08083774149417877, "learning_rate": 3.550950785402591e-05, "loss": 0.0062, "step": 242 }, { "epoch": 0.897196261682243, "grad_norm": 0.17133909463882446, "learning_rate": 3.5455095794890234e-05, "loss": 0.0086, "step": 243 }, { "epoch": 0.9008884273681782, "grad_norm": 0.05909837409853935, "learning_rate": 3.540039828182604e-05, "loss": 0.0052, "step": 244 }, { "epoch": 0.9045805930541133, "grad_norm": 0.051370203495025635, "learning_rate": 3.53454163250904e-05, "loss": 0.0058, "step": 245 }, { "epoch": 0.9082727587400484, "grad_norm": 0.14762379229068756, "learning_rate": 3.529015094019405e-05, "loss": 0.0182, "step": 246 }, { "epoch": 0.9119649244259836, "grad_norm": 0.052717871963977814, "learning_rate": 3.523460314788259e-05, "loss": 0.004, "step": 247 }, { "epoch": 0.9156570901119188, "grad_norm": 0.0905941054224968, "learning_rate": 3.517877397411768e-05, "loss": 0.0152, "step": 248 }, { "epoch": 0.919349255797854, "grad_norm": 0.08593659847974777, "learning_rate": 3.5122664450058044e-05, "loss": 0.0089, "step": 249 }, { "epoch": 0.9230414214837891, "grad_norm": 0.08689778298139572, "learning_rate": 3.506627561204045e-05, "loss": 0.017, "step": 250 }, { "epoch": 0.9267335871697242, "grad_norm": 0.07897453010082245, "learning_rate": 3.5009608501560585e-05, "loss": 0.01, "step": 251 }, { "epoch": 0.9304257528556594, "grad_norm": 0.12059691548347473, "learning_rate": 3.495266416525376e-05, "loss": 0.0279, "step": 252 }, { "epoch": 0.9341179185415945, "grad_norm": 0.06975448876619339, "learning_rate": 3.489544365487564e-05, "loss": 0.0054, "step": 253 }, { "epoch": 0.9378100842275298, "grad_norm": 0.11304624378681183, "learning_rate": 3.48379480272828e-05, "loss": 0.0117, "step": 254 }, { "epoch": 0.9415022499134649, "grad_norm": 0.07075347006320953, "learning_rate": 3.478017834441319e-05, "loss": 0.0136, "step": 255 }, { "epoch": 0.9451944155994, "grad_norm": 0.04639885574579239, "learning_rate": 3.472213567326652e-05, "loss": 0.0039, "step": 256 }, { "epoch": 0.9488865812853352, "grad_norm": 0.048733506351709366, "learning_rate": 3.4663821085884597e-05, "loss": 0.0044, "step": 257 }, { "epoch": 0.9525787469712703, "grad_norm": 0.06614992767572403, "learning_rate": 3.460523565933145e-05, "loss": 0.0044, "step": 258 }, { "epoch": 0.9562709126572055, "grad_norm": 0.16428013145923615, "learning_rate": 3.4546380475673514e-05, "loss": 0.0079, "step": 259 }, { "epoch": 0.9599630783431407, "grad_norm": 0.11396708339452744, "learning_rate": 3.448725662195959e-05, "loss": 0.024, "step": 260 }, { "epoch": 0.9636552440290758, "grad_norm": 0.10393685102462769, "learning_rate": 3.442786519020077e-05, "loss": 0.0238, "step": 261 }, { "epoch": 0.967347409715011, "grad_norm": 0.09232791513204575, "learning_rate": 3.436820727735031e-05, "loss": 0.0145, "step": 262 }, { "epoch": 0.9710395754009461, "grad_norm": 0.1006862074136734, "learning_rate": 3.430828398528336e-05, "loss": 0.0166, "step": 263 }, { "epoch": 0.9747317410868813, "grad_norm": 0.10068142414093018, "learning_rate": 3.4248096420776536e-05, "loss": 0.0088, "step": 264 }, { "epoch": 0.9784239067728164, "grad_norm": 0.05003920570015907, "learning_rate": 3.418764569548758e-05, "loss": 0.005, "step": 265 }, { "epoch": 0.9821160724587515, "grad_norm": 0.05870620906352997, "learning_rate": 3.412693292593478e-05, "loss": 0.006, "step": 266 }, { "epoch": 0.9858082381446868, "grad_norm": 0.09036832302808762, "learning_rate": 3.4065959233476334e-05, "loss": 0.0097, "step": 267 }, { "epoch": 0.9895004038306219, "grad_norm": 0.07893083244562149, "learning_rate": 3.4004725744289685e-05, "loss": 0.0068, "step": 268 }, { "epoch": 0.9931925695165571, "grad_norm": 0.08609715849161148, "learning_rate": 3.394323358935068e-05, "loss": 0.0101, "step": 269 }, { "epoch": 0.9968847352024922, "grad_norm": 0.07740162312984467, "learning_rate": 3.3881483904412685e-05, "loss": 0.0087, "step": 270 }, { "epoch": 1.0, "grad_norm": 0.0650748685002327, "learning_rate": 3.3819477829985624e-05, "loss": 0.0091, "step": 271 }, { "epoch": 1.0036921656859352, "grad_norm": 0.060567598789930344, "learning_rate": 3.3757216511314915e-05, "loss": 0.0158, "step": 272 }, { "epoch": 1.0073843313718702, "grad_norm": 0.08802874386310577, "learning_rate": 3.3694701098360295e-05, "loss": 0.0063, "step": 273 }, { "epoch": 1.0110764970578054, "grad_norm": 0.05286823958158493, "learning_rate": 3.363193274577461e-05, "loss": 0.0022, "step": 274 }, { "epoch": 1.0147686627437407, "grad_norm": 0.059792377054691315, "learning_rate": 3.356891261288247e-05, "loss": 0.0094, "step": 275 }, { "epoch": 1.018460828429676, "grad_norm": 0.03914966061711311, "learning_rate": 3.350564186365882e-05, "loss": 0.0027, "step": 276 }, { "epoch": 1.018460828429676, "eval_loss": 0.008386676199734211, "eval_runtime": 89.8024, "eval_samples_per_second": 10.167, "eval_steps_per_second": 5.089, "step": 276 }, { "epoch": 1.022152994115611, "grad_norm": 0.05939140170812607, "learning_rate": 3.344212166670748e-05, "loss": 0.0046, "step": 277 }, { "epoch": 1.0258451598015461, "grad_norm": 0.23599718511104584, "learning_rate": 3.3378353195239546e-05, "loss": 0.0088, "step": 278 }, { "epoch": 1.0295373254874813, "grad_norm": 0.04266897588968277, "learning_rate": 3.331433762705171e-05, "loss": 0.0025, "step": 279 }, { "epoch": 1.0332294911734163, "grad_norm": 0.07201959937810898, "learning_rate": 3.32500761445045e-05, "loss": 0.0092, "step": 280 }, { "epoch": 1.0369216568593516, "grad_norm": 0.38548168540000916, "learning_rate": 3.318556993450048e-05, "loss": 0.0054, "step": 281 }, { "epoch": 1.0406138225452868, "grad_norm": 0.09021361917257309, "learning_rate": 3.312082018846229e-05, "loss": 0.0038, "step": 282 }, { "epoch": 1.0443059882312218, "grad_norm": 0.1946287751197815, "learning_rate": 3.3055828102310656e-05, "loss": 0.0054, "step": 283 }, { "epoch": 1.047998153917157, "grad_norm": 0.08428189903497696, "learning_rate": 3.299059487644229e-05, "loss": 0.0153, "step": 284 }, { "epoch": 1.0516903196030922, "grad_norm": 0.06667447090148926, "learning_rate": 3.292512171570775e-05, "loss": 0.0066, "step": 285 }, { "epoch": 1.0553824852890274, "grad_norm": 0.04148027300834656, "learning_rate": 3.2859409829389146e-05, "loss": 0.0029, "step": 286 }, { "epoch": 1.0590746509749625, "grad_norm": 0.06811388581991196, "learning_rate": 3.2793460431177827e-05, "loss": 0.0058, "step": 287 }, { "epoch": 1.0627668166608977, "grad_norm": 0.04447786882519722, "learning_rate": 3.272727473915197e-05, "loss": 0.0034, "step": 288 }, { "epoch": 1.066458982346833, "grad_norm": 0.22201436758041382, "learning_rate": 3.266085397575406e-05, "loss": 0.0123, "step": 289 }, { "epoch": 1.070151148032768, "grad_norm": 0.0670245811343193, "learning_rate": 3.259419936776833e-05, "loss": 0.0088, "step": 290 }, { "epoch": 1.0738433137187031, "grad_norm": 0.06902515143156052, "learning_rate": 3.25273121462981e-05, "loss": 0.0025, "step": 291 }, { "epoch": 1.0775354794046383, "grad_norm": 0.1305130422115326, "learning_rate": 3.246019354674303e-05, "loss": 0.0093, "step": 292 }, { "epoch": 1.0812276450905733, "grad_norm": 0.11023421585559845, "learning_rate": 3.239284480877632e-05, "loss": 0.0085, "step": 293 }, { "epoch": 1.0849198107765086, "grad_norm": 0.07404778897762299, "learning_rate": 3.232526717632178e-05, "loss": 0.0045, "step": 294 }, { "epoch": 1.0886119764624438, "grad_norm": 0.054306432604789734, "learning_rate": 3.22574618975309e-05, "loss": 0.0032, "step": 295 }, { "epoch": 1.092304142148379, "grad_norm": 0.05058205872774124, "learning_rate": 3.218943022475975e-05, "loss": 0.0046, "step": 296 }, { "epoch": 1.095996307834314, "grad_norm": 0.04942139610648155, "learning_rate": 3.2121173414545886e-05, "loss": 0.004, "step": 297 }, { "epoch": 1.0996884735202492, "grad_norm": 0.04936928302049637, "learning_rate": 3.205269272758513e-05, "loss": 0.0038, "step": 298 }, { "epoch": 1.1033806392061845, "grad_norm": 0.06841211020946503, "learning_rate": 3.198398942870828e-05, "loss": 0.005, "step": 299 }, { "epoch": 1.1070728048921195, "grad_norm": 0.11358197033405304, "learning_rate": 3.1915064786857745e-05, "loss": 0.0189, "step": 300 }, { "epoch": 1.1107649705780547, "grad_norm": 0.047055114060640335, "learning_rate": 3.1845920075064115e-05, "loss": 0.0022, "step": 301 }, { "epoch": 1.11445713626399, "grad_norm": 0.06010272353887558, "learning_rate": 3.177655657042266e-05, "loss": 0.0042, "step": 302 }, { "epoch": 1.118149301949925, "grad_norm": 0.24772150814533234, "learning_rate": 3.170697555406972e-05, "loss": 0.0104, "step": 303 }, { "epoch": 1.1218414676358601, "grad_norm": 0.04649265855550766, "learning_rate": 3.163717831115906e-05, "loss": 0.0032, "step": 304 }, { "epoch": 1.1255336333217953, "grad_norm": 0.0447462759912014, "learning_rate": 3.156716613083811e-05, "loss": 0.0038, "step": 305 }, { "epoch": 1.1292257990077306, "grad_norm": 0.09475228935480118, "learning_rate": 3.1496940306224185e-05, "loss": 0.0107, "step": 306 }, { "epoch": 1.1329179646936656, "grad_norm": 0.08427289873361588, "learning_rate": 3.14265021343806e-05, "loss": 0.0174, "step": 307 }, { "epoch": 1.1366101303796008, "grad_norm": 0.05597711727023125, "learning_rate": 3.1355852916292654e-05, "loss": 0.0089, "step": 308 }, { "epoch": 1.140302296065536, "grad_norm": 0.0892227441072464, "learning_rate": 3.1284993956843685e-05, "loss": 0.0135, "step": 309 }, { "epoch": 1.143994461751471, "grad_norm": 0.05312884971499443, "learning_rate": 3.121392656479094e-05, "loss": 0.0043, "step": 310 }, { "epoch": 1.1476866274374062, "grad_norm": 0.047065041959285736, "learning_rate": 3.114265205274135e-05, "loss": 0.0031, "step": 311 }, { "epoch": 1.1513787931233415, "grad_norm": 0.05725245550274849, "learning_rate": 3.1071171737127375e-05, "loss": 0.0035, "step": 312 }, { "epoch": 1.1550709588092767, "grad_norm": 0.04488014802336693, "learning_rate": 3.0999486938182605e-05, "loss": 0.003, "step": 313 }, { "epoch": 1.1587631244952117, "grad_norm": 0.047577228397130966, "learning_rate": 3.0927598979917454e-05, "loss": 0.0031, "step": 314 }, { "epoch": 1.162455290181147, "grad_norm": 0.0874548926949501, "learning_rate": 3.085550919009464e-05, "loss": 0.0102, "step": 315 }, { "epoch": 1.1661474558670821, "grad_norm": 0.19847019016742706, "learning_rate": 3.078321890020469e-05, "loss": 0.008, "step": 316 }, { "epoch": 1.1698396215530171, "grad_norm": 0.052011147141456604, "learning_rate": 3.071072944544135e-05, "loss": 0.0033, "step": 317 }, { "epoch": 1.1735317872389524, "grad_norm": 0.12744103372097015, "learning_rate": 3.0638042164676915e-05, "loss": 0.0048, "step": 318 }, { "epoch": 1.1772239529248876, "grad_norm": 0.0666290894150734, "learning_rate": 3.0565158400437525e-05, "loss": 0.0083, "step": 319 }, { "epoch": 1.1809161186108226, "grad_norm": 0.06896385550498962, "learning_rate": 3.0492079498878318e-05, "loss": 0.0067, "step": 320 }, { "epoch": 1.1846082842967578, "grad_norm": 0.07686945050954819, "learning_rate": 3.041880680975861e-05, "loss": 0.0046, "step": 321 }, { "epoch": 1.188300449982693, "grad_norm": 0.08590448647737503, "learning_rate": 3.0345341686416955e-05, "loss": 0.0045, "step": 322 }, { "epoch": 1.191992615668628, "grad_norm": 0.08735356479883194, "learning_rate": 3.0271685485746154e-05, "loss": 0.0051, "step": 323 }, { "epoch": 1.1956847813545632, "grad_norm": 0.05842231586575508, "learning_rate": 3.0197839568168167e-05, "loss": 0.0035, "step": 324 }, { "epoch": 1.1993769470404985, "grad_norm": 0.05802956968545914, "learning_rate": 3.0123805297609005e-05, "loss": 0.0065, "step": 325 }, { "epoch": 1.2030691127264337, "grad_norm": 0.08226858079433441, "learning_rate": 3.004958404147356e-05, "loss": 0.0044, "step": 326 }, { "epoch": 1.2067612784123687, "grad_norm": 0.04885147139430046, "learning_rate": 2.9975177170620307e-05, "loss": 0.0049, "step": 327 }, { "epoch": 1.210453444098304, "grad_norm": 0.24907778203487396, "learning_rate": 2.9900586059336008e-05, "loss": 0.0248, "step": 328 }, { "epoch": 1.2141456097842391, "grad_norm": 0.06446171551942825, "learning_rate": 2.9825812085310327e-05, "loss": 0.0036, "step": 329 }, { "epoch": 1.2178377754701741, "grad_norm": 4.368242263793945, "learning_rate": 2.975085662961039e-05, "loss": 0.0103, "step": 330 }, { "epoch": 1.2215299411561094, "grad_norm": 0.06198897585272789, "learning_rate": 2.967572107665526e-05, "loss": 0.0035, "step": 331 }, { "epoch": 1.2252221068420446, "grad_norm": 0.13626688718795776, "learning_rate": 2.960040681419039e-05, "loss": 0.0053, "step": 332 }, { "epoch": 1.2289142725279798, "grad_norm": 0.19492781162261963, "learning_rate": 2.9524915233261944e-05, "loss": 0.0084, "step": 333 }, { "epoch": 1.2326064382139148, "grad_norm": 0.2215728908777237, "learning_rate": 2.944924772819119e-05, "loss": 0.0086, "step": 334 }, { "epoch": 1.23629860389985, "grad_norm": 0.2793082892894745, "learning_rate": 2.9373405696548656e-05, "loss": 0.005, "step": 335 }, { "epoch": 1.2399907695857852, "grad_norm": 0.04554106295108795, "learning_rate": 2.9297390539128364e-05, "loss": 0.004, "step": 336 }, { "epoch": 1.2436829352717202, "grad_norm": 0.05815259367227554, "learning_rate": 2.922120365992196e-05, "loss": 0.0052, "step": 337 }, { "epoch": 1.2473751009576555, "grad_norm": 0.07114019244909286, "learning_rate": 2.9144846466092773e-05, "loss": 0.0079, "step": 338 }, { "epoch": 1.2510672666435907, "grad_norm": 0.0655088722705841, "learning_rate": 2.9068320367949817e-05, "loss": 0.0057, "step": 339 }, { "epoch": 1.254759432329526, "grad_norm": 1.1501847505569458, "learning_rate": 2.899162677892175e-05, "loss": 0.0089, "step": 340 }, { "epoch": 1.258451598015461, "grad_norm": 0.06778527051210403, "learning_rate": 2.891476711553077e-05, "loss": 0.0065, "step": 341 }, { "epoch": 1.2621437637013961, "grad_norm": 0.12092837691307068, "learning_rate": 2.8837742797366454e-05, "loss": 0.0112, "step": 342 }, { "epoch": 1.2658359293873311, "grad_norm": 0.08216589689254761, "learning_rate": 2.876055524705953e-05, "loss": 0.0085, "step": 343 }, { "epoch": 1.2695280950732664, "grad_norm": 0.0777222067117691, "learning_rate": 2.8683205890255613e-05, "loss": 0.0037, "step": 344 }, { "epoch": 1.2732202607592016, "grad_norm": 0.08206533640623093, "learning_rate": 2.8605696155588855e-05, "loss": 0.0061, "step": 345 }, { "epoch": 1.2769124264451368, "grad_norm": 0.09279124438762665, "learning_rate": 2.852802747465558e-05, "loss": 0.0054, "step": 346 }, { "epoch": 1.2806045921310718, "grad_norm": 0.16200777888298035, "learning_rate": 2.845020128198782e-05, "loss": 0.0099, "step": 347 }, { "epoch": 1.284296757817007, "grad_norm": 0.2419300675392151, "learning_rate": 2.837221901502685e-05, "loss": 0.0171, "step": 348 }, { "epoch": 1.2879889235029423, "grad_norm": 0.11904854327440262, "learning_rate": 2.8294082114096607e-05, "loss": 0.0187, "step": 349 }, { "epoch": 1.2916810891888773, "grad_norm": 0.07375206053256989, "learning_rate": 2.8215792022377092e-05, "loss": 0.0154, "step": 350 }, { "epoch": 1.2953732548748125, "grad_norm": 0.060616642236709595, "learning_rate": 2.8137350185877744e-05, "loss": 0.0031, "step": 351 }, { "epoch": 1.2990654205607477, "grad_norm": 0.04948243498802185, "learning_rate": 2.8058758053410704e-05, "loss": 0.0025, "step": 352 }, { "epoch": 1.302757586246683, "grad_norm": 0.07605982571840286, "learning_rate": 2.7980017076564053e-05, "loss": 0.0045, "step": 353 }, { "epoch": 1.306449751932618, "grad_norm": 0.06741812080144882, "learning_rate": 2.7901128709675025e-05, "loss": 0.005, "step": 354 }, { "epoch": 1.3101419176185531, "grad_norm": 0.09975893050432205, "learning_rate": 2.782209440980312e-05, "loss": 0.0067, "step": 355 }, { "epoch": 1.3138340833044884, "grad_norm": 0.06588315218687057, "learning_rate": 2.774291563670322e-05, "loss": 0.0027, "step": 356 }, { "epoch": 1.3175262489904234, "grad_norm": 0.11582572758197784, "learning_rate": 2.766359385279859e-05, "loss": 0.0047, "step": 357 }, { "epoch": 1.3212184146763586, "grad_norm": 0.05676430091261864, "learning_rate": 2.7584130523153906e-05, "loss": 0.0022, "step": 358 }, { "epoch": 1.3249105803622938, "grad_norm": 0.07599082589149475, "learning_rate": 2.7504527115448176e-05, "loss": 0.0047, "step": 359 }, { "epoch": 1.328602746048229, "grad_norm": 0.053951650857925415, "learning_rate": 2.742478509994763e-05, "loss": 0.0031, "step": 360 }, { "epoch": 1.332294911734164, "grad_norm": 0.05379689112305641, "learning_rate": 2.7344905949478557e-05, "loss": 0.0034, "step": 361 }, { "epoch": 1.3359870774200993, "grad_norm": 0.08939212560653687, "learning_rate": 2.7264891139400155e-05, "loss": 0.0103, "step": 362 }, { "epoch": 1.3396792431060343, "grad_norm": 0.05766845494508743, "learning_rate": 2.718474214757719e-05, "loss": 0.0036, "step": 363 }, { "epoch": 1.3433714087919695, "grad_norm": 0.11903363466262817, "learning_rate": 2.710446045435278e-05, "loss": 0.0057, "step": 364 }, { "epoch": 1.3470635744779047, "grad_norm": 0.07542143017053604, "learning_rate": 2.7024047542521014e-05, "loss": 0.0085, "step": 365 }, { "epoch": 1.35075574016384, "grad_norm": 0.08536005765199661, "learning_rate": 2.694350489729958e-05, "loss": 0.0144, "step": 366 }, { "epoch": 1.3544479058497751, "grad_norm": 0.09188759326934814, "learning_rate": 2.6862834006302324e-05, "loss": 0.0083, "step": 367 }, { "epoch": 1.3581400715357101, "grad_norm": 0.1899387389421463, "learning_rate": 2.678203635951177e-05, "loss": 0.0084, "step": 368 }, { "epoch": 1.3581400715357101, "eval_loss": 0.008687354624271393, "eval_runtime": 90.5037, "eval_samples_per_second": 10.088, "eval_steps_per_second": 5.05, "step": 368 }, { "epoch": 1.3618322372216454, "grad_norm": 0.046323299407958984, "learning_rate": 2.6701113449251618e-05, "loss": 0.0044, "step": 369 }, { "epoch": 1.3655244029075804, "grad_norm": 0.06219512224197388, "learning_rate": 2.6620066770159178e-05, "loss": 0.0032, "step": 370 }, { "epoch": 1.3692165685935156, "grad_norm": 0.1851065307855606, "learning_rate": 2.6538897819157733e-05, "loss": 0.005, "step": 371 }, { "epoch": 1.3729087342794508, "grad_norm": 0.12302684038877487, "learning_rate": 2.6457608095428925e-05, "loss": 0.0056, "step": 372 }, { "epoch": 1.376600899965386, "grad_norm": 0.06654980778694153, "learning_rate": 2.6376199100385074e-05, "loss": 0.0049, "step": 373 }, { "epoch": 1.380293065651321, "grad_norm": 0.08494460582733154, "learning_rate": 2.62946723376414e-05, "loss": 0.011, "step": 374 }, { "epoch": 1.3839852313372563, "grad_norm": 0.08226186037063599, "learning_rate": 2.6213029312988294e-05, "loss": 0.008, "step": 375 }, { "epoch": 1.3876773970231915, "grad_norm": 0.06261271983385086, "learning_rate": 2.6131271534363497e-05, "loss": 0.0063, "step": 376 }, { "epoch": 1.3913695627091265, "grad_norm": 0.040595002472400665, "learning_rate": 2.604940051182422e-05, "loss": 0.0029, "step": 377 }, { "epoch": 1.3950617283950617, "grad_norm": 0.054169923067092896, "learning_rate": 2.596741775751931e-05, "loss": 0.0023, "step": 378 }, { "epoch": 1.398753894080997, "grad_norm": 0.06638327986001968, "learning_rate": 2.5885324785661263e-05, "loss": 0.0059, "step": 379 }, { "epoch": 1.4024460597669322, "grad_norm": 0.08735162764787674, "learning_rate": 2.580312311249828e-05, "loss": 0.0053, "step": 380 }, { "epoch": 1.4061382254528672, "grad_norm": 0.03515574708580971, "learning_rate": 2.572081425628628e-05, "loss": 0.0026, "step": 381 }, { "epoch": 1.4098303911388024, "grad_norm": 0.07844855636358261, "learning_rate": 2.5638399737260837e-05, "loss": 0.0071, "step": 382 }, { "epoch": 1.4135225568247374, "grad_norm": 0.05010690912604332, "learning_rate": 2.555588107760909e-05, "loss": 0.0032, "step": 383 }, { "epoch": 1.4172147225106726, "grad_norm": 0.048177916556596756, "learning_rate": 2.5473259801441663e-05, "loss": 0.0027, "step": 384 }, { "epoch": 1.4209068881966078, "grad_norm": 0.073530413210392, "learning_rate": 2.5390537434764483e-05, "loss": 0.0066, "step": 385 }, { "epoch": 1.424599053882543, "grad_norm": 0.13586187362670898, "learning_rate": 2.530771550545061e-05, "loss": 0.0111, "step": 386 }, { "epoch": 1.4282912195684783, "grad_norm": 0.08986911922693253, "learning_rate": 2.522479554321203e-05, "loss": 0.003, "step": 387 }, { "epoch": 1.4319833852544133, "grad_norm": 0.09543124586343765, "learning_rate": 2.5141779079571366e-05, "loss": 0.0058, "step": 388 }, { "epoch": 1.4356755509403485, "grad_norm": 0.06117438152432442, "learning_rate": 2.5058667647833615e-05, "loss": 0.0031, "step": 389 }, { "epoch": 1.4393677166262835, "grad_norm": 0.05431349202990532, "learning_rate": 2.4975462783057837e-05, "loss": 0.006, "step": 390 }, { "epoch": 1.4430598823122187, "grad_norm": 0.04814140498638153, "learning_rate": 2.4892166022028778e-05, "loss": 0.0026, "step": 391 }, { "epoch": 1.446752047998154, "grad_norm": 0.04245537519454956, "learning_rate": 2.4808778903228506e-05, "loss": 0.0024, "step": 392 }, { "epoch": 1.4504442136840892, "grad_norm": 0.027589252218604088, "learning_rate": 2.472530296680797e-05, "loss": 0.0014, "step": 393 }, { "epoch": 1.4541363793700242, "grad_norm": 0.053102582693099976, "learning_rate": 2.4641739754558594e-05, "loss": 0.0051, "step": 394 }, { "epoch": 1.4578285450559594, "grad_norm": 0.04420861601829529, "learning_rate": 2.4558090809883767e-05, "loss": 0.0053, "step": 395 }, { "epoch": 1.4615207107418946, "grad_norm": 0.07793322950601578, "learning_rate": 2.4474357677770336e-05, "loss": 0.013, "step": 396 }, { "epoch": 1.4652128764278296, "grad_norm": 0.08467935770750046, "learning_rate": 2.4390541904760105e-05, "loss": 0.0059, "step": 397 }, { "epoch": 1.4689050421137648, "grad_norm": 0.10742378234863281, "learning_rate": 2.430664503892122e-05, "loss": 0.0077, "step": 398 }, { "epoch": 1.4725972077997, "grad_norm": 0.04297586902976036, "learning_rate": 2.4222668629819622e-05, "loss": 0.0021, "step": 399 }, { "epoch": 1.4762893734856353, "grad_norm": 0.047660425305366516, "learning_rate": 2.4138614228490395e-05, "loss": 0.0048, "step": 400 }, { "epoch": 1.4799815391715703, "grad_norm": 0.046939097344875336, "learning_rate": 2.4054483387409135e-05, "loss": 0.0024, "step": 401 }, { "epoch": 1.4836737048575055, "grad_norm": 0.06429211050271988, "learning_rate": 2.3970277660463275e-05, "loss": 0.0032, "step": 402 }, { "epoch": 1.4873658705434407, "grad_norm": 0.053521350026130676, "learning_rate": 2.3885998602923387e-05, "loss": 0.0033, "step": 403 }, { "epoch": 1.4910580362293757, "grad_norm": 0.05599725991487503, "learning_rate": 2.380164777141443e-05, "loss": 0.0035, "step": 404 }, { "epoch": 1.494750201915311, "grad_norm": 0.055872391909360886, "learning_rate": 2.3717226723887037e-05, "loss": 0.0041, "step": 405 }, { "epoch": 1.4984423676012462, "grad_norm": 0.0680844634771347, "learning_rate": 2.363273701958873e-05, "loss": 0.0043, "step": 406 }, { "epoch": 1.5021345332871814, "grad_norm": 0.14843975007534027, "learning_rate": 2.35481802190351e-05, "loss": 0.0105, "step": 407 }, { "epoch": 1.5058266989731164, "grad_norm": 0.05503234639763832, "learning_rate": 2.3463557883980995e-05, "loss": 0.0046, "step": 408 }, { "epoch": 1.5095188646590516, "grad_norm": 0.11701611429452896, "learning_rate": 2.337887157739169e-05, "loss": 0.0338, "step": 409 }, { "epoch": 1.5132110303449866, "grad_norm": 0.08817101269960403, "learning_rate": 2.3294122863414e-05, "loss": 0.0129, "step": 410 }, { "epoch": 1.5169031960309218, "grad_norm": 0.08257415890693665, "learning_rate": 2.3209313307347413e-05, "loss": 0.0104, "step": 411 }, { "epoch": 1.520595361716857, "grad_norm": 0.05527732893824577, "learning_rate": 2.312444447561514e-05, "loss": 0.0038, "step": 412 }, { "epoch": 1.5242875274027923, "grad_norm": 0.06380025297403336, "learning_rate": 2.3039517935735215e-05, "loss": 0.0051, "step": 413 }, { "epoch": 1.5279796930887275, "grad_norm": 0.040247559547424316, "learning_rate": 2.2954535256291554e-05, "loss": 0.0027, "step": 414 }, { "epoch": 1.5316718587746625, "grad_norm": 0.04606495052576065, "learning_rate": 2.2869498006904934e-05, "loss": 0.0043, "step": 415 }, { "epoch": 1.5353640244605977, "grad_norm": 0.051601577550172806, "learning_rate": 2.2784407758204054e-05, "loss": 0.0039, "step": 416 }, { "epoch": 1.5390561901465327, "grad_norm": 0.06773068755865097, "learning_rate": 2.2699266081796493e-05, "loss": 0.0042, "step": 417 }, { "epoch": 1.542748355832468, "grad_norm": 0.06343487650156021, "learning_rate": 2.2614074550239707e-05, "loss": 0.0053, "step": 418 }, { "epoch": 1.5464405215184032, "grad_norm": 0.04785839468240738, "learning_rate": 2.2528834737011963e-05, "loss": 0.0057, "step": 419 }, { "epoch": 1.5501326872043384, "grad_norm": 0.06275342404842377, "learning_rate": 2.2443548216483292e-05, "loss": 0.0049, "step": 420 }, { "epoch": 1.5538248528902736, "grad_norm": 0.06469978392124176, "learning_rate": 2.235821656388638e-05, "loss": 0.0049, "step": 421 }, { "epoch": 1.5575170185762086, "grad_norm": 0.07403778284788132, "learning_rate": 2.2272841355287526e-05, "loss": 0.01, "step": 422 }, { "epoch": 1.5612091842621436, "grad_norm": 0.041104815900325775, "learning_rate": 2.2187424167557496e-05, "loss": 0.0027, "step": 423 }, { "epoch": 1.5649013499480788, "grad_norm": 0.09375158697366714, "learning_rate": 2.210196657834239e-05, "loss": 0.0037, "step": 424 }, { "epoch": 1.568593515634014, "grad_norm": 0.043952830135822296, "learning_rate": 2.2016470166034544e-05, "loss": 0.0034, "step": 425 }, { "epoch": 1.5722856813199493, "grad_norm": 0.07946529239416122, "learning_rate": 2.193093650974334e-05, "loss": 0.0103, "step": 426 }, { "epoch": 1.5759778470058845, "grad_norm": 0.09605729579925537, "learning_rate": 2.184536718926604e-05, "loss": 0.0063, "step": 427 }, { "epoch": 1.5796700126918195, "grad_norm": 0.04163216054439545, "learning_rate": 2.175976378505865e-05, "loss": 0.0036, "step": 428 }, { "epoch": 1.5833621783777547, "grad_norm": 0.036647167056798935, "learning_rate": 2.1674127878206664e-05, "loss": 0.0037, "step": 429 }, { "epoch": 1.5870543440636897, "grad_norm": 0.06975825875997543, "learning_rate": 2.1588461050395918e-05, "loss": 0.0077, "step": 430 }, { "epoch": 1.590746509749625, "grad_norm": 0.06405274569988251, "learning_rate": 2.1502764883883355e-05, "loss": 0.0085, "step": 431 }, { "epoch": 1.5944386754355602, "grad_norm": 0.09269712120294571, "learning_rate": 2.141704096146779e-05, "loss": 0.0192, "step": 432 }, { "epoch": 1.5981308411214954, "grad_norm": 0.07872427254915237, "learning_rate": 2.133129086646069e-05, "loss": 0.0115, "step": 433 }, { "epoch": 1.6018230068074306, "grad_norm": 0.053590744733810425, "learning_rate": 2.1245516182656938e-05, "loss": 0.0039, "step": 434 }, { "epoch": 1.6055151724933656, "grad_norm": 0.05257750675082207, "learning_rate": 2.1159718494305547e-05, "loss": 0.005, "step": 435 }, { "epoch": 1.6092073381793008, "grad_norm": 0.05436495319008827, "learning_rate": 2.107389938608045e-05, "loss": 0.0044, "step": 436 }, { "epoch": 1.6128995038652358, "grad_norm": 0.063501738011837, "learning_rate": 2.0988060443051165e-05, "loss": 0.0059, "step": 437 }, { "epoch": 1.616591669551171, "grad_norm": 0.06863530725240707, "learning_rate": 2.0902203250653596e-05, "loss": 0.0092, "step": 438 }, { "epoch": 1.6202838352371063, "grad_norm": 0.06638474762439728, "learning_rate": 2.0816329394660696e-05, "loss": 0.0031, "step": 439 }, { "epoch": 1.6239760009230415, "grad_norm": 0.03661806881427765, "learning_rate": 2.0730440461153183e-05, "loss": 0.0036, "step": 440 }, { "epoch": 1.6276681666089767, "grad_norm": 0.05380409210920334, "learning_rate": 2.0644538036490257e-05, "loss": 0.0062, "step": 441 }, { "epoch": 1.6313603322949117, "grad_norm": 0.0474727526307106, "learning_rate": 2.0558623707280313e-05, "loss": 0.0033, "step": 442 }, { "epoch": 1.6350524979808467, "grad_norm": 0.07232918590307236, "learning_rate": 2.0472699060351614e-05, "loss": 0.0035, "step": 443 }, { "epoch": 1.638744663666782, "grad_norm": 0.16932211816310883, "learning_rate": 2.038676568272298e-05, "loss": 0.0054, "step": 444 }, { "epoch": 1.6424368293527172, "grad_norm": 0.08939804881811142, "learning_rate": 2.03008251615745e-05, "loss": 0.0061, "step": 445 }, { "epoch": 1.6461289950386524, "grad_norm": 0.048010073602199554, "learning_rate": 2.0214879084218193e-05, "loss": 0.0033, "step": 446 }, { "epoch": 1.6498211607245876, "grad_norm": 0.08143167942762375, "learning_rate": 2.0128929038068716e-05, "loss": 0.0123, "step": 447 }, { "epoch": 1.6535133264105226, "grad_norm": 0.041366055607795715, "learning_rate": 2.0042976610614006e-05, "loss": 0.0022, "step": 448 }, { "epoch": 1.6572054920964578, "grad_norm": 0.06036004796624184, "learning_rate": 1.9957023389385998e-05, "loss": 0.0031, "step": 449 }, { "epoch": 1.6608976577823928, "grad_norm": 0.06090189516544342, "learning_rate": 1.9871070961931294e-05, "loss": 0.0046, "step": 450 }, { "epoch": 1.664589823468328, "grad_norm": 0.06394810229539871, "learning_rate": 1.9785120915781813e-05, "loss": 0.0055, "step": 451 }, { "epoch": 1.6682819891542633, "grad_norm": 0.05126480385661125, "learning_rate": 1.9699174838425502e-05, "loss": 0.0028, "step": 452 }, { "epoch": 1.6719741548401985, "grad_norm": 0.16779179871082306, "learning_rate": 1.961323431727703e-05, "loss": 0.0138, "step": 453 }, { "epoch": 1.6756663205261337, "grad_norm": 0.04399624094367027, "learning_rate": 1.9527300939648396e-05, "loss": 0.0028, "step": 454 }, { "epoch": 1.6793584862120687, "grad_norm": 0.07073678821325302, "learning_rate": 1.9441376292719687e-05, "loss": 0.0042, "step": 455 }, { "epoch": 1.683050651898004, "grad_norm": 0.06449951231479645, "learning_rate": 1.935546196350975e-05, "loss": 0.0047, "step": 456 }, { "epoch": 1.686742817583939, "grad_norm": 0.05503733456134796, "learning_rate": 1.9269559538846823e-05, "loss": 0.0054, "step": 457 }, { "epoch": 1.6904349832698742, "grad_norm": 0.0840897262096405, "learning_rate": 1.9183670605339314e-05, "loss": 0.0096, "step": 458 }, { "epoch": 1.6941271489558094, "grad_norm": 0.06032564863562584, "learning_rate": 1.909779674934641e-05, "loss": 0.0039, "step": 459 }, { "epoch": 1.6978193146417446, "grad_norm": 0.08506152033805847, "learning_rate": 1.9011939556948835e-05, "loss": 0.0061, "step": 460 }, { "epoch": 1.6978193146417446, "eval_loss": 0.007821443490684032, "eval_runtime": 90.0445, "eval_samples_per_second": 10.139, "eval_steps_per_second": 5.075, "step": 460 }, { "epoch": 1.7015114803276798, "grad_norm": 0.09359995275735855, "learning_rate": 1.8926100613919565e-05, "loss": 0.0137, "step": 461 }, { "epoch": 1.7052036460136148, "grad_norm": 0.050462689250707626, "learning_rate": 1.884028150569446e-05, "loss": 0.0026, "step": 462 }, { "epoch": 1.70889581169955, "grad_norm": 0.05139093101024628, "learning_rate": 1.8754483817343065e-05, "loss": 0.0038, "step": 463 }, { "epoch": 1.712587977385485, "grad_norm": 0.05640941858291626, "learning_rate": 1.8668709133539316e-05, "loss": 0.0048, "step": 464 }, { "epoch": 1.7162801430714203, "grad_norm": 0.0617087222635746, "learning_rate": 1.8582959038532216e-05, "loss": 0.0066, "step": 465 }, { "epoch": 1.7199723087573555, "grad_norm": 0.05858307704329491, "learning_rate": 1.8497235116116656e-05, "loss": 0.0042, "step": 466 }, { "epoch": 1.7236644744432907, "grad_norm": 0.06570729613304138, "learning_rate": 1.841153894960409e-05, "loss": 0.0069, "step": 467 }, { "epoch": 1.727356640129226, "grad_norm": 0.03923754021525383, "learning_rate": 1.8325872121793343e-05, "loss": 0.0038, "step": 468 }, { "epoch": 1.731048805815161, "grad_norm": 0.04980519786477089, "learning_rate": 1.824023621494136e-05, "loss": 0.0033, "step": 469 }, { "epoch": 1.734740971501096, "grad_norm": 0.0408487394452095, "learning_rate": 1.815463281073396e-05, "loss": 0.0025, "step": 470 }, { "epoch": 1.7384331371870312, "grad_norm": 0.07105151563882828, "learning_rate": 1.8069063490256668e-05, "loss": 0.0055, "step": 471 }, { "epoch": 1.7421253028729664, "grad_norm": 0.048975639045238495, "learning_rate": 1.7983529833965463e-05, "loss": 0.0036, "step": 472 }, { "epoch": 1.7458174685589016, "grad_norm": 0.034441813826560974, "learning_rate": 1.7898033421657616e-05, "loss": 0.0029, "step": 473 }, { "epoch": 1.7495096342448369, "grad_norm": 0.07011700421571732, "learning_rate": 1.7812575832442518e-05, "loss": 0.0097, "step": 474 }, { "epoch": 1.7532017999307719, "grad_norm": 0.058152489364147186, "learning_rate": 1.7727158644712484e-05, "loss": 0.0092, "step": 475 }, { "epoch": 1.756893965616707, "grad_norm": 0.049807388335466385, "learning_rate": 1.764178343611363e-05, "loss": 0.0036, "step": 476 }, { "epoch": 1.760586131302642, "grad_norm": 0.05702248960733414, "learning_rate": 1.755645178351672e-05, "loss": 0.0091, "step": 477 }, { "epoch": 1.7642782969885773, "grad_norm": 0.051516178995370865, "learning_rate": 1.747116526298804e-05, "loss": 0.0044, "step": 478 }, { "epoch": 1.7679704626745125, "grad_norm": 0.09747370332479477, "learning_rate": 1.7385925449760296e-05, "loss": 0.0065, "step": 479 }, { "epoch": 1.7716626283604477, "grad_norm": 0.05502758547663689, "learning_rate": 1.7300733918203514e-05, "loss": 0.0151, "step": 480 }, { "epoch": 1.775354794046383, "grad_norm": 0.05905942991375923, "learning_rate": 1.7215592241795956e-05, "loss": 0.0029, "step": 481 }, { "epoch": 1.779046959732318, "grad_norm": 0.07898251712322235, "learning_rate": 1.7130501993095076e-05, "loss": 0.0034, "step": 482 }, { "epoch": 1.7827391254182532, "grad_norm": 0.0482512004673481, "learning_rate": 1.7045464743708456e-05, "loss": 0.0065, "step": 483 }, { "epoch": 1.7864312911041882, "grad_norm": 0.07192537188529968, "learning_rate": 1.6960482064264788e-05, "loss": 0.0076, "step": 484 }, { "epoch": 1.7901234567901234, "grad_norm": 0.08650217205286026, "learning_rate": 1.687555552438487e-05, "loss": 0.0209, "step": 485 }, { "epoch": 1.7938156224760586, "grad_norm": 0.06443698704242706, "learning_rate": 1.679068669265259e-05, "loss": 0.0148, "step": 486 }, { "epoch": 1.7975077881619939, "grad_norm": 0.04942217096686363, "learning_rate": 1.6705877136586e-05, "loss": 0.0048, "step": 487 }, { "epoch": 1.801199953847929, "grad_norm": 0.04607919976115227, "learning_rate": 1.6621128422608318e-05, "loss": 0.0034, "step": 488 }, { "epoch": 1.804892119533864, "grad_norm": 0.06549520045518875, "learning_rate": 1.6536442116019012e-05, "loss": 0.0067, "step": 489 }, { "epoch": 1.808584285219799, "grad_norm": 0.1073460504412651, "learning_rate": 1.6451819780964912e-05, "loss": 0.0181, "step": 490 }, { "epoch": 1.8122764509057343, "grad_norm": 0.03819148242473602, "learning_rate": 1.6367262980411273e-05, "loss": 0.0031, "step": 491 }, { "epoch": 1.8159686165916695, "grad_norm": 0.05567432940006256, "learning_rate": 1.6282773276112963e-05, "loss": 0.0032, "step": 492 }, { "epoch": 1.8196607822776047, "grad_norm": 0.0490952767431736, "learning_rate": 1.619835222858558e-05, "loss": 0.0022, "step": 493 }, { "epoch": 1.82335294796354, "grad_norm": 0.06697966158390045, "learning_rate": 1.6114001397076623e-05, "loss": 0.0051, "step": 494 }, { "epoch": 1.827045113649475, "grad_norm": 0.06809218227863312, "learning_rate": 1.6029722339536725e-05, "loss": 0.0065, "step": 495 }, { "epoch": 1.8307372793354102, "grad_norm": 0.07054764032363892, "learning_rate": 1.5945516612590872e-05, "loss": 0.0051, "step": 496 }, { "epoch": 1.8344294450213452, "grad_norm": 0.03432456776499748, "learning_rate": 1.5861385771509612e-05, "loss": 0.0021, "step": 497 }, { "epoch": 1.8381216107072804, "grad_norm": 0.08520621806383133, "learning_rate": 1.5777331370180388e-05, "loss": 0.005, "step": 498 }, { "epoch": 1.8418137763932156, "grad_norm": 0.05386526510119438, "learning_rate": 1.5693354961078783e-05, "loss": 0.0048, "step": 499 }, { "epoch": 1.8455059420791509, "grad_norm": 0.07909571379423141, "learning_rate": 1.56094580952399e-05, "loss": 0.0068, "step": 500 }, { "epoch": 1.849198107765086, "grad_norm": 0.048736322671175, "learning_rate": 1.5525642322229667e-05, "loss": 0.0082, "step": 501 }, { "epoch": 1.852890273451021, "grad_norm": 0.05122867971658707, "learning_rate": 1.5441909190116237e-05, "loss": 0.0054, "step": 502 }, { "epoch": 1.8565824391369563, "grad_norm": 0.043926313519477844, "learning_rate": 1.535826024544141e-05, "loss": 0.0022, "step": 503 }, { "epoch": 1.8602746048228913, "grad_norm": 0.11847585439682007, "learning_rate": 1.5274697033192033e-05, "loss": 0.0228, "step": 504 }, { "epoch": 1.8639667705088265, "grad_norm": 0.06619201600551605, "learning_rate": 1.51912210967715e-05, "loss": 0.0038, "step": 505 }, { "epoch": 1.8676589361947618, "grad_norm": 0.11160582304000854, "learning_rate": 1.5107833977971227e-05, "loss": 0.0097, "step": 506 }, { "epoch": 1.871351101880697, "grad_norm": 0.053367406129837036, "learning_rate": 1.5024537216942166e-05, "loss": 0.0046, "step": 507 }, { "epoch": 1.8750432675666322, "grad_norm": 0.04304146394133568, "learning_rate": 1.4941332352166385e-05, "loss": 0.0038, "step": 508 }, { "epoch": 1.8787354332525672, "grad_norm": 0.06712479144334793, "learning_rate": 1.485822092042864e-05, "loss": 0.0094, "step": 509 }, { "epoch": 1.8824275989385022, "grad_norm": 0.07085831463336945, "learning_rate": 1.4775204456787973e-05, "loss": 0.0065, "step": 510 }, { "epoch": 1.8861197646244374, "grad_norm": 0.06869763880968094, "learning_rate": 1.469228449454939e-05, "loss": 0.0062, "step": 511 }, { "epoch": 1.8898119303103726, "grad_norm": 0.25166359543800354, "learning_rate": 1.4609462565235524e-05, "loss": 0.0039, "step": 512 }, { "epoch": 1.8935040959963079, "grad_norm": 0.05382119119167328, "learning_rate": 1.4526740198558345e-05, "loss": 0.0076, "step": 513 }, { "epoch": 1.897196261682243, "grad_norm": 0.03443971276283264, "learning_rate": 1.4444118922390921e-05, "loss": 0.0029, "step": 514 }, { "epoch": 1.9008884273681783, "grad_norm": 0.04824664443731308, "learning_rate": 1.4361600262739171e-05, "loss": 0.0034, "step": 515 }, { "epoch": 1.9045805930541133, "grad_norm": 0.05474744364619255, "learning_rate": 1.4279185743713721e-05, "loss": 0.0053, "step": 516 }, { "epoch": 1.9082727587400483, "grad_norm": 0.05808331444859505, "learning_rate": 1.419687688750173e-05, "loss": 0.0036, "step": 517 }, { "epoch": 1.9119649244259835, "grad_norm": 0.0522182323038578, "learning_rate": 1.4114675214338745e-05, "loss": 0.0029, "step": 518 }, { "epoch": 1.9156570901119188, "grad_norm": 0.04176926612854004, "learning_rate": 1.4032582242480692e-05, "loss": 0.0069, "step": 519 }, { "epoch": 1.919349255797854, "grad_norm": 0.3265964984893799, "learning_rate": 1.3950599488175783e-05, "loss": 0.0124, "step": 520 }, { "epoch": 1.9230414214837892, "grad_norm": 0.05882977694272995, "learning_rate": 1.3868728465636508e-05, "loss": 0.0048, "step": 521 }, { "epoch": 1.9267335871697242, "grad_norm": 0.04733727127313614, "learning_rate": 1.3786970687011713e-05, "loss": 0.0057, "step": 522 }, { "epoch": 1.9304257528556594, "grad_norm": 0.07429318130016327, "learning_rate": 1.3705327662358605e-05, "loss": 0.0154, "step": 523 }, { "epoch": 1.9341179185415944, "grad_norm": 0.04765889793634415, "learning_rate": 1.362380089961493e-05, "loss": 0.0025, "step": 524 }, { "epoch": 1.9378100842275297, "grad_norm": 0.11108744144439697, "learning_rate": 1.3542391904571082e-05, "loss": 0.013, "step": 525 }, { "epoch": 1.9415022499134649, "grad_norm": 0.05669174715876579, "learning_rate": 1.3461102180842274e-05, "loss": 0.0063, "step": 526 }, { "epoch": 1.9451944155994, "grad_norm": 0.04899504780769348, "learning_rate": 1.3379933229840827e-05, "loss": 0.0061, "step": 527 }, { "epoch": 1.9488865812853353, "grad_norm": 0.04838700219988823, "learning_rate": 1.3298886550748387e-05, "loss": 0.0059, "step": 528 }, { "epoch": 1.9525787469712703, "grad_norm": 0.06490358710289001, "learning_rate": 1.3217963640488232e-05, "loss": 0.0032, "step": 529 }, { "epoch": 1.9562709126572055, "grad_norm": 0.06235655024647713, "learning_rate": 1.3137165993697687e-05, "loss": 0.0052, "step": 530 }, { "epoch": 1.9599630783431405, "grad_norm": 0.06066396087408066, "learning_rate": 1.3056495102700426e-05, "loss": 0.0082, "step": 531 }, { "epoch": 1.9636552440290758, "grad_norm": 0.03929363191127777, "learning_rate": 1.2975952457478986e-05, "loss": 0.0035, "step": 532 }, { "epoch": 1.967347409715011, "grad_norm": 0.1256752759218216, "learning_rate": 1.2895539545647229e-05, "loss": 0.0051, "step": 533 }, { "epoch": 1.9710395754009462, "grad_norm": 0.04527255520224571, "learning_rate": 1.2815257852422818e-05, "loss": 0.0029, "step": 534 }, { "epoch": 1.9747317410868814, "grad_norm": 0.08779493719339371, "learning_rate": 1.2735108860599848e-05, "loss": 0.0051, "step": 535 }, { "epoch": 1.9784239067728164, "grad_norm": 0.061192456632852554, "learning_rate": 1.2655094050521447e-05, "loss": 0.0061, "step": 536 }, { "epoch": 1.9821160724587514, "grad_norm": 0.03906107693910599, "learning_rate": 1.2575214900052378e-05, "loss": 0.0035, "step": 537 }, { "epoch": 1.9858082381446867, "grad_norm": 0.05745285004377365, "learning_rate": 1.2495472884551836e-05, "loss": 0.0061, "step": 538 }, { "epoch": 1.9895004038306219, "grad_norm": 0.0978095754981041, "learning_rate": 1.2415869476846101e-05, "loss": 0.0058, "step": 539 }, { "epoch": 1.993192569516557, "grad_norm": 0.05764520913362503, "learning_rate": 1.2336406147201411e-05, "loss": 0.0038, "step": 540 }, { "epoch": 1.9968847352024923, "grad_norm": 0.04506031796336174, "learning_rate": 1.225708436329679e-05, "loss": 0.0066, "step": 541 }, { "epoch": 2.0, "grad_norm": 0.08254817128181458, "learning_rate": 1.2177905590196884e-05, "loss": 0.0048, "step": 542 }, { "epoch": 2.0036921656859352, "grad_norm": 0.050131332129240036, "learning_rate": 1.2098871290324974e-05, "loss": 0.0011, "step": 543 }, { "epoch": 2.0073843313718704, "grad_norm": 0.04068991169333458, "learning_rate": 1.2019982923435954e-05, "loss": 0.0048, "step": 544 }, { "epoch": 2.0110764970578057, "grad_norm": 0.04712502658367157, "learning_rate": 1.1941241946589299e-05, "loss": 0.0016, "step": 545 }, { "epoch": 2.0147686627437404, "grad_norm": 0.019259510561823845, "learning_rate": 1.1862649814122263e-05, "loss": 0.0013, "step": 546 }, { "epoch": 2.0184608284296757, "grad_norm": 0.06433498114347458, "learning_rate": 1.1784207977622914e-05, "loss": 0.0021, "step": 547 }, { "epoch": 2.022152994115611, "grad_norm": 0.054391320794820786, "learning_rate": 1.1705917885903402e-05, "loss": 0.0079, "step": 548 }, { "epoch": 2.025845159801546, "grad_norm": 0.04541629180312157, "learning_rate": 1.1627780984973153e-05, "loss": 0.0029, "step": 549 }, { "epoch": 2.0295373254874813, "grad_norm": 0.02488025464117527, "learning_rate": 1.1549798718012184e-05, "loss": 0.0013, "step": 550 }, { "epoch": 2.0332294911734166, "grad_norm": 0.06702622771263123, "learning_rate": 1.1471972525344421e-05, "loss": 0.0067, "step": 551 }, { "epoch": 2.036921656859352, "grad_norm": 0.04141972213983536, "learning_rate": 1.139430384441115e-05, "loss": 0.0037, "step": 552 }, { "epoch": 2.036921656859352, "eval_loss": 0.0076590548269450665, "eval_runtime": 89.9642, "eval_samples_per_second": 10.148, "eval_steps_per_second": 5.08, "step": 552 }, { "epoch": 2.0406138225452866, "grad_norm": 0.09934539347887039, "learning_rate": 1.1316794109744394e-05, "loss": 0.0017, "step": 553 }, { "epoch": 2.044305988231222, "grad_norm": 0.04188989847898483, "learning_rate": 1.1239444752940477e-05, "loss": 0.0017, "step": 554 }, { "epoch": 2.047998153917157, "grad_norm": 0.03835677355527878, "learning_rate": 1.1162257202633548e-05, "loss": 0.0054, "step": 555 }, { "epoch": 2.0516903196030922, "grad_norm": 0.04190275818109512, "learning_rate": 1.1085232884469236e-05, "loss": 0.0022, "step": 556 }, { "epoch": 2.0553824852890274, "grad_norm": 0.03239237144589424, "learning_rate": 1.1008373221078261e-05, "loss": 0.0018, "step": 557 }, { "epoch": 2.0590746509749627, "grad_norm": 0.022900836542248726, "learning_rate": 1.0931679632050186e-05, "loss": 0.0014, "step": 558 }, { "epoch": 2.0627668166608975, "grad_norm": 0.04315221309661865, "learning_rate": 1.085515353390723e-05, "loss": 0.001, "step": 559 }, { "epoch": 2.0664589823468327, "grad_norm": 0.03839712589979172, "learning_rate": 1.0778796340078043e-05, "loss": 0.0019, "step": 560 }, { "epoch": 2.070151148032768, "grad_norm": 0.04536756873130798, "learning_rate": 1.070260946087164e-05, "loss": 0.005, "step": 561 }, { "epoch": 2.073843313718703, "grad_norm": 0.0286678746342659, "learning_rate": 1.0626594303451359e-05, "loss": 0.0013, "step": 562 }, { "epoch": 2.0775354794046383, "grad_norm": 0.04558982327580452, "learning_rate": 1.0550752271808817e-05, "loss": 0.0028, "step": 563 }, { "epoch": 2.0812276450905736, "grad_norm": 0.08176779747009277, "learning_rate": 1.0475084766738051e-05, "loss": 0.0031, "step": 564 }, { "epoch": 2.084919810776509, "grad_norm": 0.07253772020339966, "learning_rate": 1.0399593185809625e-05, "loss": 0.0081, "step": 565 }, { "epoch": 2.0886119764624436, "grad_norm": 0.045510150492191315, "learning_rate": 1.0324278923344741e-05, "loss": 0.0022, "step": 566 }, { "epoch": 2.092304142148379, "grad_norm": 0.04695519059896469, "learning_rate": 1.0249143370389607e-05, "loss": 0.0029, "step": 567 }, { "epoch": 2.095996307834314, "grad_norm": 0.04093737155199051, "learning_rate": 1.0174187914689681e-05, "loss": 0.0038, "step": 568 }, { "epoch": 2.0996884735202492, "grad_norm": 0.026440149173140526, "learning_rate": 1.0099413940664e-05, "loss": 0.0011, "step": 569 }, { "epoch": 2.1033806392061845, "grad_norm": 0.040475890040397644, "learning_rate": 1.0024822829379701e-05, "loss": 0.0016, "step": 570 }, { "epoch": 2.1070728048921197, "grad_norm": 0.07037003338336945, "learning_rate": 9.950415958526449e-06, "loss": 0.0018, "step": 571 }, { "epoch": 2.110764970578055, "grad_norm": 0.04606116563081741, "learning_rate": 9.876194702390998e-06, "loss": 0.0018, "step": 572 }, { "epoch": 2.1144571362639897, "grad_norm": 0.03653557226061821, "learning_rate": 9.802160431831845e-06, "loss": 0.0011, "step": 573 }, { "epoch": 2.118149301949925, "grad_norm": 0.03161030635237694, "learning_rate": 9.728314514253856e-06, "loss": 0.0015, "step": 574 }, { "epoch": 2.12184146763586, "grad_norm": 0.054874520748853683, "learning_rate": 9.654658313583045e-06, "loss": 0.0084, "step": 575 }, { "epoch": 2.1255336333217953, "grad_norm": 0.07581143826246262, "learning_rate": 9.581193190241398e-06, "loss": 0.0031, "step": 576 }, { "epoch": 2.1292257990077306, "grad_norm": 0.049715541303157806, "learning_rate": 9.507920501121685e-06, "loss": 0.0018, "step": 577 }, { "epoch": 2.132917964693666, "grad_norm": 0.02939217910170555, "learning_rate": 9.434841599562487e-06, "loss": 0.0012, "step": 578 }, { "epoch": 2.1366101303796006, "grad_norm": 0.02201433666050434, "learning_rate": 9.361957835323088e-06, "loss": 0.0005, "step": 579 }, { "epoch": 2.140302296065536, "grad_norm": 0.03674834221601486, "learning_rate": 9.289270554558651e-06, "loss": 0.0018, "step": 580 }, { "epoch": 2.143994461751471, "grad_norm": 0.061158619821071625, "learning_rate": 9.216781099795322e-06, "loss": 0.0056, "step": 581 }, { "epoch": 2.1476866274374062, "grad_norm": 0.06538428366184235, "learning_rate": 9.144490809905365e-06, "loss": 0.0077, "step": 582 }, { "epoch": 2.1513787931233415, "grad_norm": 0.03936760872602463, "learning_rate": 9.072401020082542e-06, "loss": 0.0021, "step": 583 }, { "epoch": 2.1550709588092767, "grad_norm": 0.06418730318546295, "learning_rate": 9.0005130618174e-06, "loss": 0.0042, "step": 584 }, { "epoch": 2.158763124495212, "grad_norm": 0.027600156143307686, "learning_rate": 8.928828262872633e-06, "loss": 0.0008, "step": 585 }, { "epoch": 2.1624552901811467, "grad_norm": 0.02378762699663639, "learning_rate": 8.857347947258657e-06, "loss": 0.0008, "step": 586 }, { "epoch": 2.166147455867082, "grad_norm": 0.05525508150458336, "learning_rate": 8.786073435209072e-06, "loss": 0.003, "step": 587 }, { "epoch": 2.169839621553017, "grad_norm": 0.05668642744421959, "learning_rate": 8.715006043156315e-06, "loss": 0.0047, "step": 588 }, { "epoch": 2.1735317872389524, "grad_norm": 0.6010801196098328, "learning_rate": 8.644147083707354e-06, "loss": 0.0111, "step": 589 }, { "epoch": 2.1772239529248876, "grad_norm": 0.364096075296402, "learning_rate": 8.573497865619414e-06, "loss": 0.018, "step": 590 }, { "epoch": 2.180916118610823, "grad_norm": 0.03992763161659241, "learning_rate": 8.503059693775813e-06, "loss": 0.0023, "step": 591 }, { "epoch": 2.184608284296758, "grad_norm": 0.025853926315903664, "learning_rate": 8.432833869161893e-06, "loss": 0.0012, "step": 592 }, { "epoch": 2.188300449982693, "grad_norm": 0.02981944940984249, "learning_rate": 8.362821688840947e-06, "loss": 0.0015, "step": 593 }, { "epoch": 2.191992615668628, "grad_norm": 0.05791231989860535, "learning_rate": 8.293024445930287e-06, "loss": 0.0027, "step": 594 }, { "epoch": 2.1956847813545632, "grad_norm": 0.04732658341526985, "learning_rate": 8.223443429577343e-06, "loss": 0.0033, "step": 595 }, { "epoch": 2.1993769470404985, "grad_norm": 0.03762805834412575, "learning_rate": 8.154079924935892e-06, "loss": 0.0015, "step": 596 }, { "epoch": 2.2030691127264337, "grad_norm": 0.07153934240341187, "learning_rate": 8.084935213142269e-06, "loss": 0.0022, "step": 597 }, { "epoch": 2.206761278412369, "grad_norm": 0.04638079181313515, "learning_rate": 8.016010571291725e-06, "loss": 0.0018, "step": 598 }, { "epoch": 2.210453444098304, "grad_norm": 0.030643608421087265, "learning_rate": 7.947307272414874e-06, "loss": 0.0029, "step": 599 }, { "epoch": 2.214145609784239, "grad_norm": 0.02873793989419937, "learning_rate": 7.878826585454122e-06, "loss": 0.0021, "step": 600 }, { "epoch": 2.217837775470174, "grad_norm": 0.04478053003549576, "learning_rate": 7.810569775240257e-06, "loss": 0.003, "step": 601 }, { "epoch": 2.2215299411561094, "grad_norm": 0.046556588262319565, "learning_rate": 7.742538102469111e-06, "loss": 0.002, "step": 602 }, { "epoch": 2.2252221068420446, "grad_norm": 0.039592090994119644, "learning_rate": 7.674732823678228e-06, "loss": 0.0036, "step": 603 }, { "epoch": 2.22891427252798, "grad_norm": 0.031397104263305664, "learning_rate": 7.607155191223683e-06, "loss": 0.0013, "step": 604 }, { "epoch": 2.232606438213915, "grad_norm": 0.035691820085048676, "learning_rate": 7.539806453256973e-06, "loss": 0.0014, "step": 605 }, { "epoch": 2.23629860389985, "grad_norm": 0.0710548609495163, "learning_rate": 7.472687853701908e-06, "loss": 0.0078, "step": 606 }, { "epoch": 2.239990769585785, "grad_norm": 0.04906422272324562, "learning_rate": 7.405800632231672e-06, "loss": 0.0025, "step": 607 }, { "epoch": 2.2436829352717202, "grad_norm": 0.08130212128162384, "learning_rate": 7.339146024245947e-06, "loss": 0.006, "step": 608 }, { "epoch": 2.2473751009576555, "grad_norm": 0.0357891283929348, "learning_rate": 7.272725260848037e-06, "loss": 0.0018, "step": 609 }, { "epoch": 2.2510672666435907, "grad_norm": 0.05284830555319786, "learning_rate": 7.206539568822179e-06, "loss": 0.0017, "step": 610 }, { "epoch": 2.254759432329526, "grad_norm": 0.03607559949159622, "learning_rate": 7.140590170610857e-06, "loss": 0.0023, "step": 611 }, { "epoch": 2.258451598015461, "grad_norm": 0.043198052793741226, "learning_rate": 7.0748782842922545e-06, "loss": 0.0016, "step": 612 }, { "epoch": 2.262143763701396, "grad_norm": 0.07694031298160553, "learning_rate": 7.0094051235577155e-06, "loss": 0.0025, "step": 613 }, { "epoch": 2.265835929387331, "grad_norm": 0.040816958993673325, "learning_rate": 6.944171897689349e-06, "loss": 0.0043, "step": 614 }, { "epoch": 2.2695280950732664, "grad_norm": 0.02998993545770645, "learning_rate": 6.879179811537715e-06, "loss": 0.001, "step": 615 }, { "epoch": 2.2732202607592016, "grad_norm": 0.0377618744969368, "learning_rate": 6.814430065499526e-06, "loss": 0.0017, "step": 616 }, { "epoch": 2.276912426445137, "grad_norm": 0.05830111727118492, "learning_rate": 6.749923855495502e-06, "loss": 0.0038, "step": 617 }, { "epoch": 2.280604592131072, "grad_norm": 0.042703039944171906, "learning_rate": 6.685662372948298e-06, "loss": 0.004, "step": 618 }, { "epoch": 2.284296757817007, "grad_norm": 0.0343906469643116, "learning_rate": 6.62164680476046e-06, "loss": 0.0024, "step": 619 }, { "epoch": 2.287988923502942, "grad_norm": 0.030185390263795853, "learning_rate": 6.55787833329252e-06, "loss": 0.001, "step": 620 }, { "epoch": 2.2916810891888773, "grad_norm": 0.03131229057908058, "learning_rate": 6.4943581363411855e-06, "loss": 0.0015, "step": 621 }, { "epoch": 2.2953732548748125, "grad_norm": 0.03044010140001774, "learning_rate": 6.431087387117538e-06, "loss": 0.001, "step": 622 }, { "epoch": 2.2990654205607477, "grad_norm": 0.028354087844491005, "learning_rate": 6.368067254225387e-06, "loss": 0.0009, "step": 623 }, { "epoch": 2.302757586246683, "grad_norm": 0.06797152012586594, "learning_rate": 6.305298901639704e-06, "loss": 0.0081, "step": 624 }, { "epoch": 2.306449751932618, "grad_norm": 0.035257063806056976, "learning_rate": 6.242783488685091e-06, "loss": 0.0013, "step": 625 }, { "epoch": 2.3101419176185534, "grad_norm": 0.09349807351827621, "learning_rate": 6.1805221700143844e-06, "loss": 0.017, "step": 626 }, { "epoch": 2.313834083304488, "grad_norm": 0.07014710456132889, "learning_rate": 6.118516095587321e-06, "loss": 0.0109, "step": 627 }, { "epoch": 2.3175262489904234, "grad_norm": 0.044707559049129486, "learning_rate": 6.056766410649329e-06, "loss": 0.0013, "step": 628 }, { "epoch": 2.3212184146763586, "grad_norm": 0.1256485879421234, "learning_rate": 5.99527425571032e-06, "loss": 0.0066, "step": 629 }, { "epoch": 2.324910580362294, "grad_norm": 0.03149070963263512, "learning_rate": 5.934040766523668e-06, "loss": 0.0026, "step": 630 }, { "epoch": 2.328602746048229, "grad_norm": 0.029719380661845207, "learning_rate": 5.873067074065229e-06, "loss": 0.001, "step": 631 }, { "epoch": 2.3322949117341643, "grad_norm": 0.02905251644551754, "learning_rate": 5.8123543045124285e-06, "loss": 0.0015, "step": 632 }, { "epoch": 2.335987077420099, "grad_norm": 0.030503999441862106, "learning_rate": 5.751903579223468e-06, "loss": 0.001, "step": 633 }, { "epoch": 2.3396792431060343, "grad_norm": 0.039289508014917374, "learning_rate": 5.6917160147166525e-06, "loss": 0.0019, "step": 634 }, { "epoch": 2.3433714087919695, "grad_norm": 0.04832150787115097, "learning_rate": 5.6317927226496875e-06, "loss": 0.0016, "step": 635 }, { "epoch": 2.3470635744779047, "grad_norm": 0.03619923070073128, "learning_rate": 5.572134809799235e-06, "loss": 0.0023, "step": 636 }, { "epoch": 2.35075574016384, "grad_norm": 0.036724645644426346, "learning_rate": 5.512743378040428e-06, "loss": 0.001, "step": 637 }, { "epoch": 2.354447905849775, "grad_norm": 0.026730459183454514, "learning_rate": 5.453619524326495e-06, "loss": 0.0013, "step": 638 }, { "epoch": 2.3581400715357104, "grad_norm": 0.036537256091833115, "learning_rate": 5.39476434066855e-06, "loss": 0.0011, "step": 639 }, { "epoch": 2.361832237221645, "grad_norm": 0.0313183031976223, "learning_rate": 5.3361789141154085e-06, "loss": 0.0011, "step": 640 }, { "epoch": 2.3655244029075804, "grad_norm": 0.036056943237781525, "learning_rate": 5.277864326733484e-06, "loss": 0.0015, "step": 641 }, { "epoch": 2.3692165685935156, "grad_norm": 0.06026415154337883, "learning_rate": 5.219821655586821e-06, "loss": 0.0081, "step": 642 }, { "epoch": 2.372908734279451, "grad_norm": 0.04079107567667961, "learning_rate": 5.162051972717204e-06, "loss": 0.0028, "step": 643 }, { "epoch": 2.376600899965386, "grad_norm": 0.06928452104330063, "learning_rate": 5.104556345124363e-06, "loss": 0.0027, "step": 644 }, { "epoch": 2.376600899965386, "eval_loss": 0.008511913008987904, "eval_runtime": 89.9081, "eval_samples_per_second": 10.155, "eval_steps_per_second": 5.083, "step": 644 }, { "epoch": 2.3802930656513213, "grad_norm": 0.0545664019882679, "learning_rate": 5.04733583474625e-06, "loss": 0.0037, "step": 645 }, { "epoch": 2.383985231337256, "grad_norm": 0.033127326518297195, "learning_rate": 4.9903914984394195e-06, "loss": 0.0016, "step": 646 }, { "epoch": 2.3876773970231913, "grad_norm": 0.044179655611515045, "learning_rate": 4.933724387959549e-06, "loss": 0.0026, "step": 647 }, { "epoch": 2.3913695627091265, "grad_norm": 0.032938070595264435, "learning_rate": 4.877335549941962e-06, "loss": 0.001, "step": 648 }, { "epoch": 2.3950617283950617, "grad_norm": 0.02591692842543125, "learning_rate": 4.8212260258823216e-06, "loss": 0.001, "step": 649 }, { "epoch": 2.398753894080997, "grad_norm": 0.03672531247138977, "learning_rate": 4.765396852117417e-06, "loss": 0.0016, "step": 650 }, { "epoch": 2.402446059766932, "grad_norm": 0.0361509770154953, "learning_rate": 4.709849059805956e-06, "loss": 0.001, "step": 651 }, { "epoch": 2.4061382254528674, "grad_norm": 0.06867770105600357, "learning_rate": 4.654583674909598e-06, "loss": 0.007, "step": 652 }, { "epoch": 2.4098303911388026, "grad_norm": 0.06737197190523148, "learning_rate": 4.5996017181739675e-06, "loss": 0.0075, "step": 653 }, { "epoch": 2.4135225568247374, "grad_norm": 0.09367132931947708, "learning_rate": 4.5449042051097655e-06, "loss": 0.0175, "step": 654 }, { "epoch": 2.4172147225106726, "grad_norm": 0.02626173198223114, "learning_rate": 4.49049214597409e-06, "loss": 0.0009, "step": 655 }, { "epoch": 2.420906888196608, "grad_norm": 0.03810608759522438, "learning_rate": 4.436366545751727e-06, "loss": 0.0009, "step": 656 }, { "epoch": 2.424599053882543, "grad_norm": 0.06305629014968872, "learning_rate": 4.382528404136572e-06, "loss": 0.0017, "step": 657 }, { "epoch": 2.4282912195684783, "grad_norm": 0.030049487948417664, "learning_rate": 4.3289787155132236e-06, "loss": 0.0011, "step": 658 }, { "epoch": 2.431983385254413, "grad_norm": 0.12249593436717987, "learning_rate": 4.275718468938572e-06, "loss": 0.0012, "step": 659 }, { "epoch": 2.4356755509403483, "grad_norm": 0.05436545982956886, "learning_rate": 4.2227486481235335e-06, "loss": 0.0063, "step": 660 }, { "epoch": 2.4393677166262835, "grad_norm": 0.057109009474515915, "learning_rate": 4.17007023141492e-06, "loss": 0.0024, "step": 661 }, { "epoch": 2.4430598823122187, "grad_norm": 0.5980064868927002, "learning_rate": 4.117684191777316e-06, "loss": 0.0244, "step": 662 }, { "epoch": 2.446752047998154, "grad_norm": 0.0350472554564476, "learning_rate": 4.065591496775139e-06, "loss": 0.0016, "step": 663 }, { "epoch": 2.450444213684089, "grad_norm": 0.037017323076725006, "learning_rate": 4.013793108554776e-06, "loss": 0.0023, "step": 664 }, { "epoch": 2.4541363793700244, "grad_norm": 0.03902212157845497, "learning_rate": 3.962289983826784e-06, "loss": 0.0012, "step": 665 }, { "epoch": 2.4578285450559596, "grad_norm": 0.06350851058959961, "learning_rate": 3.9110830738482455e-06, "loss": 0.0037, "step": 666 }, { "epoch": 2.4615207107418944, "grad_norm": 0.04751058667898178, "learning_rate": 3.860173324405178e-06, "loss": 0.0025, "step": 667 }, { "epoch": 2.4652128764278296, "grad_norm": 0.05800960212945938, "learning_rate": 3.809561675795097e-06, "loss": 0.0093, "step": 668 }, { "epoch": 2.468905042113765, "grad_norm": 0.04538734257221222, "learning_rate": 3.759249062809609e-06, "loss": 0.005, "step": 669 }, { "epoch": 2.4725972077997, "grad_norm": 0.05560098960995674, "learning_rate": 3.7092364147171656e-06, "loss": 0.0023, "step": 670 }, { "epoch": 2.4762893734856353, "grad_norm": 0.04338544234633446, "learning_rate": 3.659524655245916e-06, "loss": 0.0019, "step": 671 }, { "epoch": 2.4799815391715705, "grad_norm": 0.023049261420965195, "learning_rate": 3.610114702566614e-06, "loss": 0.0006, "step": 672 }, { "epoch": 2.4836737048575053, "grad_norm": 0.0627702996134758, "learning_rate": 3.561007469275677e-06, "loss": 0.0024, "step": 673 }, { "epoch": 2.4873658705434405, "grad_norm": 0.049440622329711914, "learning_rate": 3.512203862378338e-06, "loss": 0.002, "step": 674 }, { "epoch": 2.4910580362293757, "grad_norm": 0.06725075840950012, "learning_rate": 3.463704783271875e-06, "loss": 0.006, "step": 675 }, { "epoch": 2.494750201915311, "grad_norm": 0.039713066071271896, "learning_rate": 3.4155111277289675e-06, "loss": 0.0013, "step": 676 }, { "epoch": 2.498442367601246, "grad_norm": 0.03898831084370613, "learning_rate": 3.3676237858811713e-06, "loss": 0.0018, "step": 677 }, { "epoch": 2.5021345332871814, "grad_norm": 0.037710677832365036, "learning_rate": 3.320043642202444e-06, "loss": 0.0011, "step": 678 }, { "epoch": 2.5058266989731166, "grad_norm": 0.03683096170425415, "learning_rate": 3.2727715754928323e-06, "loss": 0.0012, "step": 679 }, { "epoch": 2.509518864659052, "grad_norm": 0.05158952623605728, "learning_rate": 3.2258084588622453e-06, "loss": 0.002, "step": 680 }, { "epoch": 2.5132110303449866, "grad_norm": 0.058341093361377716, "learning_rate": 3.1791551597143046e-06, "loss": 0.0043, "step": 681 }, { "epoch": 2.516903196030922, "grad_norm": 0.029107490554451942, "learning_rate": 3.132812539730341e-06, "loss": 0.0018, "step": 682 }, { "epoch": 2.520595361716857, "grad_norm": 0.07059961557388306, "learning_rate": 3.0867814548534713e-06, "loss": 0.0073, "step": 683 }, { "epoch": 2.5242875274027923, "grad_norm": 0.053354717791080475, "learning_rate": 3.0410627552728057e-06, "loss": 0.0078, "step": 684 }, { "epoch": 2.5279796930887275, "grad_norm": 0.030938267707824707, "learning_rate": 2.9956572854077205e-06, "loss": 0.0012, "step": 685 }, { "epoch": 2.5316718587746623, "grad_norm": 0.06863964349031448, "learning_rate": 2.9505658838922715e-06, "loss": 0.0028, "step": 686 }, { "epoch": 2.5353640244605975, "grad_norm": 0.04640622437000275, "learning_rate": 2.9057893835597205e-06, "loss": 0.0012, "step": 687 }, { "epoch": 2.5390561901465327, "grad_norm": 0.0366484560072422, "learning_rate": 2.8613286114271275e-06, "loss": 0.0019, "step": 688 }, { "epoch": 2.542748355832468, "grad_norm": 0.04071936756372452, "learning_rate": 2.817184388680083e-06, "loss": 0.0015, "step": 689 }, { "epoch": 2.546440521518403, "grad_norm": 0.06161284074187279, "learning_rate": 2.773357530657561e-06, "loss": 0.0058, "step": 690 }, { "epoch": 2.5501326872043384, "grad_norm": 0.11831287294626236, "learning_rate": 2.7298488468368313e-06, "loss": 0.0204, "step": 691 }, { "epoch": 2.5538248528902736, "grad_norm": 0.03312596306204796, "learning_rate": 2.6866591408185172e-06, "loss": 0.0027, "step": 692 }, { "epoch": 2.557517018576209, "grad_norm": 0.07281254231929779, "learning_rate": 2.6437892103117734e-06, "loss": 0.0122, "step": 693 }, { "epoch": 2.5612091842621436, "grad_norm": 0.024078702554106712, "learning_rate": 2.6012398471195257e-06, "loss": 0.0007, "step": 694 }, { "epoch": 2.564901349948079, "grad_norm": 0.028001535683870316, "learning_rate": 2.559011837123846e-06, "loss": 0.0013, "step": 695 }, { "epoch": 2.568593515634014, "grad_norm": 0.037758968770504, "learning_rate": 2.5171059602714754e-06, "loss": 0.0026, "step": 696 }, { "epoch": 2.5722856813199493, "grad_norm": 0.02078225091099739, "learning_rate": 2.475522990559365e-06, "loss": 0.0006, "step": 697 }, { "epoch": 2.5759778470058845, "grad_norm": 0.049048252403736115, "learning_rate": 2.434263696020418e-06, "loss": 0.0061, "step": 698 }, { "epoch": 2.5796700126918193, "grad_norm": 0.060004621744155884, "learning_rate": 2.393328838709288e-06, "loss": 0.0016, "step": 699 }, { "epoch": 2.5833621783777545, "grad_norm": 0.033074602484703064, "learning_rate": 2.352719174688316e-06, "loss": 0.0009, "step": 700 }, { "epoch": 2.5870543440636897, "grad_norm": 0.031975001096725464, "learning_rate": 2.312435454013553e-06, "loss": 0.0014, "step": 701 }, { "epoch": 2.590746509749625, "grad_norm": 0.03675561025738716, "learning_rate": 2.272478420720907e-06, "loss": 0.002, "step": 702 }, { "epoch": 2.59443867543556, "grad_norm": 0.05581916868686676, "learning_rate": 2.2328488128124203e-06, "loss": 0.0041, "step": 703 }, { "epoch": 2.5981308411214954, "grad_norm": 0.0367700457572937, "learning_rate": 2.1935473622426096e-06, "loss": 0.0024, "step": 704 }, { "epoch": 2.6018230068074306, "grad_norm": 0.08924701809883118, "learning_rate": 2.1545747949049665e-06, "loss": 0.0091, "step": 705 }, { "epoch": 2.605515172493366, "grad_norm": 0.04543719068169594, "learning_rate": 2.1159318306185517e-06, "loss": 0.0021, "step": 706 }, { "epoch": 2.609207338179301, "grad_norm": 0.05079701170325279, "learning_rate": 2.0776191831146853e-06, "loss": 0.0028, "step": 707 }, { "epoch": 2.612899503865236, "grad_norm": 0.03997986763715744, "learning_rate": 2.0396375600237684e-06, "loss": 0.0032, "step": 708 }, { "epoch": 2.616591669551171, "grad_norm": 0.1283845454454422, "learning_rate": 2.001987662862237e-06, "loss": 0.006, "step": 709 }, { "epoch": 2.6202838352371063, "grad_norm": 0.04783840477466583, "learning_rate": 1.964670187019564e-06, "loss": 0.0052, "step": 710 }, { "epoch": 2.6239760009230415, "grad_norm": 0.04014943912625313, "learning_rate": 1.9276858217454465e-06, "loss": 0.0017, "step": 711 }, { "epoch": 2.6276681666089767, "grad_norm": 0.11507362872362137, "learning_rate": 1.8910352501370677e-06, "loss": 0.009, "step": 712 }, { "epoch": 2.6313603322949115, "grad_norm": 0.03147709369659424, "learning_rate": 1.854719149126476e-06, "loss": 0.0017, "step": 713 }, { "epoch": 2.6350524979808467, "grad_norm": 0.05146815627813339, "learning_rate": 1.8187381894680812e-06, "loss": 0.0028, "step": 714 }, { "epoch": 2.638744663666782, "grad_norm": 0.03048107400536537, "learning_rate": 1.7830930357262689e-06, "loss": 0.0022, "step": 715 }, { "epoch": 2.642436829352717, "grad_norm": 0.03846743330359459, "learning_rate": 1.7477843462631371e-06, "loss": 0.0034, "step": 716 }, { "epoch": 2.6461289950386524, "grad_norm": 0.04522532969713211, "learning_rate": 1.7128127732263134e-06, "loss": 0.0015, "step": 717 }, { "epoch": 2.6498211607245876, "grad_norm": 0.023895738646388054, "learning_rate": 1.6781789625369204e-06, "loss": 0.001, "step": 718 }, { "epoch": 2.653513326410523, "grad_norm": 0.028686054050922394, "learning_rate": 1.6438835538776631e-06, "loss": 0.0015, "step": 719 }, { "epoch": 2.657205492096458, "grad_norm": 0.056154992431402206, "learning_rate": 1.6099271806809814e-06, "loss": 0.0013, "step": 720 }, { "epoch": 2.660897657782393, "grad_norm": 0.06218738853931427, "learning_rate": 1.5763104701173726e-06, "loss": 0.0017, "step": 721 }, { "epoch": 2.664589823468328, "grad_norm": 0.05347761884331703, "learning_rate": 1.5430340430838086e-06, "loss": 0.0018, "step": 722 }, { "epoch": 2.6682819891542633, "grad_norm": 0.043504808098077774, "learning_rate": 1.5100985141922486e-06, "loss": 0.006, "step": 723 }, { "epoch": 2.6719741548401985, "grad_norm": 0.04124554246664047, "learning_rate": 1.4775044917583104e-06, "loss": 0.0015, "step": 724 }, { "epoch": 2.6756663205261337, "grad_norm": 0.06463329493999481, "learning_rate": 1.4452525777900306e-06, "loss": 0.0009, "step": 725 }, { "epoch": 2.6793584862120685, "grad_norm": 0.09179668873548508, "learning_rate": 1.4133433679767183e-06, "loss": 0.0138, "step": 726 }, { "epoch": 2.6830506518980037, "grad_norm": 0.0477369949221611, "learning_rate": 1.3817774516779947e-06, "loss": 0.0019, "step": 727 }, { "epoch": 2.686742817583939, "grad_norm": 0.041714612394571304, "learning_rate": 1.3505554119128861e-06, "loss": 0.0008, "step": 728 }, { "epoch": 2.690434983269874, "grad_norm": 0.02698429860174656, "learning_rate": 1.3196778253490417e-06, "loss": 0.0008, "step": 729 }, { "epoch": 2.6941271489558094, "grad_norm": 0.03678148612380028, "learning_rate": 1.2891452622921107e-06, "loss": 0.002, "step": 730 }, { "epoch": 2.6978193146417446, "grad_norm": 0.024919135496020317, "learning_rate": 1.258958286675196e-06, "loss": 0.0009, "step": 731 }, { "epoch": 2.70151148032768, "grad_norm": 0.05177600681781769, "learning_rate": 1.2291174560484276e-06, "loss": 0.0018, "step": 732 }, { "epoch": 2.705203646013615, "grad_norm": 0.0748058333992958, "learning_rate": 1.199623321568688e-06, "loss": 0.0034, "step": 733 }, { "epoch": 2.7088958116995503, "grad_norm": 0.03387774899601936, "learning_rate": 1.1704764279894154e-06, "loss": 0.0056, "step": 734 }, { "epoch": 2.712587977385485, "grad_norm": 0.08370117098093033, "learning_rate": 1.1416773136505376e-06, "loss": 0.0054, "step": 735 }, { "epoch": 2.7162801430714203, "grad_norm": 0.06413291394710541, "learning_rate": 1.1132265104685568e-06, "loss": 0.0041, "step": 736 }, { "epoch": 2.7162801430714203, "eval_loss": 0.008520028553903103, "eval_runtime": 89.8859, "eval_samples_per_second": 10.157, "eval_steps_per_second": 5.084, "step": 736 }, { "epoch": 2.7199723087573555, "grad_norm": 0.14114509522914886, "learning_rate": 1.0851245439266856e-06, "loss": 0.0028, "step": 737 }, { "epoch": 2.7236644744432907, "grad_norm": 0.11610425263643265, "learning_rate": 1.057371933065181e-06, "loss": 0.0151, "step": 738 }, { "epoch": 2.727356640129226, "grad_norm": 0.039616964757442474, "learning_rate": 1.0299691904717179e-06, "loss": 0.0018, "step": 739 }, { "epoch": 2.7310488058151607, "grad_norm": 0.043560516089200974, "learning_rate": 1.0029168222719622e-06, "loss": 0.0022, "step": 740 }, { "epoch": 2.734740971501096, "grad_norm": 0.051236286759376526, "learning_rate": 9.762153281201959e-07, "loss": 0.0036, "step": 741 }, { "epoch": 2.738433137187031, "grad_norm": 0.0456005297601223, "learning_rate": 9.498652011900899e-07, "loss": 0.0023, "step": 742 }, { "epoch": 2.7421253028729664, "grad_norm": 0.04226566106081009, "learning_rate": 9.238669281656043e-07, "loss": 0.0045, "step": 743 }, { "epoch": 2.7458174685589016, "grad_norm": 0.04904725402593613, "learning_rate": 8.982209892320104e-07, "loss": 0.0034, "step": 744 }, { "epoch": 2.749509634244837, "grad_norm": 0.043459225445985794, "learning_rate": 8.729278580669765e-07, "loss": 0.0024, "step": 745 }, { "epoch": 2.753201799930772, "grad_norm": 0.031117543578147888, "learning_rate": 8.479880018318831e-07, "loss": 0.0024, "step": 746 }, { "epoch": 2.7568939656167073, "grad_norm": 0.05448920652270317, "learning_rate": 8.234018811631372e-07, "loss": 0.0022, "step": 747 }, { "epoch": 2.760586131302642, "grad_norm": 0.05770432949066162, "learning_rate": 7.991699501637007e-07, "loss": 0.0059, "step": 748 }, { "epoch": 2.7642782969885773, "grad_norm": 0.04619462043046951, "learning_rate": 7.752926563946905e-07, "loss": 0.0054, "step": 749 }, { "epoch": 2.7679704626745125, "grad_norm": 0.029614942148327827, "learning_rate": 7.517704408671078e-07, "loss": 0.0009, "step": 750 }, { "epoch": 2.7716626283604477, "grad_norm": 0.02850547805428505, "learning_rate": 7.286037380336974e-07, "loss": 0.0008, "step": 751 }, { "epoch": 2.775354794046383, "grad_norm": 0.02556842938065529, "learning_rate": 7.057929757809323e-07, "loss": 0.001, "step": 752 }, { "epoch": 2.7790469597323177, "grad_norm": 0.06311103701591492, "learning_rate": 6.833385754210931e-07, "loss": 0.0045, "step": 753 }, { "epoch": 2.782739125418253, "grad_norm": 0.05822919309139252, "learning_rate": 6.612409516844987e-07, "loss": 0.0038, "step": 754 }, { "epoch": 2.786431291104188, "grad_norm": 0.03946372866630554, "learning_rate": 6.39500512711837e-07, "loss": 0.0014, "step": 755 }, { "epoch": 2.7901234567901234, "grad_norm": 0.02774471417069435, "learning_rate": 6.181176600466443e-07, "loss": 0.0009, "step": 756 }, { "epoch": 2.7938156224760586, "grad_norm": 0.05857662111520767, "learning_rate": 5.970927886278644e-07, "loss": 0.0032, "step": 757 }, { "epoch": 2.797507788161994, "grad_norm": 0.03707636892795563, "learning_rate": 5.764262867825766e-07, "loss": 0.0015, "step": 758 }, { "epoch": 2.801199953847929, "grad_norm": 0.04190952703356743, "learning_rate": 5.561185362188059e-07, "loss": 0.0012, "step": 759 }, { "epoch": 2.8048921195338643, "grad_norm": 0.05025167390704155, "learning_rate": 5.361699120184871e-07, "loss": 0.002, "step": 760 }, { "epoch": 2.808584285219799, "grad_norm": 0.06911665946245193, "learning_rate": 5.165807826305224e-07, "loss": 0.0029, "step": 761 }, { "epoch": 2.8122764509057343, "grad_norm": 0.05155842751264572, "learning_rate": 4.973515098639903e-07, "loss": 0.0025, "step": 762 }, { "epoch": 2.8159686165916695, "grad_norm": 0.042222995311021805, "learning_rate": 4.784824488814588e-07, "loss": 0.0021, "step": 763 }, { "epoch": 2.8196607822776047, "grad_norm": 0.03629340976476669, "learning_rate": 4.5997394819241146e-07, "loss": 0.0024, "step": 764 }, { "epoch": 2.82335294796354, "grad_norm": 0.039383579045534134, "learning_rate": 4.4182634964684113e-07, "loss": 0.0025, "step": 765 }, { "epoch": 2.8270451136494748, "grad_norm": 0.0424778051674366, "learning_rate": 4.2403998842890147e-07, "loss": 0.0016, "step": 766 }, { "epoch": 2.83073727933541, "grad_norm": 0.050856415182352066, "learning_rate": 4.066151930507367e-07, "loss": 0.0016, "step": 767 }, { "epoch": 2.834429445021345, "grad_norm": 0.060999441891908646, "learning_rate": 3.8955228534641733e-07, "loss": 0.0047, "step": 768 }, { "epoch": 2.8381216107072804, "grad_norm": 0.07913260906934738, "learning_rate": 3.7285158046597826e-07, "loss": 0.0029, "step": 769 }, { "epoch": 2.8418137763932156, "grad_norm": 0.03891456499695778, "learning_rate": 3.5651338686961247e-07, "loss": 0.0027, "step": 770 }, { "epoch": 2.845505942079151, "grad_norm": 0.04553454741835594, "learning_rate": 3.4053800632196434e-07, "loss": 0.0017, "step": 771 }, { "epoch": 2.849198107765086, "grad_norm": 0.02823561057448387, "learning_rate": 3.249257338865719e-07, "loss": 0.0031, "step": 772 }, { "epoch": 2.8528902734510213, "grad_norm": 0.02780543640255928, "learning_rate": 3.096768579203935e-07, "loss": 0.001, "step": 773 }, { "epoch": 2.8565824391369565, "grad_norm": 0.030648574233055115, "learning_rate": 2.947916600685008e-07, "loss": 0.0011, "step": 774 }, { "epoch": 2.8602746048228913, "grad_norm": 0.04621976613998413, "learning_rate": 2.802704152588742e-07, "loss": 0.0031, "step": 775 }, { "epoch": 2.8639667705088265, "grad_norm": 0.052076127380132675, "learning_rate": 2.661133916973113e-07, "loss": 0.0125, "step": 776 }, { "epoch": 2.8676589361947618, "grad_norm": 0.028537949547171593, "learning_rate": 2.523208508624908e-07, "loss": 0.0012, "step": 777 }, { "epoch": 2.871351101880697, "grad_norm": 0.04475348815321922, "learning_rate": 2.3889304750113193e-07, "loss": 0.0032, "step": 778 }, { "epoch": 2.875043267566632, "grad_norm": 0.03786703944206238, "learning_rate": 2.2583022962329614e-07, "loss": 0.0016, "step": 779 }, { "epoch": 2.878735433252567, "grad_norm": 0.03621987998485565, "learning_rate": 2.1313263849779498e-07, "loss": 0.0015, "step": 780 }, { "epoch": 2.882427598938502, "grad_norm": 0.03176356479525566, "learning_rate": 2.0080050864775602e-07, "loss": 0.0033, "step": 781 }, { "epoch": 2.8861197646244374, "grad_norm": 0.058462027460336685, "learning_rate": 1.888340678462619e-07, "loss": 0.0029, "step": 782 }, { "epoch": 2.8898119303103726, "grad_norm": 0.09863194078207016, "learning_rate": 1.7723353711216474e-07, "loss": 0.0059, "step": 783 }, { "epoch": 2.893504095996308, "grad_norm": 0.04548930376768112, "learning_rate": 1.659991307060027e-07, "loss": 0.0039, "step": 784 }, { "epoch": 2.897196261682243, "grad_norm": 0.059715621173381805, "learning_rate": 1.5513105612602996e-07, "loss": 0.0051, "step": 785 }, { "epoch": 2.9008884273681783, "grad_norm": 0.03995850309729576, "learning_rate": 1.446295141043974e-07, "loss": 0.0026, "step": 786 }, { "epoch": 2.9045805930541135, "grad_norm": 0.04538688436150551, "learning_rate": 1.3449469860343123e-07, "loss": 0.002, "step": 787 }, { "epoch": 2.9082727587400483, "grad_norm": 0.03328185901045799, "learning_rate": 1.2472679681207355e-07, "loss": 0.0015, "step": 788 }, { "epoch": 2.9119649244259835, "grad_norm": 0.0314176119863987, "learning_rate": 1.1532598914239635e-07, "loss": 0.0012, "step": 789 }, { "epoch": 2.9156570901119188, "grad_norm": 0.038788117468357086, "learning_rate": 1.0629244922628845e-07, "loss": 0.0014, "step": 790 }, { "epoch": 2.919349255797854, "grad_norm": 0.07843354344367981, "learning_rate": 9.762634391224713e-08, "loss": 0.0105, "step": 791 }, { "epoch": 2.923041421483789, "grad_norm": 0.1690308153629303, "learning_rate": 8.932783326228711e-08, "loss": 0.0026, "step": 792 }, { "epoch": 2.926733587169724, "grad_norm": 0.04448361694812775, "learning_rate": 8.139707054899192e-08, "loss": 0.0024, "step": 793 }, { "epoch": 2.930425752855659, "grad_norm": 0.03807828575372696, "learning_rate": 7.383420225268278e-08, "loss": 0.0011, "step": 794 }, { "epoch": 2.9341179185415944, "grad_norm": 0.04406026750802994, "learning_rate": 6.663936805870963e-08, "loss": 0.0031, "step": 795 }, { "epoch": 2.9378100842275297, "grad_norm": 0.07986247539520264, "learning_rate": 5.981270085487101e-08, "loss": 0.0026, "step": 796 }, { "epoch": 2.941502249913465, "grad_norm": 0.03615804389119148, "learning_rate": 5.335432672896712e-08, "loss": 0.0022, "step": 797 }, { "epoch": 2.9451944155994, "grad_norm": 0.047268688678741455, "learning_rate": 4.7264364966457257e-08, "loss": 0.0031, "step": 798 }, { "epoch": 2.9488865812853353, "grad_norm": 0.023875901475548744, "learning_rate": 4.154292804827042e-08, "loss": 0.0009, "step": 799 }, { "epoch": 2.9525787469712705, "grad_norm": 0.047933872789144516, "learning_rate": 3.61901216487226e-08, "loss": 0.001, "step": 800 }, { "epoch": 2.9562709126572058, "grad_norm": 0.034800831228494644, "learning_rate": 3.120604463356047e-08, "loss": 0.0013, "step": 801 }, { "epoch": 2.9599630783431405, "grad_norm": 0.037260983139276505, "learning_rate": 2.659078905814072e-08, "loss": 0.0012, "step": 802 }, { "epoch": 2.9636552440290758, "grad_norm": 0.052072469145059586, "learning_rate": 2.234444016572912e-08, "loss": 0.0021, "step": 803 }, { "epoch": 2.967347409715011, "grad_norm": 0.056816283613443375, "learning_rate": 1.8467076385926263e-08, "loss": 0.0037, "step": 804 }, { "epoch": 2.971039575400946, "grad_norm": 0.05605953931808472, "learning_rate": 1.4958769333217604e-08, "loss": 0.0047, "step": 805 }, { "epoch": 2.9747317410868814, "grad_norm": 0.0329916849732399, "learning_rate": 1.181958380564785e-08, "loss": 0.0015, "step": 806 }, { "epoch": 2.978423906772816, "grad_norm": 0.056854039430618286, "learning_rate": 9.049577783633023e-09, "loss": 0.0011, "step": 807 }, { "epoch": 2.9821160724587514, "grad_norm": 0.029551850631833076, "learning_rate": 6.648802428879109e-09, "loss": 0.001, "step": 808 }, { "epoch": 2.9858082381446867, "grad_norm": 0.05778481811285019, "learning_rate": 4.6173020834450186e-09, "loss": 0.004, "step": 809 }, { "epoch": 2.989500403830622, "grad_norm": 0.07338174432516098, "learning_rate": 2.9551142689210334e-09, "loss": 0.0092, "step": 810 }, { "epoch": 2.993192569516557, "grad_norm": 0.04888184741139412, "learning_rate": 1.6622696857315768e-09, "loss": 0.0059, "step": 811 }, { "epoch": 2.9968847352024923, "grad_norm": 0.06601458787918091, "learning_rate": 7.387922125778879e-10, "loss": 0.0035, "step": 812 }, { "epoch": 3.0, "grad_norm": 0.05915842577815056, "learning_rate": 1.8469890598726837e-10, "loss": 0.0014, "step": 813 } ], "logging_steps": 1, "max_steps": 813, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 92, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.396172477162324e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }