{ "best_global_step": 250, "best_metric": 1.4088929891586304, "best_model_checkpoint": "saves/qwen3-1.7B/medical-o1-sft-full/checkpoint-250", "epoch": 3.0, "eval_steps": 50, "global_step": 441, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006837606837606838, "grad_norm": 83.15293884277344, "learning_rate": 0.0, "loss": 2.8199405670166016, "step": 1 }, { "epoch": 0.013675213675213675, "grad_norm": 81.79350280761719, "learning_rate": 8.695652173913044e-07, "loss": 2.7888758182525635, "step": 2 }, { "epoch": 0.020512820512820513, "grad_norm": 83.25151824951172, "learning_rate": 1.7391304347826088e-06, "loss": 2.820769786834717, "step": 3 }, { "epoch": 0.02735042735042735, "grad_norm": 75.52108001708984, "learning_rate": 2.6086956521739132e-06, "loss": 2.734041690826416, "step": 4 }, { "epoch": 0.03418803418803419, "grad_norm": 72.11664581298828, "learning_rate": 3.4782608695652175e-06, "loss": 2.7135212421417236, "step": 5 }, { "epoch": 0.041025641025641026, "grad_norm": 55.534324645996094, "learning_rate": 4.347826086956522e-06, "loss": 2.4443650245666504, "step": 6 }, { "epoch": 0.04786324786324787, "grad_norm": 48.14010238647461, "learning_rate": 5.2173913043478265e-06, "loss": 2.3162710666656494, "step": 7 }, { "epoch": 0.0547008547008547, "grad_norm": 20.861207962036133, "learning_rate": 6.086956521739132e-06, "loss": 2.0038950443267822, "step": 8 }, { "epoch": 0.06153846153846154, "grad_norm": 15.49008846282959, "learning_rate": 6.956521739130435e-06, "loss": 1.8993940353393555, "step": 9 }, { "epoch": 0.06837606837606838, "grad_norm": 5.190984725952148, "learning_rate": 7.82608695652174e-06, "loss": 1.7324286699295044, "step": 10 }, { "epoch": 0.07521367521367521, "grad_norm": 4.630637168884277, "learning_rate": 8.695652173913044e-06, "loss": 1.654750943183899, "step": 11 }, { "epoch": 0.08205128205128205, "grad_norm": 3.784055233001709, "learning_rate": 9.565217391304349e-06, "loss": 1.7394911050796509, "step": 12 }, { "epoch": 0.08888888888888889, "grad_norm": 3.4299561977386475, "learning_rate": 1.0434782608695653e-05, "loss": 1.6633565425872803, "step": 13 }, { "epoch": 0.09572649572649573, "grad_norm": 4.693484306335449, "learning_rate": 1.1304347826086957e-05, "loss": 1.670560359954834, "step": 14 }, { "epoch": 0.10256410256410256, "grad_norm": 5.14279317855835, "learning_rate": 1.2173913043478263e-05, "loss": 1.647332787513733, "step": 15 }, { "epoch": 0.1094017094017094, "grad_norm": 3.8385608196258545, "learning_rate": 1.3043478260869566e-05, "loss": 1.6399732828140259, "step": 16 }, { "epoch": 0.11623931623931624, "grad_norm": 2.6695456504821777, "learning_rate": 1.391304347826087e-05, "loss": 1.5681482553482056, "step": 17 }, { "epoch": 0.12307692307692308, "grad_norm": 2.117490291595459, "learning_rate": 1.4782608695652174e-05, "loss": 1.6053783893585205, "step": 18 }, { "epoch": 0.12991452991452992, "grad_norm": 1.9541882276535034, "learning_rate": 1.565217391304348e-05, "loss": 1.5954205989837646, "step": 19 }, { "epoch": 0.13675213675213677, "grad_norm": 2.011003255844116, "learning_rate": 1.6521739130434785e-05, "loss": 1.5820363759994507, "step": 20 }, { "epoch": 0.14358974358974358, "grad_norm": 1.9789162874221802, "learning_rate": 1.739130434782609e-05, "loss": 1.532997727394104, "step": 21 }, { "epoch": 0.15042735042735042, "grad_norm": 1.8961035013198853, "learning_rate": 1.8260869565217393e-05, "loss": 1.5475587844848633, "step": 22 }, { "epoch": 0.15726495726495726, "grad_norm": 1.5811997652053833, "learning_rate": 1.9130434782608697e-05, "loss": 1.580260992050171, "step": 23 }, { "epoch": 0.1641025641025641, "grad_norm": 1.4591213464736938, "learning_rate": 2e-05, "loss": 1.5463660955429077, "step": 24 }, { "epoch": 0.17094017094017094, "grad_norm": 1.4459729194641113, "learning_rate": 1.999971756719333e-05, "loss": 1.5187675952911377, "step": 25 }, { "epoch": 0.17777777777777778, "grad_norm": 1.4411983489990234, "learning_rate": 1.9998870284726968e-05, "loss": 1.529025673866272, "step": 26 }, { "epoch": 0.18461538461538463, "grad_norm": 1.3215960264205933, "learning_rate": 1.9997458200460994e-05, "loss": 1.513730525970459, "step": 27 }, { "epoch": 0.19145299145299147, "grad_norm": 1.324648141860962, "learning_rate": 1.999548139415919e-05, "loss": 1.5576432943344116, "step": 28 }, { "epoch": 0.19829059829059828, "grad_norm": 1.1139763593673706, "learning_rate": 1.999293997748454e-05, "loss": 1.5223976373672485, "step": 29 }, { "epoch": 0.20512820512820512, "grad_norm": 1.175620675086975, "learning_rate": 1.9989834093992945e-05, "loss": 1.529496431350708, "step": 30 }, { "epoch": 0.21196581196581196, "grad_norm": 1.2628631591796875, "learning_rate": 1.9986163919125077e-05, "loss": 1.5556331872940063, "step": 31 }, { "epoch": 0.2188034188034188, "grad_norm": 1.121780276298523, "learning_rate": 1.9981929660196492e-05, "loss": 1.522382140159607, "step": 32 }, { "epoch": 0.22564102564102564, "grad_norm": 1.057112693786621, "learning_rate": 1.997713155638592e-05, "loss": 1.5269778966903687, "step": 33 }, { "epoch": 0.23247863247863249, "grad_norm": 1.1212079524993896, "learning_rate": 1.9971769878721747e-05, "loss": 1.5179802179336548, "step": 34 }, { "epoch": 0.23931623931623933, "grad_norm": 1.1053107976913452, "learning_rate": 1.99658449300667e-05, "loss": 1.4600404500961304, "step": 35 }, { "epoch": 0.24615384615384617, "grad_norm": 1.0344611406326294, "learning_rate": 1.9959357045100764e-05, "loss": 1.4895355701446533, "step": 36 }, { "epoch": 0.252991452991453, "grad_norm": 1.0998711585998535, "learning_rate": 1.9952306590302247e-05, "loss": 1.498748779296875, "step": 37 }, { "epoch": 0.25982905982905985, "grad_norm": 1.0810974836349487, "learning_rate": 1.9944693963927092e-05, "loss": 1.4847540855407715, "step": 38 }, { "epoch": 0.26666666666666666, "grad_norm": 1.0349794626235962, "learning_rate": 1.9936519595986395e-05, "loss": 1.4850821495056152, "step": 39 }, { "epoch": 0.27350427350427353, "grad_norm": 0.9509456157684326, "learning_rate": 1.9927783948222084e-05, "loss": 1.4879685640335083, "step": 40 }, { "epoch": 0.28034188034188035, "grad_norm": 0.9873176217079163, "learning_rate": 1.9918487514080867e-05, "loss": 1.5055975914001465, "step": 41 }, { "epoch": 0.28717948717948716, "grad_norm": 0.9554620385169983, "learning_rate": 1.990863081868634e-05, "loss": 1.4576541185379028, "step": 42 }, { "epoch": 0.294017094017094, "grad_norm": 0.915795087814331, "learning_rate": 1.989821441880933e-05, "loss": 1.469474196434021, "step": 43 }, { "epoch": 0.30085470085470084, "grad_norm": 1.006457805633545, "learning_rate": 1.988723890283645e-05, "loss": 1.5073033571243286, "step": 44 }, { "epoch": 0.3076923076923077, "grad_norm": 0.9496122598648071, "learning_rate": 1.9875704890736853e-05, "loss": 1.496271014213562, "step": 45 }, { "epoch": 0.3145299145299145, "grad_norm": 0.9319558143615723, "learning_rate": 1.9863613034027224e-05, "loss": 1.4825000762939453, "step": 46 }, { "epoch": 0.3213675213675214, "grad_norm": 0.9389411807060242, "learning_rate": 1.985096401573497e-05, "loss": 1.4443243741989136, "step": 47 }, { "epoch": 0.3282051282051282, "grad_norm": 0.9735950827598572, "learning_rate": 1.9837758550359637e-05, "loss": 1.4762128591537476, "step": 48 }, { "epoch": 0.335042735042735, "grad_norm": 0.9494331479072571, "learning_rate": 1.982399738383255e-05, "loss": 1.5045385360717773, "step": 49 }, { "epoch": 0.3418803418803419, "grad_norm": 0.9520753026008606, "learning_rate": 1.9809681293474693e-05, "loss": 1.496164321899414, "step": 50 }, { "epoch": 0.3418803418803419, "eval_loss": 1.4685521125793457, "eval_runtime": 14.1604, "eval_samples_per_second": 69.631, "eval_steps_per_second": 8.757, "step": 50 }, { "epoch": 0.3487179487179487, "grad_norm": 0.9688102602958679, "learning_rate": 1.979481108795278e-05, "loss": 1.4734501838684082, "step": 51 }, { "epoch": 0.35555555555555557, "grad_norm": 0.9477071166038513, "learning_rate": 1.9779387607233587e-05, "loss": 1.4600017070770264, "step": 52 }, { "epoch": 0.3623931623931624, "grad_norm": 0.9507799744606018, "learning_rate": 1.9763411722536503e-05, "loss": 1.455001711845398, "step": 53 }, { "epoch": 0.36923076923076925, "grad_norm": 0.9292111992835999, "learning_rate": 1.9746884336284316e-05, "loss": 1.4742114543914795, "step": 54 }, { "epoch": 0.37606837606837606, "grad_norm": 0.9916467666625977, "learning_rate": 1.972980638205225e-05, "loss": 1.5147836208343506, "step": 55 }, { "epoch": 0.38290598290598293, "grad_norm": 0.9744175672531128, "learning_rate": 1.971217882451521e-05, "loss": 1.4713977575302124, "step": 56 }, { "epoch": 0.38974358974358975, "grad_norm": 1.0033540725708008, "learning_rate": 1.9694002659393306e-05, "loss": 1.4538943767547607, "step": 57 }, { "epoch": 0.39658119658119656, "grad_norm": 0.946854293346405, "learning_rate": 1.9675278913395605e-05, "loss": 1.4287432432174683, "step": 58 }, { "epoch": 0.40341880341880343, "grad_norm": 1.0013198852539062, "learning_rate": 1.9656008644162134e-05, "loss": 1.4492701292037964, "step": 59 }, { "epoch": 0.41025641025641024, "grad_norm": 1.0438623428344727, "learning_rate": 1.9636192940204134e-05, "loss": 1.4924561977386475, "step": 60 }, { "epoch": 0.4170940170940171, "grad_norm": 0.9705636501312256, "learning_rate": 1.961583292084259e-05, "loss": 1.4596234560012817, "step": 61 }, { "epoch": 0.4239316239316239, "grad_norm": 0.9079157114028931, "learning_rate": 1.9594929736144978e-05, "loss": 1.44952392578125, "step": 62 }, { "epoch": 0.4307692307692308, "grad_norm": 0.9640805125236511, "learning_rate": 1.957348456686032e-05, "loss": 1.4430960416793823, "step": 63 }, { "epoch": 0.4376068376068376, "grad_norm": 0.9475866556167603, "learning_rate": 1.9551498624352497e-05, "loss": 1.446009635925293, "step": 64 }, { "epoch": 0.4444444444444444, "grad_norm": 0.948258638381958, "learning_rate": 1.9528973150531787e-05, "loss": 1.4411481618881226, "step": 65 }, { "epoch": 0.4512820512820513, "grad_norm": 0.9805014133453369, "learning_rate": 1.9505909417784758e-05, "loss": 1.4417314529418945, "step": 66 }, { "epoch": 0.4581196581196581, "grad_norm": 0.9225365519523621, "learning_rate": 1.9482308728902358e-05, "loss": 1.480376958847046, "step": 67 }, { "epoch": 0.46495726495726497, "grad_norm": 0.9221044182777405, "learning_rate": 1.9458172417006347e-05, "loss": 1.4625794887542725, "step": 68 }, { "epoch": 0.4717948717948718, "grad_norm": 0.9901456832885742, "learning_rate": 1.9433501845473996e-05, "loss": 1.4856598377227783, "step": 69 }, { "epoch": 0.47863247863247865, "grad_norm": 0.9551020860671997, "learning_rate": 1.9408298407861045e-05, "loss": 1.4896745681762695, "step": 70 }, { "epoch": 0.48547008547008547, "grad_norm": 0.9381822943687439, "learning_rate": 1.9382563527823026e-05, "loss": 1.4343875646591187, "step": 71 }, { "epoch": 0.49230769230769234, "grad_norm": 0.8770731091499329, "learning_rate": 1.935629865903482e-05, "loss": 1.4482182264328003, "step": 72 }, { "epoch": 0.49914529914529915, "grad_norm": 0.934929609298706, "learning_rate": 1.9329505285108544e-05, "loss": 1.4524080753326416, "step": 73 }, { "epoch": 0.505982905982906, "grad_norm": 0.9203254580497742, "learning_rate": 1.9302184919509758e-05, "loss": 1.4096636772155762, "step": 74 }, { "epoch": 0.5128205128205128, "grad_norm": 0.9084986448287964, "learning_rate": 1.927433910547197e-05, "loss": 1.423622488975525, "step": 75 }, { "epoch": 0.5196581196581197, "grad_norm": 0.8734993934631348, "learning_rate": 1.9245969415909464e-05, "loss": 1.4265828132629395, "step": 76 }, { "epoch": 0.5264957264957265, "grad_norm": 0.8964496850967407, "learning_rate": 1.921707745332845e-05, "loss": 1.4725595712661743, "step": 77 }, { "epoch": 0.5333333333333333, "grad_norm": 0.9096109867095947, "learning_rate": 1.9187664849736542e-05, "loss": 1.457470417022705, "step": 78 }, { "epoch": 0.5401709401709401, "grad_norm": 0.8932516574859619, "learning_rate": 1.9157733266550577e-05, "loss": 1.454951286315918, "step": 79 }, { "epoch": 0.5470085470085471, "grad_norm": 0.8940214514732361, "learning_rate": 1.9127284394502765e-05, "loss": 1.4776511192321777, "step": 80 }, { "epoch": 0.5538461538461539, "grad_norm": 0.8789263963699341, "learning_rate": 1.9096319953545186e-05, "loss": 1.4376585483551025, "step": 81 }, { "epoch": 0.5606837606837607, "grad_norm": 0.9395255446434021, "learning_rate": 1.906484169275263e-05, "loss": 1.4360781908035278, "step": 82 }, { "epoch": 0.5675213675213675, "grad_norm": 0.8618428707122803, "learning_rate": 1.903285139022381e-05, "loss": 1.4329712390899658, "step": 83 }, { "epoch": 0.5743589743589743, "grad_norm": 0.9313262104988098, "learning_rate": 1.900035085298091e-05, "loss": 1.446253776550293, "step": 84 }, { "epoch": 0.5811965811965812, "grad_norm": 0.8763355016708374, "learning_rate": 1.896734191686752e-05, "loss": 1.4160209894180298, "step": 85 }, { "epoch": 0.588034188034188, "grad_norm": 0.8777135610580444, "learning_rate": 1.8933826446444933e-05, "loss": 1.449493408203125, "step": 86 }, { "epoch": 0.5948717948717949, "grad_norm": 0.8737928867340088, "learning_rate": 1.889980633488683e-05, "loss": 1.377128005027771, "step": 87 }, { "epoch": 0.6017094017094017, "grad_norm": 0.923620343208313, "learning_rate": 1.8865283503872325e-05, "loss": 1.422142505645752, "step": 88 }, { "epoch": 0.6085470085470085, "grad_norm": 0.9419258832931519, "learning_rate": 1.8830259903477427e-05, "loss": 1.4897931814193726, "step": 89 }, { "epoch": 0.6153846153846154, "grad_norm": 0.9292656779289246, "learning_rate": 1.879473751206489e-05, "loss": 1.4244943857192993, "step": 90 }, { "epoch": 0.6222222222222222, "grad_norm": 0.9174057841300964, "learning_rate": 1.8758718336172462e-05, "loss": 1.432208776473999, "step": 91 }, { "epoch": 0.629059829059829, "grad_norm": 0.9447773694992065, "learning_rate": 1.8722204410399524e-05, "loss": 1.4501725435256958, "step": 92 }, { "epoch": 0.6358974358974359, "grad_norm": 0.8907484412193298, "learning_rate": 1.868519779729218e-05, "loss": 1.4563168287277222, "step": 93 }, { "epoch": 0.6427350427350428, "grad_norm": 0.8975157141685486, "learning_rate": 1.864770058722676e-05, "loss": 1.4320740699768066, "step": 94 }, { "epoch": 0.6495726495726496, "grad_norm": 0.9034259915351868, "learning_rate": 1.8609714898291716e-05, "loss": 1.4002689123153687, "step": 95 }, { "epoch": 0.6564102564102564, "grad_norm": 0.9356617331504822, "learning_rate": 1.8571242876167995e-05, "loss": 1.4669139385223389, "step": 96 }, { "epoch": 0.6632478632478632, "grad_norm": 0.9355176091194153, "learning_rate": 1.853228669400784e-05, "loss": 1.4444191455841064, "step": 97 }, { "epoch": 0.67008547008547, "grad_norm": 0.8931655883789062, "learning_rate": 1.8492848552312016e-05, "loss": 1.4415756464004517, "step": 98 }, { "epoch": 0.676923076923077, "grad_norm": 0.8951373100280762, "learning_rate": 1.8452930678805536e-05, "loss": 1.4061449766159058, "step": 99 }, { "epoch": 0.6837606837606838, "grad_norm": 0.9179074168205261, "learning_rate": 1.8412535328311813e-05, "loss": 1.4215387105941772, "step": 100 }, { "epoch": 0.6837606837606838, "eval_loss": 1.4336893558502197, "eval_runtime": 13.7947, "eval_samples_per_second": 71.477, "eval_steps_per_second": 8.989, "step": 100 }, { "epoch": 0.6905982905982906, "grad_norm": 0.977781355381012, "learning_rate": 1.8371664782625287e-05, "loss": 1.4540152549743652, "step": 101 }, { "epoch": 0.6974358974358974, "grad_norm": 0.9076094627380371, "learning_rate": 1.8330321350382545e-05, "loss": 1.415886640548706, "step": 102 }, { "epoch": 0.7042735042735043, "grad_norm": 0.8912188410758972, "learning_rate": 1.8288507366931907e-05, "loss": 1.4277691841125488, "step": 103 }, { "epoch": 0.7111111111111111, "grad_norm": 0.8660780787467957, "learning_rate": 1.8246225194201517e-05, "loss": 1.39166259765625, "step": 104 }, { "epoch": 0.717948717948718, "grad_norm": 0.9204691648483276, "learning_rate": 1.8203477220565912e-05, "loss": 1.4161370992660522, "step": 105 }, { "epoch": 0.7247863247863248, "grad_norm": 0.9661011695861816, "learning_rate": 1.8160265860711134e-05, "loss": 1.4492610692977905, "step": 106 }, { "epoch": 0.7316239316239316, "grad_norm": 0.9005808234214783, "learning_rate": 1.8116593555498308e-05, "loss": 1.4389468431472778, "step": 107 }, { "epoch": 0.7384615384615385, "grad_norm": 0.9088156223297119, "learning_rate": 1.807246277182578e-05, "loss": 1.4940838813781738, "step": 108 }, { "epoch": 0.7452991452991453, "grad_norm": 0.9402887225151062, "learning_rate": 1.802787600248977e-05, "loss": 1.4154539108276367, "step": 109 }, { "epoch": 0.7521367521367521, "grad_norm": 0.9380722045898438, "learning_rate": 1.798283576604356e-05, "loss": 1.4318289756774902, "step": 110 }, { "epoch": 0.7589743589743589, "grad_norm": 0.9319474101066589, "learning_rate": 1.7937344606655228e-05, "loss": 1.4192531108856201, "step": 111 }, { "epoch": 0.7658119658119659, "grad_norm": 0.9068304896354675, "learning_rate": 1.789140509396394e-05, "loss": 1.4170390367507935, "step": 112 }, { "epoch": 0.7726495726495727, "grad_norm": 0.8808281421661377, "learning_rate": 1.784501982293479e-05, "loss": 1.432860016822815, "step": 113 }, { "epoch": 0.7794871794871795, "grad_norm": 0.8805544376373291, "learning_rate": 1.7798191413712244e-05, "loss": 1.4037058353424072, "step": 114 }, { "epoch": 0.7863247863247863, "grad_norm": 0.8959332704544067, "learning_rate": 1.775092251147211e-05, "loss": 1.4175316095352173, "step": 115 }, { "epoch": 0.7931623931623931, "grad_norm": 0.8379173278808594, "learning_rate": 1.770321578627213e-05, "loss": 1.404625654220581, "step": 116 }, { "epoch": 0.8, "grad_norm": 0.8591132164001465, "learning_rate": 1.765507393290117e-05, "loss": 1.4534145593643188, "step": 117 }, { "epoch": 0.8068376068376069, "grad_norm": 0.8517522215843201, "learning_rate": 1.7606499670726972e-05, "loss": 1.4170221090316772, "step": 118 }, { "epoch": 0.8136752136752137, "grad_norm": 0.8700085282325745, "learning_rate": 1.7557495743542586e-05, "loss": 1.4001213312149048, "step": 119 }, { "epoch": 0.8205128205128205, "grad_norm": 0.8774170875549316, "learning_rate": 1.7508064919411344e-05, "loss": 1.418135643005371, "step": 120 }, { "epoch": 0.8273504273504273, "grad_norm": 0.8984478116035461, "learning_rate": 1.745820999051053e-05, "loss": 1.4195680618286133, "step": 121 }, { "epoch": 0.8341880341880342, "grad_norm": 0.8648718595504761, "learning_rate": 1.7407933772973638e-05, "loss": 1.383607029914856, "step": 122 }, { "epoch": 0.841025641025641, "grad_norm": 0.9336929321289062, "learning_rate": 1.735723910673132e-05, "loss": 1.4406161308288574, "step": 123 }, { "epoch": 0.8478632478632478, "grad_norm": 0.8780763149261475, "learning_rate": 1.730612885535094e-05, "loss": 1.4191570281982422, "step": 124 }, { "epoch": 0.8547008547008547, "grad_norm": 0.8674494624137878, "learning_rate": 1.7254605905874862e-05, "loss": 1.437395691871643, "step": 125 }, { "epoch": 0.8615384615384616, "grad_norm": 0.9440014958381653, "learning_rate": 1.7202673168657318e-05, "loss": 1.4250893592834473, "step": 126 }, { "epoch": 0.8683760683760684, "grad_norm": 0.9403019547462463, "learning_rate": 1.7150333577200062e-05, "loss": 1.435499906539917, "step": 127 }, { "epoch": 0.8752136752136752, "grad_norm": 0.863822877407074, "learning_rate": 1.709759008798663e-05, "loss": 1.409804105758667, "step": 128 }, { "epoch": 0.882051282051282, "grad_norm": 0.9274973273277283, "learning_rate": 1.7044445680315374e-05, "loss": 1.433601975440979, "step": 129 }, { "epoch": 0.8888888888888888, "grad_norm": 0.9369088411331177, "learning_rate": 1.6990903356131125e-05, "loss": 1.4320355653762817, "step": 130 }, { "epoch": 0.8957264957264958, "grad_norm": 0.8703179955482483, "learning_rate": 1.6936966139855664e-05, "loss": 1.4167561531066895, "step": 131 }, { "epoch": 0.9025641025641026, "grad_norm": 0.9144904017448425, "learning_rate": 1.6882637078216867e-05, "loss": 1.4223415851593018, "step": 132 }, { "epoch": 0.9094017094017094, "grad_norm": 0.9126601219177246, "learning_rate": 1.6827919240076612e-05, "loss": 1.4480727910995483, "step": 133 }, { "epoch": 0.9162393162393162, "grad_norm": 0.8591611981391907, "learning_rate": 1.6772815716257414e-05, "loss": 1.40584135055542, "step": 134 }, { "epoch": 0.9230769230769231, "grad_norm": 0.8316404223442078, "learning_rate": 1.671732961936785e-05, "loss": 1.449837565422058, "step": 135 }, { "epoch": 0.9299145299145299, "grad_norm": 0.8785284757614136, "learning_rate": 1.6661464083626734e-05, "loss": 1.440337061882019, "step": 136 }, { "epoch": 0.9367521367521368, "grad_norm": 0.8786150813102722, "learning_rate": 1.6605222264686085e-05, "loss": 1.440657138824463, "step": 137 }, { "epoch": 0.9435897435897436, "grad_norm": 0.8501399159431458, "learning_rate": 1.6548607339452853e-05, "loss": 1.397615671157837, "step": 138 }, { "epoch": 0.9504273504273504, "grad_norm": 0.8737369775772095, "learning_rate": 1.6491622505909483e-05, "loss": 1.4285824298858643, "step": 139 }, { "epoch": 0.9572649572649573, "grad_norm": 0.8369284868240356, "learning_rate": 1.6434270982933272e-05, "loss": 1.3992527723312378, "step": 140 }, { "epoch": 0.9641025641025641, "grad_norm": 0.8740672469139099, "learning_rate": 1.637655601011454e-05, "loss": 1.4451634883880615, "step": 141 }, { "epoch": 0.9709401709401709, "grad_norm": 0.873289942741394, "learning_rate": 1.631848084757364e-05, "loss": 1.3965365886688232, "step": 142 }, { "epoch": 0.9777777777777777, "grad_norm": 0.9107730984687805, "learning_rate": 1.6260048775776804e-05, "loss": 1.4110256433486938, "step": 143 }, { "epoch": 0.9846153846153847, "grad_norm": 0.8785021305084229, "learning_rate": 1.6201263095350833e-05, "loss": 1.4294975996017456, "step": 144 }, { "epoch": 0.9914529914529915, "grad_norm": 0.8321818113327026, "learning_rate": 1.6142127126896682e-05, "loss": 1.4016475677490234, "step": 145 }, { "epoch": 0.9982905982905983, "grad_norm": 0.8866358399391174, "learning_rate": 1.6082644210801846e-05, "loss": 1.3802778720855713, "step": 146 }, { "epoch": 1.0, "grad_norm": 1.623956561088562, "learning_rate": 1.602281770705172e-05, "loss": 1.4806468486785889, "step": 147 }, { "epoch": 1.0068376068376068, "grad_norm": 1.1759995222091675, "learning_rate": 1.5962650995039783e-05, "loss": 1.3020893335342407, "step": 148 }, { "epoch": 1.0136752136752136, "grad_norm": 1.0619325637817383, "learning_rate": 1.5902147473376695e-05, "loss": 1.2844979763031006, "step": 149 }, { "epoch": 1.0205128205128204, "grad_norm": 0.9689248204231262, "learning_rate": 1.5841310559698346e-05, "loss": 1.3303570747375488, "step": 150 }, { "epoch": 1.0205128205128204, "eval_loss": 1.4194111824035645, "eval_runtime": 13.7873, "eval_samples_per_second": 71.515, "eval_steps_per_second": 8.994, "step": 150 }, { "epoch": 1.0273504273504273, "grad_norm": 0.9153519868850708, "learning_rate": 1.578014369047279e-05, "loss": 1.3417026996612549, "step": 151 }, { "epoch": 1.0341880341880343, "grad_norm": 0.9799442887306213, "learning_rate": 1.5718650320806145e-05, "loss": 1.293771743774414, "step": 152 }, { "epoch": 1.041025641025641, "grad_norm": 1.0599641799926758, "learning_rate": 1.56568339242474e-05, "loss": 1.3117493391036987, "step": 153 }, { "epoch": 1.047863247863248, "grad_norm": 0.9470742344856262, "learning_rate": 1.5594697992592232e-05, "loss": 1.2798222303390503, "step": 154 }, { "epoch": 1.0547008547008547, "grad_norm": 0.9936373829841614, "learning_rate": 1.5532246035685755e-05, "loss": 1.3070576190948486, "step": 155 }, { "epoch": 1.0615384615384615, "grad_norm": 0.9454049468040466, "learning_rate": 1.5469481581224274e-05, "loss": 1.3386294841766357, "step": 156 }, { "epoch": 1.0683760683760684, "grad_norm": 0.9544969797134399, "learning_rate": 1.5406408174555978e-05, "loss": 1.302185297012329, "step": 157 }, { "epoch": 1.0752136752136752, "grad_norm": 0.9065172076225281, "learning_rate": 1.5343029378480733e-05, "loss": 1.3039960861206055, "step": 158 }, { "epoch": 1.082051282051282, "grad_norm": 0.867220938205719, "learning_rate": 1.527934877304879e-05, "loss": 1.3006991147994995, "step": 159 }, { "epoch": 1.0888888888888888, "grad_norm": 0.9097728133201599, "learning_rate": 1.5215369955358568e-05, "loss": 1.2785807847976685, "step": 160 }, { "epoch": 1.0957264957264958, "grad_norm": 0.9294711351394653, "learning_rate": 1.5151096539353481e-05, "loss": 1.3051520586013794, "step": 161 }, { "epoch": 1.1025641025641026, "grad_norm": 0.9427935481071472, "learning_rate": 1.5086532155617785e-05, "loss": 1.3146125078201294, "step": 162 }, { "epoch": 1.1094017094017095, "grad_norm": 0.9104812741279602, "learning_rate": 1.5021680451171499e-05, "loss": 1.2878390550613403, "step": 163 }, { "epoch": 1.1162393162393163, "grad_norm": 0.8972042202949524, "learning_rate": 1.4956545089264408e-05, "loss": 1.3068175315856934, "step": 164 }, { "epoch": 1.123076923076923, "grad_norm": 0.9040313959121704, "learning_rate": 1.489112974916912e-05, "loss": 1.2897545099258423, "step": 165 }, { "epoch": 1.12991452991453, "grad_norm": 0.9337772727012634, "learning_rate": 1.4825438125973263e-05, "loss": 1.301710844039917, "step": 166 }, { "epoch": 1.1367521367521367, "grad_norm": 0.8870652914047241, "learning_rate": 1.4759473930370738e-05, "loss": 1.3163543939590454, "step": 167 }, { "epoch": 1.1435897435897435, "grad_norm": 0.8637550473213196, "learning_rate": 1.4693240888452121e-05, "loss": 1.3200492858886719, "step": 168 }, { "epoch": 1.1504273504273503, "grad_norm": 0.8388293981552124, "learning_rate": 1.4626742741494207e-05, "loss": 1.307487964630127, "step": 169 }, { "epoch": 1.1572649572649572, "grad_norm": 0.9050071835517883, "learning_rate": 1.4559983245748639e-05, "loss": 1.2808455228805542, "step": 170 }, { "epoch": 1.1641025641025642, "grad_norm": 0.965691089630127, "learning_rate": 1.449296617222978e-05, "loss": 1.332348346710205, "step": 171 }, { "epoch": 1.170940170940171, "grad_norm": 0.8704518675804138, "learning_rate": 1.4425695306501656e-05, "loss": 1.306895136833191, "step": 172 }, { "epoch": 1.1777777777777778, "grad_norm": 0.8741139769554138, "learning_rate": 1.4358174448464155e-05, "loss": 1.2980892658233643, "step": 173 }, { "epoch": 1.1846153846153846, "grad_norm": 0.9941467642784119, "learning_rate": 1.4290407412138365e-05, "loss": 1.2821602821350098, "step": 174 }, { "epoch": 1.1914529914529914, "grad_norm": 0.9268296957015991, "learning_rate": 1.4222398025451137e-05, "loss": 1.302233338356018, "step": 175 }, { "epoch": 1.1982905982905983, "grad_norm": 0.8978403806686401, "learning_rate": 1.4154150130018867e-05, "loss": 1.265356421470642, "step": 176 }, { "epoch": 1.205128205128205, "grad_norm": 0.9328585267066956, "learning_rate": 1.4085667580930482e-05, "loss": 1.320369005203247, "step": 177 }, { "epoch": 1.2119658119658119, "grad_norm": 0.9113616943359375, "learning_rate": 1.4016954246529697e-05, "loss": 1.2897846698760986, "step": 178 }, { "epoch": 1.218803418803419, "grad_norm": 0.9257543087005615, "learning_rate": 1.3948014008196486e-05, "loss": 1.3368397951126099, "step": 179 }, { "epoch": 1.2256410256410257, "grad_norm": 0.8960409164428711, "learning_rate": 1.3878850760127848e-05, "loss": 1.3266628980636597, "step": 180 }, { "epoch": 1.2324786324786325, "grad_norm": 0.9111725687980652, "learning_rate": 1.3809468409117845e-05, "loss": 1.2674126625061035, "step": 181 }, { "epoch": 1.2393162393162394, "grad_norm": 0.9564438462257385, "learning_rate": 1.3739870874336898e-05, "loss": 1.2953293323516846, "step": 182 }, { "epoch": 1.2461538461538462, "grad_norm": 1.0268452167510986, "learning_rate": 1.3670062087110423e-05, "loss": 1.3054559230804443, "step": 183 }, { "epoch": 1.252991452991453, "grad_norm": 0.8995468020439148, "learning_rate": 1.3600045990696762e-05, "loss": 1.3053619861602783, "step": 184 }, { "epoch": 1.2598290598290598, "grad_norm": 0.8805936574935913, "learning_rate": 1.352982654006444e-05, "loss": 1.3140225410461426, "step": 185 }, { "epoch": 1.2666666666666666, "grad_norm": 0.9060247540473938, "learning_rate": 1.3459407701668762e-05, "loss": 1.3046287298202515, "step": 186 }, { "epoch": 1.2735042735042734, "grad_norm": 0.8805747628211975, "learning_rate": 1.3388793453227766e-05, "loss": 1.3128578662872314, "step": 187 }, { "epoch": 1.2803418803418802, "grad_norm": 0.8997815847396851, "learning_rate": 1.331798778349752e-05, "loss": 1.3107125759124756, "step": 188 }, { "epoch": 1.287179487179487, "grad_norm": 0.9592490792274475, "learning_rate": 1.3246994692046837e-05, "loss": 1.3269885778427124, "step": 189 }, { "epoch": 1.294017094017094, "grad_norm": 0.9726372957229614, "learning_rate": 1.3175818189031326e-05, "loss": 1.337971806526184, "step": 190 }, { "epoch": 1.300854700854701, "grad_norm": 0.9480524659156799, "learning_rate": 1.3104462294966895e-05, "loss": 1.287239670753479, "step": 191 }, { "epoch": 1.3076923076923077, "grad_norm": 0.9071521162986755, "learning_rate": 1.3032931040502627e-05, "loss": 1.2962584495544434, "step": 192 }, { "epoch": 1.3145299145299145, "grad_norm": 0.9058794379234314, "learning_rate": 1.2961228466193116e-05, "loss": 1.280348300933838, "step": 193 }, { "epoch": 1.3213675213675213, "grad_norm": 0.9048560261726379, "learning_rate": 1.2889358622270225e-05, "loss": 1.3330844640731812, "step": 194 }, { "epoch": 1.3282051282051281, "grad_norm": 0.945749819278717, "learning_rate": 1.2817325568414299e-05, "loss": 1.3170994520187378, "step": 195 }, { "epoch": 1.335042735042735, "grad_norm": 0.9457980394363403, "learning_rate": 1.2745133373524855e-05, "loss": 1.3166072368621826, "step": 196 }, { "epoch": 1.341880341880342, "grad_norm": 0.9297810196876526, "learning_rate": 1.267278611549073e-05, "loss": 1.3273459672927856, "step": 197 }, { "epoch": 1.3487179487179488, "grad_norm": 0.9370136260986328, "learning_rate": 1.2600287880959762e-05, "loss": 1.3432742357254028, "step": 198 }, { "epoch": 1.3555555555555556, "grad_norm": 0.904547393321991, "learning_rate": 1.2527642765107919e-05, "loss": 1.3275690078735352, "step": 199 }, { "epoch": 1.3623931623931624, "grad_norm": 0.9034311175346375, "learning_rate": 1.2454854871407993e-05, "loss": 1.3097259998321533, "step": 200 }, { "epoch": 1.3623931623931624, "eval_loss": 1.4159187078475952, "eval_runtime": 13.7977, "eval_samples_per_second": 71.461, "eval_steps_per_second": 8.987, "step": 200 }, { "epoch": 1.3692307692307693, "grad_norm": 0.8713945150375366, "learning_rate": 1.2381928311397806e-05, "loss": 1.2865114212036133, "step": 201 }, { "epoch": 1.376068376068376, "grad_norm": 0.8947977423667908, "learning_rate": 1.2308867204447958e-05, "loss": 1.277376651763916, "step": 202 }, { "epoch": 1.3829059829059829, "grad_norm": 0.9047794342041016, "learning_rate": 1.2235675677529158e-05, "loss": 1.288478970527649, "step": 203 }, { "epoch": 1.3897435897435897, "grad_norm": 0.8953425884246826, "learning_rate": 1.2162357864979073e-05, "loss": 1.2861666679382324, "step": 204 }, { "epoch": 1.3965811965811965, "grad_norm": 0.9369704723358154, "learning_rate": 1.2088917908268822e-05, "loss": 1.2857511043548584, "step": 205 }, { "epoch": 1.4034188034188033, "grad_norm": 0.887296736240387, "learning_rate": 1.2015359955769021e-05, "loss": 1.2925364971160889, "step": 206 }, { "epoch": 1.4102564102564101, "grad_norm": 0.875452995300293, "learning_rate": 1.1941688162515468e-05, "loss": 1.3017300367355347, "step": 207 }, { "epoch": 1.4170940170940172, "grad_norm": 0.8836603760719299, "learning_rate": 1.186790668997443e-05, "loss": 1.2731754779815674, "step": 208 }, { "epoch": 1.423931623931624, "grad_norm": 0.8866926431655884, "learning_rate": 1.1794019705807584e-05, "loss": 1.3009804487228394, "step": 209 }, { "epoch": 1.4307692307692308, "grad_norm": 0.8414238095283508, "learning_rate": 1.1720031383636585e-05, "loss": 1.3082433938980103, "step": 210 }, { "epoch": 1.4376068376068376, "grad_norm": 0.8662127256393433, "learning_rate": 1.164594590280734e-05, "loss": 1.2641851902008057, "step": 211 }, { "epoch": 1.4444444444444444, "grad_norm": 0.9151703119277954, "learning_rate": 1.15717674481539e-05, "loss": 1.3064939975738525, "step": 212 }, { "epoch": 1.4512820512820512, "grad_norm": 0.9086518883705139, "learning_rate": 1.1497500209762102e-05, "loss": 1.3118016719818115, "step": 213 }, { "epoch": 1.458119658119658, "grad_norm": 0.9340091347694397, "learning_rate": 1.1423148382732854e-05, "loss": 1.3228766918182373, "step": 214 }, { "epoch": 1.464957264957265, "grad_norm": 0.865403950214386, "learning_rate": 1.1348716166945195e-05, "loss": 1.2863235473632812, "step": 215 }, { "epoch": 1.471794871794872, "grad_norm": 0.8879923224449158, "learning_rate": 1.127420776681905e-05, "loss": 1.3169306516647339, "step": 216 }, { "epoch": 1.4786324786324787, "grad_norm": 0.8761537075042725, "learning_rate": 1.1199627391077732e-05, "loss": 1.2758698463439941, "step": 217 }, { "epoch": 1.4854700854700855, "grad_norm": 0.905274510383606, "learning_rate": 1.1124979252510209e-05, "loss": 1.3158073425292969, "step": 218 }, { "epoch": 1.4923076923076923, "grad_norm": 0.9052457213401794, "learning_rate": 1.105026756773314e-05, "loss": 1.3242114782333374, "step": 219 }, { "epoch": 1.4991452991452991, "grad_norm": 0.8539809584617615, "learning_rate": 1.0975496556952683e-05, "loss": 1.295405387878418, "step": 220 }, { "epoch": 1.505982905982906, "grad_norm": 0.9171442985534668, "learning_rate": 1.0900670443726136e-05, "loss": 1.3160406351089478, "step": 221 }, { "epoch": 1.5128205128205128, "grad_norm": 0.877983570098877, "learning_rate": 1.0825793454723325e-05, "loss": 1.315245509147644, "step": 222 }, { "epoch": 1.5196581196581196, "grad_norm": 0.8745649456977844, "learning_rate": 1.0750869819487884e-05, "loss": 1.3248393535614014, "step": 223 }, { "epoch": 1.5264957264957264, "grad_norm": 0.8661232590675354, "learning_rate": 1.0675903770198333e-05, "loss": 1.2788147926330566, "step": 224 }, { "epoch": 1.5333333333333332, "grad_norm": 0.8793037533760071, "learning_rate": 1.0600899541429004e-05, "loss": 1.288352608680725, "step": 225 }, { "epoch": 1.54017094017094, "grad_norm": 0.9148133397102356, "learning_rate": 1.0525861369910877e-05, "loss": 1.3211514949798584, "step": 226 }, { "epoch": 1.547008547008547, "grad_norm": 0.9006965160369873, "learning_rate": 1.0450793494292223e-05, "loss": 1.3327584266662598, "step": 227 }, { "epoch": 1.5538461538461539, "grad_norm": 0.8701738119125366, "learning_rate": 1.0375700154899208e-05, "loss": 1.3010832071304321, "step": 228 }, { "epoch": 1.5606837606837607, "grad_norm": 0.880436360836029, "learning_rate": 1.0300585593496348e-05, "loss": 1.3152333498001099, "step": 229 }, { "epoch": 1.5675213675213675, "grad_norm": 0.8781545758247375, "learning_rate": 1.0225454053046922e-05, "loss": 1.2808175086975098, "step": 230 }, { "epoch": 1.5743589743589743, "grad_norm": 0.8630225658416748, "learning_rate": 1.0150309777473305e-05, "loss": 1.2873480319976807, "step": 231 }, { "epoch": 1.5811965811965814, "grad_norm": 0.8928260803222656, "learning_rate": 1.007515701141722e-05, "loss": 1.28458571434021, "step": 232 }, { "epoch": 1.5880341880341882, "grad_norm": 0.8699108958244324, "learning_rate": 1e-05, "loss": 1.2885918617248535, "step": 233 }, { "epoch": 1.594871794871795, "grad_norm": 0.8759332895278931, "learning_rate": 9.924842988582783e-06, "loss": 1.2787448167800903, "step": 234 }, { "epoch": 1.6017094017094018, "grad_norm": 0.8956566452980042, "learning_rate": 9.849690222526698e-06, "loss": 1.304962158203125, "step": 235 }, { "epoch": 1.6085470085470086, "grad_norm": 0.8675941824913025, "learning_rate": 9.77454594695308e-06, "loss": 1.2871266603469849, "step": 236 }, { "epoch": 1.6153846153846154, "grad_norm": 0.9092246294021606, "learning_rate": 9.699414406503655e-06, "loss": 1.327986240386963, "step": 237 }, { "epoch": 1.6222222222222222, "grad_norm": 0.8909919857978821, "learning_rate": 9.624299845100795e-06, "loss": 1.2647631168365479, "step": 238 }, { "epoch": 1.629059829059829, "grad_norm": 0.8657082915306091, "learning_rate": 9.549206505707778e-06, "loss": 1.294311761856079, "step": 239 }, { "epoch": 1.6358974358974359, "grad_norm": 0.8618515133857727, "learning_rate": 9.474138630089124e-06, "loss": 1.3014901876449585, "step": 240 }, { "epoch": 1.6427350427350427, "grad_norm": 0.8630589246749878, "learning_rate": 9.399100458570998e-06, "loss": 1.293131709098816, "step": 241 }, { "epoch": 1.6495726495726495, "grad_norm": 0.8735710978507996, "learning_rate": 9.324096229801673e-06, "loss": 1.290333867073059, "step": 242 }, { "epoch": 1.6564102564102563, "grad_norm": 0.8574416041374207, "learning_rate": 9.249130180512118e-06, "loss": 1.3111311197280884, "step": 243 }, { "epoch": 1.6632478632478631, "grad_norm": 0.9102303981781006, "learning_rate": 9.174206545276678e-06, "loss": 1.271691083908081, "step": 244 }, { "epoch": 1.67008547008547, "grad_norm": 0.867579996585846, "learning_rate": 9.099329556273866e-06, "loss": 1.3228224515914917, "step": 245 }, { "epoch": 1.676923076923077, "grad_norm": 0.8179166316986084, "learning_rate": 9.024503443047318e-06, "loss": 1.3084717988967896, "step": 246 }, { "epoch": 1.6837606837606838, "grad_norm": 0.8923108577728271, "learning_rate": 8.949732432266867e-06, "loss": 1.2903640270233154, "step": 247 }, { "epoch": 1.6905982905982906, "grad_norm": 0.9241410493850708, "learning_rate": 8.875020747489795e-06, "loss": 1.302449345588684, "step": 248 }, { "epoch": 1.6974358974358974, "grad_norm": 0.8430485129356384, "learning_rate": 8.800372608922272e-06, "loss": 1.2765015363693237, "step": 249 }, { "epoch": 1.7042735042735044, "grad_norm": 0.8592954874038696, "learning_rate": 8.72579223318095e-06, "loss": 1.317484736442566, "step": 250 }, { "epoch": 1.7042735042735044, "eval_loss": 1.4088929891586304, "eval_runtime": 13.7993, "eval_samples_per_second": 71.453, "eval_steps_per_second": 8.986, "step": 250 }, { "epoch": 1.7111111111111112, "grad_norm": 0.916032612323761, "learning_rate": 8.65128383305481e-06, "loss": 1.300941824913025, "step": 251 }, { "epoch": 1.717948717948718, "grad_norm": 0.8675019145011902, "learning_rate": 8.576851617267151e-06, "loss": 1.3122076988220215, "step": 252 }, { "epoch": 1.7247863247863249, "grad_norm": 0.8310043811798096, "learning_rate": 8.5024997902379e-06, "loss": 1.3160263299942017, "step": 253 }, { "epoch": 1.7316239316239317, "grad_norm": 0.8706823587417603, "learning_rate": 8.428232551846101e-06, "loss": 1.2773703336715698, "step": 254 }, { "epoch": 1.7384615384615385, "grad_norm": 0.8875864744186401, "learning_rate": 8.35405409719266e-06, "loss": 1.288883090019226, "step": 255 }, { "epoch": 1.7452991452991453, "grad_norm": 0.9055056571960449, "learning_rate": 8.279968616363417e-06, "loss": 1.3028110265731812, "step": 256 }, { "epoch": 1.7521367521367521, "grad_norm": 0.905623197555542, "learning_rate": 8.205980294192421e-06, "loss": 1.3112901449203491, "step": 257 }, { "epoch": 1.758974358974359, "grad_norm": 0.847100555896759, "learning_rate": 8.132093310025572e-06, "loss": 1.311500906944275, "step": 258 }, { "epoch": 1.7658119658119658, "grad_norm": 0.8671444058418274, "learning_rate": 8.058311837484537e-06, "loss": 1.308862566947937, "step": 259 }, { "epoch": 1.7726495726495726, "grad_norm": 0.844569742679596, "learning_rate": 7.984640044230984e-06, "loss": 1.3032524585723877, "step": 260 }, { "epoch": 1.7794871794871794, "grad_norm": 0.9013960957527161, "learning_rate": 7.911082091731182e-06, "loss": 1.2791337966918945, "step": 261 }, { "epoch": 1.7863247863247862, "grad_norm": 0.8714650869369507, "learning_rate": 7.837642135020929e-06, "loss": 1.2602317333221436, "step": 262 }, { "epoch": 1.793162393162393, "grad_norm": 0.9024747014045715, "learning_rate": 7.764324322470842e-06, "loss": 1.279998540878296, "step": 263 }, { "epoch": 1.8, "grad_norm": 0.8714993596076965, "learning_rate": 7.691132795552044e-06, "loss": 1.284783959388733, "step": 264 }, { "epoch": 1.8068376068376069, "grad_norm": 0.8371661305427551, "learning_rate": 7.618071688602199e-06, "loss": 1.3234297037124634, "step": 265 }, { "epoch": 1.8136752136752137, "grad_norm": 0.8943991661071777, "learning_rate": 7.545145128592009e-06, "loss": 1.2969616651535034, "step": 266 }, { "epoch": 1.8205128205128205, "grad_norm": 0.8753275275230408, "learning_rate": 7.472357234892083e-06, "loss": 1.2795380353927612, "step": 267 }, { "epoch": 1.8273504273504273, "grad_norm": 0.8614721894264221, "learning_rate": 7.3997121190402375e-06, "loss": 1.3064361810684204, "step": 268 }, { "epoch": 1.8341880341880343, "grad_norm": 0.853656530380249, "learning_rate": 7.3272138845092725e-06, "loss": 1.3017405271530151, "step": 269 }, { "epoch": 1.8410256410256411, "grad_norm": 0.8655431866645813, "learning_rate": 7.254866626475152e-06, "loss": 1.304486632347107, "step": 270 }, { "epoch": 1.847863247863248, "grad_norm": 0.87064528465271, "learning_rate": 7.182674431585703e-06, "loss": 1.2795239686965942, "step": 271 }, { "epoch": 1.8547008547008548, "grad_norm": 0.8889244198799133, "learning_rate": 7.110641377729778e-06, "loss": 1.294914960861206, "step": 272 }, { "epoch": 1.8615384615384616, "grad_norm": 0.9096329212188721, "learning_rate": 7.038771533806884e-06, "loss": 1.2885854244232178, "step": 273 }, { "epoch": 1.8683760683760684, "grad_norm": 0.8873443007469177, "learning_rate": 6.967068959497376e-06, "loss": 1.297377347946167, "step": 274 }, { "epoch": 1.8752136752136752, "grad_norm": 0.8182293772697449, "learning_rate": 6.895537705033108e-06, "loss": 1.3091909885406494, "step": 275 }, { "epoch": 1.882051282051282, "grad_norm": 0.849620521068573, "learning_rate": 6.824181810968675e-06, "loss": 1.2712843418121338, "step": 276 }, { "epoch": 1.8888888888888888, "grad_norm": 0.8953171372413635, "learning_rate": 6.7530053079531664e-06, "loss": 1.305629849433899, "step": 277 }, { "epoch": 1.8957264957264957, "grad_norm": 0.8743292689323425, "learning_rate": 6.6820122165024845e-06, "loss": 1.3009774684906006, "step": 278 }, { "epoch": 1.9025641025641025, "grad_norm": 0.8852370977401733, "learning_rate": 6.6112065467722375e-06, "loss": 1.2898852825164795, "step": 279 }, { "epoch": 1.9094017094017093, "grad_norm": 0.8812291026115417, "learning_rate": 6.540592298331239e-06, "loss": 1.3161499500274658, "step": 280 }, { "epoch": 1.916239316239316, "grad_norm": 0.8949340581893921, "learning_rate": 6.4701734599355605e-06, "loss": 1.2947360277175903, "step": 281 }, { "epoch": 1.9230769230769231, "grad_norm": 0.8372949957847595, "learning_rate": 6.3999540093032396e-06, "loss": 1.263576626777649, "step": 282 }, { "epoch": 1.92991452991453, "grad_norm": 0.8882158398628235, "learning_rate": 6.329937912889582e-06, "loss": 1.2893450260162354, "step": 283 }, { "epoch": 1.9367521367521368, "grad_norm": 0.838527500629425, "learning_rate": 6.260129125663106e-06, "loss": 1.2985213994979858, "step": 284 }, { "epoch": 1.9435897435897436, "grad_norm": 0.8823593258857727, "learning_rate": 6.1905315908821584e-06, "loss": 1.306897521018982, "step": 285 }, { "epoch": 1.9504273504273504, "grad_norm": 0.8618027567863464, "learning_rate": 6.121149239872151e-06, "loss": 1.2990589141845703, "step": 286 }, { "epoch": 1.9572649572649574, "grad_norm": 0.8389527797698975, "learning_rate": 6.051985991803517e-06, "loss": 1.2886924743652344, "step": 287 }, { "epoch": 1.9641025641025642, "grad_norm": 0.8738916516304016, "learning_rate": 5.983045753470308e-06, "loss": 1.3003113269805908, "step": 288 }, { "epoch": 1.970940170940171, "grad_norm": 0.8567415475845337, "learning_rate": 5.91433241906952e-06, "loss": 1.285038948059082, "step": 289 }, { "epoch": 1.9777777777777779, "grad_norm": 0.8555871248245239, "learning_rate": 5.845849869981137e-06, "loss": 1.2825312614440918, "step": 290 }, { "epoch": 1.9846153846153847, "grad_norm": 0.8524548411369324, "learning_rate": 5.7776019745488665e-06, "loss": 1.3078036308288574, "step": 291 }, { "epoch": 1.9914529914529915, "grad_norm": 0.8610931634902954, "learning_rate": 5.709592587861637e-06, "loss": 1.2933144569396973, "step": 292 }, { "epoch": 1.9982905982905983, "grad_norm": 0.8547428250312805, "learning_rate": 5.641825551535849e-06, "loss": 1.2723497152328491, "step": 293 }, { "epoch": 2.0, "grad_norm": 1.6815301179885864, "learning_rate": 5.574304693498346e-06, "loss": 1.260840892791748, "step": 294 }, { "epoch": 2.006837606837607, "grad_norm": 1.1894463300704956, "learning_rate": 5.507033827770225e-06, "loss": 1.2158567905426025, "step": 295 }, { "epoch": 2.0136752136752136, "grad_norm": 1.1574074029922485, "learning_rate": 5.440016754251364e-06, "loss": 1.188340663909912, "step": 296 }, { "epoch": 2.0205128205128204, "grad_norm": 0.9981362819671631, "learning_rate": 5.373257258505798e-06, "loss": 1.1729332208633423, "step": 297 }, { "epoch": 2.0273504273504273, "grad_norm": 1.0496586561203003, "learning_rate": 5.306759111547881e-06, "loss": 1.1735312938690186, "step": 298 }, { "epoch": 2.034188034188034, "grad_norm": 0.9409749507904053, "learning_rate": 5.240526069629265e-06, "loss": 1.198318600654602, "step": 299 }, { "epoch": 2.041025641025641, "grad_norm": 0.9382721781730652, "learning_rate": 5.174561874026741e-06, "loss": 1.2194828987121582, "step": 300 }, { "epoch": 2.041025641025641, "eval_loss": 1.4175776243209839, "eval_runtime": 13.7699, "eval_samples_per_second": 71.606, "eval_steps_per_second": 9.005, "step": 300 }, { "epoch": 2.0478632478632477, "grad_norm": 0.936610996723175, "learning_rate": 5.1088702508308815e-06, "loss": 1.2439236640930176, "step": 301 }, { "epoch": 2.0547008547008545, "grad_norm": 0.9476950764656067, "learning_rate": 5.043454910735595e-06, "loss": 1.2119914293289185, "step": 302 }, { "epoch": 2.0615384615384613, "grad_norm": 0.975143313407898, "learning_rate": 4.978319548828504e-06, "loss": 1.1766479015350342, "step": 303 }, { "epoch": 2.0683760683760686, "grad_norm": 0.9535344243049622, "learning_rate": 4.913467844382217e-06, "loss": 1.2154781818389893, "step": 304 }, { "epoch": 2.0752136752136754, "grad_norm": 0.9839100241661072, "learning_rate": 4.848903460646522e-06, "loss": 1.1973791122436523, "step": 305 }, { "epoch": 2.082051282051282, "grad_norm": 0.9296822547912598, "learning_rate": 4.784630044641435e-06, "loss": 1.2077343463897705, "step": 306 }, { "epoch": 2.088888888888889, "grad_norm": 0.9518297910690308, "learning_rate": 4.720651226951213e-06, "loss": 1.2044742107391357, "step": 307 }, { "epoch": 2.095726495726496, "grad_norm": 0.9024590253829956, "learning_rate": 4.65697062151927e-06, "loss": 1.2214324474334717, "step": 308 }, { "epoch": 2.1025641025641026, "grad_norm": 0.8939958214759827, "learning_rate": 4.593591825444028e-06, "loss": 1.230959177017212, "step": 309 }, { "epoch": 2.1094017094017095, "grad_norm": 0.9565759301185608, "learning_rate": 4.530518418775734e-06, "loss": 1.2308049201965332, "step": 310 }, { "epoch": 2.1162393162393163, "grad_norm": 0.8952397704124451, "learning_rate": 4.467753964314245e-06, "loss": 1.2218645811080933, "step": 311 }, { "epoch": 2.123076923076923, "grad_norm": 0.9192137122154236, "learning_rate": 4.40530200740777e-06, "loss": 1.1945393085479736, "step": 312 }, { "epoch": 2.12991452991453, "grad_norm": 0.9151750206947327, "learning_rate": 4.343166075752605e-06, "loss": 1.1909265518188477, "step": 313 }, { "epoch": 2.1367521367521367, "grad_norm": 0.912064790725708, "learning_rate": 4.281349679193862e-06, "loss": 1.176002860069275, "step": 314 }, { "epoch": 2.1435897435897435, "grad_norm": 0.9001777172088623, "learning_rate": 4.219856309527212e-06, "loss": 1.2102347612380981, "step": 315 }, { "epoch": 2.1504273504273503, "grad_norm": 0.9100410342216492, "learning_rate": 4.1586894403016576e-06, "loss": 1.2215776443481445, "step": 316 }, { "epoch": 2.157264957264957, "grad_norm": 0.8823668360710144, "learning_rate": 4.097852526623307e-06, "loss": 1.1972424983978271, "step": 317 }, { "epoch": 2.164102564102564, "grad_norm": 0.8945139050483704, "learning_rate": 4.03734900496022e-06, "loss": 1.2440537214279175, "step": 318 }, { "epoch": 2.1709401709401708, "grad_norm": 0.858863890171051, "learning_rate": 3.9771822929482825e-06, "loss": 1.2240134477615356, "step": 319 }, { "epoch": 2.1777777777777776, "grad_norm": 0.9579023122787476, "learning_rate": 3.917355789198157e-06, "loss": 1.1975905895233154, "step": 320 }, { "epoch": 2.184615384615385, "grad_norm": 0.8992065191268921, "learning_rate": 3.857872873103322e-06, "loss": 1.2251243591308594, "step": 321 }, { "epoch": 2.1914529914529917, "grad_norm": 0.8930969834327698, "learning_rate": 3.7987369046491684e-06, "loss": 1.1994602680206299, "step": 322 }, { "epoch": 2.1982905982905985, "grad_norm": 0.8879907727241516, "learning_rate": 3.7399512242231994e-06, "loss": 1.2023355960845947, "step": 323 }, { "epoch": 2.2051282051282053, "grad_norm": 0.8827998638153076, "learning_rate": 3.6815191524263628e-06, "loss": 1.1980074644088745, "step": 324 }, { "epoch": 2.211965811965812, "grad_norm": 0.9081103801727295, "learning_rate": 3.623443989885462e-06, "loss": 1.2123109102249146, "step": 325 }, { "epoch": 2.218803418803419, "grad_norm": 0.8658437132835388, "learning_rate": 3.565729017066729e-06, "loss": 1.1860473155975342, "step": 326 }, { "epoch": 2.2256410256410257, "grad_norm": 0.8716210722923279, "learning_rate": 3.508377494090521e-06, "loss": 1.246274471282959, "step": 327 }, { "epoch": 2.2324786324786325, "grad_norm": 0.8930105566978455, "learning_rate": 3.4513926605471504e-06, "loss": 1.2249618768692017, "step": 328 }, { "epoch": 2.2393162393162394, "grad_norm": 0.8859133720397949, "learning_rate": 3.3947777353139188e-06, "loss": 1.2300435304641724, "step": 329 }, { "epoch": 2.246153846153846, "grad_norm": 0.876879096031189, "learning_rate": 3.338535916373267e-06, "loss": 1.226067066192627, "step": 330 }, { "epoch": 2.252991452991453, "grad_norm": 0.8582764863967896, "learning_rate": 3.2826703806321526e-06, "loss": 1.2141978740692139, "step": 331 }, { "epoch": 2.25982905982906, "grad_norm": 0.9050947427749634, "learning_rate": 3.2271842837425917e-06, "loss": 1.199479103088379, "step": 332 }, { "epoch": 2.2666666666666666, "grad_norm": 0.8743166923522949, "learning_rate": 3.1720807599233903e-06, "loss": 1.2526406049728394, "step": 333 }, { "epoch": 2.2735042735042734, "grad_norm": 0.9142019152641296, "learning_rate": 3.1173629217831345e-06, "loss": 1.1963285207748413, "step": 334 }, { "epoch": 2.2803418803418802, "grad_norm": 0.8888209462165833, "learning_rate": 3.063033860144339e-06, "loss": 1.209120512008667, "step": 335 }, { "epoch": 2.287179487179487, "grad_norm": 0.8925624489784241, "learning_rate": 3.0090966438688774e-06, "loss": 1.1804795265197754, "step": 336 }, { "epoch": 2.294017094017094, "grad_norm": 0.9087634682655334, "learning_rate": 2.9555543196846293e-06, "loss": 1.2147403955459595, "step": 337 }, { "epoch": 2.3008547008547007, "grad_norm": 0.9099950194358826, "learning_rate": 2.9024099120133674e-06, "loss": 1.2237548828125, "step": 338 }, { "epoch": 2.3076923076923075, "grad_norm": 0.8658971786499023, "learning_rate": 2.8496664227999417e-06, "loss": 1.2072890996932983, "step": 339 }, { "epoch": 2.3145299145299143, "grad_norm": 0.8897408843040466, "learning_rate": 2.7973268313426836e-06, "loss": 1.2147533893585205, "step": 340 }, { "epoch": 2.3213675213675216, "grad_norm": 0.8564779758453369, "learning_rate": 2.745394094125141e-06, "loss": 1.2456395626068115, "step": 341 }, { "epoch": 2.3282051282051284, "grad_norm": 0.8652287125587463, "learning_rate": 2.6938711446490607e-06, "loss": 1.2109252214431763, "step": 342 }, { "epoch": 2.335042735042735, "grad_norm": 0.8643552660942078, "learning_rate": 2.642760893268684e-06, "loss": 1.1878920793533325, "step": 343 }, { "epoch": 2.341880341880342, "grad_norm": 0.8824043869972229, "learning_rate": 2.5920662270263653e-06, "loss": 1.1911319494247437, "step": 344 }, { "epoch": 2.348717948717949, "grad_norm": 0.8898422122001648, "learning_rate": 2.541790009489474e-06, "loss": 1.193242073059082, "step": 345 }, { "epoch": 2.3555555555555556, "grad_norm": 0.8772786259651184, "learning_rate": 2.491935080588658e-06, "loss": 1.1836318969726562, "step": 346 }, { "epoch": 2.3623931623931624, "grad_norm": 0.8587839603424072, "learning_rate": 2.4425042564574186e-06, "loss": 1.2118480205535889, "step": 347 }, { "epoch": 2.3692307692307693, "grad_norm": 0.8739367127418518, "learning_rate": 2.3935003292730295e-06, "loss": 1.2201834917068481, "step": 348 }, { "epoch": 2.376068376068376, "grad_norm": 0.8904187679290771, "learning_rate": 2.344926067098836e-06, "loss": 1.1912821531295776, "step": 349 }, { "epoch": 2.382905982905983, "grad_norm": 0.8717731237411499, "learning_rate": 2.2967842137278706e-06, "loss": 1.2726080417633057, "step": 350 }, { "epoch": 2.382905982905983, "eval_loss": 1.422935962677002, "eval_runtime": 13.7932, "eval_samples_per_second": 71.484, "eval_steps_per_second": 8.99, "step": 350 }, { "epoch": 2.3897435897435897, "grad_norm": 0.8623640537261963, "learning_rate": 2.249077488527891e-06, "loss": 1.1917917728424072, "step": 351 }, { "epoch": 2.3965811965811965, "grad_norm": 0.9295298457145691, "learning_rate": 2.201808586287757e-06, "loss": 1.195438027381897, "step": 352 }, { "epoch": 2.4034188034188033, "grad_norm": 0.8726212382316589, "learning_rate": 2.15498017706521e-06, "loss": 1.1993173360824585, "step": 353 }, { "epoch": 2.41025641025641, "grad_norm": 0.8750997185707092, "learning_rate": 2.1085949060360654e-06, "loss": 1.2198253870010376, "step": 354 }, { "epoch": 2.417094017094017, "grad_norm": 0.8799977898597717, "learning_rate": 2.0626553933447734e-06, "loss": 1.1714023351669312, "step": 355 }, { "epoch": 2.4239316239316238, "grad_norm": 0.9106065034866333, "learning_rate": 2.01716423395644e-06, "loss": 1.2285724878311157, "step": 356 }, { "epoch": 2.430769230769231, "grad_norm": 0.8555257320404053, "learning_rate": 1.9721239975102313e-06, "loss": 1.1813218593597412, "step": 357 }, { "epoch": 2.437606837606838, "grad_norm": 0.8696889877319336, "learning_rate": 1.9275372281742242e-06, "loss": 1.2316478490829468, "step": 358 }, { "epoch": 2.4444444444444446, "grad_norm": 0.9041836857795715, "learning_rate": 1.8834064445016952e-06, "loss": 1.2227892875671387, "step": 359 }, { "epoch": 2.4512820512820515, "grad_norm": 0.8697716593742371, "learning_rate": 1.8397341392888679e-06, "loss": 1.224617600440979, "step": 360 }, { "epoch": 2.4581196581196583, "grad_norm": 0.8882873058319092, "learning_rate": 1.7965227794340879e-06, "loss": 1.1995422840118408, "step": 361 }, { "epoch": 2.464957264957265, "grad_norm": 0.8834539651870728, "learning_rate": 1.7537748057984861e-06, "loss": 1.2222732305526733, "step": 362 }, { "epoch": 2.471794871794872, "grad_norm": 0.899989128112793, "learning_rate": 1.7114926330680958e-06, "loss": 1.2143341302871704, "step": 363 }, { "epoch": 2.4786324786324787, "grad_norm": 0.8635477423667908, "learning_rate": 1.6696786496174578e-06, "loss": 1.2323466539382935, "step": 364 }, { "epoch": 2.4854700854700855, "grad_norm": 0.8827865719795227, "learning_rate": 1.6283352173747148e-06, "loss": 1.1907907724380493, "step": 365 }, { "epoch": 2.4923076923076923, "grad_norm": 0.8702190518379211, "learning_rate": 1.587464671688187e-06, "loss": 1.201211929321289, "step": 366 }, { "epoch": 2.499145299145299, "grad_norm": 0.8626653552055359, "learning_rate": 1.5470693211944643e-06, "loss": 1.1894201040267944, "step": 367 }, { "epoch": 2.505982905982906, "grad_norm": 0.879705011844635, "learning_rate": 1.5071514476879878e-06, "loss": 1.2102407217025757, "step": 368 }, { "epoch": 2.5128205128205128, "grad_norm": 0.8780226707458496, "learning_rate": 1.4677133059921634e-06, "loss": 1.235593557357788, "step": 369 }, { "epoch": 2.5196581196581196, "grad_norm": 0.8804551362991333, "learning_rate": 1.4287571238320053e-06, "loss": 1.2265985012054443, "step": 370 }, { "epoch": 2.5264957264957264, "grad_norm": 0.8670660257339478, "learning_rate": 1.3902851017082863e-06, "loss": 1.1925873756408691, "step": 371 }, { "epoch": 2.533333333333333, "grad_norm": 0.8729323744773865, "learning_rate": 1.3522994127732415e-06, "loss": 1.20308518409729, "step": 372 }, { "epoch": 2.54017094017094, "grad_norm": 0.8794763088226318, "learning_rate": 1.3148022027078223e-06, "loss": 1.2204805612564087, "step": 373 }, { "epoch": 2.547008547008547, "grad_norm": 0.870823323726654, "learning_rate": 1.2777955896004812e-06, "loss": 1.2257260084152222, "step": 374 }, { "epoch": 2.5538461538461537, "grad_norm": 0.8570955991744995, "learning_rate": 1.2412816638275406e-06, "loss": 1.2166708707809448, "step": 375 }, { "epoch": 2.5606837606837605, "grad_norm": 0.8496021628379822, "learning_rate": 1.2052624879351105e-06, "loss": 1.1956825256347656, "step": 376 }, { "epoch": 2.5675213675213673, "grad_norm": 0.8563467860221863, "learning_rate": 1.1697400965225746e-06, "loss": 1.2383781671524048, "step": 377 }, { "epoch": 2.574358974358974, "grad_norm": 0.8653855919837952, "learning_rate": 1.134716496127679e-06, "loss": 1.218265414237976, "step": 378 }, { "epoch": 2.5811965811965814, "grad_norm": 0.8653165698051453, "learning_rate": 1.1001936651131717e-06, "loss": 1.226462483406067, "step": 379 }, { "epoch": 2.588034188034188, "grad_norm": 0.8810314536094666, "learning_rate": 1.0661735535550666e-06, "loss": 1.176276445388794, "step": 380 }, { "epoch": 2.594871794871795, "grad_norm": 0.8538199663162231, "learning_rate": 1.0326580831324816e-06, "loss": 1.2393090724945068, "step": 381 }, { "epoch": 2.601709401709402, "grad_norm": 0.849739134311676, "learning_rate": 9.996491470190917e-07, "loss": 1.2231508493423462, "step": 382 }, { "epoch": 2.6085470085470086, "grad_norm": 0.891149640083313, "learning_rate": 9.671486097761918e-07, "loss": 1.2225626707077026, "step": 383 }, { "epoch": 2.6153846153846154, "grad_norm": 0.8668763637542725, "learning_rate": 9.351583072473713e-07, "loss": 1.2182505130767822, "step": 384 }, { "epoch": 2.6222222222222222, "grad_norm": 0.8931220173835754, "learning_rate": 9.036800464548157e-07, "loss": 1.1996538639068604, "step": 385 }, { "epoch": 2.629059829059829, "grad_norm": 0.923690140247345, "learning_rate": 8.727156054972374e-07, "loss": 1.238417148590088, "step": 386 }, { "epoch": 2.635897435897436, "grad_norm": 0.9119179844856262, "learning_rate": 8.42266733449425e-07, "loss": 1.226833462715149, "step": 387 }, { "epoch": 2.6427350427350427, "grad_norm": 0.8686037659645081, "learning_rate": 8.123351502634625e-07, "loss": 1.1834110021591187, "step": 388 }, { "epoch": 2.6495726495726495, "grad_norm": 0.8596007823944092, "learning_rate": 7.829225466715551e-07, "loss": 1.1922662258148193, "step": 389 }, { "epoch": 2.6564102564102563, "grad_norm": 0.8411397337913513, "learning_rate": 7.540305840905371e-07, "loss": 1.2220802307128906, "step": 390 }, { "epoch": 2.663247863247863, "grad_norm": 0.8473320007324219, "learning_rate": 7.256608945280319e-07, "loss": 1.176034688949585, "step": 391 }, { "epoch": 2.67008547008547, "grad_norm": 0.8465791940689087, "learning_rate": 6.978150804902451e-07, "loss": 1.2118513584136963, "step": 392 }, { "epoch": 2.676923076923077, "grad_norm": 0.8556994199752808, "learning_rate": 6.704947148914608e-07, "loss": 1.2035595178604126, "step": 393 }, { "epoch": 2.683760683760684, "grad_norm": 0.8603663444519043, "learning_rate": 6.437013409651849e-07, "loss": 1.2043513059616089, "step": 394 }, { "epoch": 2.690598290598291, "grad_norm": 0.8347552418708801, "learning_rate": 6.174364721769744e-07, "loss": 1.260666847229004, "step": 395 }, { "epoch": 2.6974358974358976, "grad_norm": 0.867624044418335, "learning_rate": 5.917015921389569e-07, "loss": 1.2071622610092163, "step": 396 }, { "epoch": 2.7042735042735044, "grad_norm": 0.8668217062950134, "learning_rate": 5.664981545260073e-07, "loss": 1.197313904762268, "step": 397 }, { "epoch": 2.7111111111111112, "grad_norm": 0.8758941292762756, "learning_rate": 5.418275829936537e-07, "loss": 1.1844048500061035, "step": 398 }, { "epoch": 2.717948717948718, "grad_norm": 0.866844892501831, "learning_rate": 5.176912710976467e-07, "loss": 1.1971948146820068, "step": 399 }, { "epoch": 2.724786324786325, "grad_norm": 0.8587160110473633, "learning_rate": 4.940905822152454e-07, "loss": 1.1895333528518677, "step": 400 }, { "epoch": 2.724786324786325, "eval_loss": 1.4216117858886719, "eval_runtime": 13.782, "eval_samples_per_second": 71.543, "eval_steps_per_second": 8.997, "step": 400 }, { "epoch": 2.7316239316239317, "grad_norm": 0.8763930201530457, "learning_rate": 4.710268494682146e-07, "loss": 1.1914920806884766, "step": 401 }, { "epoch": 2.7384615384615385, "grad_norm": 0.8831557035446167, "learning_rate": 4.485013756475076e-07, "loss": 1.1900079250335693, "step": 402 }, { "epoch": 2.7452991452991453, "grad_norm": 0.866532027721405, "learning_rate": 4.265154331396815e-07, "loss": 1.1844745874404907, "step": 403 }, { "epoch": 2.752136752136752, "grad_norm": 0.8787288069725037, "learning_rate": 4.0507026385502747e-07, "loss": 1.2126126289367676, "step": 404 }, { "epoch": 2.758974358974359, "grad_norm": 0.8669936060905457, "learning_rate": 3.841670791574137e-07, "loss": 1.229267954826355, "step": 405 }, { "epoch": 2.7658119658119658, "grad_norm": 0.8436914086341858, "learning_rate": 3.638070597958665e-07, "loss": 1.1994611024856567, "step": 406 }, { "epoch": 2.7726495726495726, "grad_norm": 0.8477561473846436, "learning_rate": 3.439913558378705e-07, "loss": 1.2160733938217163, "step": 407 }, { "epoch": 2.7794871794871794, "grad_norm": 0.9217561483383179, "learning_rate": 3.2472108660439706e-07, "loss": 1.1882672309875488, "step": 408 }, { "epoch": 2.786324786324786, "grad_norm": 0.8692064881324768, "learning_rate": 3.059973406066963e-07, "loss": 1.186108112335205, "step": 409 }, { "epoch": 2.793162393162393, "grad_norm": 0.8593800067901611, "learning_rate": 2.878211754847926e-07, "loss": 1.2128371000289917, "step": 410 }, { "epoch": 2.8, "grad_norm": 0.8875913023948669, "learning_rate": 2.701936179477516e-07, "loss": 1.1906311511993408, "step": 411 }, { "epoch": 2.8068376068376066, "grad_norm": 0.8833599090576172, "learning_rate": 2.5311566371568505e-07, "loss": 1.1937415599822998, "step": 412 }, { "epoch": 2.8136752136752134, "grad_norm": 0.8523573279380798, "learning_rate": 2.3658827746349976e-07, "loss": 1.1862268447875977, "step": 413 }, { "epoch": 2.8205128205128203, "grad_norm": 0.8653656244277954, "learning_rate": 2.206123927664161e-07, "loss": 1.2255483865737915, "step": 414 }, { "epoch": 2.827350427350427, "grad_norm": 0.874724805355072, "learning_rate": 2.0518891204722169e-07, "loss": 1.2177876234054565, "step": 415 }, { "epoch": 2.8341880341880343, "grad_norm": 0.8411559462547302, "learning_rate": 1.903187065253076e-07, "loss": 1.2034833431243896, "step": 416 }, { "epoch": 2.841025641025641, "grad_norm": 0.8371963500976562, "learning_rate": 1.7600261616745106e-07, "loss": 1.1710231304168701, "step": 417 }, { "epoch": 2.847863247863248, "grad_norm": 0.8555141687393188, "learning_rate": 1.622414496403668e-07, "loss": 1.2024474143981934, "step": 418 }, { "epoch": 2.8547008547008548, "grad_norm": 0.8661652207374573, "learning_rate": 1.490359842650324e-07, "loss": 1.2498114109039307, "step": 419 }, { "epoch": 2.8615384615384616, "grad_norm": 0.8592333197593689, "learning_rate": 1.3638696597277678e-07, "loss": 1.2100580930709839, "step": 420 }, { "epoch": 2.8683760683760684, "grad_norm": 0.8594926595687866, "learning_rate": 1.2429510926314835e-07, "loss": 1.1787865161895752, "step": 421 }, { "epoch": 2.875213675213675, "grad_norm": 0.8879026174545288, "learning_rate": 1.1276109716355288e-07, "loss": 1.2315534353256226, "step": 422 }, { "epoch": 2.882051282051282, "grad_norm": 0.8497971892356873, "learning_rate": 1.0178558119067316e-07, "loss": 1.2027359008789062, "step": 423 }, { "epoch": 2.888888888888889, "grad_norm": 0.8838421106338501, "learning_rate": 9.136918131366412e-08, "loss": 1.2284358739852905, "step": 424 }, { "epoch": 2.8957264957264957, "grad_norm": 0.8940805196762085, "learning_rate": 8.151248591913519e-08, "loss": 1.2018911838531494, "step": 425 }, { "epoch": 2.9025641025641025, "grad_norm": 0.8463784456253052, "learning_rate": 7.22160517779169e-08, "loss": 1.2137906551361084, "step": 426 }, { "epoch": 2.9094017094017093, "grad_norm": 0.8508373498916626, "learning_rate": 6.348040401360833e-08, "loss": 1.2048455476760864, "step": 427 }, { "epoch": 2.916239316239316, "grad_norm": 0.8702911138534546, "learning_rate": 5.530603607290852e-08, "loss": 1.216880202293396, "step": 428 }, { "epoch": 2.9230769230769234, "grad_norm": 0.8441773653030396, "learning_rate": 4.7693409697756596e-08, "loss": 1.2449169158935547, "step": 429 }, { "epoch": 2.92991452991453, "grad_norm": 0.8643396496772766, "learning_rate": 4.0642954899238196e-08, "loss": 1.2154898643493652, "step": 430 }, { "epoch": 2.936752136752137, "grad_norm": 0.8390621542930603, "learning_rate": 3.4155069933301535e-08, "loss": 1.1894207000732422, "step": 431 }, { "epoch": 2.943589743589744, "grad_norm": 0.8889386057853699, "learning_rate": 2.823012127825764e-08, "loss": 1.2449326515197754, "step": 432 }, { "epoch": 2.9504273504273506, "grad_norm": 0.8431465029716492, "learning_rate": 2.2868443614082468e-08, "loss": 1.1918964385986328, "step": 433 }, { "epoch": 2.9572649572649574, "grad_norm": 0.859993577003479, "learning_rate": 1.8070339803509805e-08, "loss": 1.1882524490356445, "step": 434 }, { "epoch": 2.9641025641025642, "grad_norm": 0.8584935069084167, "learning_rate": 1.383608087492605e-08, "loss": 1.2315739393234253, "step": 435 }, { "epoch": 2.970940170940171, "grad_norm": 0.8648282289505005, "learning_rate": 1.0165906007056914e-08, "loss": 1.235274314880371, "step": 436 }, { "epoch": 2.977777777777778, "grad_norm": 0.8602524399757385, "learning_rate": 7.060022515460452e-09, "loss": 1.1928036212921143, "step": 437 }, { "epoch": 2.9846153846153847, "grad_norm": 0.8722023367881775, "learning_rate": 4.5186058408153156e-09, "loss": 1.2146607637405396, "step": 438 }, { "epoch": 2.9914529914529915, "grad_norm": 0.8878926038742065, "learning_rate": 2.5417995390086824e-09, "loss": 1.1910994052886963, "step": 439 }, { "epoch": 2.9982905982905983, "grad_norm": 0.8773415088653564, "learning_rate": 1.129715273033849e-09, "loss": 1.1811952590942383, "step": 440 }, { "epoch": 3.0, "grad_norm": 1.841178059577942, "learning_rate": 2.8243280667306084e-10, "loss": 1.14687180519104, "step": 441 }, { "epoch": 3.0, "step": 441, "total_flos": 5.3379040973665075e+17, "train_loss": 1.3470534840408637, "train_runtime": 3006.8003, "train_samples_per_second": 18.676, "train_steps_per_second": 0.147 } ], "logging_steps": 1.0, "max_steps": 441, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.3379040973665075e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }