{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1623, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030807147258163892, "grad_norm": 0.5412173867225647, "learning_rate": 9.756097560975611e-06, "loss": 1.1518, "step": 5 }, { "epoch": 0.0061614294516327784, "grad_norm": 0.4266219735145569, "learning_rate": 2.1951219512195124e-05, "loss": 1.2475, "step": 10 }, { "epoch": 0.009242144177449169, "grad_norm": 0.28109025955200195, "learning_rate": 3.414634146341464e-05, "loss": 1.2543, "step": 15 }, { "epoch": 0.012322858903265557, "grad_norm": 0.4827917814254761, "learning_rate": 4.634146341463415e-05, "loss": 1.2007, "step": 20 }, { "epoch": 0.015403573629081947, "grad_norm": 1.1757270097732544, "learning_rate": 5.853658536585366e-05, "loss": 1.0335, "step": 25 }, { "epoch": 0.018484288354898338, "grad_norm": 0.5378533601760864, "learning_rate": 7.073170731707317e-05, "loss": 1.0887, "step": 30 }, { "epoch": 0.021565003080714726, "grad_norm": 0.32407647371292114, "learning_rate": 8.292682926829268e-05, "loss": 1.1421, "step": 35 }, { "epoch": 0.024645717806531114, "grad_norm": 0.310761034488678, "learning_rate": 9.51219512195122e-05, "loss": 1.1532, "step": 40 }, { "epoch": 0.027726432532347505, "grad_norm": 0.770730197429657, "learning_rate": 0.00010731707317073172, "loss": 1.1098, "step": 45 }, { "epoch": 0.030807147258163893, "grad_norm": 0.7011957764625549, "learning_rate": 0.00011951219512195122, "loss": 0.9678, "step": 50 }, { "epoch": 0.033887861983980284, "grad_norm": 0.37764063477516174, "learning_rate": 0.00013170731707317076, "loss": 1.0811, "step": 55 }, { "epoch": 0.036968576709796676, "grad_norm": 0.36831024289131165, "learning_rate": 0.00014390243902439025, "loss": 1.1093, "step": 60 }, { "epoch": 0.04004929143561306, "grad_norm": 0.26355060935020447, "learning_rate": 0.00015609756097560978, "loss": 1.0733, "step": 65 }, { "epoch": 0.04313000616142945, "grad_norm": 0.656220018863678, "learning_rate": 0.00016829268292682927, "loss": 1.1062, "step": 70 }, { "epoch": 0.04621072088724584, "grad_norm": 0.7080119252204895, "learning_rate": 0.0001804878048780488, "loss": 0.9404, "step": 75 }, { "epoch": 0.04929143561306223, "grad_norm": 0.3640540838241577, "learning_rate": 0.0001926829268292683, "loss": 0.9997, "step": 80 }, { "epoch": 0.05237215033887862, "grad_norm": 0.327878475189209, "learning_rate": 0.0001999991687649223, "loss": 1.038, "step": 85 }, { "epoch": 0.05545286506469501, "grad_norm": 0.2545090317726135, "learning_rate": 0.00019998981752900036, "loss": 1.1127, "step": 90 }, { "epoch": 0.0585335797905114, "grad_norm": 0.729415237903595, "learning_rate": 0.00019997007698817557, "loss": 1.1212, "step": 95 }, { "epoch": 0.061614294516327786, "grad_norm": 0.5619292259216309, "learning_rate": 0.00019993994919356167, "loss": 0.8807, "step": 100 }, { "epoch": 0.06469500924214418, "grad_norm": 0.7847259640693665, "learning_rate": 0.00019989943727554598, "loss": 0.9791, "step": 105 }, { "epoch": 0.06777572396796057, "grad_norm": 0.2865511178970337, "learning_rate": 0.00019984854544346367, "loss": 1.0313, "step": 110 }, { "epoch": 0.07085643869377696, "grad_norm": 0.27198705077171326, "learning_rate": 0.00019978727898516086, "loss": 1.073, "step": 115 }, { "epoch": 0.07393715341959335, "grad_norm": 0.7417078018188477, "learning_rate": 0.0001997156442664449, "loss": 0.9766, "step": 120 }, { "epoch": 0.07701786814540973, "grad_norm": 0.5739508271217346, "learning_rate": 0.00019963364873042298, "loss": 0.8606, "step": 125 }, { "epoch": 0.08009858287122612, "grad_norm": 0.28192320466041565, "learning_rate": 0.0001995413008967289, "loss": 1.0665, "step": 130 }, { "epoch": 0.08317929759704251, "grad_norm": 0.28555136919021606, "learning_rate": 0.00019943861036063768, "loss": 1.0262, "step": 135 }, { "epoch": 0.0862600123228589, "grad_norm": 0.2717398405075073, "learning_rate": 0.00019932558779206874, "loss": 1.0675, "step": 140 }, { "epoch": 0.0893407270486753, "grad_norm": 0.6025347113609314, "learning_rate": 0.00019920224493447702, "loss": 1.069, "step": 145 }, { "epoch": 0.09242144177449169, "grad_norm": 0.5110194683074951, "learning_rate": 0.00019906859460363307, "loss": 0.8611, "step": 150 }, { "epoch": 0.09550215650030808, "grad_norm": 0.3318012058734894, "learning_rate": 0.00019892465068629131, "loss": 0.9906, "step": 155 }, { "epoch": 0.09858287122612445, "grad_norm": 0.25904572010040283, "learning_rate": 0.0001987704281387471, "loss": 1.1415, "step": 160 }, { "epoch": 0.10166358595194085, "grad_norm": 0.24928759038448334, "learning_rate": 0.00019860594298528282, "loss": 1.1192, "step": 165 }, { "epoch": 0.10474430067775724, "grad_norm": 0.724487841129303, "learning_rate": 0.0001984312123165028, "loss": 1.1348, "step": 170 }, { "epoch": 0.10782501540357363, "grad_norm": 0.4745875597000122, "learning_rate": 0.0001982462542875576, "loss": 0.8411, "step": 175 }, { "epoch": 0.11090573012939002, "grad_norm": 0.2991827726364136, "learning_rate": 0.00019805108811625773, "loss": 1.0013, "step": 180 }, { "epoch": 0.11398644485520641, "grad_norm": 0.2740406394004822, "learning_rate": 0.00019784573408107657, "loss": 0.9905, "step": 185 }, { "epoch": 0.1170671595810228, "grad_norm": 0.2254003882408142, "learning_rate": 0.00019763021351904358, "loss": 0.9899, "step": 190 }, { "epoch": 0.12014787430683918, "grad_norm": 0.4834403991699219, "learning_rate": 0.00019740454882352732, "loss": 1.0253, "step": 195 }, { "epoch": 0.12322858903265557, "grad_norm": 0.5589045882225037, "learning_rate": 0.0001971687634419086, "loss": 0.8803, "step": 200 }, { "epoch": 0.12630930375847196, "grad_norm": 0.36558014154434204, "learning_rate": 0.0001969228818731442, "loss": 0.9944, "step": 205 }, { "epoch": 0.12939001848428835, "grad_norm": 0.2908712327480316, "learning_rate": 0.00019666692966522145, "loss": 1.0408, "step": 210 }, { "epoch": 0.13247073321010475, "grad_norm": 0.260887086391449, "learning_rate": 0.00019640093341250357, "loss": 0.9837, "step": 215 }, { "epoch": 0.13555144793592114, "grad_norm": 0.5799285769462585, "learning_rate": 0.0001961249207529665, "loss": 1.0473, "step": 220 }, { "epoch": 0.13863216266173753, "grad_norm": 0.6132811903953552, "learning_rate": 0.00019583892036532726, "loss": 0.872, "step": 225 }, { "epoch": 0.14171287738755392, "grad_norm": 0.28405216336250305, "learning_rate": 0.00019554296196606395, "loss": 0.9982, "step": 230 }, { "epoch": 0.1447935921133703, "grad_norm": 0.2680262327194214, "learning_rate": 0.00019523707630632835, "loss": 0.9808, "step": 235 }, { "epoch": 0.1478743068391867, "grad_norm": 0.22932539880275726, "learning_rate": 0.00019492129516875055, "loss": 1.045, "step": 240 }, { "epoch": 0.15095502156500307, "grad_norm": 0.5834919810295105, "learning_rate": 0.00019459565136413666, "loss": 1.0869, "step": 245 }, { "epoch": 0.15403573629081946, "grad_norm": 0.5460705161094666, "learning_rate": 0.0001942601787280598, "loss": 0.8503, "step": 250 }, { "epoch": 0.15711645101663585, "grad_norm": 0.3478216230869293, "learning_rate": 0.00019391491211734425, "loss": 0.9952, "step": 255 }, { "epoch": 0.16019716574245224, "grad_norm": 0.2494744062423706, "learning_rate": 0.0001935598874064438, "loss": 0.9793, "step": 260 }, { "epoch": 0.16327788046826863, "grad_norm": 0.23242679238319397, "learning_rate": 0.00019319514148371435, "loss": 0.9427, "step": 265 }, { "epoch": 0.16635859519408502, "grad_norm": 0.5289666056632996, "learning_rate": 0.00019282071224758091, "loss": 1.0259, "step": 270 }, { "epoch": 0.16943930991990142, "grad_norm": 0.5127719640731812, "learning_rate": 0.00019243663860259993, "loss": 0.8559, "step": 275 }, { "epoch": 0.1725200246457178, "grad_norm": 0.3006535768508911, "learning_rate": 0.00019204296045541685, "loss": 0.9995, "step": 280 }, { "epoch": 0.1756007393715342, "grad_norm": 0.2744085192680359, "learning_rate": 0.0001916397187106199, "loss": 0.9409, "step": 285 }, { "epoch": 0.1786814540973506, "grad_norm": 0.2169232964515686, "learning_rate": 0.00019122695526648968, "loss": 0.9847, "step": 290 }, { "epoch": 0.18176216882316698, "grad_norm": 0.5880517959594727, "learning_rate": 0.00019080471301064598, "loss": 1.0982, "step": 295 }, { "epoch": 0.18484288354898337, "grad_norm": 0.5316351056098938, "learning_rate": 0.00019037303581559143, "loss": 0.8299, "step": 300 }, { "epoch": 0.18792359827479976, "grad_norm": 0.31043657660484314, "learning_rate": 0.00018993196853415317, "loss": 0.9737, "step": 305 }, { "epoch": 0.19100431300061615, "grad_norm": 0.3109757602214813, "learning_rate": 0.00018948155699482244, "loss": 0.9551, "step": 310 }, { "epoch": 0.19408502772643252, "grad_norm": 0.24996116757392883, "learning_rate": 0.00018902184799699263, "loss": 1.057, "step": 315 }, { "epoch": 0.1971657424522489, "grad_norm": 0.4340403974056244, "learning_rate": 0.00018855288930609692, "loss": 1.0065, "step": 320 }, { "epoch": 0.2002464571780653, "grad_norm": 0.47125184535980225, "learning_rate": 0.00018807472964864515, "loss": 0.8492, "step": 325 }, { "epoch": 0.2033271719038817, "grad_norm": 0.335504949092865, "learning_rate": 0.00018758741870716092, "loss": 1.0248, "step": 330 }, { "epoch": 0.20640788662969808, "grad_norm": 0.24601280689239502, "learning_rate": 0.00018709100711501955, "loss": 1.0095, "step": 335 }, { "epoch": 0.20948860135551448, "grad_norm": 0.2395162731409073, "learning_rate": 0.0001865855464511869, "loss": 0.9469, "step": 340 }, { "epoch": 0.21256931608133087, "grad_norm": 0.4389413893222809, "learning_rate": 0.00018607108923486025, "loss": 0.8772, "step": 345 }, { "epoch": 0.21565003080714726, "grad_norm": 0.5434815287590027, "learning_rate": 0.00018554768892001136, "loss": 0.8309, "step": 350 }, { "epoch": 0.21873074553296365, "grad_norm": 0.3031887114048004, "learning_rate": 0.00018501539988983234, "loss": 0.8526, "step": 355 }, { "epoch": 0.22181146025878004, "grad_norm": 0.2740094065666199, "learning_rate": 0.0001844742774510851, "loss": 0.9808, "step": 360 }, { "epoch": 0.22489217498459643, "grad_norm": 0.20324410498142242, "learning_rate": 0.00018392437782835475, "loss": 0.9952, "step": 365 }, { "epoch": 0.22797288971041282, "grad_norm": 0.5303752422332764, "learning_rate": 0.00018336575815820766, "loss": 1.011, "step": 370 }, { "epoch": 0.23105360443622922, "grad_norm": 0.5991610884666443, "learning_rate": 0.00018279847648325478, "loss": 0.8487, "step": 375 }, { "epoch": 0.2341343191620456, "grad_norm": 0.3495015501976013, "learning_rate": 0.0001822225917461208, "loss": 0.9038, "step": 380 }, { "epoch": 0.23721503388786197, "grad_norm": 0.35118335485458374, "learning_rate": 0.0001816381637833198, "loss": 0.9601, "step": 385 }, { "epoch": 0.24029574861367836, "grad_norm": 0.2514159679412842, "learning_rate": 0.00018104525331903799, "loss": 1.0495, "step": 390 }, { "epoch": 0.24337646333949475, "grad_norm": 0.5237391591072083, "learning_rate": 0.00018044392195882427, "loss": 1.0792, "step": 395 }, { "epoch": 0.24645717806531114, "grad_norm": 0.5198184251785278, "learning_rate": 0.00017983423218318918, "loss": 0.8639, "step": 400 }, { "epoch": 0.24953789279112754, "grad_norm": 0.31769484281539917, "learning_rate": 0.00017921624734111292, "loss": 0.9426, "step": 405 }, { "epoch": 0.2526186075169439, "grad_norm": 0.27527257800102234, "learning_rate": 0.00017859003164346336, "loss": 0.9744, "step": 410 }, { "epoch": 0.2556993222427603, "grad_norm": 0.22283877432346344, "learning_rate": 0.0001779556501563239, "loss": 0.9612, "step": 415 }, { "epoch": 0.2587800369685767, "grad_norm": 0.4636116623878479, "learning_rate": 0.00017731316879423327, "loss": 1.034, "step": 420 }, { "epoch": 0.2618607516943931, "grad_norm": 0.47125929594039917, "learning_rate": 0.00017666265431333654, "loss": 0.8632, "step": 425 }, { "epoch": 0.2649414664202095, "grad_norm": 0.28012779355049133, "learning_rate": 0.000176004174304449, "loss": 0.9842, "step": 430 }, { "epoch": 0.2680221811460259, "grad_norm": 0.2671544551849365, "learning_rate": 0.00017533779718603313, "loss": 0.9874, "step": 435 }, { "epoch": 0.2711028958718423, "grad_norm": 0.22700628638267517, "learning_rate": 0.00017466359219708985, "loss": 0.9457, "step": 440 }, { "epoch": 0.27418361059765867, "grad_norm": 0.41735100746154785, "learning_rate": 0.00017398162938996422, "loss": 0.9501, "step": 445 }, { "epoch": 0.27726432532347506, "grad_norm": 0.37333425879478455, "learning_rate": 0.00017329197962306664, "loss": 0.8123, "step": 450 }, { "epoch": 0.28034504004929145, "grad_norm": 0.2996601164340973, "learning_rate": 0.00017259471455351072, "loss": 0.9048, "step": 455 }, { "epoch": 0.28342575477510784, "grad_norm": 0.25390616059303284, "learning_rate": 0.0001718899066296675, "loss": 0.9711, "step": 460 }, { "epoch": 0.28650646950092423, "grad_norm": 0.25309455394744873, "learning_rate": 0.000171177629083638, "loss": 0.9762, "step": 465 }, { "epoch": 0.2895871842267406, "grad_norm": 0.46593979001045227, "learning_rate": 0.0001704579559236441, "loss": 1.0148, "step": 470 }, { "epoch": 0.292667898952557, "grad_norm": 0.5357691645622253, "learning_rate": 0.00016973096192633884, "loss": 0.786, "step": 475 }, { "epoch": 0.2957486136783734, "grad_norm": 0.3320036828517914, "learning_rate": 0.00016899672262903677, "loss": 0.9034, "step": 480 }, { "epoch": 0.2988293284041898, "grad_norm": 0.2919875681400299, "learning_rate": 0.00016825531432186543, "loss": 0.9694, "step": 485 }, { "epoch": 0.30191004313000613, "grad_norm": 0.2110517919063568, "learning_rate": 0.00016750681403983846, "loss": 1.0684, "step": 490 }, { "epoch": 0.3049907578558225, "grad_norm": 0.8895492553710938, "learning_rate": 0.00016675129955485152, "loss": 0.9935, "step": 495 }, { "epoch": 0.3080714725816389, "grad_norm": 0.4201313555240631, "learning_rate": 0.00016598884936760131, "loss": 0.8232, "step": 500 }, { "epoch": 0.3111521873074553, "grad_norm": 0.2772030234336853, "learning_rate": 0.00016521954269942918, "loss": 0.989, "step": 505 }, { "epoch": 0.3142329020332717, "grad_norm": 0.24938176572322845, "learning_rate": 0.00016444345948408984, "loss": 0.9521, "step": 510 }, { "epoch": 0.3173136167590881, "grad_norm": 0.23586159944534302, "learning_rate": 0.0001636606803594457, "loss": 1.0013, "step": 515 }, { "epoch": 0.3203943314849045, "grad_norm": 0.6195285320281982, "learning_rate": 0.0001628712866590885, "loss": 0.9773, "step": 520 }, { "epoch": 0.3234750462107209, "grad_norm": 0.5146563053131104, "learning_rate": 0.00016207536040388845, "loss": 0.8414, "step": 525 }, { "epoch": 0.32655576093653726, "grad_norm": 0.30340778827667236, "learning_rate": 0.0001612729842934718, "loss": 0.9793, "step": 530 }, { "epoch": 0.32963647566235366, "grad_norm": 0.2670872211456299, "learning_rate": 0.00016046424169762827, "loss": 1.0042, "step": 535 }, { "epoch": 0.33271719038817005, "grad_norm": 0.2140674591064453, "learning_rate": 0.0001596492166476485, "loss": 1.0067, "step": 540 }, { "epoch": 0.33579790511398644, "grad_norm": 0.42335960268974304, "learning_rate": 0.0001588279938275929, "loss": 0.9971, "step": 545 }, { "epoch": 0.33887861983980283, "grad_norm": 0.5136492252349854, "learning_rate": 0.00015800065856549269, "loss": 0.7794, "step": 550 }, { "epoch": 0.3419593345656192, "grad_norm": 0.28528350591659546, "learning_rate": 0.00015716729682448393, "loss": 0.9553, "step": 555 }, { "epoch": 0.3450400492914356, "grad_norm": 0.23635320365428925, "learning_rate": 0.0001563279951938758, "loss": 0.9601, "step": 560 }, { "epoch": 0.348120764017252, "grad_norm": 0.21445675194263458, "learning_rate": 0.00015548284088015354, "loss": 1.0177, "step": 565 }, { "epoch": 0.3512014787430684, "grad_norm": 0.494815856218338, "learning_rate": 0.00015463192169791741, "loss": 0.9958, "step": 570 }, { "epoch": 0.3542821934688848, "grad_norm": 0.4995960295200348, "learning_rate": 0.0001537753260607584, "loss": 0.8352, "step": 575 }, { "epoch": 0.3573629081947012, "grad_norm": 0.2728192210197449, "learning_rate": 0.00015291314297207175, "loss": 0.9472, "step": 580 }, { "epoch": 0.36044362292051757, "grad_norm": 0.21085520088672638, "learning_rate": 0.0001520454620158093, "loss": 0.9853, "step": 585 }, { "epoch": 0.36352433764633396, "grad_norm": 0.21773026883602142, "learning_rate": 0.00015117237334717117, "loss": 0.9141, "step": 590 }, { "epoch": 0.36660505237215035, "grad_norm": 0.4270155131816864, "learning_rate": 0.00015029396768323846, "loss": 1.0516, "step": 595 }, { "epoch": 0.36968576709796674, "grad_norm": 0.444807767868042, "learning_rate": 0.00014941033629354734, "loss": 0.8681, "step": 600 }, { "epoch": 0.37276648182378314, "grad_norm": 0.29151108860969543, "learning_rate": 0.00014852157099060596, "loss": 0.9942, "step": 605 }, { "epoch": 0.3758471965495995, "grad_norm": 0.2614330053329468, "learning_rate": 0.00014762776412035456, "loss": 1.0202, "step": 610 }, { "epoch": 0.3789279112754159, "grad_norm": 0.22319279611110687, "learning_rate": 0.00014672900855257056, "loss": 0.9941, "step": 615 }, { "epoch": 0.3820086260012323, "grad_norm": 0.5060321688652039, "learning_rate": 0.00014582539767121904, "loss": 0.9866, "step": 620 }, { "epoch": 0.3850893407270487, "grad_norm": 0.46248483657836914, "learning_rate": 0.0001449170253647498, "loss": 0.741, "step": 625 }, { "epoch": 0.38817005545286504, "grad_norm": 0.31994864344596863, "learning_rate": 0.0001440039860163419, "loss": 0.9465, "step": 630 }, { "epoch": 0.39125077017868143, "grad_norm": 0.2810644209384918, "learning_rate": 0.00014308637449409706, "loss": 0.9403, "step": 635 }, { "epoch": 0.3943314849044978, "grad_norm": 0.22498297691345215, "learning_rate": 0.00014216428614118243, "loss": 1.0146, "step": 640 }, { "epoch": 0.3974121996303142, "grad_norm": 0.4635995924472809, "learning_rate": 0.00014123781676592418, "loss": 0.9778, "step": 645 }, { "epoch": 0.4004929143561306, "grad_norm": 0.4592895805835724, "learning_rate": 0.00014030706263185247, "loss": 0.8311, "step": 650 }, { "epoch": 0.403573629081947, "grad_norm": 0.30809083580970764, "learning_rate": 0.00013937212044769955, "loss": 0.9141, "step": 655 }, { "epoch": 0.4066543438077634, "grad_norm": 0.32495659589767456, "learning_rate": 0.0001384330873573513, "loss": 0.9867, "step": 660 }, { "epoch": 0.4097350585335798, "grad_norm": 0.22196783125400543, "learning_rate": 0.00013749006092975347, "loss": 1.0004, "step": 665 }, { "epoch": 0.41281577325939617, "grad_norm": 0.5153515934944153, "learning_rate": 0.00013654313914877414, "loss": 0.9771, "step": 670 }, { "epoch": 0.41589648798521256, "grad_norm": 0.43011271953582764, "learning_rate": 0.00013559242040302272, "loss": 0.7806, "step": 675 }, { "epoch": 0.41897720271102895, "grad_norm": 0.2914900779724121, "learning_rate": 0.00013463800347562706, "loss": 0.9531, "step": 680 }, { "epoch": 0.42205791743684534, "grad_norm": 0.27283361554145813, "learning_rate": 0.00013367998753396944, "loss": 0.8862, "step": 685 }, { "epoch": 0.42513863216266173, "grad_norm": 0.22260256111621857, "learning_rate": 0.00013271847211938285, "loss": 0.978, "step": 690 }, { "epoch": 0.4282193468884781, "grad_norm": 0.49871596693992615, "learning_rate": 0.0001317535571368082, "loss": 1.0035, "step": 695 }, { "epoch": 0.4313000616142945, "grad_norm": 0.4641573429107666, "learning_rate": 0.00013078534284441382, "loss": 0.8737, "step": 700 }, { "epoch": 0.4343807763401109, "grad_norm": 0.2758205235004425, "learning_rate": 0.00012981392984317834, "loss": 0.9117, "step": 705 }, { "epoch": 0.4374614910659273, "grad_norm": 0.266637921333313, "learning_rate": 0.00012883941906643786, "loss": 0.9657, "step": 710 }, { "epoch": 0.4405422057917437, "grad_norm": 0.23960596323013306, "learning_rate": 0.00012786191176939848, "loss": 0.9081, "step": 715 }, { "epoch": 0.4436229205175601, "grad_norm": 0.4470706880092621, "learning_rate": 0.00012688150951861582, "loss": 0.9299, "step": 720 }, { "epoch": 0.4467036352433765, "grad_norm": 0.5093996524810791, "learning_rate": 0.00012589831418144154, "loss": 0.8259, "step": 725 }, { "epoch": 0.44978434996919286, "grad_norm": 0.2514059841632843, "learning_rate": 0.00012491242791543922, "loss": 0.9407, "step": 730 }, { "epoch": 0.45286506469500926, "grad_norm": 0.25785940885543823, "learning_rate": 0.00012392395315776963, "loss": 0.9092, "step": 735 }, { "epoch": 0.45594577942082565, "grad_norm": 0.2544153034687042, "learning_rate": 0.00012293299261454725, "loss": 0.9285, "step": 740 }, { "epoch": 0.45902649414664204, "grad_norm": 0.4648970663547516, "learning_rate": 0.00012193964925016872, "loss": 0.9379, "step": 745 }, { "epoch": 0.46210720887245843, "grad_norm": 0.5296097993850708, "learning_rate": 0.00012094402627661447, "loss": 0.7754, "step": 750 }, { "epoch": 0.4651879235982748, "grad_norm": 0.2834993898868561, "learning_rate": 0.00011994622714272448, "loss": 0.9358, "step": 755 }, { "epoch": 0.4682686383240912, "grad_norm": 0.329428493976593, "learning_rate": 0.00011894635552344975, "loss": 0.9574, "step": 760 }, { "epoch": 0.4713493530499076, "grad_norm": 0.20489312708377838, "learning_rate": 0.00011794451530908011, "loss": 0.9345, "step": 765 }, { "epoch": 0.47443006777572394, "grad_norm": 0.5085513591766357, "learning_rate": 0.00011694081059444946, "loss": 0.9837, "step": 770 }, { "epoch": 0.47751078250154033, "grad_norm": 0.4909701645374298, "learning_rate": 0.0001159353456681201, "loss": 0.816, "step": 775 }, { "epoch": 0.4805914972273567, "grad_norm": 0.28249093890190125, "learning_rate": 0.00011492822500154667, "loss": 0.9001, "step": 780 }, { "epoch": 0.4836722119531731, "grad_norm": 0.24982449412345886, "learning_rate": 0.00011391955323822126, "loss": 0.8926, "step": 785 }, { "epoch": 0.4867529266789895, "grad_norm": 0.22139038145542145, "learning_rate": 0.00011290943518280057, "loss": 1.0207, "step": 790 }, { "epoch": 0.4898336414048059, "grad_norm": 0.4508483111858368, "learning_rate": 0.0001118979757902162, "loss": 0.9285, "step": 795 }, { "epoch": 0.4929143561306223, "grad_norm": 0.4797590672969818, "learning_rate": 0.00011088528015476964, "loss": 0.8541, "step": 800 }, { "epoch": 0.4959950708564387, "grad_norm": 0.25662368535995483, "learning_rate": 0.00010987145349921251, "loss": 0.9033, "step": 805 }, { "epoch": 0.49907578558225507, "grad_norm": 0.26000267267227173, "learning_rate": 0.0001088566011638134, "loss": 0.9413, "step": 810 }, { "epoch": 0.5021565003080715, "grad_norm": 0.21961303055286407, "learning_rate": 0.00010784082859541292, "loss": 0.9315, "step": 815 }, { "epoch": 0.5052372150338879, "grad_norm": 0.4173499047756195, "learning_rate": 0.0001068242413364671, "loss": 0.9527, "step": 820 }, { "epoch": 0.5083179297597042, "grad_norm": 0.4871540069580078, "learning_rate": 0.00010580694501408138, "loss": 0.8284, "step": 825 }, { "epoch": 0.5113986444855206, "grad_norm": 0.28846967220306396, "learning_rate": 0.00010478904532903535, "loss": 0.8648, "step": 830 }, { "epoch": 0.514479359211337, "grad_norm": 0.24096441268920898, "learning_rate": 0.00010377064804480025, "loss": 1.0178, "step": 835 }, { "epoch": 0.5175600739371534, "grad_norm": 0.2081213891506195, "learning_rate": 0.00010275185897654971, "loss": 0.8944, "step": 840 }, { "epoch": 0.5206407886629698, "grad_norm": 0.46252796053886414, "learning_rate": 0.00010173278398016501, "loss": 0.922, "step": 845 }, { "epoch": 0.5237215033887862, "grad_norm": 0.44197043776512146, "learning_rate": 0.00010071352894123654, "loss": 0.7921, "step": 850 }, { "epoch": 0.5268022181146026, "grad_norm": 0.3035351037979126, "learning_rate": 9.969419976406165e-05, "loss": 0.9301, "step": 855 }, { "epoch": 0.529882932840419, "grad_norm": 0.2718713879585266, "learning_rate": 9.867490236064108e-05, "loss": 0.9367, "step": 860 }, { "epoch": 0.5329636475662354, "grad_norm": 0.25972554087638855, "learning_rate": 9.765574263967396e-05, "loss": 1.0116, "step": 865 }, { "epoch": 0.5360443622920518, "grad_norm": 0.3816847503185272, "learning_rate": 9.66368264955539e-05, "loss": 0.915, "step": 870 }, { "epoch": 0.5391250770178682, "grad_norm": 0.6426383852958679, "learning_rate": 9.56182597973658e-05, "loss": 0.8123, "step": 875 }, { "epoch": 0.5422057917436846, "grad_norm": 0.24743957817554474, "learning_rate": 9.460014837788605e-05, "loss": 0.9215, "step": 880 }, { "epoch": 0.5452865064695009, "grad_norm": 0.24885661900043488, "learning_rate": 9.358259802258581e-05, "loss": 0.9195, "step": 885 }, { "epoch": 0.5483672211953173, "grad_norm": 0.21944816410541534, "learning_rate": 9.256571445863972e-05, "loss": 0.9105, "step": 890 }, { "epoch": 0.5514479359211337, "grad_norm": 0.4631386399269104, "learning_rate": 9.154960334394027e-05, "loss": 0.965, "step": 895 }, { "epoch": 0.5545286506469501, "grad_norm": 0.47829023003578186, "learning_rate": 9.053437025611973e-05, "loss": 0.7986, "step": 900 }, { "epoch": 0.5576093653727665, "grad_norm": 0.29296985268592834, "learning_rate": 8.952012068158027e-05, "loss": 0.9545, "step": 905 }, { "epoch": 0.5606900800985829, "grad_norm": 0.23259030282497406, "learning_rate": 8.850696000453326e-05, "loss": 0.9846, "step": 910 }, { "epoch": 0.5637707948243993, "grad_norm": 0.2143285572528839, "learning_rate": 8.749499349604993e-05, "loss": 0.9375, "step": 915 }, { "epoch": 0.5668515095502157, "grad_norm": 0.4443269670009613, "learning_rate": 8.64843263031228e-05, "loss": 0.8851, "step": 920 }, { "epoch": 0.5699322242760321, "grad_norm": 0.49172407388687134, "learning_rate": 8.547506343774097e-05, "loss": 0.7475, "step": 925 }, { "epoch": 0.5730129390018485, "grad_norm": 0.3034185469150543, "learning_rate": 8.446730976597878e-05, "loss": 1.0023, "step": 930 }, { "epoch": 0.5760936537276649, "grad_norm": 0.27486246824264526, "learning_rate": 8.346116999709975e-05, "loss": 0.9047, "step": 935 }, { "epoch": 0.5791743684534812, "grad_norm": 0.2196229249238968, "learning_rate": 8.245674867267724e-05, "loss": 0.9262, "step": 940 }, { "epoch": 0.5822550831792976, "grad_norm": 0.4424618184566498, "learning_rate": 8.145415015573183e-05, "loss": 0.9537, "step": 945 }, { "epoch": 0.585335797905114, "grad_norm": 0.4673041105270386, "learning_rate": 8.045347861988789e-05, "loss": 0.7926, "step": 950 }, { "epoch": 0.5884165126309304, "grad_norm": 0.320578396320343, "learning_rate": 7.945483803854936e-05, "loss": 0.9144, "step": 955 }, { "epoch": 0.5914972273567468, "grad_norm": 0.2610718905925751, "learning_rate": 7.845833217409675e-05, "loss": 1.0055, "step": 960 }, { "epoch": 0.5945779420825632, "grad_norm": 0.20770247280597687, "learning_rate": 7.746406456710564e-05, "loss": 0.9012, "step": 965 }, { "epoch": 0.5976586568083796, "grad_norm": 0.43519556522369385, "learning_rate": 7.64721385255886e-05, "loss": 0.9128, "step": 970 }, { "epoch": 0.600739371534196, "grad_norm": 0.495310515165329, "learning_rate": 7.548265711426104e-05, "loss": 0.7712, "step": 975 }, { "epoch": 0.6038200862600123, "grad_norm": 0.2972421646118164, "learning_rate": 7.449572314383237e-05, "loss": 0.9942, "step": 980 }, { "epoch": 0.6069008009858287, "grad_norm": 0.25806179642677307, "learning_rate": 7.351143916032374e-05, "loss": 0.9889, "step": 985 }, { "epoch": 0.609981515711645, "grad_norm": 0.22174964845180511, "learning_rate": 7.252990743441293e-05, "loss": 0.9398, "step": 990 }, { "epoch": 0.6130622304374614, "grad_norm": 0.5080896019935608, "learning_rate": 7.155122995080827e-05, "loss": 1.0196, "step": 995 }, { "epoch": 0.6161429451632778, "grad_norm": 0.5036611557006836, "learning_rate": 7.057550839765188e-05, "loss": 0.803, "step": 1000 }, { "epoch": 0.6192236598890942, "grad_norm": 0.3295843303203583, "learning_rate": 6.960284415595407e-05, "loss": 0.9066, "step": 1005 }, { "epoch": 0.6223043746149106, "grad_norm": 0.24868136644363403, "learning_rate": 6.863333828905929e-05, "loss": 1.0486, "step": 1010 }, { "epoch": 0.625385089340727, "grad_norm": 0.2273484766483307, "learning_rate": 6.766709153214542e-05, "loss": 0.9454, "step": 1015 }, { "epoch": 0.6284658040665434, "grad_norm": 0.4086507558822632, "learning_rate": 6.670420428175705e-05, "loss": 0.9561, "step": 1020 }, { "epoch": 0.6315465187923598, "grad_norm": 0.4194948077201843, "learning_rate": 6.574477658537375e-05, "loss": 0.7882, "step": 1025 }, { "epoch": 0.6346272335181762, "grad_norm": 0.3146796226501465, "learning_rate": 6.4788908131015e-05, "loss": 0.8443, "step": 1030 }, { "epoch": 0.6377079482439926, "grad_norm": 0.24021373689174652, "learning_rate": 6.38366982368819e-05, "loss": 0.8491, "step": 1035 }, { "epoch": 0.640788662969809, "grad_norm": 0.21864767372608185, "learning_rate": 6.288824584103816e-05, "loss": 0.9222, "step": 1040 }, { "epoch": 0.6438693776956254, "grad_norm": 0.6802520155906677, "learning_rate": 6.194364949112953e-05, "loss": 0.9085, "step": 1045 }, { "epoch": 0.6469500924214417, "grad_norm": 0.49471819400787354, "learning_rate": 6.100300733414474e-05, "loss": 0.8007, "step": 1050 }, { "epoch": 0.6500308071472581, "grad_norm": 0.2820141017436981, "learning_rate": 6.0066417106217455e-05, "loss": 0.8945, "step": 1055 }, { "epoch": 0.6531115218730745, "grad_norm": 0.24521881341934204, "learning_rate": 5.9133976122471214e-05, "loss": 0.9188, "step": 1060 }, { "epoch": 0.6561922365988909, "grad_norm": 0.21457761526107788, "learning_rate": 5.82057812669081e-05, "loss": 0.9509, "step": 1065 }, { "epoch": 0.6592729513247073, "grad_norm": 0.3352929651737213, "learning_rate": 5.728192898234195e-05, "loss": 0.851, "step": 1070 }, { "epoch": 0.6623536660505237, "grad_norm": 0.48959973454475403, "learning_rate": 5.6362515260377835e-05, "loss": 0.7561, "step": 1075 }, { "epoch": 0.6654343807763401, "grad_norm": 0.27886924147605896, "learning_rate": 5.544763563143793e-05, "loss": 0.9267, "step": 1080 }, { "epoch": 0.6685150955021565, "grad_norm": 0.2557944059371948, "learning_rate": 5.4537385154835864e-05, "loss": 0.9299, "step": 1085 }, { "epoch": 0.6715958102279729, "grad_norm": 0.2110741287469864, "learning_rate": 5.363185840889935e-05, "loss": 0.8646, "step": 1090 }, { "epoch": 0.6746765249537893, "grad_norm": 0.44468095898628235, "learning_rate": 5.273114948114346e-05, "loss": 0.9427, "step": 1095 }, { "epoch": 0.6777572396796057, "grad_norm": 0.45007064938545227, "learning_rate": 5.1835351958494515e-05, "loss": 0.7519, "step": 1100 }, { "epoch": 0.680837954405422, "grad_norm": 0.2795059084892273, "learning_rate": 5.094455891756587e-05, "loss": 0.9132, "step": 1105 }, { "epoch": 0.6839186691312384, "grad_norm": 0.24423202872276306, "learning_rate": 5.00588629149872e-05, "loss": 0.9795, "step": 1110 }, { "epoch": 0.6869993838570548, "grad_norm": 0.2284388393163681, "learning_rate": 4.91783559777873e-05, "loss": 0.905, "step": 1115 }, { "epoch": 0.6900800985828712, "grad_norm": 0.49596309661865234, "learning_rate": 4.830312959383238e-05, "loss": 0.909, "step": 1120 }, { "epoch": 0.6931608133086876, "grad_norm": 0.41242459416389465, "learning_rate": 4.7433274702319815e-05, "loss": 0.7293, "step": 1125 }, { "epoch": 0.696241528034504, "grad_norm": 0.2715208828449249, "learning_rate": 4.656888168432962e-05, "loss": 0.8847, "step": 1130 }, { "epoch": 0.6993222427603204, "grad_norm": 0.25266537070274353, "learning_rate": 4.571004035343315e-05, "loss": 0.9697, "step": 1135 }, { "epoch": 0.7024029574861368, "grad_norm": 0.2048375904560089, "learning_rate": 4.485683994636144e-05, "loss": 0.8963, "step": 1140 }, { "epoch": 0.7054836722119532, "grad_norm": 0.44298356771469116, "learning_rate": 4.400936911373308e-05, "loss": 0.9756, "step": 1145 }, { "epoch": 0.7085643869377696, "grad_norm": 0.4284767508506775, "learning_rate": 4.3167715910842966e-05, "loss": 0.7932, "step": 1150 }, { "epoch": 0.711645101663586, "grad_norm": 0.29664915800094604, "learning_rate": 4.2331967788513295e-05, "loss": 0.9168, "step": 1155 }, { "epoch": 0.7147258163894024, "grad_norm": 0.24990494549274445, "learning_rate": 4.1502211584006836e-05, "loss": 0.9272, "step": 1160 }, { "epoch": 0.7178065311152187, "grad_norm": 0.20492446422576904, "learning_rate": 4.067853351200446e-05, "loss": 0.9724, "step": 1165 }, { "epoch": 0.7208872458410351, "grad_norm": 0.38028204441070557, "learning_rate": 3.986101915564695e-05, "loss": 0.9153, "step": 1170 }, { "epoch": 0.7239679605668515, "grad_norm": 0.5438628196716309, "learning_rate": 3.904975345764262e-05, "loss": 0.7897, "step": 1175 }, { "epoch": 0.7270486752926679, "grad_norm": 0.28773704171180725, "learning_rate": 3.824482071144163e-05, "loss": 0.931, "step": 1180 }, { "epoch": 0.7301293900184843, "grad_norm": 0.27623042464256287, "learning_rate": 3.744630455247739e-05, "loss": 0.905, "step": 1185 }, { "epoch": 0.7332101047443007, "grad_norm": 0.20309558510780334, "learning_rate": 3.6654287949476626e-05, "loss": 0.927, "step": 1190 }, { "epoch": 0.7362908194701171, "grad_norm": 0.40813419222831726, "learning_rate": 3.586885319583858e-05, "loss": 0.9488, "step": 1195 }, { "epoch": 0.7393715341959335, "grad_norm": 0.5010459423065186, "learning_rate": 3.5090081901084525e-05, "loss": 0.8075, "step": 1200 }, { "epoch": 0.7424522489217499, "grad_norm": 0.30515843629837036, "learning_rate": 3.431805498237808e-05, "loss": 0.9658, "step": 1205 }, { "epoch": 0.7455329636475663, "grad_norm": 0.24745185673236847, "learning_rate": 3.355285265611784e-05, "loss": 0.953, "step": 1210 }, { "epoch": 0.7486136783733827, "grad_norm": 0.20221352577209473, "learning_rate": 3.279455442960238e-05, "loss": 0.9542, "step": 1215 }, { "epoch": 0.751694393099199, "grad_norm": 0.4061656594276428, "learning_rate": 3.204323909276924e-05, "loss": 0.9838, "step": 1220 }, { "epoch": 0.7547751078250154, "grad_norm": 0.39202672243118286, "learning_rate": 3.1298984710008484e-05, "loss": 0.7694, "step": 1225 }, { "epoch": 0.7578558225508318, "grad_norm": 0.34482622146606445, "learning_rate": 3.056186861205136e-05, "loss": 0.8751, "step": 1230 }, { "epoch": 0.7609365372766482, "grad_norm": 0.24178501963615417, "learning_rate": 2.9831967387935467e-05, "loss": 0.9526, "step": 1235 }, { "epoch": 0.7640172520024646, "grad_norm": 0.2215549796819687, "learning_rate": 2.9109356877046712e-05, "loss": 0.8726, "step": 1240 }, { "epoch": 0.767097966728281, "grad_norm": 0.45621606707572937, "learning_rate": 2.8394112161239605e-05, "loss": 0.943, "step": 1245 }, { "epoch": 0.7701786814540974, "grad_norm": 0.47603940963745117, "learning_rate": 2.7686307557035685e-05, "loss": 0.7294, "step": 1250 }, { "epoch": 0.7732593961799138, "grad_norm": 0.2534734308719635, "learning_rate": 2.6986016607901908e-05, "loss": 0.8862, "step": 1255 }, { "epoch": 0.7763401109057301, "grad_norm": 0.26066556572914124, "learning_rate": 2.629331207660931e-05, "loss": 0.9054, "step": 1260 }, { "epoch": 0.7794208256315465, "grad_norm": 0.2252478003501892, "learning_rate": 2.5608265937672436e-05, "loss": 0.8883, "step": 1265 }, { "epoch": 0.7825015403573629, "grad_norm": 0.4677968919277191, "learning_rate": 2.4930949369871203e-05, "loss": 0.9571, "step": 1270 }, { "epoch": 0.7855822550831792, "grad_norm": 0.48786357045173645, "learning_rate": 2.426143274885493e-05, "loss": 0.7375, "step": 1275 }, { "epoch": 0.7886629698089956, "grad_norm": 0.3174852430820465, "learning_rate": 2.359978563983022e-05, "loss": 0.8827, "step": 1280 }, { "epoch": 0.791743684534812, "grad_norm": 0.23994563519954681, "learning_rate": 2.2946076790332827e-05, "loss": 0.8892, "step": 1285 }, { "epoch": 0.7948243992606284, "grad_norm": 0.21942304074764252, "learning_rate": 2.2300374123084522e-05, "loss": 0.8561, "step": 1290 }, { "epoch": 0.7979051139864448, "grad_norm": 0.5058274865150452, "learning_rate": 2.166274472893567e-05, "loss": 0.9178, "step": 1295 }, { "epoch": 0.8009858287122612, "grad_norm": 0.43461018800735474, "learning_rate": 2.1033254859894226e-05, "loss": 0.7465, "step": 1300 }, { "epoch": 0.8040665434380776, "grad_norm": 0.26012492179870605, "learning_rate": 2.041196992224206e-05, "loss": 0.8865, "step": 1305 }, { "epoch": 0.807147258163894, "grad_norm": 0.2593576908111572, "learning_rate": 1.9798954469738762e-05, "loss": 0.8778, "step": 1310 }, { "epoch": 0.8102279728897104, "grad_norm": 0.21213385462760925, "learning_rate": 1.919427219691453e-05, "loss": 0.9287, "step": 1315 }, { "epoch": 0.8133086876155268, "grad_norm": 0.4115472435951233, "learning_rate": 1.8597985932451856e-05, "loss": 0.8981, "step": 1320 }, { "epoch": 0.8163894023413432, "grad_norm": 0.4511643648147583, "learning_rate": 1.8010157632657543e-05, "loss": 0.7387, "step": 1325 }, { "epoch": 0.8194701170671596, "grad_norm": 0.2990265488624573, "learning_rate": 1.7430848375025176e-05, "loss": 0.9106, "step": 1330 }, { "epoch": 0.822550831792976, "grad_norm": 0.28121325373649597, "learning_rate": 1.686011835188891e-05, "loss": 0.9232, "step": 1335 }, { "epoch": 0.8256315465187923, "grad_norm": 0.19917987287044525, "learning_rate": 1.6298026864169335e-05, "loss": 0.9458, "step": 1340 }, { "epoch": 0.8287122612446087, "grad_norm": 0.4129948914051056, "learning_rate": 1.5744632315211815e-05, "loss": 0.9359, "step": 1345 }, { "epoch": 0.8317929759704251, "grad_norm": 0.47105616331100464, "learning_rate": 1.5199992204718294e-05, "loss": 0.7866, "step": 1350 }, { "epoch": 0.8348736906962415, "grad_norm": 0.2840569317340851, "learning_rate": 1.4664163122772689e-05, "loss": 0.9127, "step": 1355 }, { "epoch": 0.8379544054220579, "grad_norm": 0.27385058999061584, "learning_rate": 1.4137200743961188e-05, "loss": 0.9092, "step": 1360 }, { "epoch": 0.8410351201478743, "grad_norm": 0.21615581214427948, "learning_rate": 1.3619159821587235e-05, "loss": 0.9071, "step": 1365 }, { "epoch": 0.8441158348736907, "grad_norm": 0.4315554201602936, "learning_rate": 1.3110094181982657e-05, "loss": 0.901, "step": 1370 }, { "epoch": 0.8471965495995071, "grad_norm": 0.4600566029548645, "learning_rate": 1.261005671891482e-05, "loss": 0.7692, "step": 1375 }, { "epoch": 0.8502772643253235, "grad_norm": 0.27289214730262756, "learning_rate": 1.2119099388090716e-05, "loss": 0.9479, "step": 1380 }, { "epoch": 0.8533579790511399, "grad_norm": 0.26042231917381287, "learning_rate": 1.1637273201758748e-05, "loss": 0.8972, "step": 1385 }, { "epoch": 0.8564386937769563, "grad_norm": 0.21819062530994415, "learning_rate": 1.1164628223408168e-05, "loss": 0.8494, "step": 1390 }, { "epoch": 0.8595194085027726, "grad_norm": 0.5444476008415222, "learning_rate": 1.0701213562567492e-05, "loss": 0.9043, "step": 1395 }, { "epoch": 0.862600123228589, "grad_norm": 0.5517734289169312, "learning_rate": 1.0247077369701653e-05, "loss": 0.7521, "step": 1400 }, { "epoch": 0.8656808379544054, "grad_norm": 0.27313733100891113, "learning_rate": 9.802266831209206e-06, "loss": 0.8408, "step": 1405 }, { "epoch": 0.8687615526802218, "grad_norm": 0.23924760520458221, "learning_rate": 9.366828164519258e-06, "loss": 0.8577, "step": 1410 }, { "epoch": 0.8718422674060382, "grad_norm": 0.2202882021665573, "learning_rate": 8.940806613289498e-06, "loss": 0.9402, "step": 1415 }, { "epoch": 0.8749229821318546, "grad_norm": 0.4714129865169525, "learning_rate": 8.524246442705153e-06, "loss": 0.8714, "step": 1420 }, { "epoch": 0.878003696857671, "grad_norm": 0.5381476283073425, "learning_rate": 8.117190934879593e-06, "loss": 0.7554, "step": 1425 }, { "epoch": 0.8810844115834874, "grad_norm": 0.30594927072525024, "learning_rate": 7.719682384357308e-06, "loss": 0.9058, "step": 1430 }, { "epoch": 0.8841651263093038, "grad_norm": 0.28632140159606934, "learning_rate": 7.33176209371923e-06, "loss": 0.9048, "step": 1435 }, { "epoch": 0.8872458410351202, "grad_norm": 0.23244526982307434, "learning_rate": 6.953470369291348e-06, "loss": 0.9097, "step": 1440 }, { "epoch": 0.8903265557609366, "grad_norm": 0.4457685351371765, "learning_rate": 6.5848465169566e-06, "loss": 0.9375, "step": 1445 }, { "epoch": 0.893407270486753, "grad_norm": 0.4593055546283722, "learning_rate": 6.225928838071016e-06, "loss": 0.7327, "step": 1450 }, { "epoch": 0.8964879852125693, "grad_norm": 0.3131862282752991, "learning_rate": 5.876754625483904e-06, "loss": 0.829, "step": 1455 }, { "epoch": 0.8995686999383857, "grad_norm": 0.23787960410118103, "learning_rate": 5.537360159663108e-06, "loss": 0.893, "step": 1460 }, { "epoch": 0.9026494146642021, "grad_norm": 0.22966954112052917, "learning_rate": 5.207780704925314e-06, "loss": 0.8752, "step": 1465 }, { "epoch": 0.9057301293900185, "grad_norm": 0.43406957387924194, "learning_rate": 4.888050505771868e-06, "loss": 0.9341, "step": 1470 }, { "epoch": 0.9088108441158349, "grad_norm": 0.451045960187912, "learning_rate": 4.578202783330799e-06, "loss": 0.7766, "step": 1475 }, { "epoch": 0.9118915588416513, "grad_norm": 0.28430673480033875, "learning_rate": 4.2782697319048605e-06, "loss": 0.8861, "step": 1480 }, { "epoch": 0.9149722735674677, "grad_norm": 0.24296101927757263, "learning_rate": 3.988282515626585e-06, "loss": 0.8434, "step": 1485 }, { "epoch": 0.9180529882932841, "grad_norm": 0.2274406999349594, "learning_rate": 3.7082712652200867e-06, "loss": 0.8912, "step": 1490 }, { "epoch": 0.9211337030191005, "grad_norm": 0.45456749200820923, "learning_rate": 3.438265074870417e-06, "loss": 0.9744, "step": 1495 }, { "epoch": 0.9242144177449169, "grad_norm": 0.5641310811042786, "learning_rate": 3.1782919992006333e-06, "loss": 0.7479, "step": 1500 }, { "epoch": 0.9272951324707333, "grad_norm": 0.2640092670917511, "learning_rate": 2.9283790503567222e-06, "loss": 0.9081, "step": 1505 }, { "epoch": 0.9303758471965496, "grad_norm": 0.24997375905513763, "learning_rate": 2.6885521952010105e-06, "loss": 0.9355, "step": 1510 }, { "epoch": 0.933456561922366, "grad_norm": 0.2296486645936966, "learning_rate": 2.458836352614069e-06, "loss": 0.8545, "step": 1515 }, { "epoch": 0.9365372766481824, "grad_norm": 0.4982737898826599, "learning_rate": 2.239255390905581e-06, "loss": 0.9361, "step": 1520 }, { "epoch": 0.9396179913739988, "grad_norm": 0.5252935290336609, "learning_rate": 2.029832125334319e-06, "loss": 0.7706, "step": 1525 }, { "epoch": 0.9426987060998152, "grad_norm": 0.271879106760025, "learning_rate": 1.8305883157375804e-06, "loss": 0.842, "step": 1530 }, { "epoch": 0.9457794208256316, "grad_norm": 0.24854250252246857, "learning_rate": 1.6415446642702337e-06, "loss": 0.9651, "step": 1535 }, { "epoch": 0.9488601355514479, "grad_norm": 0.21977169811725616, "learning_rate": 1.462720813253682e-06, "loss": 0.902, "step": 1540 }, { "epoch": 0.9519408502772643, "grad_norm": 0.5121393203735352, "learning_rate": 1.2941353431350056e-06, "loss": 0.9256, "step": 1545 }, { "epoch": 0.9550215650030807, "grad_norm": 0.5130953788757324, "learning_rate": 1.135805770556364e-06, "loss": 0.7639, "step": 1550 }, { "epoch": 0.958102279728897, "grad_norm": 0.27663251757621765, "learning_rate": 9.877485465349058e-07, "loss": 0.931, "step": 1555 }, { "epoch": 0.9611829944547134, "grad_norm": 0.24967016279697418, "learning_rate": 8.499790547535025e-07, "loss": 0.8409, "step": 1560 }, { "epoch": 0.9642637091805298, "grad_norm": 0.2005191147327423, "learning_rate": 7.225116099623286e-07, "loss": 0.867, "step": 1565 }, { "epoch": 0.9673444239063462, "grad_norm": 0.43106022477149963, "learning_rate": 6.053594564914611e-07, "loss": 0.9427, "step": 1570 }, { "epoch": 0.9704251386321626, "grad_norm": 0.4947551190853119, "learning_rate": 4.985347668747809e-07, "loss": 0.7485, "step": 1575 }, { "epoch": 0.973505853357979, "grad_norm": 0.26899272203445435, "learning_rate": 4.0204864058522864e-07, "loss": 0.9249, "step": 1580 }, { "epoch": 0.9765865680837954, "grad_norm": 0.20328934490680695, "learning_rate": 3.15911102881461e-07, "loss": 0.9969, "step": 1585 }, { "epoch": 0.9796672828096118, "grad_norm": 0.22392931580543518, "learning_rate": 2.40131103766239e-07, "loss": 0.8852, "step": 1590 }, { "epoch": 0.9827479975354282, "grad_norm": 0.4746832251548767, "learning_rate": 1.747165170564724e-07, "loss": 0.9672, "step": 1595 }, { "epoch": 0.9858287122612446, "grad_norm": 0.45710742473602295, "learning_rate": 1.1967413956510686e-07, "loss": 0.7987, "step": 1600 }, { "epoch": 0.988909426987061, "grad_norm": 0.3375983238220215, "learning_rate": 7.500969039491157e-08, "loss": 0.8614, "step": 1605 }, { "epoch": 0.9919901417128774, "grad_norm": 0.2787322998046875, "learning_rate": 4.0727810344254325e-08, "loss": 0.9483, "step": 1610 }, { "epoch": 0.9950708564386938, "grad_norm": 0.2125030755996704, "learning_rate": 1.6832061424865153e-08, "loss": 0.884, "step": 1615 }, { "epoch": 0.9981515711645101, "grad_norm": 0.39220768213272095, "learning_rate": 3.3249264917878387e-09, "loss": 0.8332, "step": 1620 }, { "epoch": 1.0, "step": 1623, "total_flos": 889480391426048.0, "train_loss": 0.9303844255645997, "train_runtime": 18614.4314, "train_samples_per_second": 2.79, "train_steps_per_second": 0.087 } ], "logging_steps": 5, "max_steps": 1623, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 889480391426048.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }