{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1623, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030807147258163892, "grad_norm": 1.9506609439849854, "learning_rate": 9.756097560975611e-06, "loss": 1.0043, "step": 5 }, { "epoch": 0.0061614294516327784, "grad_norm": 1.3290749788284302, "learning_rate": 2.1951219512195124e-05, "loss": 1.0881, "step": 10 }, { "epoch": 0.009242144177449169, "grad_norm": 1.0360522270202637, "learning_rate": 3.414634146341464e-05, "loss": 1.1051, "step": 15 }, { "epoch": 0.012322858903265557, "grad_norm": 1.3245832920074463, "learning_rate": 4.634146341463415e-05, "loss": 0.9605, "step": 20 }, { "epoch": 0.015403573629081947, "grad_norm": 3.1614928245544434, "learning_rate": 5.853658536585366e-05, "loss": 0.8698, "step": 25 }, { "epoch": 0.018484288354898338, "grad_norm": 1.4028220176696777, "learning_rate": 7.073170731707317e-05, "loss": 0.8835, "step": 30 }, { "epoch": 0.021565003080714726, "grad_norm": 0.9983139634132385, "learning_rate": 8.292682926829268e-05, "loss": 0.9123, "step": 35 }, { "epoch": 0.024645717806531114, "grad_norm": 1.0538513660430908, "learning_rate": 9.51219512195122e-05, "loss": 0.885, "step": 40 }, { "epoch": 0.027726432532347505, "grad_norm": 1.131333827972412, "learning_rate": 0.00010731707317073172, "loss": 0.8446, "step": 45 }, { "epoch": 0.030807147258163893, "grad_norm": 2.132134199142456, "learning_rate": 0.00011951219512195122, "loss": 0.7956, "step": 50 }, { "epoch": 0.033887861983980284, "grad_norm": 1.0456063747406006, "learning_rate": 0.00013170731707317076, "loss": 0.8816, "step": 55 }, { "epoch": 0.036968576709796676, "grad_norm": 0.8690502643585205, "learning_rate": 0.00014390243902439025, "loss": 0.8685, "step": 60 }, { "epoch": 0.04004929143561306, "grad_norm": 1.0242969989776611, "learning_rate": 0.00015609756097560978, "loss": 0.8768, "step": 65 }, { "epoch": 0.04313000616142945, "grad_norm": 1.328539490699768, "learning_rate": 0.00016829268292682927, "loss": 0.9351, "step": 70 }, { "epoch": 0.04621072088724584, "grad_norm": 1.3852390050888062, "learning_rate": 0.0001804878048780488, "loss": 0.786, "step": 75 }, { "epoch": 0.04929143561306223, "grad_norm": 0.9410791397094727, "learning_rate": 0.0001926829268292683, "loss": 0.8309, "step": 80 }, { "epoch": 0.05237215033887862, "grad_norm": 0.8410763144493103, "learning_rate": 0.0001999991687649223, "loss": 0.9027, "step": 85 }, { "epoch": 0.05545286506469501, "grad_norm": 0.915637195110321, "learning_rate": 0.00019998981752900036, "loss": 0.9057, "step": 90 }, { "epoch": 0.0585335797905114, "grad_norm": 1.2468522787094116, "learning_rate": 0.00019997007698817557, "loss": 0.9095, "step": 95 }, { "epoch": 0.061614294516327786, "grad_norm": 1.3756437301635742, "learning_rate": 0.00019993994919356167, "loss": 0.8025, "step": 100 }, { "epoch": 0.06469500924214418, "grad_norm": 0.8679229021072388, "learning_rate": 0.00019989943727554598, "loss": 0.8418, "step": 105 }, { "epoch": 0.06777572396796057, "grad_norm": 0.8622983694076538, "learning_rate": 0.00019984854544346367, "loss": 0.8821, "step": 110 }, { "epoch": 0.07085643869377696, "grad_norm": 0.8575367331504822, "learning_rate": 0.00019978727898516086, "loss": 0.9431, "step": 115 }, { "epoch": 0.07393715341959335, "grad_norm": 1.2082788944244385, "learning_rate": 0.0001997156442664449, "loss": 0.8149, "step": 120 }, { "epoch": 0.07701786814540973, "grad_norm": 1.507377028465271, "learning_rate": 0.00019963364873042298, "loss": 0.7926, "step": 125 }, { "epoch": 0.08009858287122612, "grad_norm": 0.8779996037483215, "learning_rate": 0.0001995413008967289, "loss": 0.923, "step": 130 }, { "epoch": 0.08317929759704251, "grad_norm": 0.8468291163444519, "learning_rate": 0.00019943861036063768, "loss": 0.8893, "step": 135 }, { "epoch": 0.0862600123228589, "grad_norm": 0.7504271864891052, "learning_rate": 0.00019932558779206874, "loss": 0.8932, "step": 140 }, { "epoch": 0.0893407270486753, "grad_norm": 1.057389736175537, "learning_rate": 0.00019920224493447702, "loss": 0.8907, "step": 145 }, { "epoch": 0.09242144177449169, "grad_norm": 1.2131892442703247, "learning_rate": 0.00019906859460363307, "loss": 0.7649, "step": 150 }, { "epoch": 0.09550215650030808, "grad_norm": 0.7876495718955994, "learning_rate": 0.00019892465068629131, "loss": 0.8601, "step": 155 }, { "epoch": 0.09858287122612445, "grad_norm": 0.7372773885726929, "learning_rate": 0.0001987704281387471, "loss": 0.9682, "step": 160 }, { "epoch": 0.10166358595194085, "grad_norm": 0.9224637150764465, "learning_rate": 0.00019860594298528282, "loss": 0.8882, "step": 165 }, { "epoch": 0.10474430067775724, "grad_norm": 1.195654034614563, "learning_rate": 0.0001984312123165028, "loss": 0.9287, "step": 170 }, { "epoch": 0.10782501540357363, "grad_norm": 1.0784906148910522, "learning_rate": 0.0001982462542875576, "loss": 0.7491, "step": 175 }, { "epoch": 0.11090573012939002, "grad_norm": 0.6587386131286621, "learning_rate": 0.00019805108811625773, "loss": 0.8577, "step": 180 }, { "epoch": 0.11398644485520641, "grad_norm": 0.699715256690979, "learning_rate": 0.00019784573408107657, "loss": 0.8966, "step": 185 }, { "epoch": 0.1170671595810228, "grad_norm": 0.9360162615776062, "learning_rate": 0.00019763021351904358, "loss": 0.8773, "step": 190 }, { "epoch": 0.12014787430683918, "grad_norm": 1.123854160308838, "learning_rate": 0.00019740454882352732, "loss": 0.8704, "step": 195 }, { "epoch": 0.12322858903265557, "grad_norm": 1.2856158018112183, "learning_rate": 0.0001971687634419086, "loss": 0.7832, "step": 200 }, { "epoch": 0.12630930375847196, "grad_norm": 0.7385960221290588, "learning_rate": 0.0001969228818731442, "loss": 0.8582, "step": 205 }, { "epoch": 0.12939001848428835, "grad_norm": 0.6473488211631775, "learning_rate": 0.00019666692966522145, "loss": 0.8792, "step": 210 }, { "epoch": 0.13247073321010475, "grad_norm": 0.746126115322113, "learning_rate": 0.00019640093341250357, "loss": 0.8736, "step": 215 }, { "epoch": 0.13555144793592114, "grad_norm": 1.0111432075500488, "learning_rate": 0.0001961249207529665, "loss": 0.8853, "step": 220 }, { "epoch": 0.13863216266173753, "grad_norm": 1.2317752838134766, "learning_rate": 0.00019583892036532726, "loss": 0.7865, "step": 225 }, { "epoch": 0.14171287738755392, "grad_norm": 0.6940191388130188, "learning_rate": 0.00019554296196606395, "loss": 0.8703, "step": 230 }, { "epoch": 0.1447935921133703, "grad_norm": 0.7127991318702698, "learning_rate": 0.00019523707630632835, "loss": 0.8262, "step": 235 }, { "epoch": 0.1478743068391867, "grad_norm": 0.6165663003921509, "learning_rate": 0.00019492129516875055, "loss": 0.9039, "step": 240 }, { "epoch": 0.15095502156500307, "grad_norm": 1.2010482549667358, "learning_rate": 0.00019459565136413666, "loss": 0.9394, "step": 245 }, { "epoch": 0.15403573629081946, "grad_norm": 1.4223132133483887, "learning_rate": 0.0001942601787280598, "loss": 0.7718, "step": 250 }, { "epoch": 0.15711645101663585, "grad_norm": 0.8233282566070557, "learning_rate": 0.00019391491211734425, "loss": 0.8706, "step": 255 }, { "epoch": 0.16019716574245224, "grad_norm": 0.6466130018234253, "learning_rate": 0.0001935598874064438, "loss": 0.8579, "step": 260 }, { "epoch": 0.16327788046826863, "grad_norm": 0.7207822799682617, "learning_rate": 0.00019319514148371435, "loss": 0.7989, "step": 265 }, { "epoch": 0.16635859519408502, "grad_norm": 1.073905110359192, "learning_rate": 0.00019282071224758091, "loss": 0.8333, "step": 270 }, { "epoch": 0.16943930991990142, "grad_norm": 1.2006767988204956, "learning_rate": 0.00019243663860259993, "loss": 0.7993, "step": 275 }, { "epoch": 0.1725200246457178, "grad_norm": 0.6434801816940308, "learning_rate": 0.00019204296045541685, "loss": 0.8851, "step": 280 }, { "epoch": 0.1756007393715342, "grad_norm": 0.6209431290626526, "learning_rate": 0.0001916397187106199, "loss": 0.8257, "step": 285 }, { "epoch": 0.1786814540973506, "grad_norm": 0.5881760120391846, "learning_rate": 0.00019122695526648968, "loss": 0.8571, "step": 290 }, { "epoch": 0.18176216882316698, "grad_norm": 1.2488080263137817, "learning_rate": 0.00019080471301064598, "loss": 0.895, "step": 295 }, { "epoch": 0.18484288354898337, "grad_norm": 1.179758071899414, "learning_rate": 0.00019037303581559143, "loss": 0.7412, "step": 300 }, { "epoch": 0.18792359827479976, "grad_norm": 0.6266173720359802, "learning_rate": 0.00018993196853415317, "loss": 0.8424, "step": 305 }, { "epoch": 0.19100431300061615, "grad_norm": 0.781024694442749, "learning_rate": 0.00018948155699482244, "loss": 0.818, "step": 310 }, { "epoch": 0.19408502772643252, "grad_norm": 0.857995331287384, "learning_rate": 0.00018902184799699263, "loss": 0.9148, "step": 315 }, { "epoch": 0.1971657424522489, "grad_norm": 0.7764838933944702, "learning_rate": 0.00018855288930609692, "loss": 0.7969, "step": 320 }, { "epoch": 0.2002464571780653, "grad_norm": 0.9209476113319397, "learning_rate": 0.00018807472964864515, "loss": 0.7768, "step": 325 }, { "epoch": 0.2033271719038817, "grad_norm": 0.6861506104469299, "learning_rate": 0.00018758741870716092, "loss": 0.8849, "step": 330 }, { "epoch": 0.20640788662969808, "grad_norm": 0.5947197675704956, "learning_rate": 0.00018709100711501955, "loss": 0.8672, "step": 335 }, { "epoch": 0.20948860135551448, "grad_norm": 0.6780321002006531, "learning_rate": 0.0001865855464511869, "loss": 0.7919, "step": 340 }, { "epoch": 0.21256931608133087, "grad_norm": 1.1705702543258667, "learning_rate": 0.00018607108923486025, "loss": 0.7502, "step": 345 }, { "epoch": 0.21565003080714726, "grad_norm": 1.004110336303711, "learning_rate": 0.00018554768892001136, "loss": 0.7369, "step": 350 }, { "epoch": 0.21873074553296365, "grad_norm": 0.6980533003807068, "learning_rate": 0.00018501539988983234, "loss": 0.7703, "step": 355 }, { "epoch": 0.22181146025878004, "grad_norm": 0.6246406435966492, "learning_rate": 0.0001844742774510851, "loss": 0.8441, "step": 360 }, { "epoch": 0.22489217498459643, "grad_norm": 0.5528222322463989, "learning_rate": 0.00018392437782835475, "loss": 0.8385, "step": 365 }, { "epoch": 0.22797288971041282, "grad_norm": 0.9303919076919556, "learning_rate": 0.00018336575815820766, "loss": 0.8384, "step": 370 }, { "epoch": 0.23105360443622922, "grad_norm": 1.3066872358322144, "learning_rate": 0.00018279847648325478, "loss": 0.7767, "step": 375 }, { "epoch": 0.2341343191620456, "grad_norm": 0.7670502662658691, "learning_rate": 0.0001822225917461208, "loss": 0.8032, "step": 380 }, { "epoch": 0.23721503388786197, "grad_norm": 0.7942723631858826, "learning_rate": 0.0001816381637833198, "loss": 0.8288, "step": 385 }, { "epoch": 0.24029574861367836, "grad_norm": 0.6688534617424011, "learning_rate": 0.00018104525331903799, "loss": 0.8631, "step": 390 }, { "epoch": 0.24337646333949475, "grad_norm": 0.9976694583892822, "learning_rate": 0.00018044392195882427, "loss": 0.9414, "step": 395 }, { "epoch": 0.24645717806531114, "grad_norm": 1.142532229423523, "learning_rate": 0.00017983423218318918, "loss": 0.7797, "step": 400 }, { "epoch": 0.24953789279112754, "grad_norm": 0.6933532953262329, "learning_rate": 0.00017921624734111292, "loss": 0.8074, "step": 405 }, { "epoch": 0.2526186075169439, "grad_norm": 0.7068799138069153, "learning_rate": 0.00017859003164346336, "loss": 0.9038, "step": 410 }, { "epoch": 0.2556993222427603, "grad_norm": 0.7113360166549683, "learning_rate": 0.0001779556501563239, "loss": 0.8077, "step": 415 }, { "epoch": 0.2587800369685767, "grad_norm": 1.0284174680709839, "learning_rate": 0.00017731316879423327, "loss": 0.9117, "step": 420 }, { "epoch": 0.2618607516943931, "grad_norm": 1.0717129707336426, "learning_rate": 0.00017666265431333654, "loss": 0.8241, "step": 425 }, { "epoch": 0.2649414664202095, "grad_norm": 0.6098302006721497, "learning_rate": 0.000176004174304449, "loss": 0.8526, "step": 430 }, { "epoch": 0.2680221811460259, "grad_norm": 0.6730021834373474, "learning_rate": 0.00017533779718603313, "loss": 0.8473, "step": 435 }, { "epoch": 0.2711028958718423, "grad_norm": 0.627232551574707, "learning_rate": 0.00017466359219708985, "loss": 0.7787, "step": 440 }, { "epoch": 0.27418361059765867, "grad_norm": 0.7510082125663757, "learning_rate": 0.00017398162938996422, "loss": 0.7869, "step": 445 }, { "epoch": 0.27726432532347506, "grad_norm": 0.800914466381073, "learning_rate": 0.00017329197962306664, "loss": 0.7307, "step": 450 }, { "epoch": 0.28034504004929145, "grad_norm": 0.688615083694458, "learning_rate": 0.00017259471455351072, "loss": 0.8078, "step": 455 }, { "epoch": 0.28342575477510784, "grad_norm": 0.6369125247001648, "learning_rate": 0.0001718899066296675, "loss": 0.827, "step": 460 }, { "epoch": 0.28650646950092423, "grad_norm": 0.7632527351379395, "learning_rate": 0.000171177629083638, "loss": 0.8171, "step": 465 }, { "epoch": 0.2895871842267406, "grad_norm": 0.8901572227478027, "learning_rate": 0.0001704579559236441, "loss": 0.8534, "step": 470 }, { "epoch": 0.292667898952557, "grad_norm": 1.2739449739456177, "learning_rate": 0.00016973096192633884, "loss": 0.718, "step": 475 }, { "epoch": 0.2957486136783734, "grad_norm": 0.655415952205658, "learning_rate": 0.00016899672262903677, "loss": 0.7889, "step": 480 }, { "epoch": 0.2988293284041898, "grad_norm": 0.6401821970939636, "learning_rate": 0.00016825531432186543, "loss": 0.9173, "step": 485 }, { "epoch": 0.30191004313000613, "grad_norm": 0.5191354155540466, "learning_rate": 0.00016750681403983846, "loss": 0.931, "step": 490 }, { "epoch": 0.3049907578558225, "grad_norm": 0.8743457794189453, "learning_rate": 0.00016675129955485152, "loss": 0.8111, "step": 495 }, { "epoch": 0.3080714725816389, "grad_norm": 1.0666691064834595, "learning_rate": 0.00016598884936760131, "loss": 0.736, "step": 500 }, { "epoch": 0.3111521873074553, "grad_norm": 0.606925368309021, "learning_rate": 0.00016521954269942918, "loss": 0.8742, "step": 505 }, { "epoch": 0.3142329020332717, "grad_norm": 0.5690024495124817, "learning_rate": 0.00016444345948408984, "loss": 0.8288, "step": 510 }, { "epoch": 0.3173136167590881, "grad_norm": 0.7045788168907166, "learning_rate": 0.0001636606803594457, "loss": 0.8422, "step": 515 }, { "epoch": 0.3203943314849045, "grad_norm": 0.9720426201820374, "learning_rate": 0.0001628712866590885, "loss": 0.8012, "step": 520 }, { "epoch": 0.3234750462107209, "grad_norm": 1.1168466806411743, "learning_rate": 0.00016207536040388845, "loss": 0.7338, "step": 525 }, { "epoch": 0.32655576093653726, "grad_norm": 0.6451901197433472, "learning_rate": 0.0001612729842934718, "loss": 0.8471, "step": 530 }, { "epoch": 0.32963647566235366, "grad_norm": 0.7933263778686523, "learning_rate": 0.00016046424169762827, "loss": 0.8995, "step": 535 }, { "epoch": 0.33271719038817005, "grad_norm": 0.6123836636543274, "learning_rate": 0.0001596492166476485, "loss": 0.8341, "step": 540 }, { "epoch": 0.33579790511398644, "grad_norm": 0.8381984233856201, "learning_rate": 0.0001588279938275929, "loss": 0.8531, "step": 545 }, { "epoch": 0.33887861983980283, "grad_norm": 1.155434250831604, "learning_rate": 0.00015800065856549269, "loss": 0.6915, "step": 550 }, { "epoch": 0.3419593345656192, "grad_norm": 0.634437084197998, "learning_rate": 0.00015716729682448393, "loss": 0.8241, "step": 555 }, { "epoch": 0.3450400492914356, "grad_norm": 0.6219022274017334, "learning_rate": 0.0001563279951938758, "loss": 0.8461, "step": 560 }, { "epoch": 0.348120764017252, "grad_norm": 0.4845116138458252, "learning_rate": 0.00015548284088015354, "loss": 0.8311, "step": 565 }, { "epoch": 0.3512014787430684, "grad_norm": 0.9370896816253662, "learning_rate": 0.00015463192169791741, "loss": 0.8278, "step": 570 }, { "epoch": 0.3542821934688848, "grad_norm": 1.1633367538452148, "learning_rate": 0.0001537753260607584, "loss": 0.7536, "step": 575 }, { "epoch": 0.3573629081947012, "grad_norm": 0.5799803733825684, "learning_rate": 0.00015291314297207175, "loss": 0.7999, "step": 580 }, { "epoch": 0.36044362292051757, "grad_norm": 0.47321441769599915, "learning_rate": 0.0001520454620158093, "loss": 0.8864, "step": 585 }, { "epoch": 0.36352433764633396, "grad_norm": 0.5589901804924011, "learning_rate": 0.00015117237334717117, "loss": 0.7921, "step": 590 }, { "epoch": 0.36660505237215035, "grad_norm": 0.9199019074440002, "learning_rate": 0.00015029396768323846, "loss": 0.8999, "step": 595 }, { "epoch": 0.36968576709796674, "grad_norm": 1.062154769897461, "learning_rate": 0.00014941033629354734, "loss": 0.815, "step": 600 }, { "epoch": 0.37276648182378314, "grad_norm": 0.7027643918991089, "learning_rate": 0.00014852157099060596, "loss": 0.8644, "step": 605 }, { "epoch": 0.3758471965495995, "grad_norm": 0.5963114500045776, "learning_rate": 0.00014762776412035456, "loss": 0.87, "step": 610 }, { "epoch": 0.3789279112754159, "grad_norm": 0.5938383340835571, "learning_rate": 0.00014672900855257056, "loss": 0.8137, "step": 615 }, { "epoch": 0.3820086260012323, "grad_norm": 0.9398343563079834, "learning_rate": 0.00014582539767121904, "loss": 0.8821, "step": 620 }, { "epoch": 0.3850893407270487, "grad_norm": 0.99173504114151, "learning_rate": 0.0001449170253647498, "loss": 0.6784, "step": 625 }, { "epoch": 0.38817005545286504, "grad_norm": 0.7030093669891357, "learning_rate": 0.0001440039860163419, "loss": 0.832, "step": 630 }, { "epoch": 0.39125077017868143, "grad_norm": 0.6641373038291931, "learning_rate": 0.00014308637449409706, "loss": 0.826, "step": 635 }, { "epoch": 0.3943314849044978, "grad_norm": 0.6074934005737305, "learning_rate": 0.00014216428614118243, "loss": 0.8566, "step": 640 }, { "epoch": 0.3974121996303142, "grad_norm": 0.8580813407897949, "learning_rate": 0.00014123781676592418, "loss": 0.8423, "step": 645 }, { "epoch": 0.4004929143561306, "grad_norm": 1.0865176916122437, "learning_rate": 0.00014030706263185247, "loss": 0.769, "step": 650 }, { "epoch": 0.403573629081947, "grad_norm": 0.6008116602897644, "learning_rate": 0.00013937212044769955, "loss": 0.8003, "step": 655 }, { "epoch": 0.4066543438077634, "grad_norm": 0.7620061635971069, "learning_rate": 0.0001384330873573513, "loss": 0.863, "step": 660 }, { "epoch": 0.4097350585335798, "grad_norm": 0.5859779119491577, "learning_rate": 0.00013749006092975347, "loss": 0.8224, "step": 665 }, { "epoch": 0.41281577325939617, "grad_norm": 0.8230900764465332, "learning_rate": 0.00013654313914877414, "loss": 0.8245, "step": 670 }, { "epoch": 0.41589648798521256, "grad_norm": 0.9619393348693848, "learning_rate": 0.00013559242040302272, "loss": 0.7234, "step": 675 }, { "epoch": 0.41897720271102895, "grad_norm": 0.687427282333374, "learning_rate": 0.00013463800347562706, "loss": 0.8489, "step": 680 }, { "epoch": 0.42205791743684534, "grad_norm": 0.5357842445373535, "learning_rate": 0.00013367998753396944, "loss": 0.755, "step": 685 }, { "epoch": 0.42513863216266173, "grad_norm": 0.5615360736846924, "learning_rate": 0.00013271847211938285, "loss": 0.8116, "step": 690 }, { "epoch": 0.4282193468884781, "grad_norm": 1.0073713064193726, "learning_rate": 0.0001317535571368082, "loss": 0.8125, "step": 695 }, { "epoch": 0.4313000616142945, "grad_norm": 0.9638437032699585, "learning_rate": 0.00013078534284441382, "loss": 0.7871, "step": 700 }, { "epoch": 0.4343807763401109, "grad_norm": 0.6262618899345398, "learning_rate": 0.00012981392984317834, "loss": 0.7716, "step": 705 }, { "epoch": 0.4374614910659273, "grad_norm": 0.6202210187911987, "learning_rate": 0.00012883941906643786, "loss": 0.8464, "step": 710 }, { "epoch": 0.4405422057917437, "grad_norm": 0.636193037033081, "learning_rate": 0.00012786191176939848, "loss": 0.7936, "step": 715 }, { "epoch": 0.4436229205175601, "grad_norm": 0.7928021550178528, "learning_rate": 0.00012688150951861582, "loss": 0.7535, "step": 720 }, { "epoch": 0.4467036352433765, "grad_norm": 1.1193562746047974, "learning_rate": 0.00012589831418144154, "loss": 0.7378, "step": 725 }, { "epoch": 0.44978434996919286, "grad_norm": 0.5501682162284851, "learning_rate": 0.00012491242791543922, "loss": 0.8251, "step": 730 }, { "epoch": 0.45286506469500926, "grad_norm": 0.5638311505317688, "learning_rate": 0.00012392395315776963, "loss": 0.8488, "step": 735 }, { "epoch": 0.45594577942082565, "grad_norm": 0.5718980431556702, "learning_rate": 0.00012293299261454725, "loss": 0.8058, "step": 740 }, { "epoch": 0.45902649414664204, "grad_norm": 0.9007247090339661, "learning_rate": 0.00012193964925016872, "loss": 0.7745, "step": 745 }, { "epoch": 0.46210720887245843, "grad_norm": 1.2628989219665527, "learning_rate": 0.00012094402627661447, "loss": 0.7156, "step": 750 }, { "epoch": 0.4651879235982748, "grad_norm": 0.6456303596496582, "learning_rate": 0.00011994622714272448, "loss": 0.8305, "step": 755 }, { "epoch": 0.4682686383240912, "grad_norm": 0.7610649466514587, "learning_rate": 0.00011894635552344975, "loss": 0.8419, "step": 760 }, { "epoch": 0.4713493530499076, "grad_norm": 0.415209025144577, "learning_rate": 0.00011794451530908011, "loss": 0.7674, "step": 765 }, { "epoch": 0.47443006777572394, "grad_norm": 0.8516692519187927, "learning_rate": 0.00011694081059444946, "loss": 0.8148, "step": 770 }, { "epoch": 0.47751078250154033, "grad_norm": 1.0319640636444092, "learning_rate": 0.0001159353456681201, "loss": 0.7435, "step": 775 }, { "epoch": 0.4805914972273567, "grad_norm": 0.5645594000816345, "learning_rate": 0.00011492822500154667, "loss": 0.7572, "step": 780 }, { "epoch": 0.4836722119531731, "grad_norm": 0.5694653987884521, "learning_rate": 0.00011391955323822126, "loss": 0.7624, "step": 785 }, { "epoch": 0.4867529266789895, "grad_norm": 0.5023643374443054, "learning_rate": 0.00011290943518280057, "loss": 0.8524, "step": 790 }, { "epoch": 0.4898336414048059, "grad_norm": 0.9564359784126282, "learning_rate": 0.0001118979757902162, "loss": 0.7575, "step": 795 }, { "epoch": 0.4929143561306223, "grad_norm": 1.1121801137924194, "learning_rate": 0.00011088528015476964, "loss": 0.7799, "step": 800 }, { "epoch": 0.4959950708564387, "grad_norm": 0.5354447364807129, "learning_rate": 0.00010987145349921251, "loss": 0.7722, "step": 805 }, { "epoch": 0.49907578558225507, "grad_norm": 0.5652374625205994, "learning_rate": 0.0001088566011638134, "loss": 0.7795, "step": 810 }, { "epoch": 0.5021565003080715, "grad_norm": 0.5385531783103943, "learning_rate": 0.00010784082859541292, "loss": 0.7919, "step": 815 }, { "epoch": 0.5052372150338879, "grad_norm": 0.7922360897064209, "learning_rate": 0.0001068242413364671, "loss": 0.8262, "step": 820 }, { "epoch": 0.5083179297597042, "grad_norm": 1.1030114889144897, "learning_rate": 0.00010580694501408138, "loss": 0.7565, "step": 825 }, { "epoch": 0.5113986444855206, "grad_norm": 0.5422970056533813, "learning_rate": 0.00010478904532903535, "loss": 0.7479, "step": 830 }, { "epoch": 0.514479359211337, "grad_norm": 0.5856009125709534, "learning_rate": 0.00010377064804480025, "loss": 0.8519, "step": 835 }, { "epoch": 0.5175600739371534, "grad_norm": 0.5326692461967468, "learning_rate": 0.00010275185897654971, "loss": 0.7604, "step": 840 }, { "epoch": 0.5206407886629698, "grad_norm": 0.9409604072570801, "learning_rate": 0.00010173278398016501, "loss": 0.7729, "step": 845 }, { "epoch": 0.5237215033887862, "grad_norm": 0.8584169149398804, "learning_rate": 0.00010071352894123654, "loss": 0.7233, "step": 850 }, { "epoch": 0.5268022181146026, "grad_norm": 0.5935032963752747, "learning_rate": 9.969419976406165e-05, "loss": 0.7798, "step": 855 }, { "epoch": 0.529882932840419, "grad_norm": 0.6444724202156067, "learning_rate": 9.867490236064108e-05, "loss": 0.7783, "step": 860 }, { "epoch": 0.5329636475662354, "grad_norm": 0.5421064496040344, "learning_rate": 9.765574263967396e-05, "loss": 0.8457, "step": 865 }, { "epoch": 0.5360443622920518, "grad_norm": 0.7332625985145569, "learning_rate": 9.66368264955539e-05, "loss": 0.7861, "step": 870 }, { "epoch": 0.5391250770178682, "grad_norm": 0.9407272338867188, "learning_rate": 9.56182597973658e-05, "loss": 0.738, "step": 875 }, { "epoch": 0.5422057917436846, "grad_norm": 0.5053693652153015, "learning_rate": 9.460014837788605e-05, "loss": 0.7868, "step": 880 }, { "epoch": 0.5452865064695009, "grad_norm": 0.5454622507095337, "learning_rate": 9.358259802258581e-05, "loss": 0.8042, "step": 885 }, { "epoch": 0.5483672211953173, "grad_norm": 0.5724400281906128, "learning_rate": 9.256571445863972e-05, "loss": 0.7704, "step": 890 }, { "epoch": 0.5514479359211337, "grad_norm": 0.843951404094696, "learning_rate": 9.154960334394027e-05, "loss": 0.8044, "step": 895 }, { "epoch": 0.5545286506469501, "grad_norm": 1.0994093418121338, "learning_rate": 9.053437025611973e-05, "loss": 0.7098, "step": 900 }, { "epoch": 0.5576093653727665, "grad_norm": 0.6525376439094543, "learning_rate": 8.952012068158027e-05, "loss": 0.8139, "step": 905 }, { "epoch": 0.5606900800985829, "grad_norm": 0.49181467294692993, "learning_rate": 8.850696000453326e-05, "loss": 0.8226, "step": 910 }, { "epoch": 0.5637707948243993, "grad_norm": 0.5223445296287537, "learning_rate": 8.749499349604993e-05, "loss": 0.7899, "step": 915 }, { "epoch": 0.5668515095502157, "grad_norm": 0.8057864308357239, "learning_rate": 8.64843263031228e-05, "loss": 0.7831, "step": 920 }, { "epoch": 0.5699322242760321, "grad_norm": 1.0692858695983887, "learning_rate": 8.547506343774097e-05, "loss": 0.6825, "step": 925 }, { "epoch": 0.5730129390018485, "grad_norm": 0.6106774806976318, "learning_rate": 8.446730976597878e-05, "loss": 0.8087, "step": 930 }, { "epoch": 0.5760936537276649, "grad_norm": 0.657316267490387, "learning_rate": 8.346116999709975e-05, "loss": 0.7957, "step": 935 }, { "epoch": 0.5791743684534812, "grad_norm": 0.4877719581127167, "learning_rate": 8.245674867267724e-05, "loss": 0.7564, "step": 940 }, { "epoch": 0.5822550831792976, "grad_norm": 0.8313435912132263, "learning_rate": 8.145415015573183e-05, "loss": 0.8325, "step": 945 }, { "epoch": 0.585335797905114, "grad_norm": 0.9311845302581787, "learning_rate": 8.045347861988789e-05, "loss": 0.6653, "step": 950 }, { "epoch": 0.5884165126309304, "grad_norm": 0.6919590830802917, "learning_rate": 7.945483803854936e-05, "loss": 0.7861, "step": 955 }, { "epoch": 0.5914972273567468, "grad_norm": 0.5540125370025635, "learning_rate": 7.845833217409675e-05, "loss": 0.8685, "step": 960 }, { "epoch": 0.5945779420825632, "grad_norm": 0.5159959197044373, "learning_rate": 7.746406456710564e-05, "loss": 0.7727, "step": 965 }, { "epoch": 0.5976586568083796, "grad_norm": 0.8416092395782471, "learning_rate": 7.64721385255886e-05, "loss": 0.7798, "step": 970 }, { "epoch": 0.600739371534196, "grad_norm": 1.0320320129394531, "learning_rate": 7.548265711426104e-05, "loss": 0.6814, "step": 975 }, { "epoch": 0.6038200862600123, "grad_norm": 0.626734733581543, "learning_rate": 7.449572314383237e-05, "loss": 0.8495, "step": 980 }, { "epoch": 0.6069008009858287, "grad_norm": 0.6575467586517334, "learning_rate": 7.351143916032374e-05, "loss": 0.8553, "step": 985 }, { "epoch": 0.609981515711645, "grad_norm": 0.5149129629135132, "learning_rate": 7.252990743441293e-05, "loss": 0.7646, "step": 990 }, { "epoch": 0.6130622304374614, "grad_norm": 0.8680890798568726, "learning_rate": 7.155122995080827e-05, "loss": 0.8524, "step": 995 }, { "epoch": 0.6161429451632778, "grad_norm": 1.129459023475647, "learning_rate": 7.057550839765188e-05, "loss": 0.7465, "step": 1000 }, { "epoch": 0.6192236598890942, "grad_norm": 0.7005685567855835, "learning_rate": 6.960284415595407e-05, "loss": 0.7945, "step": 1005 }, { "epoch": 0.6223043746149106, "grad_norm": 0.5934234261512756, "learning_rate": 6.863333828905929e-05, "loss": 0.9171, "step": 1010 }, { "epoch": 0.625385089340727, "grad_norm": 0.5575773119926453, "learning_rate": 6.766709153214542e-05, "loss": 0.8228, "step": 1015 }, { "epoch": 0.6284658040665434, "grad_norm": 0.7771855592727661, "learning_rate": 6.670420428175705e-05, "loss": 0.8146, "step": 1020 }, { "epoch": 0.6315465187923598, "grad_norm": 0.8821945190429688, "learning_rate": 6.574477658537375e-05, "loss": 0.7083, "step": 1025 }, { "epoch": 0.6346272335181762, "grad_norm": 0.6677148342132568, "learning_rate": 6.4788908131015e-05, "loss": 0.7567, "step": 1030 }, { "epoch": 0.6377079482439926, "grad_norm": 0.5348629951477051, "learning_rate": 6.38366982368819e-05, "loss": 0.7256, "step": 1035 }, { "epoch": 0.640788662969809, "grad_norm": 0.539318323135376, "learning_rate": 6.288824584103816e-05, "loss": 0.7838, "step": 1040 }, { "epoch": 0.6438693776956254, "grad_norm": 0.8003283739089966, "learning_rate": 6.194364949112953e-05, "loss": 0.7608, "step": 1045 }, { "epoch": 0.6469500924214417, "grad_norm": 0.9320285320281982, "learning_rate": 6.100300733414474e-05, "loss": 0.6823, "step": 1050 }, { "epoch": 0.6500308071472581, "grad_norm": 0.6213887333869934, "learning_rate": 6.0066417106217455e-05, "loss": 0.7781, "step": 1055 }, { "epoch": 0.6531115218730745, "grad_norm": 0.5485357046127319, "learning_rate": 5.9133976122471214e-05, "loss": 0.8137, "step": 1060 }, { "epoch": 0.6561922365988909, "grad_norm": 0.5012507438659668, "learning_rate": 5.82057812669081e-05, "loss": 0.7741, "step": 1065 }, { "epoch": 0.6592729513247073, "grad_norm": 0.6728748083114624, "learning_rate": 5.728192898234195e-05, "loss": 0.7326, "step": 1070 }, { "epoch": 0.6623536660505237, "grad_norm": 1.107875943183899, "learning_rate": 5.6362515260377835e-05, "loss": 0.676, "step": 1075 }, { "epoch": 0.6654343807763401, "grad_norm": 0.6420050263404846, "learning_rate": 5.544763563143793e-05, "loss": 0.8215, "step": 1080 }, { "epoch": 0.6685150955021565, "grad_norm": 0.5939176082611084, "learning_rate": 5.4537385154835864e-05, "loss": 0.7669, "step": 1085 }, { "epoch": 0.6715958102279729, "grad_norm": 0.4939870536327362, "learning_rate": 5.363185840889935e-05, "loss": 0.7439, "step": 1090 }, { "epoch": 0.6746765249537893, "grad_norm": 0.7583256363868713, "learning_rate": 5.273114948114346e-05, "loss": 0.81, "step": 1095 }, { "epoch": 0.6777572396796057, "grad_norm": 1.1161051988601685, "learning_rate": 5.1835351958494515e-05, "loss": 0.683, "step": 1100 }, { "epoch": 0.680837954405422, "grad_norm": 0.5778560042381287, "learning_rate": 5.094455891756587e-05, "loss": 0.7839, "step": 1105 }, { "epoch": 0.6839186691312384, "grad_norm": 0.5644716024398804, "learning_rate": 5.00588629149872e-05, "loss": 0.8279, "step": 1110 }, { "epoch": 0.6869993838570548, "grad_norm": 0.525404691696167, "learning_rate": 4.91783559777873e-05, "loss": 0.7721, "step": 1115 }, { "epoch": 0.6900800985828712, "grad_norm": 0.9260256290435791, "learning_rate": 4.830312959383238e-05, "loss": 0.7555, "step": 1120 }, { "epoch": 0.6931608133086876, "grad_norm": 0.8119556903839111, "learning_rate": 4.7433274702319815e-05, "loss": 0.6236, "step": 1125 }, { "epoch": 0.696241528034504, "grad_norm": 0.6038579344749451, "learning_rate": 4.656888168432962e-05, "loss": 0.7604, "step": 1130 }, { "epoch": 0.6993222427603204, "grad_norm": 0.6174635887145996, "learning_rate": 4.571004035343315e-05, "loss": 0.8142, "step": 1135 }, { "epoch": 0.7024029574861368, "grad_norm": 0.470345139503479, "learning_rate": 4.485683994636144e-05, "loss": 0.7306, "step": 1140 }, { "epoch": 0.7054836722119532, "grad_norm": 0.8356139659881592, "learning_rate": 4.400936911373308e-05, "loss": 0.8464, "step": 1145 }, { "epoch": 0.7085643869377696, "grad_norm": 0.8929637670516968, "learning_rate": 4.3167715910842966e-05, "loss": 0.6958, "step": 1150 }, { "epoch": 0.711645101663586, "grad_norm": 0.5765171051025391, "learning_rate": 4.2331967788513295e-05, "loss": 0.7714, "step": 1155 }, { "epoch": 0.7147258163894024, "grad_norm": 0.5743905901908875, "learning_rate": 4.1502211584006836e-05, "loss": 0.7777, "step": 1160 }, { "epoch": 0.7178065311152187, "grad_norm": 0.47613048553466797, "learning_rate": 4.067853351200446e-05, "loss": 0.8229, "step": 1165 }, { "epoch": 0.7208872458410351, "grad_norm": 0.6945850849151611, "learning_rate": 3.986101915564695e-05, "loss": 0.7934, "step": 1170 }, { "epoch": 0.7239679605668515, "grad_norm": 1.2221349477767944, "learning_rate": 3.904975345764262e-05, "loss": 0.7063, "step": 1175 }, { "epoch": 0.7270486752926679, "grad_norm": 0.6141952872276306, "learning_rate": 3.824482071144163e-05, "loss": 0.7582, "step": 1180 }, { "epoch": 0.7301293900184843, "grad_norm": 0.6697849035263062, "learning_rate": 3.744630455247739e-05, "loss": 0.7688, "step": 1185 }, { "epoch": 0.7332101047443007, "grad_norm": 0.45467403531074524, "learning_rate": 3.6654287949476626e-05, "loss": 0.7579, "step": 1190 }, { "epoch": 0.7362908194701171, "grad_norm": 0.7754799127578735, "learning_rate": 3.586885319583858e-05, "loss": 0.7618, "step": 1195 }, { "epoch": 0.7393715341959335, "grad_norm": 1.0018911361694336, "learning_rate": 3.5090081901084525e-05, "loss": 0.6976, "step": 1200 }, { "epoch": 0.7424522489217499, "grad_norm": 0.6909040808677673, "learning_rate": 3.431805498237808e-05, "loss": 0.8187, "step": 1205 }, { "epoch": 0.7455329636475663, "grad_norm": 0.5961218476295471, "learning_rate": 3.355285265611784e-05, "loss": 0.7922, "step": 1210 }, { "epoch": 0.7486136783733827, "grad_norm": 0.4505811333656311, "learning_rate": 3.279455442960238e-05, "loss": 0.7868, "step": 1215 }, { "epoch": 0.751694393099199, "grad_norm": 0.7794224619865417, "learning_rate": 3.204323909276924e-05, "loss": 0.813, "step": 1220 }, { "epoch": 0.7547751078250154, "grad_norm": 0.8165008425712585, "learning_rate": 3.1298984710008484e-05, "loss": 0.666, "step": 1225 }, { "epoch": 0.7578558225508318, "grad_norm": 0.72403484582901, "learning_rate": 3.056186861205136e-05, "loss": 0.7433, "step": 1230 }, { "epoch": 0.7609365372766482, "grad_norm": 0.5372538566589355, "learning_rate": 2.9831967387935467e-05, "loss": 0.8582, "step": 1235 }, { "epoch": 0.7640172520024646, "grad_norm": 0.5056626796722412, "learning_rate": 2.9109356877046712e-05, "loss": 0.7281, "step": 1240 }, { "epoch": 0.767097966728281, "grad_norm": 0.8713842630386353, "learning_rate": 2.8394112161239605e-05, "loss": 0.794, "step": 1245 }, { "epoch": 0.7701786814540974, "grad_norm": 1.0365170240402222, "learning_rate": 2.7686307557035685e-05, "loss": 0.6647, "step": 1250 }, { "epoch": 0.7732593961799138, "grad_norm": 0.534180760383606, "learning_rate": 2.6986016607901908e-05, "loss": 0.7198, "step": 1255 }, { "epoch": 0.7763401109057301, "grad_norm": 0.5429548025131226, "learning_rate": 2.629331207660931e-05, "loss": 0.7513, "step": 1260 }, { "epoch": 0.7794208256315465, "grad_norm": 0.49868354201316833, "learning_rate": 2.5608265937672436e-05, "loss": 0.7611, "step": 1265 }, { "epoch": 0.7825015403573629, "grad_norm": 0.7774792909622192, "learning_rate": 2.4930949369871203e-05, "loss": 0.7762, "step": 1270 }, { "epoch": 0.7855822550831792, "grad_norm": 1.040366291999817, "learning_rate": 2.426143274885493e-05, "loss": 0.635, "step": 1275 }, { "epoch": 0.7886629698089956, "grad_norm": 0.6801935434341431, "learning_rate": 2.359978563983022e-05, "loss": 0.7553, "step": 1280 }, { "epoch": 0.791743684534812, "grad_norm": 0.4836885929107666, "learning_rate": 2.2946076790332827e-05, "loss": 0.7228, "step": 1285 }, { "epoch": 0.7948243992606284, "grad_norm": 0.5009395480155945, "learning_rate": 2.2300374123084522e-05, "loss": 0.706, "step": 1290 }, { "epoch": 0.7979051139864448, "grad_norm": 0.8395462036132812, "learning_rate": 2.166274472893567e-05, "loss": 0.7395, "step": 1295 }, { "epoch": 0.8009858287122612, "grad_norm": 0.9128504395484924, "learning_rate": 2.1033254859894226e-05, "loss": 0.6511, "step": 1300 }, { "epoch": 0.8040665434380776, "grad_norm": 0.4820072650909424, "learning_rate": 2.041196992224206e-05, "loss": 0.7469, "step": 1305 }, { "epoch": 0.807147258163894, "grad_norm": 0.5351672768592834, "learning_rate": 1.9798954469738762e-05, "loss": 0.7357, "step": 1310 }, { "epoch": 0.8102279728897104, "grad_norm": 0.46705517172813416, "learning_rate": 1.919427219691453e-05, "loss": 0.7738, "step": 1315 }, { "epoch": 0.8133086876155268, "grad_norm": 0.767042875289917, "learning_rate": 1.8597985932451856e-05, "loss": 0.7527, "step": 1320 }, { "epoch": 0.8163894023413432, "grad_norm": 1.0822641849517822, "learning_rate": 1.8010157632657543e-05, "loss": 0.6507, "step": 1325 }, { "epoch": 0.8194701170671596, "grad_norm": 0.6491265892982483, "learning_rate": 1.7430848375025176e-05, "loss": 0.7882, "step": 1330 }, { "epoch": 0.822550831792976, "grad_norm": 0.6176543831825256, "learning_rate": 1.686011835188891e-05, "loss": 0.7986, "step": 1335 }, { "epoch": 0.8256315465187923, "grad_norm": 0.41295069456100464, "learning_rate": 1.6298026864169335e-05, "loss": 0.7427, "step": 1340 }, { "epoch": 0.8287122612446087, "grad_norm": 0.6847618818283081, "learning_rate": 1.5744632315211815e-05, "loss": 0.7551, "step": 1345 }, { "epoch": 0.8317929759704251, "grad_norm": 0.9406694769859314, "learning_rate": 1.5199992204718294e-05, "loss": 0.6678, "step": 1350 }, { "epoch": 0.8348736906962415, "grad_norm": 0.609660804271698, "learning_rate": 1.4664163122772689e-05, "loss": 0.7761, "step": 1355 }, { "epoch": 0.8379544054220579, "grad_norm": 0.6481778621673584, "learning_rate": 1.4137200743961188e-05, "loss": 0.7838, "step": 1360 }, { "epoch": 0.8410351201478743, "grad_norm": 0.4781896770000458, "learning_rate": 1.3619159821587235e-05, "loss": 0.7483, "step": 1365 }, { "epoch": 0.8441158348736907, "grad_norm": 0.6182886958122253, "learning_rate": 1.3110094181982657e-05, "loss": 0.7589, "step": 1370 }, { "epoch": 0.8471965495995071, "grad_norm": 1.1230385303497314, "learning_rate": 1.261005671891482e-05, "loss": 0.6927, "step": 1375 }, { "epoch": 0.8502772643253235, "grad_norm": 0.6041083335876465, "learning_rate": 1.2119099388090716e-05, "loss": 0.7507, "step": 1380 }, { "epoch": 0.8533579790511399, "grad_norm": 0.5692098140716553, "learning_rate": 1.1637273201758748e-05, "loss": 0.7854, "step": 1385 }, { "epoch": 0.8564386937769563, "grad_norm": 0.4730149209499359, "learning_rate": 1.1164628223408168e-05, "loss": 0.7449, "step": 1390 }, { "epoch": 0.8595194085027726, "grad_norm": 0.868976354598999, "learning_rate": 1.0701213562567492e-05, "loss": 0.7069, "step": 1395 }, { "epoch": 0.862600123228589, "grad_norm": 1.2989338636398315, "learning_rate": 1.0247077369701653e-05, "loss": 0.6703, "step": 1400 }, { "epoch": 0.8656808379544054, "grad_norm": 0.5826016068458557, "learning_rate": 9.802266831209206e-06, "loss": 0.7252, "step": 1405 }, { "epoch": 0.8687615526802218, "grad_norm": 0.5001908540725708, "learning_rate": 9.366828164519258e-06, "loss": 0.7289, "step": 1410 }, { "epoch": 0.8718422674060382, "grad_norm": 0.49539172649383545, "learning_rate": 8.940806613289498e-06, "loss": 0.7756, "step": 1415 }, { "epoch": 0.8749229821318546, "grad_norm": 0.8087944984436035, "learning_rate": 8.524246442705153e-06, "loss": 0.7602, "step": 1420 }, { "epoch": 0.878003696857671, "grad_norm": 1.1548421382904053, "learning_rate": 8.117190934879593e-06, "loss": 0.659, "step": 1425 }, { "epoch": 0.8810844115834874, "grad_norm": 0.6721958518028259, "learning_rate": 7.719682384357308e-06, "loss": 0.7557, "step": 1430 }, { "epoch": 0.8841651263093038, "grad_norm": 0.5357010364532471, "learning_rate": 7.33176209371923e-06, "loss": 0.7661, "step": 1435 }, { "epoch": 0.8872458410351202, "grad_norm": 0.5159677863121033, "learning_rate": 6.953470369291348e-06, "loss": 0.743, "step": 1440 }, { "epoch": 0.8903265557609366, "grad_norm": 0.8002091646194458, "learning_rate": 6.5848465169566e-06, "loss": 0.7694, "step": 1445 }, { "epoch": 0.893407270486753, "grad_norm": 0.9943484663963318, "learning_rate": 6.225928838071016e-06, "loss": 0.6581, "step": 1450 }, { "epoch": 0.8964879852125693, "grad_norm": 0.7004392743110657, "learning_rate": 5.876754625483904e-06, "loss": 0.6769, "step": 1455 }, { "epoch": 0.8995686999383857, "grad_norm": 0.5294577479362488, "learning_rate": 5.537360159663108e-06, "loss": 0.7258, "step": 1460 }, { "epoch": 0.9026494146642021, "grad_norm": 0.5086947679519653, "learning_rate": 5.207780704925314e-06, "loss": 0.7234, "step": 1465 }, { "epoch": 0.9057301293900185, "grad_norm": 0.6993657350540161, "learning_rate": 4.888050505771868e-06, "loss": 0.7496, "step": 1470 }, { "epoch": 0.9088108441158349, "grad_norm": 1.0011433362960815, "learning_rate": 4.578202783330799e-06, "loss": 0.6805, "step": 1475 }, { "epoch": 0.9118915588416513, "grad_norm": 0.5664235353469849, "learning_rate": 4.2782697319048605e-06, "loss": 0.7404, "step": 1480 }, { "epoch": 0.9149722735674677, "grad_norm": 0.5276147723197937, "learning_rate": 3.988282515626585e-06, "loss": 0.732, "step": 1485 }, { "epoch": 0.9180529882932841, "grad_norm": 0.512634813785553, "learning_rate": 3.7082712652200867e-06, "loss": 0.7366, "step": 1490 }, { "epoch": 0.9211337030191005, "grad_norm": 0.7594392895698547, "learning_rate": 3.438265074870417e-06, "loss": 0.7826, "step": 1495 }, { "epoch": 0.9242144177449169, "grad_norm": 1.0028218030929565, "learning_rate": 3.1782919992006333e-06, "loss": 0.665, "step": 1500 }, { "epoch": 0.9272951324707333, "grad_norm": 0.5730507969856262, "learning_rate": 2.9283790503567222e-06, "loss": 0.7497, "step": 1505 }, { "epoch": 0.9303758471965496, "grad_norm": 0.5541625618934631, "learning_rate": 2.6885521952010105e-06, "loss": 0.7605, "step": 1510 }, { "epoch": 0.933456561922366, "grad_norm": 0.5235902070999146, "learning_rate": 2.458836352614069e-06, "loss": 0.7322, "step": 1515 }, { "epoch": 0.9365372766481824, "grad_norm": 0.787257730960846, "learning_rate": 2.239255390905581e-06, "loss": 0.7623, "step": 1520 }, { "epoch": 0.9396179913739988, "grad_norm": 1.2801662683486938, "learning_rate": 2.029832125334319e-06, "loss": 0.6707, "step": 1525 }, { "epoch": 0.9426987060998152, "grad_norm": 0.604186475276947, "learning_rate": 1.8305883157375804e-06, "loss": 0.733, "step": 1530 }, { "epoch": 0.9457794208256316, "grad_norm": 0.5822204351425171, "learning_rate": 1.6415446642702337e-06, "loss": 0.7948, "step": 1535 }, { "epoch": 0.9488601355514479, "grad_norm": 0.49571382999420166, "learning_rate": 1.462720813253682e-06, "loss": 0.743, "step": 1540 }, { "epoch": 0.9519408502772643, "grad_norm": 0.8096848726272583, "learning_rate": 1.2941353431350056e-06, "loss": 0.7614, "step": 1545 }, { "epoch": 0.9550215650030807, "grad_norm": 1.2853180170059204, "learning_rate": 1.135805770556364e-06, "loss": 0.7004, "step": 1550 }, { "epoch": 0.958102279728897, "grad_norm": 0.5894697308540344, "learning_rate": 9.877485465349058e-07, "loss": 0.8042, "step": 1555 }, { "epoch": 0.9611829944547134, "grad_norm": 0.5564848780632019, "learning_rate": 8.499790547535025e-07, "loss": 0.742, "step": 1560 }, { "epoch": 0.9642637091805298, "grad_norm": 0.4396416246891022, "learning_rate": 7.225116099623286e-07, "loss": 0.6936, "step": 1565 }, { "epoch": 0.9673444239063462, "grad_norm": 0.8373792171478271, "learning_rate": 6.053594564914611e-07, "loss": 0.7837, "step": 1570 }, { "epoch": 0.9704251386321626, "grad_norm": 1.2192115783691406, "learning_rate": 4.985347668747809e-07, "loss": 0.6541, "step": 1575 }, { "epoch": 0.973505853357979, "grad_norm": 0.6488357782363892, "learning_rate": 4.0204864058522864e-07, "loss": 0.8073, "step": 1580 }, { "epoch": 0.9765865680837954, "grad_norm": 0.456813782453537, "learning_rate": 3.15911102881461e-07, "loss": 0.837, "step": 1585 }, { "epoch": 0.9796672828096118, "grad_norm": 0.514924168586731, "learning_rate": 2.40131103766239e-07, "loss": 0.7338, "step": 1590 }, { "epoch": 0.9827479975354282, "grad_norm": 0.7612470984458923, "learning_rate": 1.747165170564724e-07, "loss": 0.7998, "step": 1595 }, { "epoch": 0.9858287122612446, "grad_norm": 0.8824509978294373, "learning_rate": 1.1967413956510686e-07, "loss": 0.6986, "step": 1600 }, { "epoch": 0.988909426987061, "grad_norm": 0.6480757594108582, "learning_rate": 7.500969039491157e-08, "loss": 0.7331, "step": 1605 }, { "epoch": 0.9919901417128774, "grad_norm": 0.6705480217933655, "learning_rate": 4.0727810344254325e-08, "loss": 0.8027, "step": 1610 }, { "epoch": 0.9950708564386938, "grad_norm": 0.4745963215827942, "learning_rate": 1.6832061424865153e-08, "loss": 0.7123, "step": 1615 }, { "epoch": 0.9981515711645101, "grad_norm": 0.5832483172416687, "learning_rate": 3.3249264917878387e-09, "loss": 0.6986, "step": 1620 }, { "epoch": 1.0, "step": 1623, "total_flos": 123288491655168.0, "train_loss": 0.7956359339436115, "train_runtime": 17588.8577, "train_samples_per_second": 2.952, "train_steps_per_second": 0.092 } ], "logging_steps": 5, "max_steps": 1623, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 123288491655168.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }