{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1384, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003614022406938923, "grad_norm": 65.45185089111328, "learning_rate": 5.755395683453238e-07, "loss": 1.1416, "step": 5 }, { "epoch": 0.007228044813877846, "grad_norm": 13.528879165649414, "learning_rate": 1.2949640287769785e-06, "loss": 0.7177, "step": 10 }, { "epoch": 0.01084206722081677, "grad_norm": 6.224380016326904, "learning_rate": 2.0143884892086333e-06, "loss": 0.322, "step": 15 }, { "epoch": 0.014456089627755691, "grad_norm": 0.6761742234230042, "learning_rate": 2.733812949640288e-06, "loss": 0.0856, "step": 20 }, { "epoch": 0.018070112034694615, "grad_norm": 1.0489720106124878, "learning_rate": 3.453237410071943e-06, "loss": 0.0707, "step": 25 }, { "epoch": 0.02168413444163354, "grad_norm": 0.18744826316833496, "learning_rate": 4.172661870503597e-06, "loss": 0.0709, "step": 30 }, { "epoch": 0.02529815684857246, "grad_norm": 0.28395524621009827, "learning_rate": 4.892086330935253e-06, "loss": 0.072, "step": 35 }, { "epoch": 0.028912179255511383, "grad_norm": 0.3135656416416168, "learning_rate": 5.611510791366906e-06, "loss": 0.0709, "step": 40 }, { "epoch": 0.03252620166245031, "grad_norm": 0.2624616324901581, "learning_rate": 6.330935251798561e-06, "loss": 0.0728, "step": 45 }, { "epoch": 0.03614022406938923, "grad_norm": 0.06319738179445267, "learning_rate": 7.050359712230216e-06, "loss": 0.0724, "step": 50 }, { "epoch": 0.03975424647632815, "grad_norm": 1.5401397943496704, "learning_rate": 7.769784172661872e-06, "loss": 0.0725, "step": 55 }, { "epoch": 0.04336826888326708, "grad_norm": 0.6666511297225952, "learning_rate": 8.489208633093526e-06, "loss": 0.0681, "step": 60 }, { "epoch": 0.046982291290206, "grad_norm": 0.36439529061317444, "learning_rate": 9.20863309352518e-06, "loss": 0.0656, "step": 65 }, { "epoch": 0.05059631369714492, "grad_norm": 1.2289910316467285, "learning_rate": 9.928057553956835e-06, "loss": 0.0658, "step": 70 }, { "epoch": 0.054210336104083844, "grad_norm": 0.6084970235824585, "learning_rate": 1.0647482014388491e-05, "loss": 0.0629, "step": 75 }, { "epoch": 0.057824358511022765, "grad_norm": 0.249479740858078, "learning_rate": 1.1366906474820146e-05, "loss": 0.0608, "step": 80 }, { "epoch": 0.061438380917961694, "grad_norm": 0.4229901134967804, "learning_rate": 1.20863309352518e-05, "loss": 0.0609, "step": 85 }, { "epoch": 0.06505240332490062, "grad_norm": 0.31077179312705994, "learning_rate": 1.2805755395683454e-05, "loss": 0.0587, "step": 90 }, { "epoch": 0.06866642573183954, "grad_norm": 0.6159167885780334, "learning_rate": 1.3525179856115109e-05, "loss": 0.0562, "step": 95 }, { "epoch": 0.07228044813877846, "grad_norm": 0.6450925469398499, "learning_rate": 1.4244604316546765e-05, "loss": 0.0553, "step": 100 }, { "epoch": 0.07589447054571738, "grad_norm": 0.4479381740093231, "learning_rate": 1.496402877697842e-05, "loss": 0.0627, "step": 105 }, { "epoch": 0.0795084929526563, "grad_norm": 0.13854101300239563, "learning_rate": 1.5683453237410072e-05, "loss": 0.0608, "step": 110 }, { "epoch": 0.08312251535959522, "grad_norm": 0.5709484219551086, "learning_rate": 1.640287769784173e-05, "loss": 0.0645, "step": 115 }, { "epoch": 0.08673653776653416, "grad_norm": 0.13701078295707703, "learning_rate": 1.7122302158273384e-05, "loss": 0.0595, "step": 120 }, { "epoch": 0.09035056017347308, "grad_norm": 0.4921853244304657, "learning_rate": 1.784172661870504e-05, "loss": 0.0595, "step": 125 }, { "epoch": 0.093964582580412, "grad_norm": 1.0717633962631226, "learning_rate": 1.8561151079136693e-05, "loss": 0.0604, "step": 130 }, { "epoch": 0.09757860498735092, "grad_norm": 0.6324800848960876, "learning_rate": 1.9280575539568347e-05, "loss": 0.0587, "step": 135 }, { "epoch": 0.10119262739428984, "grad_norm": 0.2942172586917877, "learning_rate": 2e-05, "loss": 0.0572, "step": 140 }, { "epoch": 0.10480664980122877, "grad_norm": 0.44763851165771484, "learning_rate": 1.999920408755684e-05, "loss": 0.0532, "step": 145 }, { "epoch": 0.10842067220816769, "grad_norm": 0.4999435842037201, "learning_rate": 1.999681647692268e-05, "loss": 0.0548, "step": 150 }, { "epoch": 0.11203469461510661, "grad_norm": 0.9629915952682495, "learning_rate": 1.9992837548163315e-05, "loss": 0.0611, "step": 155 }, { "epoch": 0.11564871702204553, "grad_norm": 0.36340564489364624, "learning_rate": 1.998726793465454e-05, "loss": 0.0534, "step": 160 }, { "epoch": 0.11926273942898447, "grad_norm": 0.4698989987373352, "learning_rate": 1.9980108522981287e-05, "loss": 0.0557, "step": 165 }, { "epoch": 0.12287676183592339, "grad_norm": 0.29845741391181946, "learning_rate": 1.9971360452796523e-05, "loss": 0.0587, "step": 170 }, { "epoch": 0.1264907842428623, "grad_norm": 0.11843477934598923, "learning_rate": 1.996102511663983e-05, "loss": 0.057, "step": 175 }, { "epoch": 0.13010480664980123, "grad_norm": 0.10305560380220413, "learning_rate": 1.9949104159715746e-05, "loss": 0.0532, "step": 180 }, { "epoch": 0.13371882905674015, "grad_norm": 0.25948402285575867, "learning_rate": 1.993559947963185e-05, "loss": 0.0497, "step": 185 }, { "epoch": 0.13733285146367907, "grad_norm": 0.12778767943382263, "learning_rate": 1.9920513226096735e-05, "loss": 0.053, "step": 190 }, { "epoch": 0.140946873870618, "grad_norm": 0.29483047127723694, "learning_rate": 1.9903847800577777e-05, "loss": 0.0558, "step": 195 }, { "epoch": 0.14456089627755692, "grad_norm": 0.20306497812271118, "learning_rate": 1.9885605855918887e-05, "loss": 0.0578, "step": 200 }, { "epoch": 0.14817491868449584, "grad_norm": 0.41377827525138855, "learning_rate": 1.9865790295918212e-05, "loss": 0.0573, "step": 205 }, { "epoch": 0.15178894109143476, "grad_norm": 0.37599676847457886, "learning_rate": 1.984440427486591e-05, "loss": 0.0508, "step": 210 }, { "epoch": 0.15540296349837368, "grad_norm": 0.18772058188915253, "learning_rate": 1.9821451197042028e-05, "loss": 0.0579, "step": 215 }, { "epoch": 0.1590169859053126, "grad_norm": 0.1844455748796463, "learning_rate": 1.979693471617462e-05, "loss": 0.0548, "step": 220 }, { "epoch": 0.16263100831225152, "grad_norm": 0.3249725103378296, "learning_rate": 1.9770858734858123e-05, "loss": 0.059, "step": 225 }, { "epoch": 0.16624503071919045, "grad_norm": 0.21680741012096405, "learning_rate": 1.9743227403932135e-05, "loss": 0.0508, "step": 230 }, { "epoch": 0.1698590531261294, "grad_norm": 0.17999523878097534, "learning_rate": 1.9714045121820676e-05, "loss": 0.0504, "step": 235 }, { "epoch": 0.17347307553306832, "grad_norm": 0.19252051413059235, "learning_rate": 1.968331653383204e-05, "loss": 0.0524, "step": 240 }, { "epoch": 0.17708709794000724, "grad_norm": 0.39402344822883606, "learning_rate": 1.9651046531419335e-05, "loss": 0.0536, "step": 245 }, { "epoch": 0.18070112034694616, "grad_norm": 0.38865312933921814, "learning_rate": 1.961724025140185e-05, "loss": 0.0517, "step": 250 }, { "epoch": 0.18431514275388508, "grad_norm": 0.7893776297569275, "learning_rate": 1.9581903075147372e-05, "loss": 0.0577, "step": 255 }, { "epoch": 0.187929165160824, "grad_norm": 0.8502449989318848, "learning_rate": 1.9545040627715554e-05, "loss": 0.052, "step": 260 }, { "epoch": 0.19154318756776292, "grad_norm": 0.33574968576431274, "learning_rate": 1.9506658776962522e-05, "loss": 0.0548, "step": 265 }, { "epoch": 0.19515720997470185, "grad_norm": 0.39989784359931946, "learning_rate": 1.946676363260679e-05, "loss": 0.0513, "step": 270 }, { "epoch": 0.19877123238164077, "grad_norm": 0.26637327671051025, "learning_rate": 1.942536154525673e-05, "loss": 0.0515, "step": 275 }, { "epoch": 0.2023852547885797, "grad_norm": 0.20739665627479553, "learning_rate": 1.9382459105399634e-05, "loss": 0.0468, "step": 280 }, { "epoch": 0.2059992771955186, "grad_norm": 0.12362375110387802, "learning_rate": 1.9338063142352644e-05, "loss": 0.0455, "step": 285 }, { "epoch": 0.20961329960245753, "grad_norm": 0.20232897996902466, "learning_rate": 1.9292180723175656e-05, "loss": 0.0497, "step": 290 }, { "epoch": 0.21322732200939645, "grad_norm": 0.1344902068376541, "learning_rate": 1.9244819151546325e-05, "loss": 0.0526, "step": 295 }, { "epoch": 0.21684134441633537, "grad_norm": 0.33682653307914734, "learning_rate": 1.9195985966597495e-05, "loss": 0.0529, "step": 300 }, { "epoch": 0.2204553668232743, "grad_norm": 0.2771964371204376, "learning_rate": 1.9145688941717074e-05, "loss": 0.0435, "step": 305 }, { "epoch": 0.22406938923021322, "grad_norm": 0.6373592615127563, "learning_rate": 1.9093936083310653e-05, "loss": 0.0531, "step": 310 }, { "epoch": 0.22768341163715214, "grad_norm": 0.23717528581619263, "learning_rate": 1.9040735629527027e-05, "loss": 0.0525, "step": 315 }, { "epoch": 0.23129743404409106, "grad_norm": 0.23559607565402985, "learning_rate": 1.8986096048946826e-05, "loss": 0.0476, "step": 320 }, { "epoch": 0.23491145645103, "grad_norm": 0.19278410077095032, "learning_rate": 1.893002603923446e-05, "loss": 0.0515, "step": 325 }, { "epoch": 0.23852547885796893, "grad_norm": 0.10223805904388428, "learning_rate": 1.8872534525753617e-05, "loss": 0.0516, "step": 330 }, { "epoch": 0.24213950126490785, "grad_norm": 0.13931454718112946, "learning_rate": 1.881363066014649e-05, "loss": 0.0518, "step": 335 }, { "epoch": 0.24575352367184677, "grad_norm": 0.26712024211883545, "learning_rate": 1.875332381887699e-05, "loss": 0.0475, "step": 340 }, { "epoch": 0.2493675460787857, "grad_norm": 0.1877209097146988, "learning_rate": 1.86916236017382e-05, "loss": 0.0491, "step": 345 }, { "epoch": 0.2529815684857246, "grad_norm": 0.13105887174606323, "learning_rate": 1.862853983032423e-05, "loss": 0.0473, "step": 350 }, { "epoch": 0.25659559089266354, "grad_norm": 0.15709713101387024, "learning_rate": 1.8564082546466804e-05, "loss": 0.0496, "step": 355 }, { "epoch": 0.26020961329960246, "grad_norm": 0.14228186011314392, "learning_rate": 1.8498262010636777e-05, "loss": 0.0486, "step": 360 }, { "epoch": 0.2638236357065414, "grad_norm": 0.45368531346321106, "learning_rate": 1.8431088700310846e-05, "loss": 0.0513, "step": 365 }, { "epoch": 0.2674376581134803, "grad_norm": 0.20928220450878143, "learning_rate": 1.836257330830372e-05, "loss": 0.0542, "step": 370 }, { "epoch": 0.2710516805204192, "grad_norm": 0.18949194252490997, "learning_rate": 1.8292726741066008e-05, "loss": 0.0466, "step": 375 }, { "epoch": 0.27466570292735815, "grad_norm": 0.26113438606262207, "learning_rate": 1.8221560116948103e-05, "loss": 0.0426, "step": 380 }, { "epoch": 0.27827972533429707, "grad_norm": 0.13533490896224976, "learning_rate": 1.814908476443034e-05, "loss": 0.0454, "step": 385 }, { "epoch": 0.281893747741236, "grad_norm": 0.15402187407016754, "learning_rate": 1.80753122203197e-05, "loss": 0.0486, "step": 390 }, { "epoch": 0.2855077701481749, "grad_norm": 0.4744708240032196, "learning_rate": 1.8000254227913346e-05, "loss": 0.0509, "step": 395 }, { "epoch": 0.28912179255511383, "grad_norm": 0.9314901232719421, "learning_rate": 1.7923922735129303e-05, "loss": 0.0542, "step": 400 }, { "epoch": 0.29273581496205275, "grad_norm": 0.31646886467933655, "learning_rate": 1.7846329892604548e-05, "loss": 0.0422, "step": 405 }, { "epoch": 0.2963498373689917, "grad_norm": 0.36957696080207825, "learning_rate": 1.7767488051760858e-05, "loss": 0.0479, "step": 410 }, { "epoch": 0.2999638597759306, "grad_norm": 0.13091270625591278, "learning_rate": 1.7687409762838666e-05, "loss": 0.0454, "step": 415 }, { "epoch": 0.3035778821828695, "grad_norm": 0.11943885684013367, "learning_rate": 1.760610777289929e-05, "loss": 0.0483, "step": 420 }, { "epoch": 0.30719190458980844, "grad_norm": 0.1768166720867157, "learning_rate": 1.7523595023795814e-05, "loss": 0.045, "step": 425 }, { "epoch": 0.31080592699674736, "grad_norm": 0.3109482228755951, "learning_rate": 1.743988465011299e-05, "loss": 0.0449, "step": 430 }, { "epoch": 0.3144199494036863, "grad_norm": 0.33029767870903015, "learning_rate": 1.735498997707642e-05, "loss": 0.0469, "step": 435 }, { "epoch": 0.3180339718106252, "grad_norm": 0.09848926961421967, "learning_rate": 1.7268924518431437e-05, "loss": 0.0436, "step": 440 }, { "epoch": 0.3216479942175641, "grad_norm": 0.25710049271583557, "learning_rate": 1.7181701974291927e-05, "loss": 0.0457, "step": 445 }, { "epoch": 0.32526201662450305, "grad_norm": 0.2981012165546417, "learning_rate": 1.7093336228959538e-05, "loss": 0.0437, "step": 450 }, { "epoch": 0.32887603903144197, "grad_norm": 0.2785588204860687, "learning_rate": 1.700384134871351e-05, "loss": 0.0499, "step": 455 }, { "epoch": 0.3324900614383809, "grad_norm": 0.34511667490005493, "learning_rate": 1.691323157957161e-05, "loss": 0.047, "step": 460 }, { "epoch": 0.33610408384531987, "grad_norm": 0.4485796391963959, "learning_rate": 1.6821521345022377e-05, "loss": 0.0453, "step": 465 }, { "epoch": 0.3397181062522588, "grad_norm": 0.22290338575839996, "learning_rate": 1.672872524372919e-05, "loss": 0.0455, "step": 470 }, { "epoch": 0.3433321286591977, "grad_norm": 0.11010613292455673, "learning_rate": 1.663485804720638e-05, "loss": 0.041, "step": 475 }, { "epoch": 0.34694615106613663, "grad_norm": 0.17054611444473267, "learning_rate": 1.6539934697467895e-05, "loss": 0.0423, "step": 480 }, { "epoch": 0.35056017347307555, "grad_norm": 0.31248804926872253, "learning_rate": 1.644397030464877e-05, "loss": 0.0458, "step": 485 }, { "epoch": 0.3541741958800145, "grad_norm": 0.2744062542915344, "learning_rate": 1.634698014459988e-05, "loss": 0.0475, "step": 490 }, { "epoch": 0.3577882182869534, "grad_norm": 0.23750263452529907, "learning_rate": 1.6248979656456273e-05, "loss": 0.0442, "step": 495 }, { "epoch": 0.3614022406938923, "grad_norm": 0.46631160378456116, "learning_rate": 1.614998444017954e-05, "loss": 0.0499, "step": 500 }, { "epoch": 0.3614022406938923, "eval_loss": 0.06283282488584518, "eval_runtime": 1684.1491, "eval_samples_per_second": 38.124, "eval_steps_per_second": 1.192, "step": 500 }, { "epoch": 0.36501626310083124, "grad_norm": 0.3171086013317108, "learning_rate": 1.6050010254074564e-05, "loss": 0.0427, "step": 505 }, { "epoch": 0.36863028550777016, "grad_norm": 0.14167694747447968, "learning_rate": 1.5949073012281092e-05, "loss": 0.0434, "step": 510 }, { "epoch": 0.3722443079147091, "grad_norm": 0.21627700328826904, "learning_rate": 1.5847188782240473e-05, "loss": 0.0399, "step": 515 }, { "epoch": 0.375858330321648, "grad_norm": 0.17993256449699402, "learning_rate": 1.5744373782137993e-05, "loss": 0.0397, "step": 520 }, { "epoch": 0.3794723527285869, "grad_norm": 0.1952645629644394, "learning_rate": 1.5640644378321236e-05, "loss": 0.0439, "step": 525 }, { "epoch": 0.38308637513552585, "grad_norm": 0.36679714918136597, "learning_rate": 1.5536017082694846e-05, "loss": 0.0445, "step": 530 }, { "epoch": 0.38670039754246477, "grad_norm": 0.23930436372756958, "learning_rate": 1.5430508550092123e-05, "loss": 0.0459, "step": 535 }, { "epoch": 0.3903144199494037, "grad_norm": 0.2509578466415405, "learning_rate": 1.532413557562386e-05, "loss": 0.0436, "step": 540 }, { "epoch": 0.3939284423563426, "grad_norm": 0.34942692518234253, "learning_rate": 1.5216915092004847e-05, "loss": 0.0417, "step": 545 }, { "epoch": 0.39754246476328153, "grad_norm": 0.1713494062423706, "learning_rate": 1.5108864166858506e-05, "loss": 0.0399, "step": 550 }, { "epoch": 0.40115648717022045, "grad_norm": 0.14295130968093872, "learning_rate": 1.5000000000000002e-05, "loss": 0.0402, "step": 555 }, { "epoch": 0.4047705095771594, "grad_norm": 0.13941536843776703, "learning_rate": 1.4890339920698334e-05, "loss": 0.0433, "step": 560 }, { "epoch": 0.4083845319840983, "grad_norm": 0.46250632405281067, "learning_rate": 1.4779901384917833e-05, "loss": 0.0403, "step": 565 }, { "epoch": 0.4119985543910372, "grad_norm": 0.43441110849380493, "learning_rate": 1.4668701972539459e-05, "loss": 0.0429, "step": 570 }, { "epoch": 0.41561257679797614, "grad_norm": 0.26317739486694336, "learning_rate": 1.4556759384562418e-05, "loss": 0.0441, "step": 575 }, { "epoch": 0.41922659920491506, "grad_norm": 0.42390304803848267, "learning_rate": 1.444409144028644e-05, "loss": 0.0436, "step": 580 }, { "epoch": 0.422840621611854, "grad_norm": 0.145647332072258, "learning_rate": 1.4330716074475287e-05, "loss": 0.0401, "step": 585 }, { "epoch": 0.4264546440187929, "grad_norm": 0.2961156964302063, "learning_rate": 1.421665133450184e-05, "loss": 0.0431, "step": 590 }, { "epoch": 0.4300686664257318, "grad_norm": 0.16565440595149994, "learning_rate": 1.4101915377475275e-05, "loss": 0.0372, "step": 595 }, { "epoch": 0.43368268883267075, "grad_norm": 0.16353151202201843, "learning_rate": 1.398652646735076e-05, "loss": 0.0368, "step": 600 }, { "epoch": 0.43729671123960967, "grad_norm": 0.22869880497455597, "learning_rate": 1.3870502972022175e-05, "loss": 0.0402, "step": 605 }, { "epoch": 0.4409107336465486, "grad_norm": 0.17049099504947662, "learning_rate": 1.3753863360398243e-05, "loss": 0.0413, "step": 610 }, { "epoch": 0.4445247560534875, "grad_norm": 0.4078167676925659, "learning_rate": 1.3636626199462615e-05, "loss": 0.0357, "step": 615 }, { "epoch": 0.44813877846042643, "grad_norm": 0.2695305347442627, "learning_rate": 1.351881015131833e-05, "loss": 0.0416, "step": 620 }, { "epoch": 0.45175280086736536, "grad_norm": 0.2164461761713028, "learning_rate": 1.3400433970217137e-05, "loss": 0.0361, "step": 625 }, { "epoch": 0.4553668232743043, "grad_norm": 0.2737310528755188, "learning_rate": 1.3281516499574134e-05, "loss": 0.0426, "step": 630 }, { "epoch": 0.4589808456812432, "grad_norm": 0.27422964572906494, "learning_rate": 1.316207666896824e-05, "loss": 0.0381, "step": 635 }, { "epoch": 0.4625948680881821, "grad_norm": 0.2095690369606018, "learning_rate": 1.3042133491128934e-05, "loss": 0.0433, "step": 640 }, { "epoch": 0.46620889049512104, "grad_norm": 0.19285966455936432, "learning_rate": 1.2921706058909757e-05, "loss": 0.0376, "step": 645 }, { "epoch": 0.46982291290206, "grad_norm": 0.2235649824142456, "learning_rate": 1.2800813542249073e-05, "loss": 0.034, "step": 650 }, { "epoch": 0.47343693530899894, "grad_norm": 0.21383994817733765, "learning_rate": 1.2679475185118535e-05, "loss": 0.034, "step": 655 }, { "epoch": 0.47705095771593786, "grad_norm": 0.12778101861476898, "learning_rate": 1.2557710302459803e-05, "loss": 0.0422, "step": 660 }, { "epoch": 0.4806649801228768, "grad_norm": 0.13164684176445007, "learning_rate": 1.2435538277109919e-05, "loss": 0.0458, "step": 665 }, { "epoch": 0.4842790025298157, "grad_norm": 0.10109174251556396, "learning_rate": 1.2312978556715934e-05, "loss": 0.0444, "step": 670 }, { "epoch": 0.4878930249367546, "grad_norm": 0.12084756791591644, "learning_rate": 1.2190050650639131e-05, "loss": 0.0382, "step": 675 }, { "epoch": 0.49150704734369355, "grad_norm": 0.27953436970710754, "learning_rate": 1.206677412684953e-05, "loss": 0.0398, "step": 680 }, { "epoch": 0.49512106975063247, "grad_norm": 0.24097155034542084, "learning_rate": 1.1943168608810977e-05, "loss": 0.0396, "step": 685 }, { "epoch": 0.4987350921575714, "grad_norm": 0.17015314102172852, "learning_rate": 1.1819253772357442e-05, "loss": 0.0374, "step": 690 }, { "epoch": 0.5023491145645103, "grad_norm": 0.2601104974746704, "learning_rate": 1.1695049342560969e-05, "loss": 0.0461, "step": 695 }, { "epoch": 0.5059631369714492, "grad_norm": 0.09341022372245789, "learning_rate": 1.157057509059179e-05, "loss": 0.0355, "step": 700 }, { "epoch": 0.5095771593783881, "grad_norm": 0.2533698081970215, "learning_rate": 1.144585083057111e-05, "loss": 0.0389, "step": 705 }, { "epoch": 0.5131911817853271, "grad_norm": 0.3184564411640167, "learning_rate": 1.1320896416417026e-05, "loss": 0.0442, "step": 710 }, { "epoch": 0.5168052041922659, "grad_norm": 0.3521571755409241, "learning_rate": 1.119573173868415e-05, "loss": 0.0473, "step": 715 }, { "epoch": 0.5204192265992049, "grad_norm": 0.1924971640110016, "learning_rate": 1.1070376721397374e-05, "loss": 0.0411, "step": 720 }, { "epoch": 0.5240332490061438, "grad_norm": 0.13801982998847961, "learning_rate": 1.0944851318880314e-05, "loss": 0.0416, "step": 725 }, { "epoch": 0.5276472714130828, "grad_norm": 0.4150485694408417, "learning_rate": 1.0819175512578925e-05, "loss": 0.0388, "step": 730 }, { "epoch": 0.5312612938200216, "grad_norm": 0.17930372059345245, "learning_rate": 1.0693369307880817e-05, "loss": 0.04, "step": 735 }, { "epoch": 0.5348753162269606, "grad_norm": 0.281807005405426, "learning_rate": 1.0567452730930743e-05, "loss": 0.0349, "step": 740 }, { "epoch": 0.5384893386338996, "grad_norm": 0.18618857860565186, "learning_rate": 1.0441445825442773e-05, "loss": 0.0387, "step": 745 }, { "epoch": 0.5421033610408384, "grad_norm": 0.18845702707767487, "learning_rate": 1.0315368649509716e-05, "loss": 0.0379, "step": 750 }, { "epoch": 0.5457173834477774, "grad_norm": 0.3343959152698517, "learning_rate": 1.0189241272410191e-05, "loss": 0.0385, "step": 755 }, { "epoch": 0.5493314058547163, "grad_norm": 0.22052714228630066, "learning_rate": 1.0063083771413975e-05, "loss": 0.0561, "step": 760 }, { "epoch": 0.5529454282616553, "grad_norm": 0.11884549260139465, "learning_rate": 9.936916228586027e-06, "loss": 0.0383, "step": 765 }, { "epoch": 0.5565594506685941, "grad_norm": 0.1994011104106903, "learning_rate": 9.810758727589814e-06, "loss": 0.0391, "step": 770 }, { "epoch": 0.5601734730755331, "grad_norm": 0.264037162065506, "learning_rate": 9.684631350490287e-06, "loss": 0.0324, "step": 775 }, { "epoch": 0.563787495482472, "grad_norm": 0.22638504207134247, "learning_rate": 9.55855417455723e-06, "loss": 0.0352, "step": 780 }, { "epoch": 0.567401517889411, "grad_norm": 0.1375981718301773, "learning_rate": 9.43254726906926e-06, "loss": 0.0358, "step": 785 }, { "epoch": 0.5710155402963498, "grad_norm": 0.1931852251291275, "learning_rate": 9.306630692119183e-06, "loss": 0.0375, "step": 790 }, { "epoch": 0.5746295627032888, "grad_norm": 0.3065533936023712, "learning_rate": 9.180824487421077e-06, "loss": 0.0386, "step": 795 }, { "epoch": 0.5782435851102277, "grad_norm": 0.13863135874271393, "learning_rate": 9.055148681119688e-06, "loss": 0.034, "step": 800 }, { "epoch": 0.5818576075171666, "grad_norm": 0.1802493929862976, "learning_rate": 8.929623278602627e-06, "loss": 0.0368, "step": 805 }, { "epoch": 0.5854716299241055, "grad_norm": 0.12986677885055542, "learning_rate": 8.80426826131585e-06, "loss": 0.0308, "step": 810 }, { "epoch": 0.5890856523310445, "grad_norm": 0.2099331021308899, "learning_rate": 8.67910358358298e-06, "loss": 0.0319, "step": 815 }, { "epoch": 0.5926996747379834, "grad_norm": 0.15507160127162933, "learning_rate": 8.554149169428894e-06, "loss": 0.0296, "step": 820 }, { "epoch": 0.5963136971449223, "grad_norm": 0.2265109270811081, "learning_rate": 8.429424909408215e-06, "loss": 0.0359, "step": 825 }, { "epoch": 0.5999277195518612, "grad_norm": 0.16594430804252625, "learning_rate": 8.304950657439034e-06, "loss": 0.0394, "step": 830 }, { "epoch": 0.6035417419588002, "grad_norm": 0.175150528550148, "learning_rate": 8.180746227642561e-06, "loss": 0.0348, "step": 835 }, { "epoch": 0.607155764365739, "grad_norm": 0.15162833034992218, "learning_rate": 8.056831391189024e-06, "loss": 0.0349, "step": 840 }, { "epoch": 0.610769786772678, "grad_norm": 0.22084273397922516, "learning_rate": 7.93322587315047e-06, "loss": 0.0363, "step": 845 }, { "epoch": 0.6143838091796169, "grad_norm": 0.2815225124359131, "learning_rate": 7.809949349360872e-06, "loss": 0.0369, "step": 850 }, { "epoch": 0.6179978315865559, "grad_norm": 0.38362768292427063, "learning_rate": 7.687021443284071e-06, "loss": 0.0315, "step": 855 }, { "epoch": 0.6216118539934947, "grad_norm": 0.44302356243133545, "learning_rate": 7.564461722890082e-06, "loss": 0.037, "step": 860 }, { "epoch": 0.6252258764004337, "grad_norm": 0.16820432245731354, "learning_rate": 7.4422896975402004e-06, "loss": 0.0364, "step": 865 }, { "epoch": 0.6288398988073726, "grad_norm": 0.17497654259204865, "learning_rate": 7.320524814881471e-06, "loss": 0.0335, "step": 870 }, { "epoch": 0.6324539212143115, "grad_norm": 0.23959729075431824, "learning_rate": 7.199186457750931e-06, "loss": 0.0331, "step": 875 }, { "epoch": 0.6360679436212504, "grad_norm": 0.216622993350029, "learning_rate": 7.078293941090248e-06, "loss": 0.0359, "step": 880 }, { "epoch": 0.6396819660281894, "grad_norm": 0.21011534333229065, "learning_rate": 6.957866508871068e-06, "loss": 0.041, "step": 885 }, { "epoch": 0.6432959884351283, "grad_norm": 0.13897277414798737, "learning_rate": 6.837923331031761e-06, "loss": 0.0326, "step": 890 }, { "epoch": 0.6469100108420672, "grad_norm": 0.15225118398666382, "learning_rate": 6.718483500425868e-06, "loss": 0.0356, "step": 895 }, { "epoch": 0.6505240332490061, "grad_norm": 0.16399000585079193, "learning_rate": 6.599566029782863e-06, "loss": 0.0394, "step": 900 }, { "epoch": 0.6541380556559451, "grad_norm": 0.15435738861560822, "learning_rate": 6.48118984868167e-06, "loss": 0.0361, "step": 905 }, { "epoch": 0.6577520780628839, "grad_norm": 0.23756234347820282, "learning_rate": 6.363373800537388e-06, "loss": 0.0349, "step": 910 }, { "epoch": 0.6613661004698229, "grad_norm": 0.17288844287395477, "learning_rate": 6.246136639601763e-06, "loss": 0.0351, "step": 915 }, { "epoch": 0.6649801228767618, "grad_norm": 0.1758948117494583, "learning_rate": 6.129497027977829e-06, "loss": 0.034, "step": 920 }, { "epoch": 0.6685941452837008, "grad_norm": 0.1595619171857834, "learning_rate": 6.013473532649246e-06, "loss": 0.0346, "step": 925 }, { "epoch": 0.6722081676906397, "grad_norm": 0.24954086542129517, "learning_rate": 5.898084622524729e-06, "loss": 0.0323, "step": 930 }, { "epoch": 0.6758221900975786, "grad_norm": 0.19846731424331665, "learning_rate": 5.78334866549816e-06, "loss": 0.0337, "step": 935 }, { "epoch": 0.6794362125045176, "grad_norm": 0.42544224858283997, "learning_rate": 5.669283925524716e-06, "loss": 0.0327, "step": 940 }, { "epoch": 0.6830502349114564, "grad_norm": 0.17253951728343964, "learning_rate": 5.555908559713561e-06, "loss": 0.04, "step": 945 }, { "epoch": 0.6866642573183954, "grad_norm": 0.18640030920505524, "learning_rate": 5.443240615437586e-06, "loss": 0.0328, "step": 950 }, { "epoch": 0.6902782797253343, "grad_norm": 0.1302676647901535, "learning_rate": 5.33129802746054e-06, "loss": 0.0361, "step": 955 }, { "epoch": 0.6938923021322733, "grad_norm": 0.1659248024225235, "learning_rate": 5.22009861508217e-06, "loss": 0.0332, "step": 960 }, { "epoch": 0.6975063245392121, "grad_norm": 0.19474443793296814, "learning_rate": 5.109660079301668e-06, "loss": 0.0327, "step": 965 }, { "epoch": 0.7011203469461511, "grad_norm": 0.2394312471151352, "learning_rate": 5.000000000000003e-06, "loss": 0.0305, "step": 970 }, { "epoch": 0.70473436935309, "grad_norm": 0.24105896055698395, "learning_rate": 4.891135833141495e-06, "loss": 0.0316, "step": 975 }, { "epoch": 0.708348391760029, "grad_norm": 0.15345264971256256, "learning_rate": 4.783084907995156e-06, "loss": 0.0341, "step": 980 }, { "epoch": 0.7119624141669678, "grad_norm": 0.23472526669502258, "learning_rate": 4.675864424376146e-06, "loss": 0.0352, "step": 985 }, { "epoch": 0.7155764365739068, "grad_norm": 0.12148404121398926, "learning_rate": 4.569491449907878e-06, "loss": 0.0339, "step": 990 }, { "epoch": 0.7191904589808457, "grad_norm": 0.2683773338794708, "learning_rate": 4.463982917305155e-06, "loss": 0.0434, "step": 995 }, { "epoch": 0.7228044813877846, "grad_norm": 0.1990807205438614, "learning_rate": 4.359355621678765e-06, "loss": 0.0333, "step": 1000 }, { "epoch": 0.7228044813877846, "eval_loss": 0.06226345896720886, "eval_runtime": 1684.5006, "eval_samples_per_second": 38.116, "eval_steps_per_second": 1.191, "step": 1000 }, { "epoch": 0.7264185037947235, "grad_norm": 0.16073299944400787, "learning_rate": 4.255626217862014e-06, "loss": 0.0281, "step": 1005 }, { "epoch": 0.7300325262016625, "grad_norm": 0.17389674484729767, "learning_rate": 4.152811217759529e-06, "loss": 0.0333, "step": 1010 }, { "epoch": 0.7336465486086013, "grad_norm": 0.23013567924499512, "learning_rate": 4.050926987718911e-06, "loss": 0.0359, "step": 1015 }, { "epoch": 0.7372605710155403, "grad_norm": 0.2871190905570984, "learning_rate": 3.9499897459254375e-06, "loss": 0.033, "step": 1020 }, { "epoch": 0.7408745934224792, "grad_norm": 0.1907360553741455, "learning_rate": 3.850015559820465e-06, "loss": 0.0324, "step": 1025 }, { "epoch": 0.7444886158294182, "grad_norm": 0.3101818561553955, "learning_rate": 3.75102034354373e-06, "loss": 0.037, "step": 1030 }, { "epoch": 0.748102638236357, "grad_norm": 0.20055055618286133, "learning_rate": 3.653019855400123e-06, "loss": 0.0296, "step": 1035 }, { "epoch": 0.751716660643296, "grad_norm": 0.23536404967308044, "learning_rate": 3.5560296953512296e-06, "loss": 0.0355, "step": 1040 }, { "epoch": 0.7553306830502349, "grad_norm": 0.28307974338531494, "learning_rate": 3.4600653025321085e-06, "loss": 0.0281, "step": 1045 }, { "epoch": 0.7589447054571739, "grad_norm": 0.32817333936691284, "learning_rate": 3.3651419527936223e-06, "loss": 0.029, "step": 1050 }, { "epoch": 0.7625587278641127, "grad_norm": 0.2622580826282501, "learning_rate": 3.2712747562708115e-06, "loss": 0.0346, "step": 1055 }, { "epoch": 0.7661727502710517, "grad_norm": 0.16355015337467194, "learning_rate": 3.178478654977624e-06, "loss": 0.0308, "step": 1060 }, { "epoch": 0.7697867726779906, "grad_norm": 0.13598884642124176, "learning_rate": 3.086768420428392e-06, "loss": 0.0342, "step": 1065 }, { "epoch": 0.7734007950849295, "grad_norm": 0.2859932482242584, "learning_rate": 2.9961586512864947e-06, "loss": 0.0321, "step": 1070 }, { "epoch": 0.7770148174918684, "grad_norm": 0.20649324357509613, "learning_rate": 2.906663771040468e-06, "loss": 0.0321, "step": 1075 }, { "epoch": 0.7806288398988074, "grad_norm": 0.17001411318778992, "learning_rate": 2.8182980257080748e-06, "loss": 0.0318, "step": 1080 }, { "epoch": 0.7842428623057462, "grad_norm": 0.1272343248128891, "learning_rate": 2.7310754815685627e-06, "loss": 0.0319, "step": 1085 }, { "epoch": 0.7878568847126852, "grad_norm": 0.18668726086616516, "learning_rate": 2.64501002292358e-06, "loss": 0.0337, "step": 1090 }, { "epoch": 0.7914709071196241, "grad_norm": 0.22532936930656433, "learning_rate": 2.5601153498870137e-06, "loss": 0.0292, "step": 1095 }, { "epoch": 0.7950849295265631, "grad_norm": 0.2742968797683716, "learning_rate": 2.4764049762041874e-06, "loss": 0.0345, "step": 1100 }, { "epoch": 0.7986989519335019, "grad_norm": 0.262674480676651, "learning_rate": 2.3938922271007147e-06, "loss": 0.0282, "step": 1105 }, { "epoch": 0.8023129743404409, "grad_norm": 0.1856016367673874, "learning_rate": 2.312590237161335e-06, "loss": 0.026, "step": 1110 }, { "epoch": 0.8059269967473799, "grad_norm": 0.1422245353460312, "learning_rate": 2.2325119482391466e-06, "loss": 0.0323, "step": 1115 }, { "epoch": 0.8095410191543188, "grad_norm": 0.17140090465545654, "learning_rate": 2.153670107395456e-06, "loss": 0.0266, "step": 1120 }, { "epoch": 0.8131550415612577, "grad_norm": 0.3133852481842041, "learning_rate": 2.0760772648707016e-06, "loss": 0.0314, "step": 1125 }, { "epoch": 0.8167690639681966, "grad_norm": 0.2690429091453552, "learning_rate": 1.9997457720866554e-06, "loss": 0.0259, "step": 1130 }, { "epoch": 0.8203830863751356, "grad_norm": 0.2718668282032013, "learning_rate": 1.924687779680302e-06, "loss": 0.0269, "step": 1135 }, { "epoch": 0.8239971087820744, "grad_norm": 0.25396299362182617, "learning_rate": 1.8509152355696625e-06, "loss": 0.0317, "step": 1140 }, { "epoch": 0.8276111311890134, "grad_norm": 0.2634499669075012, "learning_rate": 1.7784398830519002e-06, "loss": 0.0283, "step": 1145 }, { "epoch": 0.8312251535959523, "grad_norm": 0.28554320335388184, "learning_rate": 1.7072732589339958e-06, "loss": 0.0294, "step": 1150 }, { "epoch": 0.8348391760028913, "grad_norm": 0.16729383170604706, "learning_rate": 1.6374266916962832e-06, "loss": 0.0276, "step": 1155 }, { "epoch": 0.8384531984098301, "grad_norm": 0.3412957787513733, "learning_rate": 1.5689112996891576e-06, "loss": 0.0307, "step": 1160 }, { "epoch": 0.8420672208167691, "grad_norm": 0.19754613935947418, "learning_rate": 1.5017379893632255e-06, "loss": 0.03, "step": 1165 }, { "epoch": 0.845681243223708, "grad_norm": 0.24068793654441833, "learning_rate": 1.4359174535331998e-06, "loss": 0.0306, "step": 1170 }, { "epoch": 0.849295265630647, "grad_norm": 0.14576299488544464, "learning_rate": 1.3714601696757713e-06, "loss": 0.0302, "step": 1175 }, { "epoch": 0.8529092880375858, "grad_norm": 0.15564730763435364, "learning_rate": 1.3083763982618026e-06, "loss": 0.0284, "step": 1180 }, { "epoch": 0.8565233104445248, "grad_norm": 0.25240421295166016, "learning_rate": 1.2466761811230099e-06, "loss": 0.0279, "step": 1185 }, { "epoch": 0.8601373328514637, "grad_norm": 0.2509186267852783, "learning_rate": 1.1863693398535115e-06, "loss": 0.0313, "step": 1190 }, { "epoch": 0.8637513552584026, "grad_norm": 0.22277849912643433, "learning_rate": 1.1274654742463842e-06, "loss": 0.0277, "step": 1195 }, { "epoch": 0.8673653776653415, "grad_norm": 0.15197935700416565, "learning_rate": 1.0699739607655434e-06, "loss": 0.0348, "step": 1200 }, { "epoch": 0.8709794000722805, "grad_norm": 0.27582135796546936, "learning_rate": 1.01390395105318e-06, "loss": 0.0317, "step": 1205 }, { "epoch": 0.8745934224792193, "grad_norm": 0.2296678125858307, "learning_rate": 9.592643704729754e-07, "loss": 0.028, "step": 1210 }, { "epoch": 0.8782074448861583, "grad_norm": 0.3100360631942749, "learning_rate": 9.060639166893493e-07, "loss": 0.0296, "step": 1215 }, { "epoch": 0.8818214672930972, "grad_norm": 0.1880117952823639, "learning_rate": 8.543110582829272e-07, "loss": 0.0298, "step": 1220 }, { "epoch": 0.8854354897000362, "grad_norm": 0.1505495011806488, "learning_rate": 8.040140334025082e-07, "loss": 0.0251, "step": 1225 }, { "epoch": 0.889049512106975, "grad_norm": 0.1834176778793335, "learning_rate": 7.551808484536782e-07, "loss": 0.0289, "step": 1230 }, { "epoch": 0.892663534513914, "grad_norm": 0.2518479824066162, "learning_rate": 7.078192768243486e-07, "loss": 0.034, "step": 1235 }, { "epoch": 0.8962775569208529, "grad_norm": 0.2181636542081833, "learning_rate": 6.61936857647355e-07, "loss": 0.0308, "step": 1240 }, { "epoch": 0.8998915793277918, "grad_norm": 0.28248122334480286, "learning_rate": 6.175408946003703e-07, "loss": 0.0338, "step": 1245 }, { "epoch": 0.9035056017347307, "grad_norm": 0.18422535061836243, "learning_rate": 5.746384547432738e-07, "loss": 0.0267, "step": 1250 }, { "epoch": 0.9071196241416697, "grad_norm": 0.17286597192287445, "learning_rate": 5.332363673932106e-07, "loss": 0.0318, "step": 1255 }, { "epoch": 0.9107336465486086, "grad_norm": 0.23957087099552155, "learning_rate": 4.933412230374812e-07, "loss": 0.035, "step": 1260 }, { "epoch": 0.9143476689555475, "grad_norm": 0.1773596704006195, "learning_rate": 4.549593722844492e-07, "loss": 0.0243, "step": 1265 }, { "epoch": 0.9179616913624864, "grad_norm": 0.25387516617774963, "learning_rate": 4.180969248526334e-07, "loss": 0.0272, "step": 1270 }, { "epoch": 0.9215757137694254, "grad_norm": 0.23314779996871948, "learning_rate": 3.827597485981527e-07, "loss": 0.0288, "step": 1275 }, { "epoch": 0.9251897361763642, "grad_norm": 0.16649064421653748, "learning_rate": 3.4895346858066723e-07, "loss": 0.0295, "step": 1280 }, { "epoch": 0.9288037585833032, "grad_norm": 0.1977885514497757, "learning_rate": 3.166834661679596e-07, "loss": 0.029, "step": 1285 }, { "epoch": 0.9324177809902421, "grad_norm": 0.1530725359916687, "learning_rate": 2.8595487817932424e-07, "loss": 0.0268, "step": 1290 }, { "epoch": 0.9360318033971811, "grad_norm": 0.2034778892993927, "learning_rate": 2.5677259606786686e-07, "loss": 0.0364, "step": 1295 }, { "epoch": 0.93964582580412, "grad_norm": 0.2887728214263916, "learning_rate": 2.2914126514187784e-07, "loss": 0.0372, "step": 1300 }, { "epoch": 0.9432598482110589, "grad_norm": 0.19726163148880005, "learning_rate": 2.0306528382538103e-07, "loss": 0.0283, "step": 1305 }, { "epoch": 0.9468738706179979, "grad_norm": 0.19561438262462616, "learning_rate": 1.7854880295797406e-07, "loss": 0.0311, "step": 1310 }, { "epoch": 0.9504878930249367, "grad_norm": 0.31874212622642517, "learning_rate": 1.5559572513409338e-07, "loss": 0.031, "step": 1315 }, { "epoch": 0.9541019154318757, "grad_norm": 0.19174811244010925, "learning_rate": 1.3420970408178912e-07, "loss": 0.0285, "step": 1320 }, { "epoch": 0.9577159378388146, "grad_norm": 0.17756476998329163, "learning_rate": 1.1439414408111471e-07, "loss": 0.0237, "step": 1325 }, { "epoch": 0.9613299602457536, "grad_norm": 0.2853069007396698, "learning_rate": 9.615219942222476e-08, "loss": 0.0324, "step": 1330 }, { "epoch": 0.9649439826526924, "grad_norm": 0.20107683539390564, "learning_rate": 7.948677390326786e-08, "loss": 0.029, "step": 1335 }, { "epoch": 0.9685580050596314, "grad_norm": 0.32500964403152466, "learning_rate": 6.440052036815081e-08, "loss": 0.0307, "step": 1340 }, { "epoch": 0.9721720274665703, "grad_norm": 0.37760159373283386, "learning_rate": 5.0895840284257424e-08, "loss": 0.0349, "step": 1345 }, { "epoch": 0.9757860498735093, "grad_norm": 0.1795365810394287, "learning_rate": 3.8974883360169966e-08, "loss": 0.0269, "step": 1350 }, { "epoch": 0.9794000722804481, "grad_norm": 0.1918817162513733, "learning_rate": 2.86395472034795e-08, "loss": 0.0322, "step": 1355 }, { "epoch": 0.9830140946873871, "grad_norm": 0.16373227536678314, "learning_rate": 1.989147701871641e-08, "loss": 0.0317, "step": 1360 }, { "epoch": 0.986628117094326, "grad_norm": 0.12552815675735474, "learning_rate": 1.2732065345462118e-08, "loss": 0.0223, "step": 1365 }, { "epoch": 0.9902421395012649, "grad_norm": 0.16605937480926514, "learning_rate": 7.162451836685291e-09, "loss": 0.0283, "step": 1370 }, { "epoch": 0.9938561619082038, "grad_norm": 0.18890981376171112, "learning_rate": 3.183523077324724e-09, "loss": 0.0281, "step": 1375 }, { "epoch": 0.9974701843151428, "grad_norm": 0.14776916801929474, "learning_rate": 7.959124431622389e-10, "loss": 0.0287, "step": 1380 }, { "epoch": 1.0, "step": 1384, "total_flos": 2.850613244928721e+18, "train_loss": 0.048995194472157194, "train_runtime": 23358.4539, "train_samples_per_second": 3.789, "train_steps_per_second": 0.059 } ], "logging_steps": 5, "max_steps": 1384, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.850613244928721e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }