{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 500.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 9.25, "learning_rate": 1.8e-05, "loss": 2.605172348022461, "loss_d0": 2.6139824271202086, "step": 10 }, { "epoch": 2.0, "grad_norm": 4.8125, "learning_rate": 3.8e-05, "loss": 1.1845547676086425, "loss_d0": 1.1885395765304565, "step": 20 }, { "epoch": 3.0, "grad_norm": 3.65625, "learning_rate": 5.8e-05, "loss": 0.8508452415466309, "loss_d0": 0.8536352932453155, "step": 30 }, { "epoch": 4.0, "grad_norm": 3.46875, "learning_rate": 7.800000000000001e-05, "loss": 0.6966594219207763, "loss_d0": 0.6979476511478424, "step": 40 }, { "epoch": 5.0, "grad_norm": 3.296875, "learning_rate": 9.8e-05, "loss": 0.5743978500366211, "loss_d0": 0.5776701003313065, "step": 50 }, { "epoch": 6.0, "grad_norm": 3.71875, "learning_rate": 0.000118, "loss": 0.4979101657867432, "loss_d0": 0.4996922880411148, "step": 60 }, { "epoch": 7.0, "grad_norm": 3.25, "learning_rate": 0.000138, "loss": 0.4397528648376465, "loss_d0": 0.4406041353940964, "step": 70 }, { "epoch": 8.0, "grad_norm": 2.5625, "learning_rate": 0.00015800000000000002, "loss": 0.3697507381439209, "loss_d0": 0.3696742236614227, "step": 80 }, { "epoch": 9.0, "grad_norm": 2.9375, "learning_rate": 0.00017800000000000002, "loss": 0.31113204956054685, "loss_d0": 0.31142298579216005, "step": 90 }, { "epoch": 10.0, "grad_norm": 2.25, "learning_rate": 0.00019800000000000002, "loss": 0.2800392389297485, "loss_d0": 0.27933542132377626, "step": 100 }, { "epoch": 11.0, "grad_norm": 1.8203125, "learning_rate": 0.0002, "loss": 0.2221465826034546, "loss_d0": 0.22203450053930282, "step": 110 }, { "epoch": 12.0, "grad_norm": 2.0625, "learning_rate": 0.0002, "loss": 0.1953430414199829, "loss_d0": 0.19439931064844132, "step": 120 }, { "epoch": 13.0, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.14305418729782104, "loss_d0": 0.14241147190332412, "step": 130 }, { "epoch": 14.0, "grad_norm": 1.609375, "learning_rate": 0.0002, "loss": 0.1502935767173767, "loss_d0": 0.14951273798942566, "step": 140 }, { "epoch": 15.0, "grad_norm": 2.15625, "learning_rate": 0.0002, "loss": 0.14393000602722167, "loss_d0": 0.14211773499846458, "step": 150 }, { "epoch": 16.0, "grad_norm": 1.5390625, "learning_rate": 0.0002, "loss": 0.1161999225616455, "loss_d0": 0.11462540775537491, "step": 160 }, { "epoch": 17.0, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1050883412361145, "loss_d0": 0.10514769107103347, "step": 170 }, { "epoch": 18.0, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.10532078742980958, "loss_d0": 0.10545785427093506, "step": 180 }, { "epoch": 19.0, "grad_norm": 1.375, "learning_rate": 0.0002, "loss": 0.11194120645523072, "loss_d0": 0.11231792494654655, "step": 190 }, { "epoch": 20.0, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.10956100225448609, "loss_d0": 0.11055244281888008, "step": 200 }, { "epoch": 21.0, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.09398337006568909, "loss_d0": 0.09433126747608185, "step": 210 }, { "epoch": 22.0, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.08510669469833373, "loss_d0": 0.08516838252544404, "step": 220 }, { "epoch": 23.0, "grad_norm": 1.796875, "learning_rate": 0.0002, "loss": 0.07973664999008179, "loss_d0": 0.0800891250371933, "step": 230 }, { "epoch": 24.0, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.07982662916183472, "loss_d0": 0.08119344227015972, "step": 240 }, { "epoch": 25.0, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.08493419289588929, "loss_d0": 0.08543153777718544, "step": 250 }, { "epoch": 26.0, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.06563451290130615, "loss_d0": 0.06554836891591549, "step": 260 }, { "epoch": 27.0, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.08964254260063172, "loss_d0": 0.08905367143452167, "step": 270 }, { "epoch": 28.0, "grad_norm": 1.4375, "learning_rate": 0.0002, "loss": 0.08700705170631409, "loss_d0": 0.08580705337226391, "step": 280 }, { "epoch": 29.0, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.08369559049606323, "loss_d0": 0.08155160546302795, "step": 290 }, { "epoch": 30.0, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.07587954998016358, "loss_d0": 0.0753675114363432, "step": 300 }, { "epoch": 31.0, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.06574047803878784, "loss_d0": 0.06600831300020218, "step": 310 }, { "epoch": 32.0, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.06255401968955994, "loss_d0": 0.06289612613618374, "step": 320 }, { "epoch": 33.0, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.06944599151611328, "loss_d0": 0.07013467662036418, "step": 330 }, { "epoch": 34.0, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.0734113335609436, "loss_d0": 0.07388503737747669, "step": 340 }, { "epoch": 35.0, "grad_norm": 1.625, "learning_rate": 0.0002, "loss": 0.06372126340866088, "loss_d0": 0.06445319131016732, "step": 350 }, { "epoch": 36.0, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.06621803045272827, "loss_d0": 0.0674049399793148, "step": 360 }, { "epoch": 37.0, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.07585157752037049, "loss_d0": 0.07674749866127968, "step": 370 }, { "epoch": 38.0, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.07490594983100891, "loss_d0": 0.07461650408804417, "step": 380 }, { "epoch": 39.0, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.06459608674049377, "loss_d0": 0.06494694538414478, "step": 390 }, { "epoch": 40.0, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.0653274655342102, "loss_d0": 0.06574108265340328, "step": 400 }, { "epoch": 41.0, "grad_norm": 1.8046875, "learning_rate": 0.0002, "loss": 0.08319691419601441, "loss_d0": 0.0816740058362484, "step": 410 }, { "epoch": 42.0, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.058042091131210324, "loss_d0": 0.058284175023436545, "step": 420 }, { "epoch": 43.0, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.05881038308143616, "loss_d0": 0.05877893678843975, "step": 430 }, { "epoch": 44.0, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.05556913018226624, "loss_d0": 0.05579867213964462, "step": 440 }, { "epoch": 45.0, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0559271514415741, "loss_d0": 0.0562079343944788, "step": 450 }, { "epoch": 46.0, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0503437340259552, "loss_d0": 0.050457949936389926, "step": 460 }, { "epoch": 47.0, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.04916974902153015, "loss_d0": 0.0493311133235693, "step": 470 }, { "epoch": 48.0, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.060026037693023684, "loss_d0": 0.059200653806328773, "step": 480 }, { "epoch": 49.0, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.06555094122886658, "loss_d0": 0.06555219888687133, "step": 490 }, { "epoch": 50.0, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.07094892263412475, "loss_d0": 0.07080870307981968, "step": 500 }, { "epoch": 50.0, "eval_loss": 9.149801254272461, "eval_runtime": 0.6889, "eval_samples_per_second": 725.825, "eval_steps_per_second": 72.582, "step": 500 }, { "epoch": 51.0, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.07003722190856934, "loss_d0": 0.07078699246048928, "step": 510 }, { "epoch": 52.0, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0648545503616333, "loss_d0": 0.06463338956236839, "step": 520 }, { "epoch": 53.0, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.05929445028305054, "loss_d0": 0.0596495222300291, "step": 530 }, { "epoch": 54.0, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.05196449756622314, "loss_d0": 0.05247226879000664, "step": 540 }, { "epoch": 55.0, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.05879771709442139, "loss_d0": 0.05922210738062859, "step": 550 }, { "epoch": 56.0, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.06885148882865906, "loss_d0": 0.07016028575599194, "step": 560 }, { "epoch": 57.0, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.057416903972625735, "loss_d0": 0.05888371020555496, "step": 570 }, { "epoch": 58.0, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.057390010356903075, "loss_d0": 0.05848095864057541, "step": 580 }, { "epoch": 59.0, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.049796289205551146, "loss_d0": 0.05020042285323143, "step": 590 }, { "epoch": 60.0, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.05940237045288086, "loss_d0": 0.06075261794030666, "step": 600 }, { "epoch": 61.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.057738131284713744, "loss_d0": 0.05881649628281593, "step": 610 }, { "epoch": 62.0, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.062183260917663574, "loss_d0": 0.06322281733155251, "step": 620 }, { "epoch": 63.0, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.05927368402481079, "loss_d0": 0.0597139336168766, "step": 630 }, { "epoch": 64.0, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.058104443550109866, "loss_d0": 0.05876607708632946, "step": 640 }, { "epoch": 65.0, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.05966512560844421, "loss_d0": 0.060669278353452684, "step": 650 }, { "epoch": 66.0, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.05417026281356811, "loss_d0": 0.054636499658226964, "step": 660 }, { "epoch": 67.0, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.05017418265342712, "loss_d0": 0.050396521016955374, "step": 670 }, { "epoch": 68.0, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 0.04814895987510681, "loss_d0": 0.04857309609651565, "step": 680 }, { "epoch": 69.0, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.05414179563522339, "loss_d0": 0.05384636260569096, "step": 690 }, { "epoch": 70.0, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.05307164788246155, "loss_d0": 0.05355789102613926, "step": 700 }, { "epoch": 71.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.05436263084411621, "loss_d0": 0.05512550659477711, "step": 710 }, { "epoch": 72.0, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.05032788515090943, "loss_d0": 0.050449307262897494, "step": 720 }, { "epoch": 73.0, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.051280814409255984, "loss_d0": 0.05181795097887516, "step": 730 }, { "epoch": 74.0, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.05536478161811829, "loss_d0": 0.05605713278055191, "step": 740 }, { "epoch": 75.0, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.05402403473854065, "loss_d0": 0.054415644705295564, "step": 750 }, { "epoch": 76.0, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.057246971130371097, "loss_d0": 0.057024940848350525, "step": 760 }, { "epoch": 77.0, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 0.053191614151000974, "loss_d0": 0.053360605239868165, "step": 770 }, { "epoch": 78.0, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.05366742014884949, "loss_d0": 0.05328587256371975, "step": 780 }, { "epoch": 79.0, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.06317275166511535, "loss_d0": 0.06282185427844525, "step": 790 }, { "epoch": 80.0, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.07268043756484985, "loss_d0": 0.07160068228840828, "step": 800 }, { "epoch": 81.0, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.05127843022346497, "loss_d0": 0.051354449987411496, "step": 810 }, { "epoch": 82.0, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.045409074425697325, "loss_d0": 0.045082954317331315, "step": 820 }, { "epoch": 83.0, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.04334630072116852, "loss_d0": 0.04350667372345925, "step": 830 }, { "epoch": 84.0, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 0.042599648237228394, "loss_d0": 0.042730527743697164, "step": 840 }, { "epoch": 85.0, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.04336960911750794, "loss_d0": 0.043284989148378375, "step": 850 }, { "epoch": 86.0, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.04642752707004547, "loss_d0": 0.04698342382907868, "step": 860 }, { "epoch": 87.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.04236462116241455, "loss_d0": 0.04291092492640018, "step": 870 }, { "epoch": 88.0, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0467838853597641, "loss_d0": 0.04719291441142559, "step": 880 }, { "epoch": 89.0, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.056033474206924436, "loss_d0": 0.0564144778996706, "step": 890 }, { "epoch": 90.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.05397940874099731, "loss_d0": 0.05476293601095676, "step": 900 }, { "epoch": 91.0, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.04703973531723023, "loss_d0": 0.04744415730237961, "step": 910 }, { "epoch": 92.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.05714722275733948, "loss_d0": 0.05776938088238239, "step": 920 }, { "epoch": 93.0, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.049974143505096436, "loss_d0": 0.05034521222114563, "step": 930 }, { "epoch": 94.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.05092711448669433, "loss_d0": 0.0512014877051115, "step": 940 }, { "epoch": 95.0, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.04760122001171112, "loss_d0": 0.048098673298954966, "step": 950 }, { "epoch": 96.0, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 0.0453918844461441, "loss_d0": 0.04554104544222355, "step": 960 }, { "epoch": 97.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.045323750376701354, "loss_d0": 0.04565862752497196, "step": 970 }, { "epoch": 98.0, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.04769502282142639, "loss_d0": 0.047993503510951996, "step": 980 }, { "epoch": 99.0, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.04582420587539673, "loss_d0": 0.045967242866754535, "step": 990 }, { "epoch": 100.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.04648930430412292, "loss_d0": 0.04681434221565724, "step": 1000 }, { "epoch": 100.0, "eval_loss": 8.996453285217285, "eval_runtime": 0.6897, "eval_samples_per_second": 724.945, "eval_steps_per_second": 72.494, "step": 1000 }, { "epoch": 101.0, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.047931820154190063, "loss_d0": 0.048335249349474904, "step": 1010 }, { "epoch": 102.0, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.04393635094165802, "loss_d0": 0.04436287619173527, "step": 1020 }, { "epoch": 103.0, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.052803754806518555, "loss_d0": 0.05395218767225742, "step": 1030 }, { "epoch": 104.0, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.046474286913871767, "loss_d0": 0.046956886723637584, "step": 1040 }, { "epoch": 105.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.04853596985340118, "loss_d0": 0.0489469937980175, "step": 1050 }, { "epoch": 106.0, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.050303131341934204, "loss_d0": 0.050602763146162036, "step": 1060 }, { "epoch": 107.0, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.062167507410049436, "loss_d0": 0.06302939765155316, "step": 1070 }, { "epoch": 108.0, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.05572482943534851, "loss_d0": 0.0563100803643465, "step": 1080 }, { "epoch": 109.0, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.04392791986465454, "loss_d0": 0.04417993500828743, "step": 1090 }, { "epoch": 110.0, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.048283118009567264, "loss_d0": 0.048720812797546385, "step": 1100 }, { "epoch": 111.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.042201068997383115, "loss_d0": 0.04266498349606991, "step": 1110 }, { "epoch": 112.0, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 0.04001366794109344, "loss_d0": 0.040179040282964706, "step": 1120 }, { "epoch": 113.0, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 0.04639661908149719, "loss_d0": 0.04656643345952034, "step": 1130 }, { "epoch": 114.0, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.04307742714881897, "loss_d0": 0.04315165765583515, "step": 1140 }, { "epoch": 115.0, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.047644132375717164, "loss_d0": 0.04783033281564712, "step": 1150 }, { "epoch": 116.0, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.05251736044883728, "loss_d0": 0.05270914658904076, "step": 1160 }, { "epoch": 117.0, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0490668922662735, "loss_d0": 0.049511789530515674, "step": 1170 }, { "epoch": 118.0, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0441941112279892, "loss_d0": 0.04463861547410488, "step": 1180 }, { "epoch": 119.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.04082232713699341, "loss_d0": 0.0409642331302166, "step": 1190 }, { "epoch": 120.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.04445215463638306, "loss_d0": 0.044935400038957594, "step": 1200 }, { "epoch": 121.0, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.04362497329711914, "loss_d0": 0.04389031082391739, "step": 1210 }, { "epoch": 122.0, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.05719525814056396, "loss_d0": 0.05802029110491276, "step": 1220 }, { "epoch": 123.0, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.058700555562973024, "loss_d0": 0.059645514190196994, "step": 1230 }, { "epoch": 124.0, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.049953502416610715, "loss_d0": 0.05056100562214851, "step": 1240 }, { "epoch": 125.0, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.04485623240470886, "loss_d0": 0.045016249269247056, "step": 1250 }, { "epoch": 126.0, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.04192852973937988, "loss_d0": 0.0420475821942091, "step": 1260 }, { "epoch": 127.0, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0426062673330307, "loss_d0": 0.042881960049271584, "step": 1270 }, { "epoch": 128.0, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 0.04536706209182739, "loss_d0": 0.04527593217790127, "step": 1280 }, { "epoch": 129.0, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.05174095630645752, "loss_d0": 0.05235871635377407, "step": 1290 }, { "epoch": 130.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.042698603868484494, "loss_d0": 0.04338001646101475, "step": 1300 }, { "epoch": 131.0, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 0.04018429815769196, "loss_d0": 0.04045051150023937, "step": 1310 }, { "epoch": 132.0, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.039610669016838074, "loss_d0": 0.03991545438766479, "step": 1320 }, { "epoch": 133.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.04048936069011688, "loss_d0": 0.04079539142549038, "step": 1330 }, { "epoch": 134.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.042930704355239865, "loss_d0": 0.043228012323379514, "step": 1340 }, { "epoch": 135.0, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.047755110263824466, "loss_d0": 0.047895029187202454, "step": 1350 }, { "epoch": 136.0, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.055011457204818724, "loss_d0": 0.05567521676421165, "step": 1360 }, { "epoch": 137.0, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.05911533832550049, "loss_d0": 0.0588922031223774, "step": 1370 }, { "epoch": 138.0, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.07207316756248475, "loss_d0": 0.07146050035953522, "step": 1380 }, { "epoch": 139.0, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.051120662689208986, "loss_d0": 0.05109778419137001, "step": 1390 }, { "epoch": 140.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.05012030005455017, "loss_d0": 0.0504388976842165, "step": 1400 }, { "epoch": 141.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.042303305864334104, "loss_d0": 0.042646681889891624, "step": 1410 }, { "epoch": 142.0, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.04310626089572907, "loss_d0": 0.04322606287896633, "step": 1420 }, { "epoch": 143.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.041630563139915464, "loss_d0": 0.04180505834519863, "step": 1430 }, { "epoch": 144.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.04187849760055542, "loss_d0": 0.042144588008522985, "step": 1440 }, { "epoch": 145.0, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.04202188551425934, "loss_d0": 0.04231737479567528, "step": 1450 }, { "epoch": 146.0, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0439466267824173, "loss_d0": 0.044225719198584555, "step": 1460 }, { "epoch": 147.0, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.04756100177764892, "loss_d0": 0.04815598018467426, "step": 1470 }, { "epoch": 148.0, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.04747256338596344, "loss_d0": 0.04799098074436188, "step": 1480 }, { "epoch": 149.0, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.049269622564315795, "loss_d0": 0.04934872798621655, "step": 1490 }, { "epoch": 150.0, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.04529989957809448, "loss_d0": 0.045717564225196836, "step": 1500 }, { "epoch": 150.0, "eval_loss": 9.852365493774414, "eval_runtime": 0.6886, "eval_samples_per_second": 726.092, "eval_steps_per_second": 72.609, "step": 1500 }, { "epoch": 151.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.043574199080467224, "loss_d0": 0.04385456591844559, "step": 1510 }, { "epoch": 152.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.04235072135925293, "loss_d0": 0.04293657392263413, "step": 1520 }, { "epoch": 153.0, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.04195387065410614, "loss_d0": 0.042300010845065114, "step": 1530 }, { "epoch": 154.0, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 0.04051635265350342, "loss_d0": 0.040704548731446265, "step": 1540 }, { "epoch": 155.0, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 0.04044683873653412, "loss_d0": 0.04077310748398304, "step": 1550 }, { "epoch": 156.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.04552633166313171, "loss_d0": 0.0455584455281496, "step": 1560 }, { "epoch": 157.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.0467070460319519, "loss_d0": 0.04717182517051697, "step": 1570 }, { "epoch": 158.0, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.04355217814445496, "loss_d0": 0.04367000050842762, "step": 1580 }, { "epoch": 159.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.04726326167583465, "loss_d0": 0.047322430461645124, "step": 1590 }, { "epoch": 160.0, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.05183404088020325, "loss_d0": 0.051499960198998454, "step": 1600 }, { "epoch": 161.0, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.052065759897232056, "loss_d0": 0.05186050981283188, "step": 1610 }, { "epoch": 162.0, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.04411465525627136, "loss_d0": 0.04437471702694893, "step": 1620 }, { "epoch": 163.0, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.04183531403541565, "loss_d0": 0.04211582764983177, "step": 1630 }, { "epoch": 164.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.04270436465740204, "loss_d0": 0.04318705834448337, "step": 1640 }, { "epoch": 165.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.04538227915763855, "loss_d0": 0.04579868800938129, "step": 1650 }, { "epoch": 166.0, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.04416545033454895, "loss_d0": 0.04440648853778839, "step": 1660 }, { "epoch": 167.0, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.0445246160030365, "loss_d0": 0.04493884444236755, "step": 1670 }, { "epoch": 168.0, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 0.04588344693183899, "loss_d0": 0.046438657119870184, "step": 1680 }, { "epoch": 169.0, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 0.04363830387592316, "loss_d0": 0.04401036873459816, "step": 1690 }, { "epoch": 170.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.041232901811599734, "loss_d0": 0.04151614680886269, "step": 1700 }, { "epoch": 171.0, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 0.03885977864265442, "loss_d0": 0.039185041561722755, "step": 1710 }, { "epoch": 172.0, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 0.0402399480342865, "loss_d0": 0.040488839522004126, "step": 1720 }, { "epoch": 173.0, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.04318315982818603, "loss_d0": 0.04342842325568199, "step": 1730 }, { "epoch": 174.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.040678870677947995, "loss_d0": 0.04102524146437645, "step": 1740 }, { "epoch": 175.0, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.040768089890480044, "loss_d0": 0.04122583419084549, "step": 1750 }, { "epoch": 176.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.042663860321044925, "loss_d0": 0.04297072477638721, "step": 1760 }, { "epoch": 177.0, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.04289932250976562, "loss_d0": 0.04324173927307129, "step": 1770 }, { "epoch": 178.0, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.04065240621566772, "loss_d0": 0.04108826108276844, "step": 1780 }, { "epoch": 179.0, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 0.04117431342601776, "loss_d0": 0.04153142869472504, "step": 1790 }, { "epoch": 180.0, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 0.03892770111560821, "loss_d0": 0.03927576504647732, "step": 1800 }, { "epoch": 181.0, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.04309303760528564, "loss_d0": 0.04333780445158482, "step": 1810 }, { "epoch": 182.0, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.04459149837493896, "loss_d0": 0.04491332247853279, "step": 1820 }, { "epoch": 183.0, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.046898290514945984, "loss_d0": 0.047189544141292575, "step": 1830 }, { "epoch": 184.0, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 0.04611060917377472, "loss_d0": 0.04624026641249657, "step": 1840 }, { "epoch": 185.0, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.04319146275520325, "loss_d0": 0.043569745123386384, "step": 1850 }, { "epoch": 186.0, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.04202109277248382, "loss_d0": 0.04252335019409657, "step": 1860 }, { "epoch": 187.0, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 0.04427561163902283, "loss_d0": 0.04493751563131809, "step": 1870 }, { "epoch": 188.0, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 0.04414681792259216, "loss_d0": 0.044181046262383464, "step": 1880 }, { "epoch": 189.0, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 0.05471844673156738, "loss_d0": 0.05483967214822769, "step": 1890 }, { "epoch": 190.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.04385135769844055, "loss_d0": 0.04406722001731396, "step": 1900 }, { "epoch": 191.0, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.04954864680767059, "loss_d0": 0.04996456205844879, "step": 1910 }, { "epoch": 192.0, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.04157604873180389, "loss_d0": 0.042001275718212126, "step": 1920 }, { "epoch": 193.0, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.03958460390567779, "loss_d0": 0.040064525604248044, "step": 1930 }, { "epoch": 194.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.03781185150146484, "loss_d0": 0.03814610652625561, "step": 1940 }, { "epoch": 195.0, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.038428235054016116, "loss_d0": 0.038876712694764136, "step": 1950 }, { "epoch": 196.0, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.039642858505249026, "loss_d0": 0.040048804879188535, "step": 1960 }, { "epoch": 197.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.040806761384010314, "loss_d0": 0.04136274456977844, "step": 1970 }, { "epoch": 198.0, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.040471208095550534, "loss_d0": 0.04059889316558838, "step": 1980 }, { "epoch": 199.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.039885866641998294, "loss_d0": 0.040052902325987814, "step": 1990 }, { "epoch": 200.0, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0434266984462738, "loss_d0": 0.043684659898281096, "step": 2000 }, { "epoch": 200.0, "eval_loss": 10.189282417297363, "eval_runtime": 0.6861, "eval_samples_per_second": 728.743, "eval_steps_per_second": 72.874, "step": 2000 }, { "epoch": 201.0, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.04809910655021667, "loss_d0": 0.04849519394338131, "step": 2010 }, { "epoch": 202.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.041577893495559695, "loss_d0": 0.041943120583891866, "step": 2020 }, { "epoch": 203.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.04084473252296448, "loss_d0": 0.04086658768355846, "step": 2030 }, { "epoch": 204.0, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.03976602852344513, "loss_d0": 0.03992565609514713, "step": 2040 }, { "epoch": 205.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.04404784142971039, "loss_d0": 0.043900683894753455, "step": 2050 }, { "epoch": 206.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.04731652736663818, "loss_d0": 0.04724227301776409, "step": 2060 }, { "epoch": 207.0, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 0.04152216017246246, "loss_d0": 0.04180925637483597, "step": 2070 }, { "epoch": 208.0, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 0.04118179380893707, "loss_d0": 0.04148880951106548, "step": 2080 }, { "epoch": 209.0, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.04137121140956879, "loss_d0": 0.04159573912620544, "step": 2090 }, { "epoch": 210.0, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.044093775749206546, "loss_d0": 0.0443379782140255, "step": 2100 }, { "epoch": 211.0, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.04798535108566284, "loss_d0": 0.04776673950254917, "step": 2110 }, { "epoch": 212.0, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.05737084746360779, "loss_d0": 0.05612550266087055, "step": 2120 }, { "epoch": 213.0, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.046094492077827454, "loss_d0": 0.04582822136580944, "step": 2130 }, { "epoch": 214.0, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 0.04114535450935364, "loss_d0": 0.04123819507658481, "step": 2140 }, { "epoch": 215.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.039699801802635194, "loss_d0": 0.040003697574138644, "step": 2150 }, { "epoch": 216.0, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 0.03862698972225189, "loss_d0": 0.03893596157431602, "step": 2160 }, { "epoch": 217.0, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 0.043020579218864444, "loss_d0": 0.0432659212499857, "step": 2170 }, { "epoch": 218.0, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 0.04398588240146637, "loss_d0": 0.04368347264826298, "step": 2180 }, { "epoch": 219.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.040211325883865355, "loss_d0": 0.040444132313132285, "step": 2190 }, { "epoch": 220.0, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 0.04232992231845856, "loss_d0": 0.04273700416088104, "step": 2200 }, { "epoch": 221.0, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 0.03988331258296966, "loss_d0": 0.040199489891529085, "step": 2210 }, { "epoch": 222.0, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.03946065902709961, "loss_d0": 0.03989113420248032, "step": 2220 }, { "epoch": 223.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.038988009095191956, "loss_d0": 0.03922968059778213, "step": 2230 }, { "epoch": 224.0, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.039998382329940796, "loss_d0": 0.040379610285162924, "step": 2240 }, { "epoch": 225.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.041840368509292604, "loss_d0": 0.042378640919923785, "step": 2250 }, { "epoch": 226.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.04227463901042938, "loss_d0": 0.04284324869513512, "step": 2260 }, { "epoch": 227.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.04585360586643219, "loss_d0": 0.045701490342617036, "step": 2270 }, { "epoch": 228.0, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.0420985072851181, "loss_d0": 0.042375285923480985, "step": 2280 }, { "epoch": 229.0, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.04348675310611725, "loss_d0": 0.043607931956648825, "step": 2290 }, { "epoch": 230.0, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.043425142765045166, "loss_d0": 0.04386321604251862, "step": 2300 }, { "epoch": 231.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.05011132955551147, "loss_d0": 0.05058671832084656, "step": 2310 }, { "epoch": 232.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.04243658483028412, "loss_d0": 0.04289327785372734, "step": 2320 }, { "epoch": 233.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.042480701208114625, "loss_d0": 0.04278766848146916, "step": 2330 }, { "epoch": 234.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.03948282897472381, "loss_d0": 0.03976398035883903, "step": 2340 }, { "epoch": 235.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.03911280632019043, "loss_d0": 0.03939221054315567, "step": 2350 }, { "epoch": 236.0, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 0.04534493386745453, "loss_d0": 0.0454285766929388, "step": 2360 }, { "epoch": 237.0, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.04498372673988342, "loss_d0": 0.04524303488433361, "step": 2370 }, { "epoch": 238.0, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 0.04036388099193573, "loss_d0": 0.04051109738647938, "step": 2380 }, { "epoch": 239.0, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 0.03902736306190491, "loss_d0": 0.039412683621048925, "step": 2390 }, { "epoch": 240.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.03764301538467407, "loss_d0": 0.037954670190811154, "step": 2400 }, { "epoch": 241.0, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 0.03973473310470581, "loss_d0": 0.03991707712411881, "step": 2410 }, { "epoch": 242.0, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 0.03933502435684204, "loss_d0": 0.03948218524456024, "step": 2420 }, { "epoch": 243.0, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.04040732085704803, "loss_d0": 0.040525125712156294, "step": 2430 }, { "epoch": 244.0, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.04274722635746002, "loss_d0": 0.04287994578480721, "step": 2440 }, { "epoch": 245.0, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 0.04024344384670257, "loss_d0": 0.04048874229192734, "step": 2450 }, { "epoch": 246.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.040749162435531616, "loss_d0": 0.04102331958711147, "step": 2460 }, { "epoch": 247.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.0426899254322052, "loss_d0": 0.04291442297399044, "step": 2470 }, { "epoch": 248.0, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.039026832580566405, "loss_d0": 0.03924530446529388, "step": 2480 }, { "epoch": 249.0, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.0383389413356781, "loss_d0": 0.03859546259045601, "step": 2490 }, { "epoch": 250.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.038316363096237184, "loss_d0": 0.03865836299955845, "step": 2500 }, { "epoch": 250.0, "eval_loss": 10.885120391845703, "eval_runtime": 0.6922, "eval_samples_per_second": 722.309, "eval_steps_per_second": 72.231, "step": 2500 }, { "epoch": 251.0, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 0.03913344144821167, "loss_d0": 0.039387579634785654, "step": 2510 }, { "epoch": 252.0, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 0.03892936110496521, "loss_d0": 0.03930997662246227, "step": 2520 }, { "epoch": 253.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.037770673632621765, "loss_d0": 0.038118017837405205, "step": 2530 }, { "epoch": 254.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.037772169709205626, "loss_d0": 0.03807541318237782, "step": 2540 }, { "epoch": 255.0, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.03999180793762207, "loss_d0": 0.04036150127649307, "step": 2550 }, { "epoch": 256.0, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.052530336380004886, "loss_d0": 0.05343040004372597, "step": 2560 }, { "epoch": 257.0, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.04317347705364227, "loss_d0": 0.04373372159898281, "step": 2570 }, { "epoch": 258.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.03994796574115753, "loss_d0": 0.04031400717794895, "step": 2580 }, { "epoch": 259.0, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 0.038785922527313235, "loss_d0": 0.039084702357649805, "step": 2590 }, { "epoch": 260.0, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.039844360947608945, "loss_d0": 0.040255676582455636, "step": 2600 }, { "epoch": 261.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.04429091215133667, "loss_d0": 0.04446314424276352, "step": 2610 }, { "epoch": 262.0, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.04119807183742523, "loss_d0": 0.0414251770824194, "step": 2620 }, { "epoch": 263.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.03950552344322204, "loss_d0": 0.039706287905573845, "step": 2630 }, { "epoch": 264.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.0390357106924057, "loss_d0": 0.039455119892954825, "step": 2640 }, { "epoch": 265.0, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 0.03879677653312683, "loss_d0": 0.0390281654894352, "step": 2650 }, { "epoch": 266.0, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.039793723821640016, "loss_d0": 0.04023738354444504, "step": 2660 }, { "epoch": 267.0, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 0.037570255994796756, "loss_d0": 0.037908059731125834, "step": 2670 }, { "epoch": 268.0, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 0.03747315108776093, "loss_d0": 0.03773516528308392, "step": 2680 }, { "epoch": 269.0, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.03748213052749634, "loss_d0": 0.03784133456647396, "step": 2690 }, { "epoch": 270.0, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 0.0373074471950531, "loss_d0": 0.03761020861566067, "step": 2700 }, { "epoch": 271.0, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 0.03723294138908386, "loss_d0": 0.03757789246737957, "step": 2710 }, { "epoch": 272.0, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 0.0372806191444397, "loss_d0": 0.03759502917528153, "step": 2720 }, { "epoch": 273.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.037115806341171266, "loss_d0": 0.037448635697364806, "step": 2730 }, { "epoch": 274.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.0369686633348465, "loss_d0": 0.037282370403409, "step": 2740 }, { "epoch": 275.0, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 0.03698193728923797, "loss_d0": 0.03725597597658634, "step": 2750 }, { "epoch": 276.0, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 0.037520098686218264, "loss_d0": 0.03782733231782913, "step": 2760 }, { "epoch": 277.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.03804133534431457, "loss_d0": 0.038362907245755196, "step": 2770 }, { "epoch": 278.0, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.037969928979873654, "loss_d0": 0.03825241588056087, "step": 2780 }, { "epoch": 279.0, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 0.037792128324508664, "loss_d0": 0.03807285577058792, "step": 2790 }, { "epoch": 280.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.03775832355022431, "loss_d0": 0.03801813460886479, "step": 2800 }, { "epoch": 281.0, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 0.03686901330947876, "loss_d0": 0.037187918275594714, "step": 2810 }, { "epoch": 282.0, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 0.03696819245815277, "loss_d0": 0.037317240983247756, "step": 2820 }, { "epoch": 283.0, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 0.037478744983673096, "loss_d0": 0.03774147853255272, "step": 2830 }, { "epoch": 284.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.040481334924697875, "loss_d0": 0.04085813723504543, "step": 2840 }, { "epoch": 285.0, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 0.04263193607330322, "loss_d0": 0.04322305843234062, "step": 2850 }, { "epoch": 286.0, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.04221307337284088, "loss_d0": 0.04266056790947914, "step": 2860 }, { "epoch": 287.0, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 0.04012386798858643, "loss_d0": 0.04027114436030388, "step": 2870 }, { "epoch": 288.0, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.04546632468700409, "loss_d0": 0.045037579536437986, "step": 2880 }, { "epoch": 289.0, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 0.04220171272754669, "loss_d0": 0.04206137731671333, "step": 2890 }, { "epoch": 290.0, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 0.03849797248840332, "loss_d0": 0.03877202942967415, "step": 2900 }, { "epoch": 291.0, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 0.03816980719566345, "loss_d0": 0.038458903506398204, "step": 2910 }, { "epoch": 292.0, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 0.039384329319000246, "loss_d0": 0.03953510671854019, "step": 2920 }, { "epoch": 293.0, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 0.03914114236831665, "loss_d0": 0.03927744776010513, "step": 2930 }, { "epoch": 294.0, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.037627822160720824, "loss_d0": 0.03790898621082306, "step": 2940 }, { "epoch": 295.0, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 0.037601858377456665, "loss_d0": 0.03786470964550972, "step": 2950 }, { "epoch": 296.0, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 0.03714042901992798, "loss_d0": 0.03739793673157692, "step": 2960 }, { "epoch": 297.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.037202891707420346, "loss_d0": 0.03755674138665199, "step": 2970 }, { "epoch": 298.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.03793781399726868, "loss_d0": 0.03822383023798466, "step": 2980 }, { "epoch": 299.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.038024306297302246, "loss_d0": 0.03834304548799992, "step": 2990 }, { "epoch": 300.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.03749783039093017, "loss_d0": 0.03785845525562763, "step": 3000 }, { "epoch": 300.0, "eval_loss": 11.146424293518066, "eval_runtime": 0.6864, "eval_samples_per_second": 728.387, "eval_steps_per_second": 72.839, "step": 3000 }, { "epoch": 301.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.038469833135604856, "loss_d0": 0.03886338211596012, "step": 3010 }, { "epoch": 302.0, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 0.04090344309806824, "loss_d0": 0.04116071537137032, "step": 3020 }, { "epoch": 303.0, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.040218299627304076, "loss_d0": 0.04055294916033745, "step": 3030 }, { "epoch": 304.0, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.04446632564067841, "loss_d0": 0.04473265036940575, "step": 3040 }, { "epoch": 305.0, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 0.04086683392524719, "loss_d0": 0.04126431494951248, "step": 3050 }, { "epoch": 306.0, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.048713570833206175, "loss_d0": 0.04855058118700981, "step": 3060 }, { "epoch": 307.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.046805566549301146, "loss_d0": 0.04710230566561222, "step": 3070 }, { "epoch": 308.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.03934524655342102, "loss_d0": 0.039762004464864734, "step": 3080 }, { "epoch": 309.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.03915072977542877, "loss_d0": 0.03944177031517029, "step": 3090 }, { "epoch": 310.0, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.04065064489841461, "loss_d0": 0.04100923091173172, "step": 3100 }, { "epoch": 311.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.037496811151504515, "loss_d0": 0.03782621137797833, "step": 3110 }, { "epoch": 312.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.037671661376953124, "loss_d0": 0.038110511004924776, "step": 3120 }, { "epoch": 313.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.0369190514087677, "loss_d0": 0.03722741194069386, "step": 3130 }, { "epoch": 314.0, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.036951732635498044, "loss_d0": 0.03724584951996803, "step": 3140 }, { "epoch": 315.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.03681913614273071, "loss_d0": 0.03710218816995621, "step": 3150 }, { "epoch": 316.0, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 0.036740392446517944, "loss_d0": 0.03705124892294407, "step": 3160 }, { "epoch": 317.0, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 0.03693808317184448, "loss_d0": 0.03728438019752502, "step": 3170 }, { "epoch": 318.0, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.03715154826641083, "loss_d0": 0.03744188435375691, "step": 3180 }, { "epoch": 319.0, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 0.03702226579189301, "loss_d0": 0.037342607975006104, "step": 3190 }, { "epoch": 320.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.03733651638031006, "loss_d0": 0.037650084123015404, "step": 3200 }, { "epoch": 321.0, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 0.03700153231620788, "loss_d0": 0.03734440542757511, "step": 3210 }, { "epoch": 322.0, "grad_norm": 0.2392578125, "learning_rate": 0.0002, "loss": 0.03706228733062744, "loss_d0": 0.037381384521722794, "step": 3220 }, { "epoch": 323.0, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 0.036909821629524234, "loss_d0": 0.03724093846976757, "step": 3230 }, { "epoch": 324.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.03717797100543976, "loss_d0": 0.03748043179512024, "step": 3240 }, { "epoch": 325.0, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 0.03671996295452118, "loss_d0": 0.03704087920486927, "step": 3250 }, { "epoch": 326.0, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 0.03725916743278503, "loss_d0": 0.03758593760430813, "step": 3260 }, { "epoch": 327.0, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 0.037097156047821045, "loss_d0": 0.03740981854498386, "step": 3270 }, { "epoch": 328.0, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.037044870853424075, "loss_d0": 0.03732852153480053, "step": 3280 }, { "epoch": 329.0, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 0.037512749433517456, "loss_d0": 0.03780471496284008, "step": 3290 }, { "epoch": 330.0, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.03871379792690277, "loss_d0": 0.03908683769404888, "step": 3300 }, { "epoch": 331.0, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.038561710715293886, "loss_d0": 0.038924089074134825, "step": 3310 }, { "epoch": 332.0, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.04439712464809418, "loss_d0": 0.04472551830112934, "step": 3320 }, { "epoch": 333.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.04164994060993195, "loss_d0": 0.041940994933247565, "step": 3330 }, { "epoch": 334.0, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.040698114037513736, "loss_d0": 0.04085970595479012, "step": 3340 }, { "epoch": 335.0, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.03932968080043793, "loss_d0": 0.03956272974610329, "step": 3350 }, { "epoch": 336.0, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 0.0391847550868988, "loss_d0": 0.03948534913361072, "step": 3360 }, { "epoch": 337.0, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.047348752617836, "loss_d0": 0.046868476271629336, "step": 3370 }, { "epoch": 338.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.0379530131816864, "loss_d0": 0.0382020853459835, "step": 3380 }, { "epoch": 339.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.04256679117679596, "loss_d0": 0.04314272291958332, "step": 3390 }, { "epoch": 340.0, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.056311219930648804, "loss_d0": 0.05685732625424862, "step": 3400 }, { "epoch": 341.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.04154669046401978, "loss_d0": 0.04171246141195297, "step": 3410 }, { "epoch": 342.0, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 0.04307686388492584, "loss_d0": 0.04326325096189976, "step": 3420 }, { "epoch": 343.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.04260125458240509, "loss_d0": 0.04288202822208405, "step": 3430 }, { "epoch": 344.0, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 0.04342672824859619, "loss_d0": 0.043964647501707074, "step": 3440 }, { "epoch": 345.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.041104856133461, "loss_d0": 0.04152129665017128, "step": 3450 }, { "epoch": 346.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.0384725421667099, "loss_d0": 0.03876579888164997, "step": 3460 }, { "epoch": 347.0, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.04155246317386627, "loss_d0": 0.04204757548868656, "step": 3470 }, { "epoch": 348.0, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.04175013601779938, "loss_d0": 0.04206421263515949, "step": 3480 }, { "epoch": 349.0, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.03953442573547363, "loss_d0": 0.03970748074352741, "step": 3490 }, { "epoch": 350.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.03868723213672638, "loss_d0": 0.03908204138278961, "step": 3500 }, { "epoch": 350.0, "eval_loss": 10.020953178405762, "eval_runtime": 0.6866, "eval_samples_per_second": 728.252, "eval_steps_per_second": 72.825, "step": 3500 }, { "epoch": 351.0, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 0.04133128821849823, "loss_d0": 0.041662900149822234, "step": 3510 }, { "epoch": 352.0, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 0.04269187152385712, "loss_d0": 0.04334259107708931, "step": 3520 }, { "epoch": 353.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.03852761685848236, "loss_d0": 0.03878095783293247, "step": 3530 }, { "epoch": 354.0, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 0.04052022397518158, "loss_d0": 0.04095298685133457, "step": 3540 }, { "epoch": 355.0, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 0.04115490317344665, "loss_d0": 0.041552980244159696, "step": 3550 }, { "epoch": 356.0, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.042296862602233885, "loss_d0": 0.04288714602589607, "step": 3560 }, { "epoch": 357.0, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.041248321533203125, "loss_d0": 0.04149158634245396, "step": 3570 }, { "epoch": 358.0, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 0.04039554595947266, "loss_d0": 0.04083836451172829, "step": 3580 }, { "epoch": 359.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.039502471685409546, "loss_d0": 0.04000399447977543, "step": 3590 }, { "epoch": 360.0, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.043854904174804685, "loss_d0": 0.04427521526813507, "step": 3600 }, { "epoch": 361.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.040008130669593814, "loss_d0": 0.04016649015247822, "step": 3610 }, { "epoch": 362.0, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.03769044280052185, "loss_d0": 0.03798259571194649, "step": 3620 }, { "epoch": 363.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.039430353045463565, "loss_d0": 0.039879053831100464, "step": 3630 }, { "epoch": 364.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.03827457427978516, "loss_d0": 0.03869166634976864, "step": 3640 }, { "epoch": 365.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.037254220247268675, "loss_d0": 0.03753783367574215, "step": 3650 }, { "epoch": 366.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.036964389681816104, "loss_d0": 0.037302806973457336, "step": 3660 }, { "epoch": 367.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.036746549606323245, "loss_d0": 0.03708359859883785, "step": 3670 }, { "epoch": 368.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.0370238333940506, "loss_d0": 0.037373238056898114, "step": 3680 }, { "epoch": 369.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.03776443004608154, "loss_d0": 0.038052943721413615, "step": 3690 }, { "epoch": 370.0, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.037547925114631654, "loss_d0": 0.037854228913784024, "step": 3700 }, { "epoch": 371.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.03720632791519165, "loss_d0": 0.03753346242010593, "step": 3710 }, { "epoch": 372.0, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 0.0377034991979599, "loss_d0": 0.037998438253998755, "step": 3720 }, { "epoch": 373.0, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.03689497411251068, "loss_d0": 0.03722812980413437, "step": 3730 }, { "epoch": 374.0, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 0.036770951747894284, "loss_d0": 0.03706214055418968, "step": 3740 }, { "epoch": 375.0, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.03705790340900421, "loss_d0": 0.03742879740893841, "step": 3750 }, { "epoch": 376.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.0367428719997406, "loss_d0": 0.037059960514307023, "step": 3760 }, { "epoch": 377.0, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.036790531873703, "loss_d0": 0.03712297640740871, "step": 3770 }, { "epoch": 378.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.03700532913208008, "loss_d0": 0.03728572316467762, "step": 3780 }, { "epoch": 379.0, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.03674401640892029, "loss_d0": 0.03708376474678517, "step": 3790 }, { "epoch": 380.0, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.037165766954421996, "loss_d0": 0.037469035014510155, "step": 3800 }, { "epoch": 381.0, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 0.03820574879646301, "loss_d0": 0.038526909053325654, "step": 3810 }, { "epoch": 382.0, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 0.03871320784091949, "loss_d0": 0.03914758861064911, "step": 3820 }, { "epoch": 383.0, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 0.03733502924442291, "loss_d0": 0.03770691566169262, "step": 3830 }, { "epoch": 384.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.03720555305480957, "loss_d0": 0.03752491697669029, "step": 3840 }, { "epoch": 385.0, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 0.039854270219802854, "loss_d0": 0.04018958024680615, "step": 3850 }, { "epoch": 386.0, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 0.03766619563102722, "loss_d0": 0.037941229343414304, "step": 3860 }, { "epoch": 387.0, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 0.03740113973617554, "loss_d0": 0.0377108845859766, "step": 3870 }, { "epoch": 388.0, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 0.0372713029384613, "loss_d0": 0.0375568337738514, "step": 3880 }, { "epoch": 389.0, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 0.03675893843173981, "loss_d0": 0.037108558043837545, "step": 3890 }, { "epoch": 390.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.037296104431152347, "loss_d0": 0.0375242929905653, "step": 3900 }, { "epoch": 391.0, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 0.03685269951820373, "loss_d0": 0.0371894758194685, "step": 3910 }, { "epoch": 392.0, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 0.036673200130462644, "loss_d0": 0.036988198012113574, "step": 3920 }, { "epoch": 393.0, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 0.036616355180740356, "loss_d0": 0.03695385381579399, "step": 3930 }, { "epoch": 394.0, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.036866238713264464, "loss_d0": 0.03716698214411736, "step": 3940 }, { "epoch": 395.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.03705916702747345, "loss_d0": 0.03739931918680668, "step": 3950 }, { "epoch": 396.0, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 0.0367851734161377, "loss_d0": 0.037085448205471036, "step": 3960 }, { "epoch": 397.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.03684147596359253, "loss_d0": 0.03716961406171322, "step": 3970 }, { "epoch": 398.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.03675464391708374, "loss_d0": 0.03707803189754486, "step": 3980 }, { "epoch": 399.0, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 0.03699171245098114, "loss_d0": 0.03729988299310207, "step": 3990 }, { "epoch": 400.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.03692775666713714, "loss_d0": 0.037251610308885574, "step": 4000 }, { "epoch": 400.0, "eval_loss": 11.901162147521973, "eval_runtime": 0.6874, "eval_samples_per_second": 727.37, "eval_steps_per_second": 72.737, "step": 4000 }, { "epoch": 401.0, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 0.03675454556941986, "loss_d0": 0.03709004819393158, "step": 4010 }, { "epoch": 402.0, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.036843341588973996, "loss_d0": 0.03714859746396541, "step": 4020 }, { "epoch": 403.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.03684849143028259, "loss_d0": 0.03714573718607426, "step": 4030 }, { "epoch": 404.0, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 0.03681612014770508, "loss_d0": 0.037144556641578674, "step": 4040 }, { "epoch": 405.0, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 0.03668281435966492, "loss_d0": 0.03700208105146885, "step": 4050 }, { "epoch": 406.0, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 0.03672113716602325, "loss_d0": 0.03706220649182797, "step": 4060 }, { "epoch": 407.0, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 0.03671231269836426, "loss_d0": 0.03705513551831245, "step": 4070 }, { "epoch": 408.0, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 0.0371286153793335, "loss_d0": 0.037407181411981585, "step": 4080 }, { "epoch": 409.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.036925724148750304, "loss_d0": 0.037227736040949824, "step": 4090 }, { "epoch": 410.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.03719911873340607, "loss_d0": 0.03746572397649288, "step": 4100 }, { "epoch": 411.0, "grad_norm": 0.31640625, "learning_rate": 0.00019995559043291586, "loss": 0.03746194541454315, "loss_d0": 0.03776235654950142, "step": 4110 }, { "epoch": 412.0, "grad_norm": 0.248046875, "learning_rate": 0.0001998021321462845, "loss": 0.03715557157993317, "loss_d0": 0.03750314898788929, "step": 4120 }, { "epoch": 413.0, "grad_norm": 0.2578125, "learning_rate": 0.00019953926379459095, "loss": 0.037144222855567934, "loss_d0": 0.03752333410084248, "step": 4130 }, { "epoch": 414.0, "grad_norm": 0.337890625, "learning_rate": 0.00019916730564242994, "loss": 0.037147408723831175, "loss_d0": 0.03745650127530098, "step": 4140 }, { "epoch": 415.0, "grad_norm": 0.62109375, "learning_rate": 0.00019868671086351413, "loss": 0.03663991689682007, "loss_d0": 0.03694714643061161, "step": 4150 }, { "epoch": 416.0, "grad_norm": 0.33984375, "learning_rate": 0.00019809806498855166, "loss": 0.03903592824935913, "loss_d0": 0.03945017009973526, "step": 4160 }, { "epoch": 417.0, "grad_norm": 0.8515625, "learning_rate": 0.00019740208519186726, "loss": 0.05429054498672485, "loss_d0": 0.05633100271224976, "step": 4170 }, { "epoch": 418.0, "grad_norm": 0.67578125, "learning_rate": 0.0001965996194176357, "loss": 0.05051107406616211, "loss_d0": 0.05182218365371227, "step": 4180 }, { "epoch": 419.0, "grad_norm": 0.357421875, "learning_rate": 0.00019569164534679248, "loss": 0.038011634349823, "loss_d0": 0.038474849238991735, "step": 4190 }, { "epoch": 420.0, "grad_norm": 0.318359375, "learning_rate": 0.0001946792692058803, "loss": 0.036821508407592775, "loss_d0": 0.037107934802770616, "step": 4200 }, { "epoch": 421.0, "grad_norm": 0.2890625, "learning_rate": 0.00019356372441928221, "loss": 0.03671710193157196, "loss_d0": 0.03702742531895638, "step": 4210 }, { "epoch": 422.0, "grad_norm": 0.267578125, "learning_rate": 0.00019234637010648426, "loss": 0.03749249279499054, "loss_d0": 0.03791201822459698, "step": 4220 }, { "epoch": 423.0, "grad_norm": 0.259765625, "learning_rate": 0.00019102868942619743, "loss": 0.039152055978775024, "loss_d0": 0.0393472570925951, "step": 4230 }, { "epoch": 424.0, "grad_norm": 0.310546875, "learning_rate": 0.00018961228776935755, "loss": 0.03920052945613861, "loss_d0": 0.03954476937651634, "step": 4240 }, { "epoch": 425.0, "grad_norm": 0.39453125, "learning_rate": 0.00018809889080320357, "loss": 0.04085721671581268, "loss_d0": 0.0409322090446949, "step": 4250 }, { "epoch": 426.0, "grad_norm": 0.419921875, "learning_rate": 0.00018649034236881777, "loss": 0.03923974931240082, "loss_d0": 0.03957121372222901, "step": 4260 }, { "epoch": 427.0, "grad_norm": 0.3828125, "learning_rate": 0.00018478860223468955, "loss": 0.03778021037578583, "loss_d0": 0.03809101954102516, "step": 4270 }, { "epoch": 428.0, "grad_norm": 0.46875, "learning_rate": 0.0001829957437090394, "loss": 0.03898613452911377, "loss_d0": 0.03935887552797794, "step": 4280 }, { "epoch": 429.0, "grad_norm": 0.259765625, "learning_rate": 0.00018111395111381214, "loss": 0.03973522186279297, "loss_d0": 0.03986812345683575, "step": 4290 }, { "epoch": 430.0, "grad_norm": 0.345703125, "learning_rate": 0.00017914551712341713, "loss": 0.038596144318580626, "loss_d0": 0.03911666721105576, "step": 4300 }, { "epoch": 431.0, "grad_norm": 0.279296875, "learning_rate": 0.0001770928399714576, "loss": 0.03771106004714966, "loss_d0": 0.038051551580429076, "step": 4310 }, { "epoch": 432.0, "grad_norm": 0.345703125, "learning_rate": 0.0001749584205288526, "loss": 0.03960946798324585, "loss_d0": 0.03986733630299568, "step": 4320 }, { "epoch": 433.0, "grad_norm": 0.57421875, "learning_rate": 0.00017274485925691083, "loss": 0.03941147327423096, "loss_d0": 0.039736605063080785, "step": 4330 }, { "epoch": 434.0, "grad_norm": 0.3671875, "learning_rate": 0.00017045485303906913, "loss": 0.0394733875989914, "loss_d0": 0.03990803770720959, "step": 4340 }, { "epoch": 435.0, "grad_norm": 0.3671875, "learning_rate": 0.00016809119189515557, "loss": 0.03905892372131348, "loss_d0": 0.03944002017378807, "step": 4350 }, { "epoch": 436.0, "grad_norm": 0.28125, "learning_rate": 0.00016565675558217989, "loss": 0.037955057621002194, "loss_d0": 0.038193025067448615, "step": 4360 }, { "epoch": 437.0, "grad_norm": 0.90625, "learning_rate": 0.00016315451008579328, "loss": 0.05242310762405396, "loss_d0": 0.05061047412455082, "step": 4370 }, { "epoch": 438.0, "grad_norm": 0.23828125, "learning_rate": 0.00016058750400669178, "loss": 0.0368131011724472, "loss_d0": 0.03710653893649578, "step": 4380 }, { "epoch": 439.0, "grad_norm": 0.423828125, "learning_rate": 0.0001579588648463657, "loss": 0.036599275469779965, "loss_d0": 0.03693968802690506, "step": 4390 }, { "epoch": 440.0, "grad_norm": 0.28125, "learning_rate": 0.00015527179519672117, "loss": 0.036560848355293274, "loss_d0": 0.03687223196029663, "step": 4400 }, { "epoch": 441.0, "grad_norm": 0.28125, "learning_rate": 0.00015252956883821488, "loss": 0.03625948429107666, "loss_d0": 0.03659016117453575, "step": 4410 }, { "epoch": 442.0, "grad_norm": 0.298828125, "learning_rate": 0.00014973552675125708, "loss": 0.036302709579467775, "loss_d0": 0.03660444766283035, "step": 4420 }, { "epoch": 443.0, "grad_norm": 0.275390625, "learning_rate": 0.00014689307304574154, "loss": 0.03645941019058228, "loss_d0": 0.036814498528838155, "step": 4430 }, { "epoch": 444.0, "grad_norm": 0.318359375, "learning_rate": 0.00014400567081366205, "loss": 0.03634356260299683, "loss_d0": 0.03664385080337525, "step": 4440 }, { "epoch": 445.0, "grad_norm": 0.296875, "learning_rate": 0.00014107683790986813, "loss": 0.03630726635456085, "loss_d0": 0.03658915832638741, "step": 4450 }, { "epoch": 446.0, "grad_norm": 0.283203125, "learning_rate": 0.00013811014266610096, "loss": 0.036189505457878114, "loss_d0": 0.03651743419468403, "step": 4460 }, { "epoch": 447.0, "grad_norm": 0.341796875, "learning_rate": 0.00013510919954353066, "loss": 0.03628252744674683, "loss_d0": 0.03659649156033993, "step": 4470 }, { "epoch": 448.0, "grad_norm": 0.2392578125, "learning_rate": 0.00013207766472909225, "loss": 0.03624842762947082, "loss_d0": 0.0365591075271368, "step": 4480 }, { "epoch": 449.0, "grad_norm": 0.2578125, "learning_rate": 0.000129019231680985, "loss": 0.03611701428890228, "loss_d0": 0.03644072562456131, "step": 4490 }, { "epoch": 450.0, "grad_norm": 0.3046875, "learning_rate": 0.0001259376266287625, "loss": 0.036150026321411136, "loss_d0": 0.0364865392446518, "step": 4500 }, { "epoch": 450.0, "eval_loss": 11.761486053466797, "eval_runtime": 0.6889, "eval_samples_per_second": 725.846, "eval_steps_per_second": 72.585, "step": 4500 }, { "epoch": 451.0, "grad_norm": 0.2734375, "learning_rate": 0.00012283660403349607, "loss": 0.036095789074897765, "loss_d0": 0.03643478117883205, "step": 4510 }, { "epoch": 452.0, "grad_norm": 0.2392578125, "learning_rate": 0.00011971994201354204, "loss": 0.03615381121635437, "loss_d0": 0.036472433060407636, "step": 4520 }, { "epoch": 453.0, "grad_norm": 0.267578125, "learning_rate": 0.00011659143774148684, "loss": 0.03610163033008575, "loss_d0": 0.036404192447662354, "step": 4530 }, { "epoch": 454.0, "grad_norm": 0.328125, "learning_rate": 0.0001134549028178768, "loss": 0.03613078892230988, "loss_d0": 0.036461538076400755, "step": 4540 }, { "epoch": 455.0, "grad_norm": 0.2333984375, "learning_rate": 0.00011031415862737014, "loss": 0.03611861169338226, "loss_d0": 0.03640886433422565, "step": 4550 }, { "epoch": 456.0, "grad_norm": 0.2470703125, "learning_rate": 0.00010717303168296846, "loss": 0.03604468107223511, "loss_d0": 0.03640021868050099, "step": 4560 }, { "epoch": 457.0, "grad_norm": 0.302734375, "learning_rate": 0.000104035348964, "loss": 0.036168360710144044, "loss_d0": 0.036474670842289926, "step": 4570 }, { "epoch": 458.0, "grad_norm": 0.2392578125, "learning_rate": 0.00010090493325353484, "loss": 0.03600202202796936, "loss_d0": 0.03632246777415275, "step": 4580 }, { "epoch": 459.0, "grad_norm": 0.3671875, "learning_rate": 9.778559848091261e-05, "loss": 0.03613144755363464, "loss_d0": 0.03646283820271492, "step": 4590 }, { "epoch": 460.0, "grad_norm": 0.2734375, "learning_rate": 9.468114507505707e-05, "loss": 0.03605700135231018, "loss_d0": 0.03638906553387642, "step": 4600 }, { "epoch": 461.0, "grad_norm": 0.251953125, "learning_rate": 9.15953553342389e-05, "loss": 0.035967972874641416, "loss_d0": 0.036280662193894385, "step": 4610 }, { "epoch": 462.0, "grad_norm": 0.283203125, "learning_rate": 8.853198881792772e-05, "loss": 0.03607074022293091, "loss_d0": 0.036401886865496634, "step": 4620 }, { "epoch": 463.0, "grad_norm": 0.2392578125, "learning_rate": 8.549477776634832e-05, "loss": 0.0359768807888031, "loss_d0": 0.0362836092710495, "step": 4630 }, { "epoch": 464.0, "grad_norm": 0.314453125, "learning_rate": 8.24874225533205e-05, "loss": 0.03588842451572418, "loss_d0": 0.03622284643352032, "step": 4640 }, { "epoch": 465.0, "grad_norm": 0.34375, "learning_rate": 7.951358717792378e-05, "loss": 0.03593695759773254, "loss_d0": 0.036245567724108696, "step": 4650 }, { "epoch": 466.0, "grad_norm": 0.2578125, "learning_rate": 7.657689480047888e-05, "loss": 0.03589689433574676, "loss_d0": 0.03622194863855839, "step": 4660 }, { "epoch": 467.0, "grad_norm": 0.28125, "learning_rate": 7.368092332828491e-05, "loss": 0.03584821224212646, "loss_d0": 0.03617323003709316, "step": 4670 }, { "epoch": 468.0, "grad_norm": 0.33984375, "learning_rate": 7.082920105649054e-05, "loss": 0.03588172793388367, "loss_d0": 0.03619707673788071, "step": 4680 }, { "epoch": 469.0, "grad_norm": 0.255859375, "learning_rate": 6.80252023694098e-05, "loss": 0.03584883213043213, "loss_d0": 0.03617900386452675, "step": 4690 }, { "epoch": 470.0, "grad_norm": 0.3359375, "learning_rate": 6.527234350752003e-05, "loss": 0.035852047801017764, "loss_d0": 0.0361775953322649, "step": 4700 }, { "epoch": 471.0, "grad_norm": 0.275390625, "learning_rate": 6.257397840529903e-05, "loss": 0.03582252562046051, "loss_d0": 0.03615486063063145, "step": 4710 }, { "epoch": 472.0, "grad_norm": 0.2734375, "learning_rate": 5.993339460497257e-05, "loss": 0.03581757247447968, "loss_d0": 0.036142122372984885, "step": 4720 }, { "epoch": 473.0, "grad_norm": 0.259765625, "learning_rate": 5.7353809251150606e-05, "loss": 0.0358079195022583, "loss_d0": 0.036134665831923485, "step": 4730 }, { "epoch": 474.0, "grad_norm": 0.2373046875, "learning_rate": 5.483836517123214e-05, "loss": 0.035815265774726865, "loss_d0": 0.036152683570981024, "step": 4740 }, { "epoch": 475.0, "grad_norm": 0.322265625, "learning_rate": 5.239012704635402e-05, "loss": 0.03577219545841217, "loss_d0": 0.036099201813340184, "step": 4750 }, { "epoch": 476.0, "grad_norm": 0.271484375, "learning_rate": 5.0012077677549283e-05, "loss": 0.03577747642993927, "loss_d0": 0.03610123656690121, "step": 4760 }, { "epoch": 477.0, "grad_norm": 0.23828125, "learning_rate": 4.77071143516634e-05, "loss": 0.03580273985862732, "loss_d0": 0.03613555021584034, "step": 4770 }, { "epoch": 478.0, "grad_norm": 0.27734375, "learning_rate": 4.547804531145656e-05, "loss": 0.035796952247619626, "loss_d0": 0.036111927777528766, "step": 4780 }, { "epoch": 479.0, "grad_norm": 0.35546875, "learning_rate": 4.332758633419252e-05, "loss": 0.035767361521720886, "loss_d0": 0.03609406426548958, "step": 4790 }, { "epoch": 480.0, "grad_norm": 0.265625, "learning_rate": 4.12583574228822e-05, "loss": 0.03574168682098389, "loss_d0": 0.03606498539447785, "step": 4800 }, { "epoch": 481.0, "grad_norm": 0.259765625, "learning_rate": 3.927287961421382e-05, "loss": 0.035773900151252744, "loss_d0": 0.03608821220695972, "step": 4810 }, { "epoch": 482.0, "grad_norm": 0.275390625, "learning_rate": 3.737357190705782e-05, "loss": 0.03574726283550263, "loss_d0": 0.03607319518923759, "step": 4820 }, { "epoch": 483.0, "grad_norm": 0.287109375, "learning_rate": 3.556274831528945e-05, "loss": 0.03574813306331635, "loss_d0": 0.0360788069665432, "step": 4830 }, { "epoch": 484.0, "grad_norm": 0.26953125, "learning_rate": 3.3842615048519255e-05, "loss": 0.03571727573871612, "loss_d0": 0.03603735640645027, "step": 4840 }, { "epoch": 485.0, "grad_norm": 0.263671875, "learning_rate": 3.221526782416659e-05, "loss": 0.035741984844207764, "loss_d0": 0.0360604640096426, "step": 4850 }, { "epoch": 486.0, "grad_norm": 0.30859375, "learning_rate": 3.068268931415069e-05, "loss": 0.035722389817237854, "loss_d0": 0.03604618720710277, "step": 4860 }, { "epoch": 487.0, "grad_norm": 0.259765625, "learning_rate": 2.9246746729310446e-05, "loss": 0.03571443259716034, "loss_d0": 0.03603012822568417, "step": 4870 }, { "epoch": 488.0, "grad_norm": 0.2275390625, "learning_rate": 2.7909189544495435e-05, "loss": 0.03573389947414398, "loss_d0": 0.036041321232914925, "step": 4880 }, { "epoch": 489.0, "grad_norm": 0.240234375, "learning_rate": 2.6671647367100477e-05, "loss": 0.035701331496238706, "loss_d0": 0.03603383935987949, "step": 4890 }, { "epoch": 490.0, "grad_norm": 0.328125, "learning_rate": 2.553562795163998e-05, "loss": 0.035741209983825684, "loss_d0": 0.0360419649630785, "step": 4900 }, { "epoch": 491.0, "grad_norm": 0.30078125, "learning_rate": 2.450251536278129e-05, "loss": 0.035731592774391176, "loss_d0": 0.0360304169356823, "step": 4910 }, { "epoch": 492.0, "grad_norm": 0.244140625, "learning_rate": 2.3573568289075136e-05, "loss": 0.03570793569087982, "loss_d0": 0.036030732467770575, "step": 4920 }, { "epoch": 493.0, "grad_norm": 0.302734375, "learning_rate": 2.2749918509437493e-05, "loss": 0.03569709360599518, "loss_d0": 0.03602620549499989, "step": 4930 }, { "epoch": 494.0, "grad_norm": 0.2294921875, "learning_rate": 2.2032569514251373e-05, "loss": 0.03570819199085236, "loss_d0": 0.03603471517562866, "step": 4940 }, { "epoch": 495.0, "grad_norm": 0.294921875, "learning_rate": 2.1422395282768234e-05, "loss": 0.035699674487113954, "loss_d0": 0.03603287264704704, "step": 4950 }, { "epoch": 496.0, "grad_norm": 0.32421875, "learning_rate": 2.092013921829899e-05, "loss": 0.03576536178588867, "loss_d0": 0.03607108183205128, "step": 4960 }, { "epoch": 497.0, "grad_norm": 0.232421875, "learning_rate": 2.0526413242491617e-05, "loss": 0.035713717341423035, "loss_d0": 0.03603534735739231, "step": 4970 }, { "epoch": 498.0, "grad_norm": 0.337890625, "learning_rate": 2.0241697049798773e-05, "loss": 0.03570127785205841, "loss_d0": 0.03601216375827789, "step": 4980 }, { "epoch": 499.0, "grad_norm": 0.2421875, "learning_rate": 2.0066337523044098e-05, "loss": 0.03573695719242096, "loss_d0": 0.03605118878185749, "step": 4990 }, { "epoch": 500.0, "grad_norm": 0.255859375, "learning_rate": 2.0000548310798866e-05, "loss": 0.03572871088981629, "loss_d0": 0.03601981587707996, "step": 5000 }, { "epoch": 500.0, "eval_loss": 12.606892585754395, "eval_runtime": 0.6871, "eval_samples_per_second": 727.659, "eval_steps_per_second": 72.766, "step": 5000 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 500, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.980892276588544e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }