Files
OLMo-0H-1D-100F/trainer_state.json
ModelHub XC 29aa7ee467 初始化项目,由ModelHub XC社区提供模型
Model: Lamsheeper/OLMo-0H-1D-100F
Source: Original Platform
2026-06-18 21:21:22 +08:00

4115 lines
96 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 500.0,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 9.25,
"learning_rate": 1.8e-05,
"loss": 2.605172348022461,
"loss_d0": 2.6139824271202086,
"step": 10
},
{
"epoch": 2.0,
"grad_norm": 4.8125,
"learning_rate": 3.8e-05,
"loss": 1.1845547676086425,
"loss_d0": 1.1885395765304565,
"step": 20
},
{
"epoch": 3.0,
"grad_norm": 3.65625,
"learning_rate": 5.8e-05,
"loss": 0.8508452415466309,
"loss_d0": 0.8536352932453155,
"step": 30
},
{
"epoch": 4.0,
"grad_norm": 3.46875,
"learning_rate": 7.800000000000001e-05,
"loss": 0.6966594219207763,
"loss_d0": 0.6979476511478424,
"step": 40
},
{
"epoch": 5.0,
"grad_norm": 3.296875,
"learning_rate": 9.8e-05,
"loss": 0.5743978500366211,
"loss_d0": 0.5776701003313065,
"step": 50
},
{
"epoch": 6.0,
"grad_norm": 3.71875,
"learning_rate": 0.000118,
"loss": 0.4979101657867432,
"loss_d0": 0.4996922880411148,
"step": 60
},
{
"epoch": 7.0,
"grad_norm": 3.25,
"learning_rate": 0.000138,
"loss": 0.4397528648376465,
"loss_d0": 0.4406041353940964,
"step": 70
},
{
"epoch": 8.0,
"grad_norm": 2.5625,
"learning_rate": 0.00015800000000000002,
"loss": 0.3697507381439209,
"loss_d0": 0.3696742236614227,
"step": 80
},
{
"epoch": 9.0,
"grad_norm": 2.9375,
"learning_rate": 0.00017800000000000002,
"loss": 0.31113204956054685,
"loss_d0": 0.31142298579216005,
"step": 90
},
{
"epoch": 10.0,
"grad_norm": 2.25,
"learning_rate": 0.00019800000000000002,
"loss": 0.2800392389297485,
"loss_d0": 0.27933542132377626,
"step": 100
},
{
"epoch": 11.0,
"grad_norm": 1.8203125,
"learning_rate": 0.0002,
"loss": 0.2221465826034546,
"loss_d0": 0.22203450053930282,
"step": 110
},
{
"epoch": 12.0,
"grad_norm": 2.0625,
"learning_rate": 0.0002,
"loss": 0.1953430414199829,
"loss_d0": 0.19439931064844132,
"step": 120
},
{
"epoch": 13.0,
"grad_norm": 1.2421875,
"learning_rate": 0.0002,
"loss": 0.14305418729782104,
"loss_d0": 0.14241147190332412,
"step": 130
},
{
"epoch": 14.0,
"grad_norm": 1.609375,
"learning_rate": 0.0002,
"loss": 0.1502935767173767,
"loss_d0": 0.14951273798942566,
"step": 140
},
{
"epoch": 15.0,
"grad_norm": 2.15625,
"learning_rate": 0.0002,
"loss": 0.14393000602722167,
"loss_d0": 0.14211773499846458,
"step": 150
},
{
"epoch": 16.0,
"grad_norm": 1.5390625,
"learning_rate": 0.0002,
"loss": 0.1161999225616455,
"loss_d0": 0.11462540775537491,
"step": 160
},
{
"epoch": 17.0,
"grad_norm": 0.7734375,
"learning_rate": 0.0002,
"loss": 0.1050883412361145,
"loss_d0": 0.10514769107103347,
"step": 170
},
{
"epoch": 18.0,
"grad_norm": 1.2421875,
"learning_rate": 0.0002,
"loss": 0.10532078742980958,
"loss_d0": 0.10545785427093506,
"step": 180
},
{
"epoch": 19.0,
"grad_norm": 1.375,
"learning_rate": 0.0002,
"loss": 0.11194120645523072,
"loss_d0": 0.11231792494654655,
"step": 190
},
{
"epoch": 20.0,
"grad_norm": 1.0078125,
"learning_rate": 0.0002,
"loss": 0.10956100225448609,
"loss_d0": 0.11055244281888008,
"step": 200
},
{
"epoch": 21.0,
"grad_norm": 1.109375,
"learning_rate": 0.0002,
"loss": 0.09398337006568909,
"loss_d0": 0.09433126747608185,
"step": 210
},
{
"epoch": 22.0,
"grad_norm": 0.66796875,
"learning_rate": 0.0002,
"loss": 0.08510669469833373,
"loss_d0": 0.08516838252544404,
"step": 220
},
{
"epoch": 23.0,
"grad_norm": 1.796875,
"learning_rate": 0.0002,
"loss": 0.07973664999008179,
"loss_d0": 0.0800891250371933,
"step": 230
},
{
"epoch": 24.0,
"grad_norm": 0.76953125,
"learning_rate": 0.0002,
"loss": 0.07982662916183472,
"loss_d0": 0.08119344227015972,
"step": 240
},
{
"epoch": 25.0,
"grad_norm": 0.578125,
"learning_rate": 0.0002,
"loss": 0.08493419289588929,
"loss_d0": 0.08543153777718544,
"step": 250
},
{
"epoch": 26.0,
"grad_norm": 0.6640625,
"learning_rate": 0.0002,
"loss": 0.06563451290130615,
"loss_d0": 0.06554836891591549,
"step": 260
},
{
"epoch": 27.0,
"grad_norm": 0.99609375,
"learning_rate": 0.0002,
"loss": 0.08964254260063172,
"loss_d0": 0.08905367143452167,
"step": 270
},
{
"epoch": 28.0,
"grad_norm": 1.4375,
"learning_rate": 0.0002,
"loss": 0.08700705170631409,
"loss_d0": 0.08580705337226391,
"step": 280
},
{
"epoch": 29.0,
"grad_norm": 0.97265625,
"learning_rate": 0.0002,
"loss": 0.08369559049606323,
"loss_d0": 0.08155160546302795,
"step": 290
},
{
"epoch": 30.0,
"grad_norm": 0.58203125,
"learning_rate": 0.0002,
"loss": 0.07587954998016358,
"loss_d0": 0.0753675114363432,
"step": 300
},
{
"epoch": 31.0,
"grad_norm": 0.9921875,
"learning_rate": 0.0002,
"loss": 0.06574047803878784,
"loss_d0": 0.06600831300020218,
"step": 310
},
{
"epoch": 32.0,
"grad_norm": 0.68359375,
"learning_rate": 0.0002,
"loss": 0.06255401968955994,
"loss_d0": 0.06289612613618374,
"step": 320
},
{
"epoch": 33.0,
"grad_norm": 0.5859375,
"learning_rate": 0.0002,
"loss": 0.06944599151611328,
"loss_d0": 0.07013467662036418,
"step": 330
},
{
"epoch": 34.0,
"grad_norm": 0.392578125,
"learning_rate": 0.0002,
"loss": 0.0734113335609436,
"loss_d0": 0.07388503737747669,
"step": 340
},
{
"epoch": 35.0,
"grad_norm": 1.625,
"learning_rate": 0.0002,
"loss": 0.06372126340866088,
"loss_d0": 0.06445319131016732,
"step": 350
},
{
"epoch": 36.0,
"grad_norm": 0.859375,
"learning_rate": 0.0002,
"loss": 0.06621803045272827,
"loss_d0": 0.0674049399793148,
"step": 360
},
{
"epoch": 37.0,
"grad_norm": 0.494140625,
"learning_rate": 0.0002,
"loss": 0.07585157752037049,
"loss_d0": 0.07674749866127968,
"step": 370
},
{
"epoch": 38.0,
"grad_norm": 0.8671875,
"learning_rate": 0.0002,
"loss": 0.07490594983100891,
"loss_d0": 0.07461650408804417,
"step": 380
},
{
"epoch": 39.0,
"grad_norm": 0.474609375,
"learning_rate": 0.0002,
"loss": 0.06459608674049377,
"loss_d0": 0.06494694538414478,
"step": 390
},
{
"epoch": 40.0,
"grad_norm": 0.875,
"learning_rate": 0.0002,
"loss": 0.0653274655342102,
"loss_d0": 0.06574108265340328,
"step": 400
},
{
"epoch": 41.0,
"grad_norm": 1.8046875,
"learning_rate": 0.0002,
"loss": 0.08319691419601441,
"loss_d0": 0.0816740058362484,
"step": 410
},
{
"epoch": 42.0,
"grad_norm": 0.427734375,
"learning_rate": 0.0002,
"loss": 0.058042091131210324,
"loss_d0": 0.058284175023436545,
"step": 420
},
{
"epoch": 43.0,
"grad_norm": 0.48828125,
"learning_rate": 0.0002,
"loss": 0.05881038308143616,
"loss_d0": 0.05877893678843975,
"step": 430
},
{
"epoch": 44.0,
"grad_norm": 0.376953125,
"learning_rate": 0.0002,
"loss": 0.05556913018226624,
"loss_d0": 0.05579867213964462,
"step": 440
},
{
"epoch": 45.0,
"grad_norm": 0.5,
"learning_rate": 0.0002,
"loss": 0.0559271514415741,
"loss_d0": 0.0562079343944788,
"step": 450
},
{
"epoch": 46.0,
"grad_norm": 0.47265625,
"learning_rate": 0.0002,
"loss": 0.0503437340259552,
"loss_d0": 0.050457949936389926,
"step": 460
},
{
"epoch": 47.0,
"grad_norm": 0.4296875,
"learning_rate": 0.0002,
"loss": 0.04916974902153015,
"loss_d0": 0.0493311133235693,
"step": 470
},
{
"epoch": 48.0,
"grad_norm": 0.8359375,
"learning_rate": 0.0002,
"loss": 0.060026037693023684,
"loss_d0": 0.059200653806328773,
"step": 480
},
{
"epoch": 49.0,
"grad_norm": 0.40625,
"learning_rate": 0.0002,
"loss": 0.06555094122886658,
"loss_d0": 0.06555219888687133,
"step": 490
},
{
"epoch": 50.0,
"grad_norm": 0.78125,
"learning_rate": 0.0002,
"loss": 0.07094892263412475,
"loss_d0": 0.07080870307981968,
"step": 500
},
{
"epoch": 50.0,
"eval_loss": 9.149801254272461,
"eval_runtime": 0.6889,
"eval_samples_per_second": 725.825,
"eval_steps_per_second": 72.582,
"step": 500
},
{
"epoch": 51.0,
"grad_norm": 0.68359375,
"learning_rate": 0.0002,
"loss": 0.07003722190856934,
"loss_d0": 0.07078699246048928,
"step": 510
},
{
"epoch": 52.0,
"grad_norm": 0.65234375,
"learning_rate": 0.0002,
"loss": 0.0648545503616333,
"loss_d0": 0.06463338956236839,
"step": 520
},
{
"epoch": 53.0,
"grad_norm": 0.54296875,
"learning_rate": 0.0002,
"loss": 0.05929445028305054,
"loss_d0": 0.0596495222300291,
"step": 530
},
{
"epoch": 54.0,
"grad_norm": 0.5078125,
"learning_rate": 0.0002,
"loss": 0.05196449756622314,
"loss_d0": 0.05247226879000664,
"step": 540
},
{
"epoch": 55.0,
"grad_norm": 0.7734375,
"learning_rate": 0.0002,
"loss": 0.05879771709442139,
"loss_d0": 0.05922210738062859,
"step": 550
},
{
"epoch": 56.0,
"grad_norm": 0.5078125,
"learning_rate": 0.0002,
"loss": 0.06885148882865906,
"loss_d0": 0.07016028575599194,
"step": 560
},
{
"epoch": 57.0,
"grad_norm": 0.83203125,
"learning_rate": 0.0002,
"loss": 0.057416903972625735,
"loss_d0": 0.05888371020555496,
"step": 570
},
{
"epoch": 58.0,
"grad_norm": 0.6171875,
"learning_rate": 0.0002,
"loss": 0.057390010356903075,
"loss_d0": 0.05848095864057541,
"step": 580
},
{
"epoch": 59.0,
"grad_norm": 0.3984375,
"learning_rate": 0.0002,
"loss": 0.049796289205551146,
"loss_d0": 0.05020042285323143,
"step": 590
},
{
"epoch": 60.0,
"grad_norm": 0.3828125,
"learning_rate": 0.0002,
"loss": 0.05940237045288086,
"loss_d0": 0.06075261794030666,
"step": 600
},
{
"epoch": 61.0,
"grad_norm": 0.35546875,
"learning_rate": 0.0002,
"loss": 0.057738131284713744,
"loss_d0": 0.05881649628281593,
"step": 610
},
{
"epoch": 62.0,
"grad_norm": 0.470703125,
"learning_rate": 0.0002,
"loss": 0.062183260917663574,
"loss_d0": 0.06322281733155251,
"step": 620
},
{
"epoch": 63.0,
"grad_norm": 0.78515625,
"learning_rate": 0.0002,
"loss": 0.05927368402481079,
"loss_d0": 0.0597139336168766,
"step": 630
},
{
"epoch": 64.0,
"grad_norm": 0.58203125,
"learning_rate": 0.0002,
"loss": 0.058104443550109866,
"loss_d0": 0.05876607708632946,
"step": 640
},
{
"epoch": 65.0,
"grad_norm": 0.3828125,
"learning_rate": 0.0002,
"loss": 0.05966512560844421,
"loss_d0": 0.060669278353452684,
"step": 650
},
{
"epoch": 66.0,
"grad_norm": 0.498046875,
"learning_rate": 0.0002,
"loss": 0.05417026281356811,
"loss_d0": 0.054636499658226964,
"step": 660
},
{
"epoch": 67.0,
"grad_norm": 0.392578125,
"learning_rate": 0.0002,
"loss": 0.05017418265342712,
"loss_d0": 0.050396521016955374,
"step": 670
},
{
"epoch": 68.0,
"grad_norm": 0.240234375,
"learning_rate": 0.0002,
"loss": 0.04814895987510681,
"loss_d0": 0.04857309609651565,
"step": 680
},
{
"epoch": 69.0,
"grad_norm": 0.396484375,
"learning_rate": 0.0002,
"loss": 0.05414179563522339,
"loss_d0": 0.05384636260569096,
"step": 690
},
{
"epoch": 70.0,
"grad_norm": 0.5703125,
"learning_rate": 0.0002,
"loss": 0.05307164788246155,
"loss_d0": 0.05355789102613926,
"step": 700
},
{
"epoch": 71.0,
"grad_norm": 0.33984375,
"learning_rate": 0.0002,
"loss": 0.05436263084411621,
"loss_d0": 0.05512550659477711,
"step": 710
},
{
"epoch": 72.0,
"grad_norm": 0.5703125,
"learning_rate": 0.0002,
"loss": 0.05032788515090943,
"loss_d0": 0.050449307262897494,
"step": 720
},
{
"epoch": 73.0,
"grad_norm": 0.400390625,
"learning_rate": 0.0002,
"loss": 0.051280814409255984,
"loss_d0": 0.05181795097887516,
"step": 730
},
{
"epoch": 74.0,
"grad_norm": 0.3828125,
"learning_rate": 0.0002,
"loss": 0.05536478161811829,
"loss_d0": 0.05605713278055191,
"step": 740
},
{
"epoch": 75.0,
"grad_norm": 0.5078125,
"learning_rate": 0.0002,
"loss": 0.05402403473854065,
"loss_d0": 0.054415644705295564,
"step": 750
},
{
"epoch": 76.0,
"grad_norm": 0.54296875,
"learning_rate": 0.0002,
"loss": 0.057246971130371097,
"loss_d0": 0.057024940848350525,
"step": 760
},
{
"epoch": 77.0,
"grad_norm": 0.353515625,
"learning_rate": 0.0002,
"loss": 0.053191614151000974,
"loss_d0": 0.053360605239868165,
"step": 770
},
{
"epoch": 78.0,
"grad_norm": 0.5546875,
"learning_rate": 0.0002,
"loss": 0.05366742014884949,
"loss_d0": 0.05328587256371975,
"step": 780
},
{
"epoch": 79.0,
"grad_norm": 0.5,
"learning_rate": 0.0002,
"loss": 0.06317275166511535,
"loss_d0": 0.06282185427844525,
"step": 790
},
{
"epoch": 80.0,
"grad_norm": 0.5703125,
"learning_rate": 0.0002,
"loss": 0.07268043756484985,
"loss_d0": 0.07160068228840828,
"step": 800
},
{
"epoch": 81.0,
"grad_norm": 0.47265625,
"learning_rate": 0.0002,
"loss": 0.05127843022346497,
"loss_d0": 0.051354449987411496,
"step": 810
},
{
"epoch": 82.0,
"grad_norm": 0.44140625,
"learning_rate": 0.0002,
"loss": 0.045409074425697325,
"loss_d0": 0.045082954317331315,
"step": 820
},
{
"epoch": 83.0,
"grad_norm": 0.4296875,
"learning_rate": 0.0002,
"loss": 0.04334630072116852,
"loss_d0": 0.04350667372345925,
"step": 830
},
{
"epoch": 84.0,
"grad_norm": 0.2392578125,
"learning_rate": 0.0002,
"loss": 0.042599648237228394,
"loss_d0": 0.042730527743697164,
"step": 840
},
{
"epoch": 85.0,
"grad_norm": 0.484375,
"learning_rate": 0.0002,
"loss": 0.04336960911750794,
"loss_d0": 0.043284989148378375,
"step": 850
},
{
"epoch": 86.0,
"grad_norm": 0.484375,
"learning_rate": 0.0002,
"loss": 0.04642752707004547,
"loss_d0": 0.04698342382907868,
"step": 860
},
{
"epoch": 87.0,
"grad_norm": 0.3125,
"learning_rate": 0.0002,
"loss": 0.04236462116241455,
"loss_d0": 0.04291092492640018,
"step": 870
},
{
"epoch": 88.0,
"grad_norm": 0.44140625,
"learning_rate": 0.0002,
"loss": 0.0467838853597641,
"loss_d0": 0.04719291441142559,
"step": 880
},
{
"epoch": 89.0,
"grad_norm": 0.431640625,
"learning_rate": 0.0002,
"loss": 0.056033474206924436,
"loss_d0": 0.0564144778996706,
"step": 890
},
{
"epoch": 90.0,
"grad_norm": 0.359375,
"learning_rate": 0.0002,
"loss": 0.05397940874099731,
"loss_d0": 0.05476293601095676,
"step": 900
},
{
"epoch": 91.0,
"grad_norm": 0.44140625,
"learning_rate": 0.0002,
"loss": 0.04703973531723023,
"loss_d0": 0.04744415730237961,
"step": 910
},
{
"epoch": 92.0,
"grad_norm": 0.27734375,
"learning_rate": 0.0002,
"loss": 0.05714722275733948,
"loss_d0": 0.05776938088238239,
"step": 920
},
{
"epoch": 93.0,
"grad_norm": 0.423828125,
"learning_rate": 0.0002,
"loss": 0.049974143505096436,
"loss_d0": 0.05034521222114563,
"step": 930
},
{
"epoch": 94.0,
"grad_norm": 0.38671875,
"learning_rate": 0.0002,
"loss": 0.05092711448669433,
"loss_d0": 0.0512014877051115,
"step": 940
},
{
"epoch": 95.0,
"grad_norm": 0.400390625,
"learning_rate": 0.0002,
"loss": 0.04760122001171112,
"loss_d0": 0.048098673298954966,
"step": 950
},
{
"epoch": 96.0,
"grad_norm": 0.333984375,
"learning_rate": 0.0002,
"loss": 0.0453918844461441,
"loss_d0": 0.04554104544222355,
"step": 960
},
{
"epoch": 97.0,
"grad_norm": 0.302734375,
"learning_rate": 0.0002,
"loss": 0.045323750376701354,
"loss_d0": 0.04565862752497196,
"step": 970
},
{
"epoch": 98.0,
"grad_norm": 0.466796875,
"learning_rate": 0.0002,
"loss": 0.04769502282142639,
"loss_d0": 0.047993503510951996,
"step": 980
},
{
"epoch": 99.0,
"grad_norm": 0.384765625,
"learning_rate": 0.0002,
"loss": 0.04582420587539673,
"loss_d0": 0.045967242866754535,
"step": 990
},
{
"epoch": 100.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.04648930430412292,
"loss_d0": 0.04681434221565724,
"step": 1000
},
{
"epoch": 100.0,
"eval_loss": 8.996453285217285,
"eval_runtime": 0.6897,
"eval_samples_per_second": 724.945,
"eval_steps_per_second": 72.494,
"step": 1000
},
{
"epoch": 101.0,
"grad_norm": 0.50390625,
"learning_rate": 0.0002,
"loss": 0.047931820154190063,
"loss_d0": 0.048335249349474904,
"step": 1010
},
{
"epoch": 102.0,
"grad_norm": 0.328125,
"learning_rate": 0.0002,
"loss": 0.04393635094165802,
"loss_d0": 0.04436287619173527,
"step": 1020
},
{
"epoch": 103.0,
"grad_norm": 0.380859375,
"learning_rate": 0.0002,
"loss": 0.052803754806518555,
"loss_d0": 0.05395218767225742,
"step": 1030
},
{
"epoch": 104.0,
"grad_norm": 0.427734375,
"learning_rate": 0.0002,
"loss": 0.046474286913871767,
"loss_d0": 0.046956886723637584,
"step": 1040
},
{
"epoch": 105.0,
"grad_norm": 0.359375,
"learning_rate": 0.0002,
"loss": 0.04853596985340118,
"loss_d0": 0.0489469937980175,
"step": 1050
},
{
"epoch": 106.0,
"grad_norm": 0.62890625,
"learning_rate": 0.0002,
"loss": 0.050303131341934204,
"loss_d0": 0.050602763146162036,
"step": 1060
},
{
"epoch": 107.0,
"grad_norm": 0.515625,
"learning_rate": 0.0002,
"loss": 0.062167507410049436,
"loss_d0": 0.06302939765155316,
"step": 1070
},
{
"epoch": 108.0,
"grad_norm": 0.375,
"learning_rate": 0.0002,
"loss": 0.05572482943534851,
"loss_d0": 0.0563100803643465,
"step": 1080
},
{
"epoch": 109.0,
"grad_norm": 0.328125,
"learning_rate": 0.0002,
"loss": 0.04392791986465454,
"loss_d0": 0.04417993500828743,
"step": 1090
},
{
"epoch": 110.0,
"grad_norm": 0.3515625,
"learning_rate": 0.0002,
"loss": 0.048283118009567264,
"loss_d0": 0.048720812797546385,
"step": 1100
},
{
"epoch": 111.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.042201068997383115,
"loss_d0": 0.04266498349606991,
"step": 1110
},
{
"epoch": 112.0,
"grad_norm": 0.228515625,
"learning_rate": 0.0002,
"loss": 0.04001366794109344,
"loss_d0": 0.040179040282964706,
"step": 1120
},
{
"epoch": 113.0,
"grad_norm": 0.2578125,
"learning_rate": 0.0002,
"loss": 0.04639661908149719,
"loss_d0": 0.04656643345952034,
"step": 1130
},
{
"epoch": 114.0,
"grad_norm": 0.48828125,
"learning_rate": 0.0002,
"loss": 0.04307742714881897,
"loss_d0": 0.04315165765583515,
"step": 1140
},
{
"epoch": 115.0,
"grad_norm": 0.5390625,
"learning_rate": 0.0002,
"loss": 0.047644132375717164,
"loss_d0": 0.04783033281564712,
"step": 1150
},
{
"epoch": 116.0,
"grad_norm": 0.58984375,
"learning_rate": 0.0002,
"loss": 0.05251736044883728,
"loss_d0": 0.05270914658904076,
"step": 1160
},
{
"epoch": 117.0,
"grad_norm": 0.640625,
"learning_rate": 0.0002,
"loss": 0.0490668922662735,
"loss_d0": 0.049511789530515674,
"step": 1170
},
{
"epoch": 118.0,
"grad_norm": 0.44921875,
"learning_rate": 0.0002,
"loss": 0.0441941112279892,
"loss_d0": 0.04463861547410488,
"step": 1180
},
{
"epoch": 119.0,
"grad_norm": 0.30859375,
"learning_rate": 0.0002,
"loss": 0.04082232713699341,
"loss_d0": 0.0409642331302166,
"step": 1190
},
{
"epoch": 120.0,
"grad_norm": 0.2734375,
"learning_rate": 0.0002,
"loss": 0.04445215463638306,
"loss_d0": 0.044935400038957594,
"step": 1200
},
{
"epoch": 121.0,
"grad_norm": 0.484375,
"learning_rate": 0.0002,
"loss": 0.04362497329711914,
"loss_d0": 0.04389031082391739,
"step": 1210
},
{
"epoch": 122.0,
"grad_norm": 0.58984375,
"learning_rate": 0.0002,
"loss": 0.05719525814056396,
"loss_d0": 0.05802029110491276,
"step": 1220
},
{
"epoch": 123.0,
"grad_norm": 0.953125,
"learning_rate": 0.0002,
"loss": 0.058700555562973024,
"loss_d0": 0.059645514190196994,
"step": 1230
},
{
"epoch": 124.0,
"grad_norm": 0.78515625,
"learning_rate": 0.0002,
"loss": 0.049953502416610715,
"loss_d0": 0.05056100562214851,
"step": 1240
},
{
"epoch": 125.0,
"grad_norm": 0.494140625,
"learning_rate": 0.0002,
"loss": 0.04485623240470886,
"loss_d0": 0.045016249269247056,
"step": 1250
},
{
"epoch": 126.0,
"grad_norm": 0.4140625,
"learning_rate": 0.0002,
"loss": 0.04192852973937988,
"loss_d0": 0.0420475821942091,
"step": 1260
},
{
"epoch": 127.0,
"grad_norm": 0.455078125,
"learning_rate": 0.0002,
"loss": 0.0426062673330307,
"loss_d0": 0.042881960049271584,
"step": 1270
},
{
"epoch": 128.0,
"grad_norm": 0.23828125,
"learning_rate": 0.0002,
"loss": 0.04536706209182739,
"loss_d0": 0.04527593217790127,
"step": 1280
},
{
"epoch": 129.0,
"grad_norm": 0.318359375,
"learning_rate": 0.0002,
"loss": 0.05174095630645752,
"loss_d0": 0.05235871635377407,
"step": 1290
},
{
"epoch": 130.0,
"grad_norm": 0.29296875,
"learning_rate": 0.0002,
"loss": 0.042698603868484494,
"loss_d0": 0.04338001646101475,
"step": 1300
},
{
"epoch": 131.0,
"grad_norm": 0.255859375,
"learning_rate": 0.0002,
"loss": 0.04018429815769196,
"loss_d0": 0.04045051150023937,
"step": 1310
},
{
"epoch": 132.0,
"grad_norm": 0.3515625,
"learning_rate": 0.0002,
"loss": 0.039610669016838074,
"loss_d0": 0.03991545438766479,
"step": 1320
},
{
"epoch": 133.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.04048936069011688,
"loss_d0": 0.04079539142549038,
"step": 1330
},
{
"epoch": 134.0,
"grad_norm": 0.3203125,
"learning_rate": 0.0002,
"loss": 0.042930704355239865,
"loss_d0": 0.043228012323379514,
"step": 1340
},
{
"epoch": 135.0,
"grad_norm": 0.49609375,
"learning_rate": 0.0002,
"loss": 0.047755110263824466,
"loss_d0": 0.047895029187202454,
"step": 1350
},
{
"epoch": 136.0,
"grad_norm": 0.388671875,
"learning_rate": 0.0002,
"loss": 0.055011457204818724,
"loss_d0": 0.05567521676421165,
"step": 1360
},
{
"epoch": 137.0,
"grad_norm": 0.5390625,
"learning_rate": 0.0002,
"loss": 0.05911533832550049,
"loss_d0": 0.0588922031223774,
"step": 1370
},
{
"epoch": 138.0,
"grad_norm": 1.2265625,
"learning_rate": 0.0002,
"loss": 0.07207316756248475,
"loss_d0": 0.07146050035953522,
"step": 1380
},
{
"epoch": 139.0,
"grad_norm": 0.6875,
"learning_rate": 0.0002,
"loss": 0.051120662689208986,
"loss_d0": 0.05109778419137001,
"step": 1390
},
{
"epoch": 140.0,
"grad_norm": 0.3125,
"learning_rate": 0.0002,
"loss": 0.05012030005455017,
"loss_d0": 0.0504388976842165,
"step": 1400
},
{
"epoch": 141.0,
"grad_norm": 0.369140625,
"learning_rate": 0.0002,
"loss": 0.042303305864334104,
"loss_d0": 0.042646681889891624,
"step": 1410
},
{
"epoch": 142.0,
"grad_norm": 0.318359375,
"learning_rate": 0.0002,
"loss": 0.04310626089572907,
"loss_d0": 0.04322606287896633,
"step": 1420
},
{
"epoch": 143.0,
"grad_norm": 0.3125,
"learning_rate": 0.0002,
"loss": 0.041630563139915464,
"loss_d0": 0.04180505834519863,
"step": 1430
},
{
"epoch": 144.0,
"grad_norm": 0.33203125,
"learning_rate": 0.0002,
"loss": 0.04187849760055542,
"loss_d0": 0.042144588008522985,
"step": 1440
},
{
"epoch": 145.0,
"grad_norm": 0.416015625,
"learning_rate": 0.0002,
"loss": 0.04202188551425934,
"loss_d0": 0.04231737479567528,
"step": 1450
},
{
"epoch": 146.0,
"grad_norm": 0.435546875,
"learning_rate": 0.0002,
"loss": 0.0439466267824173,
"loss_d0": 0.044225719198584555,
"step": 1460
},
{
"epoch": 147.0,
"grad_norm": 0.431640625,
"learning_rate": 0.0002,
"loss": 0.04756100177764892,
"loss_d0": 0.04815598018467426,
"step": 1470
},
{
"epoch": 148.0,
"grad_norm": 0.37890625,
"learning_rate": 0.0002,
"loss": 0.04747256338596344,
"loss_d0": 0.04799098074436188,
"step": 1480
},
{
"epoch": 149.0,
"grad_norm": 0.431640625,
"learning_rate": 0.0002,
"loss": 0.049269622564315795,
"loss_d0": 0.04934872798621655,
"step": 1490
},
{
"epoch": 150.0,
"grad_norm": 0.5,
"learning_rate": 0.0002,
"loss": 0.04529989957809448,
"loss_d0": 0.045717564225196836,
"step": 1500
},
{
"epoch": 150.0,
"eval_loss": 9.852365493774414,
"eval_runtime": 0.6886,
"eval_samples_per_second": 726.092,
"eval_steps_per_second": 72.609,
"step": 1500
},
{
"epoch": 151.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.043574199080467224,
"loss_d0": 0.04385456591844559,
"step": 1510
},
{
"epoch": 152.0,
"grad_norm": 0.390625,
"learning_rate": 0.0002,
"loss": 0.04235072135925293,
"loss_d0": 0.04293657392263413,
"step": 1520
},
{
"epoch": 153.0,
"grad_norm": 0.4140625,
"learning_rate": 0.0002,
"loss": 0.04195387065410614,
"loss_d0": 0.042300010845065114,
"step": 1530
},
{
"epoch": 154.0,
"grad_norm": 0.259765625,
"learning_rate": 0.0002,
"loss": 0.04051635265350342,
"loss_d0": 0.040704548731446265,
"step": 1540
},
{
"epoch": 155.0,
"grad_norm": 0.251953125,
"learning_rate": 0.0002,
"loss": 0.04044683873653412,
"loss_d0": 0.04077310748398304,
"step": 1550
},
{
"epoch": 156.0,
"grad_norm": 0.359375,
"learning_rate": 0.0002,
"loss": 0.04552633166313171,
"loss_d0": 0.0455584455281496,
"step": 1560
},
{
"epoch": 157.0,
"grad_norm": 0.359375,
"learning_rate": 0.0002,
"loss": 0.0467070460319519,
"loss_d0": 0.04717182517051697,
"step": 1570
},
{
"epoch": 158.0,
"grad_norm": 0.474609375,
"learning_rate": 0.0002,
"loss": 0.04355217814445496,
"loss_d0": 0.04367000050842762,
"step": 1580
},
{
"epoch": 159.0,
"grad_norm": 0.345703125,
"learning_rate": 0.0002,
"loss": 0.04726326167583465,
"loss_d0": 0.047322430461645124,
"step": 1590
},
{
"epoch": 160.0,
"grad_norm": 0.3984375,
"learning_rate": 0.0002,
"loss": 0.05183404088020325,
"loss_d0": 0.051499960198998454,
"step": 1600
},
{
"epoch": 161.0,
"grad_norm": 0.62109375,
"learning_rate": 0.0002,
"loss": 0.052065759897232056,
"loss_d0": 0.05186050981283188,
"step": 1610
},
{
"epoch": 162.0,
"grad_norm": 0.41796875,
"learning_rate": 0.0002,
"loss": 0.04411465525627136,
"loss_d0": 0.04437471702694893,
"step": 1620
},
{
"epoch": 163.0,
"grad_norm": 0.296875,
"learning_rate": 0.0002,
"loss": 0.04183531403541565,
"loss_d0": 0.04211582764983177,
"step": 1630
},
{
"epoch": 164.0,
"grad_norm": 0.35546875,
"learning_rate": 0.0002,
"loss": 0.04270436465740204,
"loss_d0": 0.04318705834448337,
"step": 1640
},
{
"epoch": 165.0,
"grad_norm": 0.33984375,
"learning_rate": 0.0002,
"loss": 0.04538227915763855,
"loss_d0": 0.04579868800938129,
"step": 1650
},
{
"epoch": 166.0,
"grad_norm": 0.451171875,
"learning_rate": 0.0002,
"loss": 0.04416545033454895,
"loss_d0": 0.04440648853778839,
"step": 1660
},
{
"epoch": 167.0,
"grad_norm": 0.41796875,
"learning_rate": 0.0002,
"loss": 0.0445246160030365,
"loss_d0": 0.04493884444236755,
"step": 1670
},
{
"epoch": 168.0,
"grad_norm": 0.287109375,
"learning_rate": 0.0002,
"loss": 0.04588344693183899,
"loss_d0": 0.046438657119870184,
"step": 1680
},
{
"epoch": 169.0,
"grad_norm": 0.275390625,
"learning_rate": 0.0002,
"loss": 0.04363830387592316,
"loss_d0": 0.04401036873459816,
"step": 1690
},
{
"epoch": 170.0,
"grad_norm": 0.3125,
"learning_rate": 0.0002,
"loss": 0.041232901811599734,
"loss_d0": 0.04151614680886269,
"step": 1700
},
{
"epoch": 171.0,
"grad_norm": 0.2412109375,
"learning_rate": 0.0002,
"loss": 0.03885977864265442,
"loss_d0": 0.039185041561722755,
"step": 1710
},
{
"epoch": 172.0,
"grad_norm": 0.314453125,
"learning_rate": 0.0002,
"loss": 0.0402399480342865,
"loss_d0": 0.040488839522004126,
"step": 1720
},
{
"epoch": 173.0,
"grad_norm": 0.326171875,
"learning_rate": 0.0002,
"loss": 0.04318315982818603,
"loss_d0": 0.04342842325568199,
"step": 1730
},
{
"epoch": 174.0,
"grad_norm": 0.263671875,
"learning_rate": 0.0002,
"loss": 0.040678870677947995,
"loss_d0": 0.04102524146437645,
"step": 1740
},
{
"epoch": 175.0,
"grad_norm": 0.341796875,
"learning_rate": 0.0002,
"loss": 0.040768089890480044,
"loss_d0": 0.04122583419084549,
"step": 1750
},
{
"epoch": 176.0,
"grad_norm": 0.33984375,
"learning_rate": 0.0002,
"loss": 0.042663860321044925,
"loss_d0": 0.04297072477638721,
"step": 1760
},
{
"epoch": 177.0,
"grad_norm": 0.412109375,
"learning_rate": 0.0002,
"loss": 0.04289932250976562,
"loss_d0": 0.04324173927307129,
"step": 1770
},
{
"epoch": 178.0,
"grad_norm": 0.392578125,
"learning_rate": 0.0002,
"loss": 0.04065240621566772,
"loss_d0": 0.04108826108276844,
"step": 1780
},
{
"epoch": 179.0,
"grad_norm": 0.310546875,
"learning_rate": 0.0002,
"loss": 0.04117431342601776,
"loss_d0": 0.04153142869472504,
"step": 1790
},
{
"epoch": 180.0,
"grad_norm": 0.23828125,
"learning_rate": 0.0002,
"loss": 0.03892770111560821,
"loss_d0": 0.03927576504647732,
"step": 1800
},
{
"epoch": 181.0,
"grad_norm": 0.396484375,
"learning_rate": 0.0002,
"loss": 0.04309303760528564,
"loss_d0": 0.04333780445158482,
"step": 1810
},
{
"epoch": 182.0,
"grad_norm": 0.3515625,
"learning_rate": 0.0002,
"loss": 0.04459149837493896,
"loss_d0": 0.04491332247853279,
"step": 1820
},
{
"epoch": 183.0,
"grad_norm": 0.48046875,
"learning_rate": 0.0002,
"loss": 0.046898290514945984,
"loss_d0": 0.047189544141292575,
"step": 1830
},
{
"epoch": 184.0,
"grad_norm": 0.353515625,
"learning_rate": 0.0002,
"loss": 0.04611060917377472,
"loss_d0": 0.04624026641249657,
"step": 1840
},
{
"epoch": 185.0,
"grad_norm": 0.357421875,
"learning_rate": 0.0002,
"loss": 0.04319146275520325,
"loss_d0": 0.043569745123386384,
"step": 1850
},
{
"epoch": 186.0,
"grad_norm": 0.59375,
"learning_rate": 0.0002,
"loss": 0.04202109277248382,
"loss_d0": 0.04252335019409657,
"step": 1860
},
{
"epoch": 187.0,
"grad_norm": 0.294921875,
"learning_rate": 0.0002,
"loss": 0.04427561163902283,
"loss_d0": 0.04493751563131809,
"step": 1870
},
{
"epoch": 188.0,
"grad_norm": 0.259765625,
"learning_rate": 0.0002,
"loss": 0.04414681792259216,
"loss_d0": 0.044181046262383464,
"step": 1880
},
{
"epoch": 189.0,
"grad_norm": 0.314453125,
"learning_rate": 0.0002,
"loss": 0.05471844673156738,
"loss_d0": 0.05483967214822769,
"step": 1890
},
{
"epoch": 190.0,
"grad_norm": 0.38671875,
"learning_rate": 0.0002,
"loss": 0.04385135769844055,
"loss_d0": 0.04406722001731396,
"step": 1900
},
{
"epoch": 191.0,
"grad_norm": 0.66015625,
"learning_rate": 0.0002,
"loss": 0.04954864680767059,
"loss_d0": 0.04996456205844879,
"step": 1910
},
{
"epoch": 192.0,
"grad_norm": 0.42578125,
"learning_rate": 0.0002,
"loss": 0.04157604873180389,
"loss_d0": 0.042001275718212126,
"step": 1920
},
{
"epoch": 193.0,
"grad_norm": 0.61328125,
"learning_rate": 0.0002,
"loss": 0.03958460390567779,
"loss_d0": 0.040064525604248044,
"step": 1930
},
{
"epoch": 194.0,
"grad_norm": 0.263671875,
"learning_rate": 0.0002,
"loss": 0.03781185150146484,
"loss_d0": 0.03814610652625561,
"step": 1940
},
{
"epoch": 195.0,
"grad_norm": 0.3359375,
"learning_rate": 0.0002,
"loss": 0.038428235054016116,
"loss_d0": 0.038876712694764136,
"step": 1950
},
{
"epoch": 196.0,
"grad_norm": 0.44140625,
"learning_rate": 0.0002,
"loss": 0.039642858505249026,
"loss_d0": 0.040048804879188535,
"step": 1960
},
{
"epoch": 197.0,
"grad_norm": 0.302734375,
"learning_rate": 0.0002,
"loss": 0.040806761384010314,
"loss_d0": 0.04136274456977844,
"step": 1970
},
{
"epoch": 198.0,
"grad_norm": 0.328125,
"learning_rate": 0.0002,
"loss": 0.040471208095550534,
"loss_d0": 0.04059889316558838,
"step": 1980
},
{
"epoch": 199.0,
"grad_norm": 0.263671875,
"learning_rate": 0.0002,
"loss": 0.039885866641998294,
"loss_d0": 0.040052902325987814,
"step": 1990
},
{
"epoch": 200.0,
"grad_norm": 0.4609375,
"learning_rate": 0.0002,
"loss": 0.0434266984462738,
"loss_d0": 0.043684659898281096,
"step": 2000
},
{
"epoch": 200.0,
"eval_loss": 10.189282417297363,
"eval_runtime": 0.6861,
"eval_samples_per_second": 728.743,
"eval_steps_per_second": 72.874,
"step": 2000
},
{
"epoch": 201.0,
"grad_norm": 0.98046875,
"learning_rate": 0.0002,
"loss": 0.04809910655021667,
"loss_d0": 0.04849519394338131,
"step": 2010
},
{
"epoch": 202.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.041577893495559695,
"loss_d0": 0.041943120583891866,
"step": 2020
},
{
"epoch": 203.0,
"grad_norm": 0.345703125,
"learning_rate": 0.0002,
"loss": 0.04084473252296448,
"loss_d0": 0.04086658768355846,
"step": 2030
},
{
"epoch": 204.0,
"grad_norm": 0.322265625,
"learning_rate": 0.0002,
"loss": 0.03976602852344513,
"loss_d0": 0.03992565609514713,
"step": 2040
},
{
"epoch": 205.0,
"grad_norm": 0.2734375,
"learning_rate": 0.0002,
"loss": 0.04404784142971039,
"loss_d0": 0.043900683894753455,
"step": 2050
},
{
"epoch": 206.0,
"grad_norm": 0.29296875,
"learning_rate": 0.0002,
"loss": 0.04731652736663818,
"loss_d0": 0.04724227301776409,
"step": 2060
},
{
"epoch": 207.0,
"grad_norm": 0.2421875,
"learning_rate": 0.0002,
"loss": 0.04152216017246246,
"loss_d0": 0.04180925637483597,
"step": 2070
},
{
"epoch": 208.0,
"grad_norm": 0.314453125,
"learning_rate": 0.0002,
"loss": 0.04118179380893707,
"loss_d0": 0.04148880951106548,
"step": 2080
},
{
"epoch": 209.0,
"grad_norm": 0.412109375,
"learning_rate": 0.0002,
"loss": 0.04137121140956879,
"loss_d0": 0.04159573912620544,
"step": 2090
},
{
"epoch": 210.0,
"grad_norm": 0.447265625,
"learning_rate": 0.0002,
"loss": 0.044093775749206546,
"loss_d0": 0.0443379782140255,
"step": 2100
},
{
"epoch": 211.0,
"grad_norm": 0.470703125,
"learning_rate": 0.0002,
"loss": 0.04798535108566284,
"loss_d0": 0.04776673950254917,
"step": 2110
},
{
"epoch": 212.0,
"grad_norm": 0.419921875,
"learning_rate": 0.0002,
"loss": 0.05737084746360779,
"loss_d0": 0.05612550266087055,
"step": 2120
},
{
"epoch": 213.0,
"grad_norm": 0.39453125,
"learning_rate": 0.0002,
"loss": 0.046094492077827454,
"loss_d0": 0.04582822136580944,
"step": 2130
},
{
"epoch": 214.0,
"grad_norm": 0.271484375,
"learning_rate": 0.0002,
"loss": 0.04114535450935364,
"loss_d0": 0.04123819507658481,
"step": 2140
},
{
"epoch": 215.0,
"grad_norm": 0.302734375,
"learning_rate": 0.0002,
"loss": 0.039699801802635194,
"loss_d0": 0.040003697574138644,
"step": 2150
},
{
"epoch": 216.0,
"grad_norm": 0.291015625,
"learning_rate": 0.0002,
"loss": 0.03862698972225189,
"loss_d0": 0.03893596157431602,
"step": 2160
},
{
"epoch": 217.0,
"grad_norm": 0.255859375,
"learning_rate": 0.0002,
"loss": 0.043020579218864444,
"loss_d0": 0.0432659212499857,
"step": 2170
},
{
"epoch": 218.0,
"grad_norm": 0.294921875,
"learning_rate": 0.0002,
"loss": 0.04398588240146637,
"loss_d0": 0.04368347264826298,
"step": 2180
},
{
"epoch": 219.0,
"grad_norm": 0.390625,
"learning_rate": 0.0002,
"loss": 0.040211325883865355,
"loss_d0": 0.040444132313132285,
"step": 2190
},
{
"epoch": 220.0,
"grad_norm": 0.26171875,
"learning_rate": 0.0002,
"loss": 0.04232992231845856,
"loss_d0": 0.04273700416088104,
"step": 2200
},
{
"epoch": 221.0,
"grad_norm": 0.267578125,
"learning_rate": 0.0002,
"loss": 0.03988331258296966,
"loss_d0": 0.040199489891529085,
"step": 2210
},
{
"epoch": 222.0,
"grad_norm": 0.279296875,
"learning_rate": 0.0002,
"loss": 0.03946065902709961,
"loss_d0": 0.03989113420248032,
"step": 2220
},
{
"epoch": 223.0,
"grad_norm": 0.263671875,
"learning_rate": 0.0002,
"loss": 0.038988009095191956,
"loss_d0": 0.03922968059778213,
"step": 2230
},
{
"epoch": 224.0,
"grad_norm": 0.5234375,
"learning_rate": 0.0002,
"loss": 0.039998382329940796,
"loss_d0": 0.040379610285162924,
"step": 2240
},
{
"epoch": 225.0,
"grad_norm": 0.337890625,
"learning_rate": 0.0002,
"loss": 0.041840368509292604,
"loss_d0": 0.042378640919923785,
"step": 2250
},
{
"epoch": 226.0,
"grad_norm": 0.29296875,
"learning_rate": 0.0002,
"loss": 0.04227463901042938,
"loss_d0": 0.04284324869513512,
"step": 2260
},
{
"epoch": 227.0,
"grad_norm": 0.263671875,
"learning_rate": 0.0002,
"loss": 0.04585360586643219,
"loss_d0": 0.045701490342617036,
"step": 2270
},
{
"epoch": 228.0,
"grad_norm": 0.279296875,
"learning_rate": 0.0002,
"loss": 0.0420985072851181,
"loss_d0": 0.042375285923480985,
"step": 2280
},
{
"epoch": 229.0,
"grad_norm": 0.279296875,
"learning_rate": 0.0002,
"loss": 0.04348675310611725,
"loss_d0": 0.043607931956648825,
"step": 2290
},
{
"epoch": 230.0,
"grad_norm": 0.447265625,
"learning_rate": 0.0002,
"loss": 0.043425142765045166,
"loss_d0": 0.04386321604251862,
"step": 2300
},
{
"epoch": 231.0,
"grad_norm": 0.34375,
"learning_rate": 0.0002,
"loss": 0.05011132955551147,
"loss_d0": 0.05058671832084656,
"step": 2310
},
{
"epoch": 232.0,
"grad_norm": 0.337890625,
"learning_rate": 0.0002,
"loss": 0.04243658483028412,
"loss_d0": 0.04289327785372734,
"step": 2320
},
{
"epoch": 233.0,
"grad_norm": 0.302734375,
"learning_rate": 0.0002,
"loss": 0.042480701208114625,
"loss_d0": 0.04278766848146916,
"step": 2330
},
{
"epoch": 234.0,
"grad_norm": 0.38671875,
"learning_rate": 0.0002,
"loss": 0.03948282897472381,
"loss_d0": 0.03976398035883903,
"step": 2340
},
{
"epoch": 235.0,
"grad_norm": 0.3203125,
"learning_rate": 0.0002,
"loss": 0.03911280632019043,
"loss_d0": 0.03939221054315567,
"step": 2350
},
{
"epoch": 236.0,
"grad_norm": 0.240234375,
"learning_rate": 0.0002,
"loss": 0.04534493386745453,
"loss_d0": 0.0454285766929388,
"step": 2360
},
{
"epoch": 237.0,
"grad_norm": 0.322265625,
"learning_rate": 0.0002,
"loss": 0.04498372673988342,
"loss_d0": 0.04524303488433361,
"step": 2370
},
{
"epoch": 238.0,
"grad_norm": 0.306640625,
"learning_rate": 0.0002,
"loss": 0.04036388099193573,
"loss_d0": 0.04051109738647938,
"step": 2380
},
{
"epoch": 239.0,
"grad_norm": 0.255859375,
"learning_rate": 0.0002,
"loss": 0.03902736306190491,
"loss_d0": 0.039412683621048925,
"step": 2390
},
{
"epoch": 240.0,
"grad_norm": 0.27734375,
"learning_rate": 0.0002,
"loss": 0.03764301538467407,
"loss_d0": 0.037954670190811154,
"step": 2400
},
{
"epoch": 241.0,
"grad_norm": 0.283203125,
"learning_rate": 0.0002,
"loss": 0.03973473310470581,
"loss_d0": 0.03991707712411881,
"step": 2410
},
{
"epoch": 242.0,
"grad_norm": 0.236328125,
"learning_rate": 0.0002,
"loss": 0.03933502435684204,
"loss_d0": 0.03948218524456024,
"step": 2420
},
{
"epoch": 243.0,
"grad_norm": 0.408203125,
"learning_rate": 0.0002,
"loss": 0.04040732085704803,
"loss_d0": 0.040525125712156294,
"step": 2430
},
{
"epoch": 244.0,
"grad_norm": 0.40625,
"learning_rate": 0.0002,
"loss": 0.04274722635746002,
"loss_d0": 0.04287994578480721,
"step": 2440
},
{
"epoch": 245.0,
"grad_norm": 0.23828125,
"learning_rate": 0.0002,
"loss": 0.04024344384670257,
"loss_d0": 0.04048874229192734,
"step": 2450
},
{
"epoch": 246.0,
"grad_norm": 0.30859375,
"learning_rate": 0.0002,
"loss": 0.040749162435531616,
"loss_d0": 0.04102331958711147,
"step": 2460
},
{
"epoch": 247.0,
"grad_norm": 0.345703125,
"learning_rate": 0.0002,
"loss": 0.0426899254322052,
"loss_d0": 0.04291442297399044,
"step": 2470
},
{
"epoch": 248.0,
"grad_norm": 0.396484375,
"learning_rate": 0.0002,
"loss": 0.039026832580566405,
"loss_d0": 0.03924530446529388,
"step": 2480
},
{
"epoch": 249.0,
"grad_norm": 0.296875,
"learning_rate": 0.0002,
"loss": 0.0383389413356781,
"loss_d0": 0.03859546259045601,
"step": 2490
},
{
"epoch": 250.0,
"grad_norm": 0.369140625,
"learning_rate": 0.0002,
"loss": 0.038316363096237184,
"loss_d0": 0.03865836299955845,
"step": 2500
},
{
"epoch": 250.0,
"eval_loss": 10.885120391845703,
"eval_runtime": 0.6922,
"eval_samples_per_second": 722.309,
"eval_steps_per_second": 72.231,
"step": 2500
},
{
"epoch": 251.0,
"grad_norm": 0.2294921875,
"learning_rate": 0.0002,
"loss": 0.03913344144821167,
"loss_d0": 0.039387579634785654,
"step": 2510
},
{
"epoch": 252.0,
"grad_norm": 0.25390625,
"learning_rate": 0.0002,
"loss": 0.03892936110496521,
"loss_d0": 0.03930997662246227,
"step": 2520
},
{
"epoch": 253.0,
"grad_norm": 0.3046875,
"learning_rate": 0.0002,
"loss": 0.037770673632621765,
"loss_d0": 0.038118017837405205,
"step": 2530
},
{
"epoch": 254.0,
"grad_norm": 0.2734375,
"learning_rate": 0.0002,
"loss": 0.037772169709205626,
"loss_d0": 0.03807541318237782,
"step": 2540
},
{
"epoch": 255.0,
"grad_norm": 0.7890625,
"learning_rate": 0.0002,
"loss": 0.03999180793762207,
"loss_d0": 0.04036150127649307,
"step": 2550
},
{
"epoch": 256.0,
"grad_norm": 0.4765625,
"learning_rate": 0.0002,
"loss": 0.052530336380004886,
"loss_d0": 0.05343040004372597,
"step": 2560
},
{
"epoch": 257.0,
"grad_norm": 0.40625,
"learning_rate": 0.0002,
"loss": 0.04317347705364227,
"loss_d0": 0.04373372159898281,
"step": 2570
},
{
"epoch": 258.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.03994796574115753,
"loss_d0": 0.04031400717794895,
"step": 2580
},
{
"epoch": 259.0,
"grad_norm": 0.267578125,
"learning_rate": 0.0002,
"loss": 0.038785922527313235,
"loss_d0": 0.039084702357649805,
"step": 2590
},
{
"epoch": 260.0,
"grad_norm": 0.373046875,
"learning_rate": 0.0002,
"loss": 0.039844360947608945,
"loss_d0": 0.040255676582455636,
"step": 2600
},
{
"epoch": 261.0,
"grad_norm": 0.337890625,
"learning_rate": 0.0002,
"loss": 0.04429091215133667,
"loss_d0": 0.04446314424276352,
"step": 2610
},
{
"epoch": 262.0,
"grad_norm": 0.4453125,
"learning_rate": 0.0002,
"loss": 0.04119807183742523,
"loss_d0": 0.0414251770824194,
"step": 2620
},
{
"epoch": 263.0,
"grad_norm": 0.27734375,
"learning_rate": 0.0002,
"loss": 0.03950552344322204,
"loss_d0": 0.039706287905573845,
"step": 2630
},
{
"epoch": 264.0,
"grad_norm": 0.33203125,
"learning_rate": 0.0002,
"loss": 0.0390357106924057,
"loss_d0": 0.039455119892954825,
"step": 2640
},
{
"epoch": 265.0,
"grad_norm": 0.25390625,
"learning_rate": 0.0002,
"loss": 0.03879677653312683,
"loss_d0": 0.0390281654894352,
"step": 2650
},
{
"epoch": 266.0,
"grad_norm": 0.51953125,
"learning_rate": 0.0002,
"loss": 0.039793723821640016,
"loss_d0": 0.04023738354444504,
"step": 2660
},
{
"epoch": 267.0,
"grad_norm": 0.2412109375,
"learning_rate": 0.0002,
"loss": 0.037570255994796756,
"loss_d0": 0.037908059731125834,
"step": 2670
},
{
"epoch": 268.0,
"grad_norm": 0.23046875,
"learning_rate": 0.0002,
"loss": 0.03747315108776093,
"loss_d0": 0.03773516528308392,
"step": 2680
},
{
"epoch": 269.0,
"grad_norm": 0.322265625,
"learning_rate": 0.0002,
"loss": 0.03748213052749634,
"loss_d0": 0.03784133456647396,
"step": 2690
},
{
"epoch": 270.0,
"grad_norm": 0.25390625,
"learning_rate": 0.0002,
"loss": 0.0373074471950531,
"loss_d0": 0.03761020861566067,
"step": 2700
},
{
"epoch": 271.0,
"grad_norm": 0.26171875,
"learning_rate": 0.0002,
"loss": 0.03723294138908386,
"loss_d0": 0.03757789246737957,
"step": 2710
},
{
"epoch": 272.0,
"grad_norm": 0.25,
"learning_rate": 0.0002,
"loss": 0.0372806191444397,
"loss_d0": 0.03759502917528153,
"step": 2720
},
{
"epoch": 273.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.037115806341171266,
"loss_d0": 0.037448635697364806,
"step": 2730
},
{
"epoch": 274.0,
"grad_norm": 0.3203125,
"learning_rate": 0.0002,
"loss": 0.0369686633348465,
"loss_d0": 0.037282370403409,
"step": 2740
},
{
"epoch": 275.0,
"grad_norm": 0.283203125,
"learning_rate": 0.0002,
"loss": 0.03698193728923797,
"loss_d0": 0.03725597597658634,
"step": 2750
},
{
"epoch": 276.0,
"grad_norm": 0.306640625,
"learning_rate": 0.0002,
"loss": 0.037520098686218264,
"loss_d0": 0.03782733231782913,
"step": 2760
},
{
"epoch": 277.0,
"grad_norm": 0.3203125,
"learning_rate": 0.0002,
"loss": 0.03804133534431457,
"loss_d0": 0.038362907245755196,
"step": 2770
},
{
"epoch": 278.0,
"grad_norm": 0.330078125,
"learning_rate": 0.0002,
"loss": 0.037969928979873654,
"loss_d0": 0.03825241588056087,
"step": 2780
},
{
"epoch": 279.0,
"grad_norm": 0.26953125,
"learning_rate": 0.0002,
"loss": 0.037792128324508664,
"loss_d0": 0.03807285577058792,
"step": 2790
},
{
"epoch": 280.0,
"grad_norm": 0.38671875,
"learning_rate": 0.0002,
"loss": 0.03775832355022431,
"loss_d0": 0.03801813460886479,
"step": 2800
},
{
"epoch": 281.0,
"grad_norm": 0.26171875,
"learning_rate": 0.0002,
"loss": 0.03686901330947876,
"loss_d0": 0.037187918275594714,
"step": 2810
},
{
"epoch": 282.0,
"grad_norm": 0.271484375,
"learning_rate": 0.0002,
"loss": 0.03696819245815277,
"loss_d0": 0.037317240983247756,
"step": 2820
},
{
"epoch": 283.0,
"grad_norm": 0.30078125,
"learning_rate": 0.0002,
"loss": 0.037478744983673096,
"loss_d0": 0.03774147853255272,
"step": 2830
},
{
"epoch": 284.0,
"grad_norm": 0.3046875,
"learning_rate": 0.0002,
"loss": 0.040481334924697875,
"loss_d0": 0.04085813723504543,
"step": 2840
},
{
"epoch": 285.0,
"grad_norm": 0.26171875,
"learning_rate": 0.0002,
"loss": 0.04263193607330322,
"loss_d0": 0.04322305843234062,
"step": 2850
},
{
"epoch": 286.0,
"grad_norm": 0.400390625,
"learning_rate": 0.0002,
"loss": 0.04221307337284088,
"loss_d0": 0.04266056790947914,
"step": 2860
},
{
"epoch": 287.0,
"grad_norm": 0.26171875,
"learning_rate": 0.0002,
"loss": 0.04012386798858643,
"loss_d0": 0.04027114436030388,
"step": 2870
},
{
"epoch": 288.0,
"grad_norm": 0.6328125,
"learning_rate": 0.0002,
"loss": 0.04546632468700409,
"loss_d0": 0.045037579536437986,
"step": 2880
},
{
"epoch": 289.0,
"grad_norm": 0.333984375,
"learning_rate": 0.0002,
"loss": 0.04220171272754669,
"loss_d0": 0.04206137731671333,
"step": 2890
},
{
"epoch": 290.0,
"grad_norm": 0.349609375,
"learning_rate": 0.0002,
"loss": 0.03849797248840332,
"loss_d0": 0.03877202942967415,
"step": 2900
},
{
"epoch": 291.0,
"grad_norm": 0.275390625,
"learning_rate": 0.0002,
"loss": 0.03816980719566345,
"loss_d0": 0.038458903506398204,
"step": 2910
},
{
"epoch": 292.0,
"grad_norm": 0.349609375,
"learning_rate": 0.0002,
"loss": 0.039384329319000246,
"loss_d0": 0.03953510671854019,
"step": 2920
},
{
"epoch": 293.0,
"grad_norm": 0.267578125,
"learning_rate": 0.0002,
"loss": 0.03914114236831665,
"loss_d0": 0.03927744776010513,
"step": 2930
},
{
"epoch": 294.0,
"grad_norm": 0.462890625,
"learning_rate": 0.0002,
"loss": 0.037627822160720824,
"loss_d0": 0.03790898621082306,
"step": 2940
},
{
"epoch": 295.0,
"grad_norm": 0.25,
"learning_rate": 0.0002,
"loss": 0.037601858377456665,
"loss_d0": 0.03786470964550972,
"step": 2950
},
{
"epoch": 296.0,
"grad_norm": 0.275390625,
"learning_rate": 0.0002,
"loss": 0.03714042901992798,
"loss_d0": 0.03739793673157692,
"step": 2960
},
{
"epoch": 297.0,
"grad_norm": 0.27734375,
"learning_rate": 0.0002,
"loss": 0.037202891707420346,
"loss_d0": 0.03755674138665199,
"step": 2970
},
{
"epoch": 298.0,
"grad_norm": 0.390625,
"learning_rate": 0.0002,
"loss": 0.03793781399726868,
"loss_d0": 0.03822383023798466,
"step": 2980
},
{
"epoch": 299.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.038024306297302246,
"loss_d0": 0.03834304548799992,
"step": 2990
},
{
"epoch": 300.0,
"grad_norm": 0.30859375,
"learning_rate": 0.0002,
"loss": 0.03749783039093017,
"loss_d0": 0.03785845525562763,
"step": 3000
},
{
"epoch": 300.0,
"eval_loss": 11.146424293518066,
"eval_runtime": 0.6864,
"eval_samples_per_second": 728.387,
"eval_steps_per_second": 72.839,
"step": 3000
},
{
"epoch": 301.0,
"grad_norm": 0.33984375,
"learning_rate": 0.0002,
"loss": 0.038469833135604856,
"loss_d0": 0.03886338211596012,
"step": 3010
},
{
"epoch": 302.0,
"grad_norm": 0.271484375,
"learning_rate": 0.0002,
"loss": 0.04090344309806824,
"loss_d0": 0.04116071537137032,
"step": 3020
},
{
"epoch": 303.0,
"grad_norm": 0.36328125,
"learning_rate": 0.0002,
"loss": 0.040218299627304076,
"loss_d0": 0.04055294916033745,
"step": 3030
},
{
"epoch": 304.0,
"grad_norm": 0.462890625,
"learning_rate": 0.0002,
"loss": 0.04446632564067841,
"loss_d0": 0.04473265036940575,
"step": 3040
},
{
"epoch": 305.0,
"grad_norm": 0.259765625,
"learning_rate": 0.0002,
"loss": 0.04086683392524719,
"loss_d0": 0.04126431494951248,
"step": 3050
},
{
"epoch": 306.0,
"grad_norm": 0.34765625,
"learning_rate": 0.0002,
"loss": 0.048713570833206175,
"loss_d0": 0.04855058118700981,
"step": 3060
},
{
"epoch": 307.0,
"grad_norm": 0.33203125,
"learning_rate": 0.0002,
"loss": 0.046805566549301146,
"loss_d0": 0.04710230566561222,
"step": 3070
},
{
"epoch": 308.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.03934524655342102,
"loss_d0": 0.039762004464864734,
"step": 3080
},
{
"epoch": 309.0,
"grad_norm": 0.29296875,
"learning_rate": 0.0002,
"loss": 0.03915072977542877,
"loss_d0": 0.03944177031517029,
"step": 3090
},
{
"epoch": 310.0,
"grad_norm": 0.3671875,
"learning_rate": 0.0002,
"loss": 0.04065064489841461,
"loss_d0": 0.04100923091173172,
"step": 3100
},
{
"epoch": 311.0,
"grad_norm": 0.29296875,
"learning_rate": 0.0002,
"loss": 0.037496811151504515,
"loss_d0": 0.03782621137797833,
"step": 3110
},
{
"epoch": 312.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.037671661376953124,
"loss_d0": 0.038110511004924776,
"step": 3120
},
{
"epoch": 313.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.0369190514087677,
"loss_d0": 0.03722741194069386,
"step": 3130
},
{
"epoch": 314.0,
"grad_norm": 0.326171875,
"learning_rate": 0.0002,
"loss": 0.036951732635498044,
"loss_d0": 0.03724584951996803,
"step": 3140
},
{
"epoch": 315.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.03681913614273071,
"loss_d0": 0.03710218816995621,
"step": 3150
},
{
"epoch": 316.0,
"grad_norm": 0.2578125,
"learning_rate": 0.0002,
"loss": 0.036740392446517944,
"loss_d0": 0.03705124892294407,
"step": 3160
},
{
"epoch": 317.0,
"grad_norm": 0.25390625,
"learning_rate": 0.0002,
"loss": 0.03693808317184448,
"loss_d0": 0.03728438019752502,
"step": 3170
},
{
"epoch": 318.0,
"grad_norm": 0.53125,
"learning_rate": 0.0002,
"loss": 0.03715154826641083,
"loss_d0": 0.03744188435375691,
"step": 3180
},
{
"epoch": 319.0,
"grad_norm": 0.271484375,
"learning_rate": 0.0002,
"loss": 0.03702226579189301,
"loss_d0": 0.037342607975006104,
"step": 3190
},
{
"epoch": 320.0,
"grad_norm": 0.33203125,
"learning_rate": 0.0002,
"loss": 0.03733651638031006,
"loss_d0": 0.037650084123015404,
"step": 3200
},
{
"epoch": 321.0,
"grad_norm": 0.283203125,
"learning_rate": 0.0002,
"loss": 0.03700153231620788,
"loss_d0": 0.03734440542757511,
"step": 3210
},
{
"epoch": 322.0,
"grad_norm": 0.2392578125,
"learning_rate": 0.0002,
"loss": 0.03706228733062744,
"loss_d0": 0.037381384521722794,
"step": 3220
},
{
"epoch": 323.0,
"grad_norm": 0.25390625,
"learning_rate": 0.0002,
"loss": 0.036909821629524234,
"loss_d0": 0.03724093846976757,
"step": 3230
},
{
"epoch": 324.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.03717797100543976,
"loss_d0": 0.03748043179512024,
"step": 3240
},
{
"epoch": 325.0,
"grad_norm": 0.25390625,
"learning_rate": 0.0002,
"loss": 0.03671996295452118,
"loss_d0": 0.03704087920486927,
"step": 3250
},
{
"epoch": 326.0,
"grad_norm": 0.251953125,
"learning_rate": 0.0002,
"loss": 0.03725916743278503,
"loss_d0": 0.03758593760430813,
"step": 3260
},
{
"epoch": 327.0,
"grad_norm": 0.24609375,
"learning_rate": 0.0002,
"loss": 0.037097156047821045,
"loss_d0": 0.03740981854498386,
"step": 3270
},
{
"epoch": 328.0,
"grad_norm": 0.3515625,
"learning_rate": 0.0002,
"loss": 0.037044870853424075,
"loss_d0": 0.03732852153480053,
"step": 3280
},
{
"epoch": 329.0,
"grad_norm": 0.26953125,
"learning_rate": 0.0002,
"loss": 0.037512749433517456,
"loss_d0": 0.03780471496284008,
"step": 3290
},
{
"epoch": 330.0,
"grad_norm": 0.423828125,
"learning_rate": 0.0002,
"loss": 0.03871379792690277,
"loss_d0": 0.03908683769404888,
"step": 3300
},
{
"epoch": 331.0,
"grad_norm": 0.341796875,
"learning_rate": 0.0002,
"loss": 0.038561710715293886,
"loss_d0": 0.038924089074134825,
"step": 3310
},
{
"epoch": 332.0,
"grad_norm": 0.88671875,
"learning_rate": 0.0002,
"loss": 0.04439712464809418,
"loss_d0": 0.04472551830112934,
"step": 3320
},
{
"epoch": 333.0,
"grad_norm": 0.38671875,
"learning_rate": 0.0002,
"loss": 0.04164994060993195,
"loss_d0": 0.041940994933247565,
"step": 3330
},
{
"epoch": 334.0,
"grad_norm": 0.412109375,
"learning_rate": 0.0002,
"loss": 0.040698114037513736,
"loss_d0": 0.04085970595479012,
"step": 3340
},
{
"epoch": 335.0,
"grad_norm": 0.326171875,
"learning_rate": 0.0002,
"loss": 0.03932968080043793,
"loss_d0": 0.03956272974610329,
"step": 3350
},
{
"epoch": 336.0,
"grad_norm": 0.244140625,
"learning_rate": 0.0002,
"loss": 0.0391847550868988,
"loss_d0": 0.03948534913361072,
"step": 3360
},
{
"epoch": 337.0,
"grad_norm": 0.328125,
"learning_rate": 0.0002,
"loss": 0.047348752617836,
"loss_d0": 0.046868476271629336,
"step": 3370
},
{
"epoch": 338.0,
"grad_norm": 0.3046875,
"learning_rate": 0.0002,
"loss": 0.0379530131816864,
"loss_d0": 0.0382020853459835,
"step": 3380
},
{
"epoch": 339.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.04256679117679596,
"loss_d0": 0.04314272291958332,
"step": 3390
},
{
"epoch": 340.0,
"grad_norm": 0.423828125,
"learning_rate": 0.0002,
"loss": 0.056311219930648804,
"loss_d0": 0.05685732625424862,
"step": 3400
},
{
"epoch": 341.0,
"grad_norm": 0.337890625,
"learning_rate": 0.0002,
"loss": 0.04154669046401978,
"loss_d0": 0.04171246141195297,
"step": 3410
},
{
"epoch": 342.0,
"grad_norm": 0.287109375,
"learning_rate": 0.0002,
"loss": 0.04307686388492584,
"loss_d0": 0.04326325096189976,
"step": 3420
},
{
"epoch": 343.0,
"grad_norm": 0.27734375,
"learning_rate": 0.0002,
"loss": 0.04260125458240509,
"loss_d0": 0.04288202822208405,
"step": 3430
},
{
"epoch": 344.0,
"grad_norm": 0.28125,
"learning_rate": 0.0002,
"loss": 0.04342672824859619,
"loss_d0": 0.043964647501707074,
"step": 3440
},
{
"epoch": 345.0,
"grad_norm": 0.30859375,
"learning_rate": 0.0002,
"loss": 0.041104856133461,
"loss_d0": 0.04152129665017128,
"step": 3450
},
{
"epoch": 346.0,
"grad_norm": 0.345703125,
"learning_rate": 0.0002,
"loss": 0.0384725421667099,
"loss_d0": 0.03876579888164997,
"step": 3460
},
{
"epoch": 347.0,
"grad_norm": 0.380859375,
"learning_rate": 0.0002,
"loss": 0.04155246317386627,
"loss_d0": 0.04204757548868656,
"step": 3470
},
{
"epoch": 348.0,
"grad_norm": 0.376953125,
"learning_rate": 0.0002,
"loss": 0.04175013601779938,
"loss_d0": 0.04206421263515949,
"step": 3480
},
{
"epoch": 349.0,
"grad_norm": 0.3984375,
"learning_rate": 0.0002,
"loss": 0.03953442573547363,
"loss_d0": 0.03970748074352741,
"step": 3490
},
{
"epoch": 350.0,
"grad_norm": 0.302734375,
"learning_rate": 0.0002,
"loss": 0.03868723213672638,
"loss_d0": 0.03908204138278961,
"step": 3500
},
{
"epoch": 350.0,
"eval_loss": 10.020953178405762,
"eval_runtime": 0.6866,
"eval_samples_per_second": 728.252,
"eval_steps_per_second": 72.825,
"step": 3500
},
{
"epoch": 351.0,
"grad_norm": 0.333984375,
"learning_rate": 0.0002,
"loss": 0.04133128821849823,
"loss_d0": 0.041662900149822234,
"step": 3510
},
{
"epoch": 352.0,
"grad_norm": 0.310546875,
"learning_rate": 0.0002,
"loss": 0.04269187152385712,
"loss_d0": 0.04334259107708931,
"step": 3520
},
{
"epoch": 353.0,
"grad_norm": 0.3046875,
"learning_rate": 0.0002,
"loss": 0.03852761685848236,
"loss_d0": 0.03878095783293247,
"step": 3530
},
{
"epoch": 354.0,
"grad_norm": 0.32421875,
"learning_rate": 0.0002,
"loss": 0.04052022397518158,
"loss_d0": 0.04095298685133457,
"step": 3540
},
{
"epoch": 355.0,
"grad_norm": 0.2890625,
"learning_rate": 0.0002,
"loss": 0.04115490317344665,
"loss_d0": 0.041552980244159696,
"step": 3550
},
{
"epoch": 356.0,
"grad_norm": 0.39453125,
"learning_rate": 0.0002,
"loss": 0.042296862602233885,
"loss_d0": 0.04288714602589607,
"step": 3560
},
{
"epoch": 357.0,
"grad_norm": 0.34765625,
"learning_rate": 0.0002,
"loss": 0.041248321533203125,
"loss_d0": 0.04149158634245396,
"step": 3570
},
{
"epoch": 358.0,
"grad_norm": 0.30078125,
"learning_rate": 0.0002,
"loss": 0.04039554595947266,
"loss_d0": 0.04083836451172829,
"step": 3580
},
{
"epoch": 359.0,
"grad_norm": 0.3125,
"learning_rate": 0.0002,
"loss": 0.039502471685409546,
"loss_d0": 0.04000399447977543,
"step": 3590
},
{
"epoch": 360.0,
"grad_norm": 0.60546875,
"learning_rate": 0.0002,
"loss": 0.043854904174804685,
"loss_d0": 0.04427521526813507,
"step": 3600
},
{
"epoch": 361.0,
"grad_norm": 0.33203125,
"learning_rate": 0.0002,
"loss": 0.040008130669593814,
"loss_d0": 0.04016649015247822,
"step": 3610
},
{
"epoch": 362.0,
"grad_norm": 0.396484375,
"learning_rate": 0.0002,
"loss": 0.03769044280052185,
"loss_d0": 0.03798259571194649,
"step": 3620
},
{
"epoch": 363.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.039430353045463565,
"loss_d0": 0.039879053831100464,
"step": 3630
},
{
"epoch": 364.0,
"grad_norm": 0.33203125,
"learning_rate": 0.0002,
"loss": 0.03827457427978516,
"loss_d0": 0.03869166634976864,
"step": 3640
},
{
"epoch": 365.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.037254220247268675,
"loss_d0": 0.03753783367574215,
"step": 3650
},
{
"epoch": 366.0,
"grad_norm": 0.3203125,
"learning_rate": 0.0002,
"loss": 0.036964389681816104,
"loss_d0": 0.037302806973457336,
"step": 3660
},
{
"epoch": 367.0,
"grad_norm": 0.27734375,
"learning_rate": 0.0002,
"loss": 0.036746549606323245,
"loss_d0": 0.03708359859883785,
"step": 3670
},
{
"epoch": 368.0,
"grad_norm": 0.263671875,
"learning_rate": 0.0002,
"loss": 0.0370238333940506,
"loss_d0": 0.037373238056898114,
"step": 3680
},
{
"epoch": 369.0,
"grad_norm": 0.29296875,
"learning_rate": 0.0002,
"loss": 0.03776443004608154,
"loss_d0": 0.038052943721413615,
"step": 3690
},
{
"epoch": 370.0,
"grad_norm": 0.40625,
"learning_rate": 0.0002,
"loss": 0.037547925114631654,
"loss_d0": 0.037854228913784024,
"step": 3700
},
{
"epoch": 371.0,
"grad_norm": 0.27734375,
"learning_rate": 0.0002,
"loss": 0.03720632791519165,
"loss_d0": 0.03753346242010593,
"step": 3710
},
{
"epoch": 372.0,
"grad_norm": 0.251953125,
"learning_rate": 0.0002,
"loss": 0.0377034991979599,
"loss_d0": 0.037998438253998755,
"step": 3720
},
{
"epoch": 373.0,
"grad_norm": 0.279296875,
"learning_rate": 0.0002,
"loss": 0.03689497411251068,
"loss_d0": 0.03722812980413437,
"step": 3730
},
{
"epoch": 374.0,
"grad_norm": 0.349609375,
"learning_rate": 0.0002,
"loss": 0.036770951747894284,
"loss_d0": 0.03706214055418968,
"step": 3740
},
{
"epoch": 375.0,
"grad_norm": 0.3671875,
"learning_rate": 0.0002,
"loss": 0.03705790340900421,
"loss_d0": 0.03742879740893841,
"step": 3750
},
{
"epoch": 376.0,
"grad_norm": 0.3203125,
"learning_rate": 0.0002,
"loss": 0.0367428719997406,
"loss_d0": 0.037059960514307023,
"step": 3760
},
{
"epoch": 377.0,
"grad_norm": 0.3359375,
"learning_rate": 0.0002,
"loss": 0.036790531873703,
"loss_d0": 0.03712297640740871,
"step": 3770
},
{
"epoch": 378.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.03700532913208008,
"loss_d0": 0.03728572316467762,
"step": 3780
},
{
"epoch": 379.0,
"grad_norm": 0.341796875,
"learning_rate": 0.0002,
"loss": 0.03674401640892029,
"loss_d0": 0.03708376474678517,
"step": 3790
},
{
"epoch": 380.0,
"grad_norm": 0.3828125,
"learning_rate": 0.0002,
"loss": 0.037165766954421996,
"loss_d0": 0.037469035014510155,
"step": 3800
},
{
"epoch": 381.0,
"grad_norm": 0.275390625,
"learning_rate": 0.0002,
"loss": 0.03820574879646301,
"loss_d0": 0.038526909053325654,
"step": 3810
},
{
"epoch": 382.0,
"grad_norm": 0.23828125,
"learning_rate": 0.0002,
"loss": 0.03871320784091949,
"loss_d0": 0.03914758861064911,
"step": 3820
},
{
"epoch": 383.0,
"grad_norm": 0.240234375,
"learning_rate": 0.0002,
"loss": 0.03733502924442291,
"loss_d0": 0.03770691566169262,
"step": 3830
},
{
"epoch": 384.0,
"grad_norm": 0.265625,
"learning_rate": 0.0002,
"loss": 0.03720555305480957,
"loss_d0": 0.03752491697669029,
"step": 3840
},
{
"epoch": 385.0,
"grad_norm": 0.275390625,
"learning_rate": 0.0002,
"loss": 0.039854270219802854,
"loss_d0": 0.04018958024680615,
"step": 3850
},
{
"epoch": 386.0,
"grad_norm": 0.2490234375,
"learning_rate": 0.0002,
"loss": 0.03766619563102722,
"loss_d0": 0.037941229343414304,
"step": 3860
},
{
"epoch": 387.0,
"grad_norm": 0.291015625,
"learning_rate": 0.0002,
"loss": 0.03740113973617554,
"loss_d0": 0.0377108845859766,
"step": 3870
},
{
"epoch": 388.0,
"grad_norm": 0.26953125,
"learning_rate": 0.0002,
"loss": 0.0372713029384613,
"loss_d0": 0.0375568337738514,
"step": 3880
},
{
"epoch": 389.0,
"grad_norm": 0.267578125,
"learning_rate": 0.0002,
"loss": 0.03675893843173981,
"loss_d0": 0.037108558043837545,
"step": 3890
},
{
"epoch": 390.0,
"grad_norm": 0.29296875,
"learning_rate": 0.0002,
"loss": 0.037296104431152347,
"loss_d0": 0.0375242929905653,
"step": 3900
},
{
"epoch": 391.0,
"grad_norm": 0.2578125,
"learning_rate": 0.0002,
"loss": 0.03685269951820373,
"loss_d0": 0.0371894758194685,
"step": 3910
},
{
"epoch": 392.0,
"grad_norm": 0.25,
"learning_rate": 0.0002,
"loss": 0.036673200130462644,
"loss_d0": 0.036988198012113574,
"step": 3920
},
{
"epoch": 393.0,
"grad_norm": 0.28515625,
"learning_rate": 0.0002,
"loss": 0.036616355180740356,
"loss_d0": 0.03695385381579399,
"step": 3930
},
{
"epoch": 394.0,
"grad_norm": 0.296875,
"learning_rate": 0.0002,
"loss": 0.036866238713264464,
"loss_d0": 0.03716698214411736,
"step": 3940
},
{
"epoch": 395.0,
"grad_norm": 0.298828125,
"learning_rate": 0.0002,
"loss": 0.03705916702747345,
"loss_d0": 0.03739931918680668,
"step": 3950
},
{
"epoch": 396.0,
"grad_norm": 0.287109375,
"learning_rate": 0.0002,
"loss": 0.0367851734161377,
"loss_d0": 0.037085448205471036,
"step": 3960
},
{
"epoch": 397.0,
"grad_norm": 0.2734375,
"learning_rate": 0.0002,
"loss": 0.03684147596359253,
"loss_d0": 0.03716961406171322,
"step": 3970
},
{
"epoch": 398.0,
"grad_norm": 0.3046875,
"learning_rate": 0.0002,
"loss": 0.03675464391708374,
"loss_d0": 0.03707803189754486,
"step": 3980
},
{
"epoch": 399.0,
"grad_norm": 0.259765625,
"learning_rate": 0.0002,
"loss": 0.03699171245098114,
"loss_d0": 0.03729988299310207,
"step": 3990
},
{
"epoch": 400.0,
"grad_norm": 0.345703125,
"learning_rate": 0.0002,
"loss": 0.03692775666713714,
"loss_d0": 0.037251610308885574,
"step": 4000
},
{
"epoch": 400.0,
"eval_loss": 11.901162147521973,
"eval_runtime": 0.6874,
"eval_samples_per_second": 727.37,
"eval_steps_per_second": 72.737,
"step": 4000
},
{
"epoch": 401.0,
"grad_norm": 0.2578125,
"learning_rate": 0.0002,
"loss": 0.03675454556941986,
"loss_d0": 0.03709004819393158,
"step": 4010
},
{
"epoch": 402.0,
"grad_norm": 0.3359375,
"learning_rate": 0.0002,
"loss": 0.036843341588973996,
"loss_d0": 0.03714859746396541,
"step": 4020
},
{
"epoch": 403.0,
"grad_norm": 0.302734375,
"learning_rate": 0.0002,
"loss": 0.03684849143028259,
"loss_d0": 0.03714573718607426,
"step": 4030
},
{
"epoch": 404.0,
"grad_norm": 0.271484375,
"learning_rate": 0.0002,
"loss": 0.03681612014770508,
"loss_d0": 0.037144556641578674,
"step": 4040
},
{
"epoch": 405.0,
"grad_norm": 0.2431640625,
"learning_rate": 0.0002,
"loss": 0.03668281435966492,
"loss_d0": 0.03700208105146885,
"step": 4050
},
{
"epoch": 406.0,
"grad_norm": 0.259765625,
"learning_rate": 0.0002,
"loss": 0.03672113716602325,
"loss_d0": 0.03706220649182797,
"step": 4060
},
{
"epoch": 407.0,
"grad_norm": 0.244140625,
"learning_rate": 0.0002,
"loss": 0.03671231269836426,
"loss_d0": 0.03705513551831245,
"step": 4070
},
{
"epoch": 408.0,
"grad_norm": 0.2353515625,
"learning_rate": 0.0002,
"loss": 0.0371286153793335,
"loss_d0": 0.037407181411981585,
"step": 4080
},
{
"epoch": 409.0,
"grad_norm": 0.38671875,
"learning_rate": 0.0002,
"loss": 0.036925724148750304,
"loss_d0": 0.037227736040949824,
"step": 4090
},
{
"epoch": 410.0,
"grad_norm": 0.3125,
"learning_rate": 0.0002,
"loss": 0.03719911873340607,
"loss_d0": 0.03746572397649288,
"step": 4100
},
{
"epoch": 411.0,
"grad_norm": 0.31640625,
"learning_rate": 0.00019995559043291586,
"loss": 0.03746194541454315,
"loss_d0": 0.03776235654950142,
"step": 4110
},
{
"epoch": 412.0,
"grad_norm": 0.248046875,
"learning_rate": 0.0001998021321462845,
"loss": 0.03715557157993317,
"loss_d0": 0.03750314898788929,
"step": 4120
},
{
"epoch": 413.0,
"grad_norm": 0.2578125,
"learning_rate": 0.00019953926379459095,
"loss": 0.037144222855567934,
"loss_d0": 0.03752333410084248,
"step": 4130
},
{
"epoch": 414.0,
"grad_norm": 0.337890625,
"learning_rate": 0.00019916730564242994,
"loss": 0.037147408723831175,
"loss_d0": 0.03745650127530098,
"step": 4140
},
{
"epoch": 415.0,
"grad_norm": 0.62109375,
"learning_rate": 0.00019868671086351413,
"loss": 0.03663991689682007,
"loss_d0": 0.03694714643061161,
"step": 4150
},
{
"epoch": 416.0,
"grad_norm": 0.33984375,
"learning_rate": 0.00019809806498855166,
"loss": 0.03903592824935913,
"loss_d0": 0.03945017009973526,
"step": 4160
},
{
"epoch": 417.0,
"grad_norm": 0.8515625,
"learning_rate": 0.00019740208519186726,
"loss": 0.05429054498672485,
"loss_d0": 0.05633100271224976,
"step": 4170
},
{
"epoch": 418.0,
"grad_norm": 0.67578125,
"learning_rate": 0.0001965996194176357,
"loss": 0.05051107406616211,
"loss_d0": 0.05182218365371227,
"step": 4180
},
{
"epoch": 419.0,
"grad_norm": 0.357421875,
"learning_rate": 0.00019569164534679248,
"loss": 0.038011634349823,
"loss_d0": 0.038474849238991735,
"step": 4190
},
{
"epoch": 420.0,
"grad_norm": 0.318359375,
"learning_rate": 0.0001946792692058803,
"loss": 0.036821508407592775,
"loss_d0": 0.037107934802770616,
"step": 4200
},
{
"epoch": 421.0,
"grad_norm": 0.2890625,
"learning_rate": 0.00019356372441928221,
"loss": 0.03671710193157196,
"loss_d0": 0.03702742531895638,
"step": 4210
},
{
"epoch": 422.0,
"grad_norm": 0.267578125,
"learning_rate": 0.00019234637010648426,
"loss": 0.03749249279499054,
"loss_d0": 0.03791201822459698,
"step": 4220
},
{
"epoch": 423.0,
"grad_norm": 0.259765625,
"learning_rate": 0.00019102868942619743,
"loss": 0.039152055978775024,
"loss_d0": 0.0393472570925951,
"step": 4230
},
{
"epoch": 424.0,
"grad_norm": 0.310546875,
"learning_rate": 0.00018961228776935755,
"loss": 0.03920052945613861,
"loss_d0": 0.03954476937651634,
"step": 4240
},
{
"epoch": 425.0,
"grad_norm": 0.39453125,
"learning_rate": 0.00018809889080320357,
"loss": 0.04085721671581268,
"loss_d0": 0.0409322090446949,
"step": 4250
},
{
"epoch": 426.0,
"grad_norm": 0.419921875,
"learning_rate": 0.00018649034236881777,
"loss": 0.03923974931240082,
"loss_d0": 0.03957121372222901,
"step": 4260
},
{
"epoch": 427.0,
"grad_norm": 0.3828125,
"learning_rate": 0.00018478860223468955,
"loss": 0.03778021037578583,
"loss_d0": 0.03809101954102516,
"step": 4270
},
{
"epoch": 428.0,
"grad_norm": 0.46875,
"learning_rate": 0.0001829957437090394,
"loss": 0.03898613452911377,
"loss_d0": 0.03935887552797794,
"step": 4280
},
{
"epoch": 429.0,
"grad_norm": 0.259765625,
"learning_rate": 0.00018111395111381214,
"loss": 0.03973522186279297,
"loss_d0": 0.03986812345683575,
"step": 4290
},
{
"epoch": 430.0,
"grad_norm": 0.345703125,
"learning_rate": 0.00017914551712341713,
"loss": 0.038596144318580626,
"loss_d0": 0.03911666721105576,
"step": 4300
},
{
"epoch": 431.0,
"grad_norm": 0.279296875,
"learning_rate": 0.0001770928399714576,
"loss": 0.03771106004714966,
"loss_d0": 0.038051551580429076,
"step": 4310
},
{
"epoch": 432.0,
"grad_norm": 0.345703125,
"learning_rate": 0.0001749584205288526,
"loss": 0.03960946798324585,
"loss_d0": 0.03986733630299568,
"step": 4320
},
{
"epoch": 433.0,
"grad_norm": 0.57421875,
"learning_rate": 0.00017274485925691083,
"loss": 0.03941147327423096,
"loss_d0": 0.039736605063080785,
"step": 4330
},
{
"epoch": 434.0,
"grad_norm": 0.3671875,
"learning_rate": 0.00017045485303906913,
"loss": 0.0394733875989914,
"loss_d0": 0.03990803770720959,
"step": 4340
},
{
"epoch": 435.0,
"grad_norm": 0.3671875,
"learning_rate": 0.00016809119189515557,
"loss": 0.03905892372131348,
"loss_d0": 0.03944002017378807,
"step": 4350
},
{
"epoch": 436.0,
"grad_norm": 0.28125,
"learning_rate": 0.00016565675558217989,
"loss": 0.037955057621002194,
"loss_d0": 0.038193025067448615,
"step": 4360
},
{
"epoch": 437.0,
"grad_norm": 0.90625,
"learning_rate": 0.00016315451008579328,
"loss": 0.05242310762405396,
"loss_d0": 0.05061047412455082,
"step": 4370
},
{
"epoch": 438.0,
"grad_norm": 0.23828125,
"learning_rate": 0.00016058750400669178,
"loss": 0.0368131011724472,
"loss_d0": 0.03710653893649578,
"step": 4380
},
{
"epoch": 439.0,
"grad_norm": 0.423828125,
"learning_rate": 0.0001579588648463657,
"loss": 0.036599275469779965,
"loss_d0": 0.03693968802690506,
"step": 4390
},
{
"epoch": 440.0,
"grad_norm": 0.28125,
"learning_rate": 0.00015527179519672117,
"loss": 0.036560848355293274,
"loss_d0": 0.03687223196029663,
"step": 4400
},
{
"epoch": 441.0,
"grad_norm": 0.28125,
"learning_rate": 0.00015252956883821488,
"loss": 0.03625948429107666,
"loss_d0": 0.03659016117453575,
"step": 4410
},
{
"epoch": 442.0,
"grad_norm": 0.298828125,
"learning_rate": 0.00014973552675125708,
"loss": 0.036302709579467775,
"loss_d0": 0.03660444766283035,
"step": 4420
},
{
"epoch": 443.0,
"grad_norm": 0.275390625,
"learning_rate": 0.00014689307304574154,
"loss": 0.03645941019058228,
"loss_d0": 0.036814498528838155,
"step": 4430
},
{
"epoch": 444.0,
"grad_norm": 0.318359375,
"learning_rate": 0.00014400567081366205,
"loss": 0.03634356260299683,
"loss_d0": 0.03664385080337525,
"step": 4440
},
{
"epoch": 445.0,
"grad_norm": 0.296875,
"learning_rate": 0.00014107683790986813,
"loss": 0.03630726635456085,
"loss_d0": 0.03658915832638741,
"step": 4450
},
{
"epoch": 446.0,
"grad_norm": 0.283203125,
"learning_rate": 0.00013811014266610096,
"loss": 0.036189505457878114,
"loss_d0": 0.03651743419468403,
"step": 4460
},
{
"epoch": 447.0,
"grad_norm": 0.341796875,
"learning_rate": 0.00013510919954353066,
"loss": 0.03628252744674683,
"loss_d0": 0.03659649156033993,
"step": 4470
},
{
"epoch": 448.0,
"grad_norm": 0.2392578125,
"learning_rate": 0.00013207766472909225,
"loss": 0.03624842762947082,
"loss_d0": 0.0365591075271368,
"step": 4480
},
{
"epoch": 449.0,
"grad_norm": 0.2578125,
"learning_rate": 0.000129019231680985,
"loss": 0.03611701428890228,
"loss_d0": 0.03644072562456131,
"step": 4490
},
{
"epoch": 450.0,
"grad_norm": 0.3046875,
"learning_rate": 0.0001259376266287625,
"loss": 0.036150026321411136,
"loss_d0": 0.0364865392446518,
"step": 4500
},
{
"epoch": 450.0,
"eval_loss": 11.761486053466797,
"eval_runtime": 0.6889,
"eval_samples_per_second": 725.846,
"eval_steps_per_second": 72.585,
"step": 4500
},
{
"epoch": 451.0,
"grad_norm": 0.2734375,
"learning_rate": 0.00012283660403349607,
"loss": 0.036095789074897765,
"loss_d0": 0.03643478117883205,
"step": 4510
},
{
"epoch": 452.0,
"grad_norm": 0.2392578125,
"learning_rate": 0.00011971994201354204,
"loss": 0.03615381121635437,
"loss_d0": 0.036472433060407636,
"step": 4520
},
{
"epoch": 453.0,
"grad_norm": 0.267578125,
"learning_rate": 0.00011659143774148684,
"loss": 0.03610163033008575,
"loss_d0": 0.036404192447662354,
"step": 4530
},
{
"epoch": 454.0,
"grad_norm": 0.328125,
"learning_rate": 0.0001134549028178768,
"loss": 0.03613078892230988,
"loss_d0": 0.036461538076400755,
"step": 4540
},
{
"epoch": 455.0,
"grad_norm": 0.2333984375,
"learning_rate": 0.00011031415862737014,
"loss": 0.03611861169338226,
"loss_d0": 0.03640886433422565,
"step": 4550
},
{
"epoch": 456.0,
"grad_norm": 0.2470703125,
"learning_rate": 0.00010717303168296846,
"loss": 0.03604468107223511,
"loss_d0": 0.03640021868050099,
"step": 4560
},
{
"epoch": 457.0,
"grad_norm": 0.302734375,
"learning_rate": 0.000104035348964,
"loss": 0.036168360710144044,
"loss_d0": 0.036474670842289926,
"step": 4570
},
{
"epoch": 458.0,
"grad_norm": 0.2392578125,
"learning_rate": 0.00010090493325353484,
"loss": 0.03600202202796936,
"loss_d0": 0.03632246777415275,
"step": 4580
},
{
"epoch": 459.0,
"grad_norm": 0.3671875,
"learning_rate": 9.778559848091261e-05,
"loss": 0.03613144755363464,
"loss_d0": 0.03646283820271492,
"step": 4590
},
{
"epoch": 460.0,
"grad_norm": 0.2734375,
"learning_rate": 9.468114507505707e-05,
"loss": 0.03605700135231018,
"loss_d0": 0.03638906553387642,
"step": 4600
},
{
"epoch": 461.0,
"grad_norm": 0.251953125,
"learning_rate": 9.15953553342389e-05,
"loss": 0.035967972874641416,
"loss_d0": 0.036280662193894385,
"step": 4610
},
{
"epoch": 462.0,
"grad_norm": 0.283203125,
"learning_rate": 8.853198881792772e-05,
"loss": 0.03607074022293091,
"loss_d0": 0.036401886865496634,
"step": 4620
},
{
"epoch": 463.0,
"grad_norm": 0.2392578125,
"learning_rate": 8.549477776634832e-05,
"loss": 0.0359768807888031,
"loss_d0": 0.0362836092710495,
"step": 4630
},
{
"epoch": 464.0,
"grad_norm": 0.314453125,
"learning_rate": 8.24874225533205e-05,
"loss": 0.03588842451572418,
"loss_d0": 0.03622284643352032,
"step": 4640
},
{
"epoch": 465.0,
"grad_norm": 0.34375,
"learning_rate": 7.951358717792378e-05,
"loss": 0.03593695759773254,
"loss_d0": 0.036245567724108696,
"step": 4650
},
{
"epoch": 466.0,
"grad_norm": 0.2578125,
"learning_rate": 7.657689480047888e-05,
"loss": 0.03589689433574676,
"loss_d0": 0.03622194863855839,
"step": 4660
},
{
"epoch": 467.0,
"grad_norm": 0.28125,
"learning_rate": 7.368092332828491e-05,
"loss": 0.03584821224212646,
"loss_d0": 0.03617323003709316,
"step": 4670
},
{
"epoch": 468.0,
"grad_norm": 0.33984375,
"learning_rate": 7.082920105649054e-05,
"loss": 0.03588172793388367,
"loss_d0": 0.03619707673788071,
"step": 4680
},
{
"epoch": 469.0,
"grad_norm": 0.255859375,
"learning_rate": 6.80252023694098e-05,
"loss": 0.03584883213043213,
"loss_d0": 0.03617900386452675,
"step": 4690
},
{
"epoch": 470.0,
"grad_norm": 0.3359375,
"learning_rate": 6.527234350752003e-05,
"loss": 0.035852047801017764,
"loss_d0": 0.0361775953322649,
"step": 4700
},
{
"epoch": 471.0,
"grad_norm": 0.275390625,
"learning_rate": 6.257397840529903e-05,
"loss": 0.03582252562046051,
"loss_d0": 0.03615486063063145,
"step": 4710
},
{
"epoch": 472.0,
"grad_norm": 0.2734375,
"learning_rate": 5.993339460497257e-05,
"loss": 0.03581757247447968,
"loss_d0": 0.036142122372984885,
"step": 4720
},
{
"epoch": 473.0,
"grad_norm": 0.259765625,
"learning_rate": 5.7353809251150606e-05,
"loss": 0.0358079195022583,
"loss_d0": 0.036134665831923485,
"step": 4730
},
{
"epoch": 474.0,
"grad_norm": 0.2373046875,
"learning_rate": 5.483836517123214e-05,
"loss": 0.035815265774726865,
"loss_d0": 0.036152683570981024,
"step": 4740
},
{
"epoch": 475.0,
"grad_norm": 0.322265625,
"learning_rate": 5.239012704635402e-05,
"loss": 0.03577219545841217,
"loss_d0": 0.036099201813340184,
"step": 4750
},
{
"epoch": 476.0,
"grad_norm": 0.271484375,
"learning_rate": 5.0012077677549283e-05,
"loss": 0.03577747642993927,
"loss_d0": 0.03610123656690121,
"step": 4760
},
{
"epoch": 477.0,
"grad_norm": 0.23828125,
"learning_rate": 4.77071143516634e-05,
"loss": 0.03580273985862732,
"loss_d0": 0.03613555021584034,
"step": 4770
},
{
"epoch": 478.0,
"grad_norm": 0.27734375,
"learning_rate": 4.547804531145656e-05,
"loss": 0.035796952247619626,
"loss_d0": 0.036111927777528766,
"step": 4780
},
{
"epoch": 479.0,
"grad_norm": 0.35546875,
"learning_rate": 4.332758633419252e-05,
"loss": 0.035767361521720886,
"loss_d0": 0.03609406426548958,
"step": 4790
},
{
"epoch": 480.0,
"grad_norm": 0.265625,
"learning_rate": 4.12583574228822e-05,
"loss": 0.03574168682098389,
"loss_d0": 0.03606498539447785,
"step": 4800
},
{
"epoch": 481.0,
"grad_norm": 0.259765625,
"learning_rate": 3.927287961421382e-05,
"loss": 0.035773900151252744,
"loss_d0": 0.03608821220695972,
"step": 4810
},
{
"epoch": 482.0,
"grad_norm": 0.275390625,
"learning_rate": 3.737357190705782e-05,
"loss": 0.03574726283550263,
"loss_d0": 0.03607319518923759,
"step": 4820
},
{
"epoch": 483.0,
"grad_norm": 0.287109375,
"learning_rate": 3.556274831528945e-05,
"loss": 0.03574813306331635,
"loss_d0": 0.0360788069665432,
"step": 4830
},
{
"epoch": 484.0,
"grad_norm": 0.26953125,
"learning_rate": 3.3842615048519255e-05,
"loss": 0.03571727573871612,
"loss_d0": 0.03603735640645027,
"step": 4840
},
{
"epoch": 485.0,
"grad_norm": 0.263671875,
"learning_rate": 3.221526782416659e-05,
"loss": 0.035741984844207764,
"loss_d0": 0.0360604640096426,
"step": 4850
},
{
"epoch": 486.0,
"grad_norm": 0.30859375,
"learning_rate": 3.068268931415069e-05,
"loss": 0.035722389817237854,
"loss_d0": 0.03604618720710277,
"step": 4860
},
{
"epoch": 487.0,
"grad_norm": 0.259765625,
"learning_rate": 2.9246746729310446e-05,
"loss": 0.03571443259716034,
"loss_d0": 0.03603012822568417,
"step": 4870
},
{
"epoch": 488.0,
"grad_norm": 0.2275390625,
"learning_rate": 2.7909189544495435e-05,
"loss": 0.03573389947414398,
"loss_d0": 0.036041321232914925,
"step": 4880
},
{
"epoch": 489.0,
"grad_norm": 0.240234375,
"learning_rate": 2.6671647367100477e-05,
"loss": 0.035701331496238706,
"loss_d0": 0.03603383935987949,
"step": 4890
},
{
"epoch": 490.0,
"grad_norm": 0.328125,
"learning_rate": 2.553562795163998e-05,
"loss": 0.035741209983825684,
"loss_d0": 0.0360419649630785,
"step": 4900
},
{
"epoch": 491.0,
"grad_norm": 0.30078125,
"learning_rate": 2.450251536278129e-05,
"loss": 0.035731592774391176,
"loss_d0": 0.0360304169356823,
"step": 4910
},
{
"epoch": 492.0,
"grad_norm": 0.244140625,
"learning_rate": 2.3573568289075136e-05,
"loss": 0.03570793569087982,
"loss_d0": 0.036030732467770575,
"step": 4920
},
{
"epoch": 493.0,
"grad_norm": 0.302734375,
"learning_rate": 2.2749918509437493e-05,
"loss": 0.03569709360599518,
"loss_d0": 0.03602620549499989,
"step": 4930
},
{
"epoch": 494.0,
"grad_norm": 0.2294921875,
"learning_rate": 2.2032569514251373e-05,
"loss": 0.03570819199085236,
"loss_d0": 0.03603471517562866,
"step": 4940
},
{
"epoch": 495.0,
"grad_norm": 0.294921875,
"learning_rate": 2.1422395282768234e-05,
"loss": 0.035699674487113954,
"loss_d0": 0.03603287264704704,
"step": 4950
},
{
"epoch": 496.0,
"grad_norm": 0.32421875,
"learning_rate": 2.092013921829899e-05,
"loss": 0.03576536178588867,
"loss_d0": 0.03607108183205128,
"step": 4960
},
{
"epoch": 497.0,
"grad_norm": 0.232421875,
"learning_rate": 2.0526413242491617e-05,
"loss": 0.035713717341423035,
"loss_d0": 0.03603534735739231,
"step": 4970
},
{
"epoch": 498.0,
"grad_norm": 0.337890625,
"learning_rate": 2.0241697049798773e-05,
"loss": 0.03570127785205841,
"loss_d0": 0.03601216375827789,
"step": 4980
},
{
"epoch": 499.0,
"grad_norm": 0.2421875,
"learning_rate": 2.0066337523044098e-05,
"loss": 0.03573695719242096,
"loss_d0": 0.03605118878185749,
"step": 4990
},
{
"epoch": 500.0,
"grad_norm": 0.255859375,
"learning_rate": 2.0000548310798866e-05,
"loss": 0.03572871088981629,
"loss_d0": 0.03601981587707996,
"step": 5000
},
{
"epoch": 500.0,
"eval_loss": 12.606892585754395,
"eval_runtime": 0.6871,
"eval_samples_per_second": 727.659,
"eval_steps_per_second": 72.766,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 500,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.980892276588544e+16,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}