Files
Qwen3-1.7B-msmarco-text-100…/trainer_state.json
ModelHub XC 464b5686b9 初始化项目,由ModelHub XC社区提供模型
Model: Abner0803/Qwen3-1.7B-msmarco-text-100k-with_pseudo_queries
Source: Original Platform
2026-05-14 09:11:05 +08:00

5035 lines
152 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.727387276450121,
"eval_steps": 500,
"global_step": 25000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011454917739371984,
"grad_norm": 53.10772705078125,
"learning_rate": 1.1228230980751604e-06,
"loss": 5.3499,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 50
},
{
"epoch": 0.022909835478743968,
"grad_norm": 33.95736312866211,
"learning_rate": 2.268560953253896e-06,
"loss": 5.0386,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 100
},
{
"epoch": 0.034364753218115954,
"grad_norm": 16.349882125854492,
"learning_rate": 3.414298808432631e-06,
"loss": 3.9819,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 150
},
{
"epoch": 0.045819670957487936,
"grad_norm": 15.363502502441406,
"learning_rate": 4.5600366636113664e-06,
"loss": 3.164,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 200
},
{
"epoch": 0.05727458869685992,
"grad_norm": 17.262718200683594,
"learning_rate": 5.705774518790101e-06,
"loss": 2.8121,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 250
},
{
"epoch": 0.06872950643623191,
"grad_norm": 14.994147300720215,
"learning_rate": 6.8515123739688366e-06,
"loss": 2.4217,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 300
},
{
"epoch": 0.08018442417560388,
"grad_norm": 11.715180397033691,
"learning_rate": 7.997250229147571e-06,
"loss": 2.1894,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 350
},
{
"epoch": 0.09163934191497587,
"grad_norm": 10.08484172821045,
"learning_rate": 9.142988084326307e-06,
"loss": 2.1338,
"memory/device_mem_reserved(gib)": 49.37,
"memory/max_mem_active(gib)": 44.85,
"memory/max_mem_allocated(gib)": 44.85,
"step": 400
},
{
"epoch": 0.10309425965434786,
"grad_norm": 9.557114601135254,
"learning_rate": 1.0288725939505042e-05,
"loss": 2.0711,
"memory/device_mem_reserved(gib)": 51.36,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 450
},
{
"epoch": 0.11454917739371984,
"grad_norm": 9.078670501708984,
"learning_rate": 1.1434463794683776e-05,
"loss": 2.0381,
"memory/device_mem_reserved(gib)": 51.36,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 500
},
{
"epoch": 0.12600409513309183,
"grad_norm": 9.677817344665527,
"learning_rate": 1.2580201649862511e-05,
"loss": 2.0244,
"memory/device_mem_reserved(gib)": 51.36,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 550
},
{
"epoch": 0.13745901287246381,
"grad_norm": 10.270977973937988,
"learning_rate": 1.3725939505041247e-05,
"loss": 1.9982,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 600
},
{
"epoch": 0.1489139306118358,
"grad_norm": 8.053842544555664,
"learning_rate": 1.4871677360219982e-05,
"loss": 2.0071,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 650
},
{
"epoch": 0.16036884835120777,
"grad_norm": 8.858375549316406,
"learning_rate": 1.6017415215398718e-05,
"loss": 1.9546,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 700
},
{
"epoch": 0.17182376609057975,
"grad_norm": 9.538141250610352,
"learning_rate": 1.7163153070577455e-05,
"loss": 1.9464,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 750
},
{
"epoch": 0.18327868382995174,
"grad_norm": 7.541695594787598,
"learning_rate": 1.830889092575619e-05,
"loss": 1.9085,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 800
},
{
"epoch": 0.19473360156932373,
"grad_norm": 7.665754318237305,
"learning_rate": 1.9454628780934923e-05,
"loss": 1.9153,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 850
},
{
"epoch": 0.20618851930869572,
"grad_norm": 9.04691219329834,
"learning_rate": 2.0600366636113656e-05,
"loss": 1.8734,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 900
},
{
"epoch": 0.2176434370480677,
"grad_norm": 7.098514080047607,
"learning_rate": 2.1746104491292394e-05,
"loss": 1.8809,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 950
},
{
"epoch": 0.22909835478743967,
"grad_norm": 7.5708513259887695,
"learning_rate": 2.2891842346471127e-05,
"loss": 1.8459,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1000
},
{
"epoch": 0.24055327252681166,
"grad_norm": 8.422965049743652,
"learning_rate": 2.4037580201649865e-05,
"loss": 1.8414,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1050
},
{
"epoch": 0.25200819026618365,
"grad_norm": 7.765232563018799,
"learning_rate": 2.51833180568286e-05,
"loss": 1.857,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1100
},
{
"epoch": 0.2634631080055556,
"grad_norm": 7.53985595703125,
"learning_rate": 2.6329055912007332e-05,
"loss": 1.8113,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1150
},
{
"epoch": 0.27491802574492763,
"grad_norm": 7.5806450843811035,
"learning_rate": 2.747479376718607e-05,
"loss": 1.793,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1200
},
{
"epoch": 0.2863729434842996,
"grad_norm": 6.706181526184082,
"learning_rate": 2.8620531622364803e-05,
"loss": 1.8109,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1250
},
{
"epoch": 0.2978278612236716,
"grad_norm": 7.132224082946777,
"learning_rate": 2.976626947754354e-05,
"loss": 1.7923,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1300
},
{
"epoch": 0.30928277896304357,
"grad_norm": 7.725433826446533,
"learning_rate": 3.091200733272228e-05,
"loss": 1.7504,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1350
},
{
"epoch": 0.32073769670241553,
"grad_norm": 7.6306843757629395,
"learning_rate": 3.205774518790101e-05,
"loss": 1.7802,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1400
},
{
"epoch": 0.33219261444178755,
"grad_norm": 7.927916049957275,
"learning_rate": 3.3203483043079745e-05,
"loss": 1.7454,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1450
},
{
"epoch": 0.3436475321811595,
"grad_norm": 7.468013286590576,
"learning_rate": 3.434922089825848e-05,
"loss": 1.716,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1500
},
{
"epoch": 0.3551024499205315,
"grad_norm": 6.887967586517334,
"learning_rate": 3.549495875343721e-05,
"loss": 1.7054,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1550
},
{
"epoch": 0.3665573676599035,
"grad_norm": 7.042320251464844,
"learning_rate": 3.6640696608615946e-05,
"loss": 1.716,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1600
},
{
"epoch": 0.37801228539927545,
"grad_norm": 7.46671199798584,
"learning_rate": 3.778643446379469e-05,
"loss": 1.7074,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1650
},
{
"epoch": 0.38946720313864747,
"grad_norm": 4.348405838012695,
"learning_rate": 3.893217231897342e-05,
"loss": 1.6607,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1700
},
{
"epoch": 0.40092212087801943,
"grad_norm": 7.3193511962890625,
"learning_rate": 4.0077910174152155e-05,
"loss": 1.6516,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1750
},
{
"epoch": 0.41237703861739144,
"grad_norm": 7.363260746002197,
"learning_rate": 4.122364802933089e-05,
"loss": 1.6137,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1800
},
{
"epoch": 0.4238319563567634,
"grad_norm": 7.189822673797607,
"learning_rate": 4.236938588450963e-05,
"loss": 1.5882,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1850
},
{
"epoch": 0.4352868740961354,
"grad_norm": 7.271198272705078,
"learning_rate": 4.351512373968836e-05,
"loss": 1.5759,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1900
},
{
"epoch": 0.4467417918355074,
"grad_norm": 10.216059684753418,
"learning_rate": 4.4660861594867096e-05,
"loss": 1.5663,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 1950
},
{
"epoch": 0.45819670957487935,
"grad_norm": 6.804873943328857,
"learning_rate": 4.580659945004584e-05,
"loss": 1.5447,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2000
},
{
"epoch": 0.46965162731425136,
"grad_norm": 7.637989044189453,
"learning_rate": 4.695233730522457e-05,
"loss": 1.553,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2050
},
{
"epoch": 0.4811065450536233,
"grad_norm": 6.641468048095703,
"learning_rate": 4.80980751604033e-05,
"loss": 1.5486,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2100
},
{
"epoch": 0.49256146279299534,
"grad_norm": 7.134258270263672,
"learning_rate": 4.924381301558204e-05,
"loss": 1.5126,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2150
},
{
"epoch": 0.5040163805323673,
"grad_norm": 6.905734062194824,
"learning_rate": 5.038955087076077e-05,
"loss": 1.4918,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2200
},
{
"epoch": 0.5154712982717393,
"grad_norm": 7.143308162689209,
"learning_rate": 5.153528872593951e-05,
"loss": 1.4778,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2250
},
{
"epoch": 0.5269262160111112,
"grad_norm": 6.968287467956543,
"learning_rate": 5.268102658111824e-05,
"loss": 1.431,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2300
},
{
"epoch": 0.5383811337504832,
"grad_norm": 7.385350704193115,
"learning_rate": 5.3826764436296974e-05,
"loss": 1.4638,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2350
},
{
"epoch": 0.5498360514898553,
"grad_norm": 6.7367095947265625,
"learning_rate": 5.4972502291475714e-05,
"loss": 1.4236,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2400
},
{
"epoch": 0.5612909692292273,
"grad_norm": 7.013253211975098,
"learning_rate": 5.611824014665444e-05,
"loss": 1.3933,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2450
},
{
"epoch": 0.5727458869685992,
"grad_norm": 7.49541711807251,
"learning_rate": 5.726397800183319e-05,
"loss": 1.3847,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2500
},
{
"epoch": 0.5842008047079712,
"grad_norm": 7.078319549560547,
"learning_rate": 5.8409715857011915e-05,
"loss": 1.3825,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2550
},
{
"epoch": 0.5956557224473432,
"grad_norm": 7.429485321044922,
"learning_rate": 5.9555453712190656e-05,
"loss": 1.3629,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2600
},
{
"epoch": 0.6071106401867151,
"grad_norm": 7.05700159072876,
"learning_rate": 6.070119156736939e-05,
"loss": 1.3372,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2650
},
{
"epoch": 0.6185655579260871,
"grad_norm": 7.29513692855835,
"learning_rate": 6.184692942254812e-05,
"loss": 1.3185,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2700
},
{
"epoch": 0.6300204756654592,
"grad_norm": 6.8477911949157715,
"learning_rate": 6.299266727772686e-05,
"loss": 1.3066,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2750
},
{
"epoch": 0.6414753934048311,
"grad_norm": 7.389026641845703,
"learning_rate": 6.41384051329056e-05,
"loss": 1.2829,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2800
},
{
"epoch": 0.6529303111442031,
"grad_norm": 6.852631568908691,
"learning_rate": 6.528414298808432e-05,
"loss": 1.274,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2850
},
{
"epoch": 0.6643852288835751,
"grad_norm": 7.158923625946045,
"learning_rate": 6.642988084326306e-05,
"loss": 1.2486,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2900
},
{
"epoch": 0.6758401466229471,
"grad_norm": 7.069329261779785,
"learning_rate": 6.75756186984418e-05,
"loss": 1.2517,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 2950
},
{
"epoch": 0.687295064362319,
"grad_norm": 6.942631721496582,
"learning_rate": 6.872135655362053e-05,
"loss": 1.1829,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 3000
},
{
"epoch": 0.698749982101691,
"grad_norm": 7.831090450286865,
"learning_rate": 6.986709440879927e-05,
"loss": 1.2037,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 3050
},
{
"epoch": 0.710204899841063,
"grad_norm": 6.641531467437744,
"learning_rate": 7.101283226397801e-05,
"loss": 1.1565,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 3100
},
{
"epoch": 0.721659817580435,
"grad_norm": 7.846933841705322,
"learning_rate": 7.215857011915674e-05,
"loss": 1.1647,
"memory/device_mem_reserved(gib)": 51.42,
"memory/max_mem_active(gib)": 46.59,
"memory/max_mem_allocated(gib)": 46.59,
"step": 3150
},
{
"epoch": 0.733114735319807,
"grad_norm": 6.905858039855957,
"learning_rate": 7.330430797433548e-05,
"loss": 1.1853,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3200
},
{
"epoch": 0.744569653059179,
"grad_norm": 7.997142314910889,
"learning_rate": 7.445004582951421e-05,
"loss": 1.1541,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3250
},
{
"epoch": 0.7560245707985509,
"grad_norm": 6.95665979385376,
"learning_rate": 7.559578368469294e-05,
"loss": 1.1223,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3300
},
{
"epoch": 0.7674794885379229,
"grad_norm": 7.185131549835205,
"learning_rate": 7.674152153987169e-05,
"loss": 1.1113,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3350
},
{
"epoch": 0.7789344062772949,
"grad_norm": 6.778895854949951,
"learning_rate": 7.788725939505041e-05,
"loss": 1.0765,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3400
},
{
"epoch": 0.790389324016667,
"grad_norm": 7.30415153503418,
"learning_rate": 7.903299725022914e-05,
"loss": 1.0601,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3450
},
{
"epoch": 0.8018442417560389,
"grad_norm": 6.911710739135742,
"learning_rate": 8.017873510540789e-05,
"loss": 1.0406,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3500
},
{
"epoch": 0.8132991594954109,
"grad_norm": 7.257194995880127,
"learning_rate": 8.132447296058661e-05,
"loss": 1.0284,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3550
},
{
"epoch": 0.8247540772347829,
"grad_norm": 8.09947395324707,
"learning_rate": 8.247021081576536e-05,
"loss": 1.0178,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3600
},
{
"epoch": 0.8362089949741548,
"grad_norm": 7.630951404571533,
"learning_rate": 8.361594867094409e-05,
"loss": 1.0031,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3650
},
{
"epoch": 0.8476639127135268,
"grad_norm": 7.508652210235596,
"learning_rate": 8.476168652612283e-05,
"loss": 0.9628,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3700
},
{
"epoch": 0.8591188304528988,
"grad_norm": 8.247767448425293,
"learning_rate": 8.590742438130156e-05,
"loss": 0.9669,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3750
},
{
"epoch": 0.8705737481922708,
"grad_norm": 7.914950370788574,
"learning_rate": 8.705316223648031e-05,
"loss": 0.9656,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3800
},
{
"epoch": 0.8820286659316428,
"grad_norm": 7.725244045257568,
"learning_rate": 8.819890009165903e-05,
"loss": 0.9541,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3850
},
{
"epoch": 0.8934835836710148,
"grad_norm": 6.968287467956543,
"learning_rate": 8.934463794683778e-05,
"loss": 0.9421,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3900
},
{
"epoch": 0.9049385014103868,
"grad_norm": 6.712941646575928,
"learning_rate": 9.049037580201651e-05,
"loss": 0.921,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 3950
},
{
"epoch": 0.9163934191497587,
"grad_norm": 6.738905429840088,
"learning_rate": 9.163611365719523e-05,
"loss": 0.9133,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4000
},
{
"epoch": 0.9278483368891307,
"grad_norm": 8.376337051391602,
"learning_rate": 9.278185151237398e-05,
"loss": 0.9016,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4050
},
{
"epoch": 0.9393032546285027,
"grad_norm": 7.274137020111084,
"learning_rate": 9.392758936755271e-05,
"loss": 0.8829,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4100
},
{
"epoch": 0.9507581723678746,
"grad_norm": 7.919043064117432,
"learning_rate": 9.507332722273144e-05,
"loss": 0.8555,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4150
},
{
"epoch": 0.9622130901072466,
"grad_norm": 6.632596015930176,
"learning_rate": 9.621906507791018e-05,
"loss": 0.8945,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4200
},
{
"epoch": 0.9736680078466187,
"grad_norm": 7.122948169708252,
"learning_rate": 9.736480293308891e-05,
"loss": 0.8447,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4250
},
{
"epoch": 0.9851229255859907,
"grad_norm": 6.747700214385986,
"learning_rate": 9.851054078826765e-05,
"loss": 0.8283,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4300
},
{
"epoch": 0.9965778433253626,
"grad_norm": 6.4440765380859375,
"learning_rate": 9.965627864344639e-05,
"loss": 0.8052,
"memory/device_mem_reserved(gib)": 53.21,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4350
},
{
"epoch": 1.0080184424175604,
"grad_norm": 6.0668182373046875,
"learning_rate": 9.99998041506907e-05,
"loss": 0.7038,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4400
},
{
"epoch": 1.0194733601569324,
"grad_norm": 8.027034759521484,
"learning_rate": 9.999884489246108e-05,
"loss": 0.6596,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4450
},
{
"epoch": 1.0309282778963043,
"grad_norm": 6.251341819763184,
"learning_rate": 9.999708626830618e-05,
"loss": 0.6702,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4500
},
{
"epoch": 1.0423831956356764,
"grad_norm": 6.4562506675720215,
"learning_rate": 9.999452830634232e-05,
"loss": 0.6421,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4550
},
{
"epoch": 1.0538381133750483,
"grad_norm": 6.475341796875,
"learning_rate": 9.999117104746543e-05,
"loss": 0.6355,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4600
},
{
"epoch": 1.0652930311144204,
"grad_norm": 6.397785663604736,
"learning_rate": 9.998701454535029e-05,
"loss": 0.638,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4650
},
{
"epoch": 1.0767479488537923,
"grad_norm": 6.843462944030762,
"learning_rate": 9.998205886644977e-05,
"loss": 0.6332,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4700
},
{
"epoch": 1.0882028665931642,
"grad_norm": 6.432698726654053,
"learning_rate": 9.997630408999371e-05,
"loss": 0.6187,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4750
},
{
"epoch": 1.0996577843325364,
"grad_norm": 7.654291152954102,
"learning_rate": 9.996975030798767e-05,
"loss": 0.6118,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4800
},
{
"epoch": 1.1111127020719083,
"grad_norm": 5.9475812911987305,
"learning_rate": 9.996239762521151e-05,
"loss": 0.6068,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4850
},
{
"epoch": 1.1225676198112802,
"grad_norm": 7.744262218475342,
"learning_rate": 9.995424615921757e-05,
"loss": 0.6021,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4900
},
{
"epoch": 1.1340225375506523,
"grad_norm": 6.4447197914123535,
"learning_rate": 9.9945296040329e-05,
"loss": 0.6119,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 4950
},
{
"epoch": 1.1454774552900242,
"grad_norm": 6.5645432472229,
"learning_rate": 9.993554741163749e-05,
"loss": 0.5836,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5000
},
{
"epoch": 1.156932373029396,
"grad_norm": 6.169116020202637,
"learning_rate": 9.992500042900104e-05,
"loss": 0.585,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5050
},
{
"epoch": 1.1683872907687682,
"grad_norm": 8.242532730102539,
"learning_rate": 9.991365526104154e-05,
"loss": 0.5657,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5100
},
{
"epoch": 1.1798422085081401,
"grad_norm": 7.146617412567139,
"learning_rate": 9.990151208914202e-05,
"loss": 0.5808,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5150
},
{
"epoch": 1.191297126247512,
"grad_norm": 6.222508430480957,
"learning_rate": 9.988857110744367e-05,
"loss": 0.554,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5200
},
{
"epoch": 1.2027520439868842,
"grad_norm": 6.2183146476745605,
"learning_rate": 9.987483252284291e-05,
"loss": 0.549,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5250
},
{
"epoch": 1.214206961726256,
"grad_norm": 6.201925754547119,
"learning_rate": 9.986029655498792e-05,
"loss": 0.5595,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5300
},
{
"epoch": 1.225661879465628,
"grad_norm": 6.70211124420166,
"learning_rate": 9.984496343627523e-05,
"loss": 0.557,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5350
},
{
"epoch": 1.2371167972050001,
"grad_norm": 6.915555477142334,
"learning_rate": 9.982883341184593e-05,
"loss": 0.5267,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5400
},
{
"epoch": 1.248571714944372,
"grad_norm": 6.054184913635254,
"learning_rate": 9.981190673958185e-05,
"loss": 0.5359,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5450
},
{
"epoch": 1.260026632683744,
"grad_norm": 6.70853328704834,
"learning_rate": 9.979418369010131e-05,
"loss": 0.5326,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5500
},
{
"epoch": 1.271481550423116,
"grad_norm": 6.052192211151123,
"learning_rate": 9.977566454675492e-05,
"loss": 0.5156,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5550
},
{
"epoch": 1.282936468162488,
"grad_norm": 6.540433406829834,
"learning_rate": 9.975634960562094e-05,
"loss": 0.5274,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5600
},
{
"epoch": 1.2943913859018599,
"grad_norm": 5.683777332305908,
"learning_rate": 9.973623917550065e-05,
"loss": 0.5169,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5650
},
{
"epoch": 1.305846303641232,
"grad_norm": 5.470891952514648,
"learning_rate": 9.97153335779133e-05,
"loss": 0.5018,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5700
},
{
"epoch": 1.317301221380604,
"grad_norm": 4.297957897186279,
"learning_rate": 9.969363314709107e-05,
"loss": 0.4915,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5750
},
{
"epoch": 1.3287561391199758,
"grad_norm": 6.072817325592041,
"learning_rate": 9.967113822997367e-05,
"loss": 0.4886,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5800
},
{
"epoch": 1.340211056859348,
"grad_norm": 5.685266017913818,
"learning_rate": 9.964784918620282e-05,
"loss": 0.4925,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5850
},
{
"epoch": 1.3516659745987198,
"grad_norm": 7.324371337890625,
"learning_rate": 9.962376638811648e-05,
"loss": 0.4557,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5900
},
{
"epoch": 1.363120892338092,
"grad_norm": 5.497219085693359,
"learning_rate": 9.959889022074291e-05,
"loss": 0.4731,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 5950
},
{
"epoch": 1.3745758100774639,
"grad_norm": 5.637268543243408,
"learning_rate": 9.95732210817945e-05,
"loss": 0.4653,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6000
},
{
"epoch": 1.3860307278168358,
"grad_norm": 5.0990705490112305,
"learning_rate": 9.954675938166145e-05,
"loss": 0.4563,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6050
},
{
"epoch": 1.397485645556208,
"grad_norm": 5.403497695922852,
"learning_rate": 9.951950554340515e-05,
"loss": 0.4427,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6100
},
{
"epoch": 1.4089405632955798,
"grad_norm": 5.238762378692627,
"learning_rate": 9.949146000275145e-05,
"loss": 0.4517,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6150
},
{
"epoch": 1.420395481034952,
"grad_norm": 3.6025900840759277,
"learning_rate": 9.946262320808371e-05,
"loss": 0.4287,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6200
},
{
"epoch": 1.4318503987743239,
"grad_norm": 4.929942607879639,
"learning_rate": 9.94329956204356e-05,
"loss": 0.4268,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6250
},
{
"epoch": 1.4433053165136958,
"grad_norm": 6.123436450958252,
"learning_rate": 9.940257771348375e-05,
"loss": 0.4254,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6300
},
{
"epoch": 1.4547602342530679,
"grad_norm": 6.038297653198242,
"learning_rate": 9.937136997354015e-05,
"loss": 0.4089,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6350
},
{
"epoch": 1.4662151519924398,
"grad_norm": 5.310572147369385,
"learning_rate": 9.93393728995444e-05,
"loss": 0.4142,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6400
},
{
"epoch": 1.4776700697318117,
"grad_norm": 5.765950679779053,
"learning_rate": 9.930658700305576e-05,
"loss": 0.4095,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6450
},
{
"epoch": 1.4891249874711838,
"grad_norm": 5.236095905303955,
"learning_rate": 9.927301280824489e-05,
"loss": 0.4068,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6500
},
{
"epoch": 1.5005799052105557,
"grad_norm": 5.353938102722168,
"learning_rate": 9.923865085188552e-05,
"loss": 0.4121,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6550
},
{
"epoch": 1.5120348229499276,
"grad_norm": 6.9634881019592285,
"learning_rate": 9.920350168334591e-05,
"loss": 0.393,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6600
},
{
"epoch": 1.5234897406892998,
"grad_norm": 4.650847911834717,
"learning_rate": 9.916756586457999e-05,
"loss": 0.385,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6650
},
{
"epoch": 1.5349446584286717,
"grad_norm": 5.3702311515808105,
"learning_rate": 9.91308439701184e-05,
"loss": 0.3965,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6700
},
{
"epoch": 1.5463995761680436,
"grad_norm": 5.833876132965088,
"learning_rate": 9.909333658705933e-05,
"loss": 0.3859,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6750
},
{
"epoch": 1.5578544939074157,
"grad_norm": 4.853622913360596,
"learning_rate": 9.905504431505912e-05,
"loss": 0.3788,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6800
},
{
"epoch": 1.5693094116467876,
"grad_norm": 4.673356056213379,
"learning_rate": 9.901596776632266e-05,
"loss": 0.3726,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6850
},
{
"epoch": 1.5807643293861595,
"grad_norm": 4.708242416381836,
"learning_rate": 9.897610756559361e-05,
"loss": 0.3624,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6900
},
{
"epoch": 1.5922192471255316,
"grad_norm": 5.4483962059021,
"learning_rate": 9.893546435014442e-05,
"loss": 0.371,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 6950
},
{
"epoch": 1.6036741648649036,
"grad_norm": 5.3623223304748535,
"learning_rate": 9.889403876976614e-05,
"loss": 0.3574,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7000
},
{
"epoch": 1.6151290826042755,
"grad_norm": 4.880136013031006,
"learning_rate": 9.8851831486758e-05,
"loss": 0.3654,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7050
},
{
"epoch": 1.6265840003436476,
"grad_norm": 4.732957363128662,
"learning_rate": 9.880884317591687e-05,
"loss": 0.3563,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7100
},
{
"epoch": 1.6380389180830195,
"grad_norm": 4.353087902069092,
"learning_rate": 9.876507452452646e-05,
"loss": 0.3523,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7150
},
{
"epoch": 1.6494938358223914,
"grad_norm": 5.005238056182861,
"learning_rate": 9.872052623234632e-05,
"loss": 0.3402,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7200
},
{
"epoch": 1.6609487535617635,
"grad_norm": 4.400302410125732,
"learning_rate": 9.867519901160059e-05,
"loss": 0.3522,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7250
},
{
"epoch": 1.6724036713011354,
"grad_norm": 5.095331192016602,
"learning_rate": 9.862909358696674e-05,
"loss": 0.3431,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7300
},
{
"epoch": 1.6838585890405073,
"grad_norm": 4.416248798370361,
"learning_rate": 9.858221069556395e-05,
"loss": 0.3373,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7350
},
{
"epoch": 1.6953135067798795,
"grad_norm": 4.021190166473389,
"learning_rate": 9.85345510869412e-05,
"loss": 0.3298,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7400
},
{
"epoch": 1.7067684245192514,
"grad_norm": 5.18602180480957,
"learning_rate": 9.848611552306548e-05,
"loss": 0.3405,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7450
},
{
"epoch": 1.7182233422586233,
"grad_norm": 4.7404608726501465,
"learning_rate": 9.843690477830945e-05,
"loss": 0.3278,
"memory/device_mem_reserved(gib)": 53.23,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7500
},
{
"epoch": 1.7296782599979954,
"grad_norm": 5.107292175292969,
"learning_rate": 9.838691963943912e-05,
"loss": 0.3351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7550
},
{
"epoch": 1.7411331777373675,
"grad_norm": 4.792062759399414,
"learning_rate": 9.83361609056013e-05,
"loss": 0.3212,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7600
},
{
"epoch": 1.7525880954767392,
"grad_norm": 5.694723606109619,
"learning_rate": 9.82846293883108e-05,
"loss": 0.3191,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7650
},
{
"epoch": 1.7640430132161113,
"grad_norm": 4.297928333282471,
"learning_rate": 9.823232591143741e-05,
"loss": 0.3096,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7700
},
{
"epoch": 1.7754979309554835,
"grad_norm": 4.557746887207031,
"learning_rate": 9.817925131119279e-05,
"loss": 0.3055,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7750
},
{
"epoch": 1.7869528486948552,
"grad_norm": 4.228251934051514,
"learning_rate": 9.81254064361171e-05,
"loss": 0.3149,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7800
},
{
"epoch": 1.7984077664342273,
"grad_norm": 4.910319805145264,
"learning_rate": 9.807079214706538e-05,
"loss": 0.3141,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7850
},
{
"epoch": 1.8098626841735994,
"grad_norm": 5.196345329284668,
"learning_rate": 9.801540931719384e-05,
"loss": 0.3035,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7900
},
{
"epoch": 1.8213176019129713,
"grad_norm": 4.111600875854492,
"learning_rate": 9.795925883194588e-05,
"loss": 0.3033,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 7950
},
{
"epoch": 1.8327725196523432,
"grad_norm": 4.21397590637207,
"learning_rate": 9.790234158903792e-05,
"loss": 0.3068,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8000
},
{
"epoch": 1.8442274373917154,
"grad_norm": 4.57835578918457,
"learning_rate": 9.784465849844511e-05,
"loss": 0.3,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8050
},
{
"epoch": 1.8556823551310873,
"grad_norm": 4.8795294761657715,
"learning_rate": 9.778621048238664e-05,
"loss": 0.2919,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8100
},
{
"epoch": 1.8671372728704592,
"grad_norm": 4.112079620361328,
"learning_rate": 9.77269984753112e-05,
"loss": 0.2866,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8150
},
{
"epoch": 1.8785921906098313,
"grad_norm": 5.471593856811523,
"learning_rate": 9.766702342388184e-05,
"loss": 0.2942,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8200
},
{
"epoch": 1.8900471083492032,
"grad_norm": 5.2102766036987305,
"learning_rate": 9.760628628696096e-05,
"loss": 0.2926,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8250
},
{
"epoch": 1.901502026088575,
"grad_norm": 4.992270469665527,
"learning_rate": 9.754478803559498e-05,
"loss": 0.2874,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8300
},
{
"epoch": 1.9129569438279472,
"grad_norm": 4.012945175170898,
"learning_rate": 9.748252965299872e-05,
"loss": 0.2774,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8350
},
{
"epoch": 1.9244118615673191,
"grad_norm": 4.634591102600098,
"learning_rate": 9.741951213453977e-05,
"loss": 0.2795,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8400
},
{
"epoch": 1.935866779306691,
"grad_norm": 4.384332656860352,
"learning_rate": 9.735573648772257e-05,
"loss": 0.2785,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8450
},
{
"epoch": 1.9473216970460632,
"grad_norm": 4.638082504272461,
"learning_rate": 9.72912037321722e-05,
"loss": 0.2803,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8500
},
{
"epoch": 1.958776614785435,
"grad_norm": 3.405381917953491,
"learning_rate": 9.722591489961827e-05,
"loss": 0.2729,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8550
},
{
"epoch": 1.970231532524807,
"grad_norm": 4.394991874694824,
"learning_rate": 9.715987103387823e-05,
"loss": 0.2751,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8600
},
{
"epoch": 1.9816864502641791,
"grad_norm": 5.380841255187988,
"learning_rate": 9.709307319084077e-05,
"loss": 0.2725,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8650
},
{
"epoch": 1.993141368003551,
"grad_norm": 3.7391974925994873,
"learning_rate": 9.702552243844899e-05,
"loss": 0.2659,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8700
},
{
"epoch": 2.0045819670957488,
"grad_norm": 3.6832714080810547,
"learning_rate": 9.69572198566832e-05,
"loss": 0.2254,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8750
},
{
"epoch": 2.016036884835121,
"grad_norm": 3.2387888431549072,
"learning_rate": 9.68881665375438e-05,
"loss": 0.1553,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8800
},
{
"epoch": 2.0274918025744926,
"grad_norm": 3.022691488265991,
"learning_rate": 9.681836358503367e-05,
"loss": 0.1662,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8850
},
{
"epoch": 2.0389467203138647,
"grad_norm": 3.7819292545318604,
"learning_rate": 9.674781211514063e-05,
"loss": 0.1651,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8900
},
{
"epoch": 2.050401638053237,
"grad_norm": 4.307174205780029,
"learning_rate": 9.667651325581955e-05,
"loss": 0.1595,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 8950
},
{
"epoch": 2.0618565557926085,
"grad_norm": 3.7441294193267822,
"learning_rate": 9.660446814697436e-05,
"loss": 0.1603,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9000
},
{
"epoch": 2.0733114735319806,
"grad_norm": 3.3949477672576904,
"learning_rate": 9.653167794043976e-05,
"loss": 0.1635,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9050
},
{
"epoch": 2.0847663912713528,
"grad_norm": 3.6564900875091553,
"learning_rate": 9.645814379996285e-05,
"loss": 0.1595,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9100
},
{
"epoch": 2.0962213090107245,
"grad_norm": 3.380403995513916,
"learning_rate": 9.638386690118452e-05,
"loss": 0.1552,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9150
},
{
"epoch": 2.1076762267500966,
"grad_norm": 3.9699547290802,
"learning_rate": 9.630884843162063e-05,
"loss": 0.1603,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9200
},
{
"epoch": 2.1191311444894687,
"grad_norm": 2.764639139175415,
"learning_rate": 9.623308959064306e-05,
"loss": 0.1587,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9250
},
{
"epoch": 2.130586062228841,
"grad_norm": 3.9039690494537354,
"learning_rate": 9.615659158946053e-05,
"loss": 0.1621,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9300
},
{
"epoch": 2.1420409799682125,
"grad_norm": 3.1429221630096436,
"learning_rate": 9.607935565109917e-05,
"loss": 0.161,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9350
},
{
"epoch": 2.1534958977075846,
"grad_norm": 3.3480520248413086,
"learning_rate": 9.600138301038311e-05,
"loss": 0.1645,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9400
},
{
"epoch": 2.1649508154469568,
"grad_norm": 3.3411660194396973,
"learning_rate": 9.592267491391452e-05,
"loss": 0.1637,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9450
},
{
"epoch": 2.1764057331863285,
"grad_norm": 3.773784637451172,
"learning_rate": 9.584323262005393e-05,
"loss": 0.1631,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9500
},
{
"epoch": 2.1878606509257006,
"grad_norm": 2.9222793579101562,
"learning_rate": 9.576305739889991e-05,
"loss": 0.1598,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9550
},
{
"epoch": 2.1993155686650727,
"grad_norm": 3.034086227416992,
"learning_rate": 9.568215053226888e-05,
"loss": 0.1602,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9600
},
{
"epoch": 2.2107704864044444,
"grad_norm": 4.284358501434326,
"learning_rate": 9.560051331367457e-05,
"loss": 0.1624,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9650
},
{
"epoch": 2.2222254041438165,
"grad_norm": 4.235621929168701,
"learning_rate": 9.551814704830734e-05,
"loss": 0.1593,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9700
},
{
"epoch": 2.2336803218831887,
"grad_norm": 3.487086057662964,
"learning_rate": 9.543505305301334e-05,
"loss": 0.155,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9750
},
{
"epoch": 2.2451352396225603,
"grad_norm": 3.9365508556365967,
"learning_rate": 9.535123265627343e-05,
"loss": 0.1608,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9800
},
{
"epoch": 2.2565901573619325,
"grad_norm": 4.065316200256348,
"learning_rate": 9.526668719818195e-05,
"loss": 0.1623,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9850
},
{
"epoch": 2.2680450751013046,
"grad_norm": 3.1943957805633545,
"learning_rate": 9.518141803042527e-05,
"loss": 0.1646,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9900
},
{
"epoch": 2.2794999928406763,
"grad_norm": 3.362541913986206,
"learning_rate": 9.509542651626027e-05,
"loss": 0.1591,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 9950
},
{
"epoch": 2.2909549105800484,
"grad_norm": 3.442073345184326,
"learning_rate": 9.500871403049239e-05,
"loss": 0.1604,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10000
},
{
"epoch": 2.3024098283194205,
"grad_norm": 3.4276912212371826,
"learning_rate": 9.492128195945383e-05,
"loss": 0.1571,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10050
},
{
"epoch": 2.313864746058792,
"grad_norm": 2.761948347091675,
"learning_rate": 9.483313170098121e-05,
"loss": 0.1535,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10100
},
{
"epoch": 2.3253196637981643,
"grad_norm": 3.1246402263641357,
"learning_rate": 9.474426466439337e-05,
"loss": 0.1579,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10150
},
{
"epoch": 2.3367745815375365,
"grad_norm": 3.328728437423706,
"learning_rate": 9.465468227046876e-05,
"loss": 0.1567,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10200
},
{
"epoch": 2.348229499276908,
"grad_norm": 4.195374965667725,
"learning_rate": 9.456438595142272e-05,
"loss": 0.1542,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10250
},
{
"epoch": 2.3596844170162803,
"grad_norm": 3.6173229217529297,
"learning_rate": 9.447337715088461e-05,
"loss": 0.1615,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10300
},
{
"epoch": 2.3711393347556524,
"grad_norm": 3.0115489959716797,
"learning_rate": 9.438165732387472e-05,
"loss": 0.1586,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10350
},
{
"epoch": 2.382594252495024,
"grad_norm": 4.064676284790039,
"learning_rate": 9.428922793678101e-05,
"loss": 0.1551,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10400
},
{
"epoch": 2.3940491702343962,
"grad_norm": 3.5949292182922363,
"learning_rate": 9.419609046733571e-05,
"loss": 0.1502,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10450
},
{
"epoch": 2.4055040879737684,
"grad_norm": 3.932413101196289,
"learning_rate": 9.410224640459156e-05,
"loss": 0.157,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10500
},
{
"epoch": 2.41695900571314,
"grad_norm": 3.8124208450317383,
"learning_rate": 9.400769724889817e-05,
"loss": 0.1495,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10550
},
{
"epoch": 2.428413923452512,
"grad_norm": 3.310115098953247,
"learning_rate": 9.391244451187793e-05,
"loss": 0.1572,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10600
},
{
"epoch": 2.4398688411918843,
"grad_norm": 3.140340566635132,
"learning_rate": 9.381648971640184e-05,
"loss": 0.1544,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10650
},
{
"epoch": 2.451323758931256,
"grad_norm": 3.2607996463775635,
"learning_rate": 9.371983439656524e-05,
"loss": 0.1515,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10700
},
{
"epoch": 2.462778676670628,
"grad_norm": 3.3957531452178955,
"learning_rate": 9.362248009766321e-05,
"loss": 0.1506,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10750
},
{
"epoch": 2.4742335944100002,
"grad_norm": 3.6932249069213867,
"learning_rate": 9.35244283761659e-05,
"loss": 0.1417,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10800
},
{
"epoch": 2.4856885121493724,
"grad_norm": 2.407801389694214,
"learning_rate": 9.342568079969363e-05,
"loss": 0.1507,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10850
},
{
"epoch": 2.497143429888744,
"grad_norm": 3.5010054111480713,
"learning_rate": 9.33262389469918e-05,
"loss": 0.1486,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10900
},
{
"epoch": 2.508598347628116,
"grad_norm": 3.2884604930877686,
"learning_rate": 9.322610440790572e-05,
"loss": 0.1545,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 10950
},
{
"epoch": 2.520053265367488,
"grad_norm": 3.1958744525909424,
"learning_rate": 9.312527878335518e-05,
"loss": 0.1431,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11000
},
{
"epoch": 2.53150818310686,
"grad_norm": 3.1914916038513184,
"learning_rate": 9.302376368530874e-05,
"loss": 0.147,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11050
},
{
"epoch": 2.542963100846232,
"grad_norm": 2.7763078212738037,
"learning_rate": 9.292156073675815e-05,
"loss": 0.1471,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11100
},
{
"epoch": 2.5544180185856042,
"grad_norm": 3.8447723388671875,
"learning_rate": 9.281867157169221e-05,
"loss": 0.1463,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11150
},
{
"epoch": 2.565872936324976,
"grad_norm": 3.5225303173065186,
"learning_rate": 9.27150978350708e-05,
"loss": 0.1462,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11200
},
{
"epoch": 2.577327854064348,
"grad_norm": 3.2575135231018066,
"learning_rate": 9.261084118279847e-05,
"loss": 0.139,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11250
},
{
"epoch": 2.5887827718037197,
"grad_norm": 3.11187481880188,
"learning_rate": 9.250590328169807e-05,
"loss": 0.1423,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11300
},
{
"epoch": 2.600237689543092,
"grad_norm": 3.156135082244873,
"learning_rate": 9.240028580948395e-05,
"loss": 0.1426,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11350
},
{
"epoch": 2.611692607282464,
"grad_norm": 3.4446299076080322,
"learning_rate": 9.229399045473532e-05,
"loss": 0.1459,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11400
},
{
"epoch": 2.623147525021836,
"grad_norm": 3.1665008068084717,
"learning_rate": 9.218701891686916e-05,
"loss": 0.1489,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11450
},
{
"epoch": 2.634602442761208,
"grad_norm": 2.7036280632019043,
"learning_rate": 9.207937290611298e-05,
"loss": 0.1407,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11500
},
{
"epoch": 2.64605736050058,
"grad_norm": 3.9781899452209473,
"learning_rate": 9.197105414347762e-05,
"loss": 0.1476,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11550
},
{
"epoch": 2.6575122782399516,
"grad_norm": 2.9390923976898193,
"learning_rate": 9.186206436072965e-05,
"loss": 0.1369,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11600
},
{
"epoch": 2.6689671959793237,
"grad_norm": 1.9289586544036865,
"learning_rate": 9.175240530036369e-05,
"loss": 0.1363,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11650
},
{
"epoch": 2.680422113718696,
"grad_norm": 3.644439697265625,
"learning_rate": 9.164207871557456e-05,
"loss": 0.1415,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11700
},
{
"epoch": 2.691877031458068,
"grad_norm": 3.1818296909332275,
"learning_rate": 9.153108637022928e-05,
"loss": 0.1371,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11750
},
{
"epoch": 2.7033319491974397,
"grad_norm": 2.6996982097625732,
"learning_rate": 9.14194300388388e-05,
"loss": 0.1409,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11800
},
{
"epoch": 2.714786866936812,
"grad_norm": 3.8771860599517822,
"learning_rate": 9.13071115065297e-05,
"loss": 0.1395,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11850
},
{
"epoch": 2.726241784676184,
"grad_norm": 3.087873935699463,
"learning_rate": 9.119413256901563e-05,
"loss": 0.1374,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11900
},
{
"epoch": 2.7376967024155556,
"grad_norm": 3.33695650100708,
"learning_rate": 9.108049503256854e-05,
"loss": 0.1378,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 11950
},
{
"epoch": 2.7491516201549278,
"grad_norm": 3.057760715484619,
"learning_rate": 9.096620071398994e-05,
"loss": 0.1417,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12000
},
{
"epoch": 2.7606065378943,
"grad_norm": 4.001928329467773,
"learning_rate": 9.085125144058168e-05,
"loss": 0.1405,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12050
},
{
"epoch": 2.7720614556336716,
"grad_norm": 2.8355178833007812,
"learning_rate": 9.073564905011689e-05,
"loss": 0.1426,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12100
},
{
"epoch": 2.7835163733730437,
"grad_norm": 3.0020503997802734,
"learning_rate": 9.061939539081049e-05,
"loss": 0.1386,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12150
},
{
"epoch": 2.794971291112416,
"grad_norm": 4.463298797607422,
"learning_rate": 9.05024923212897e-05,
"loss": 0.1368,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12200
},
{
"epoch": 2.8064262088517875,
"grad_norm": 3.095207929611206,
"learning_rate": 9.03849417105643e-05,
"loss": 0.139,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12250
},
{
"epoch": 2.8178811265911596,
"grad_norm": 3.377472162246704,
"learning_rate": 9.026674543799676e-05,
"loss": 0.1356,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12300
},
{
"epoch": 2.8293360443305318,
"grad_norm": 3.876528739929199,
"learning_rate": 9.01479053932722e-05,
"loss": 0.1356,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12350
},
{
"epoch": 2.840790962069904,
"grad_norm": 2.9100306034088135,
"learning_rate": 9.002842347636815e-05,
"loss": 0.1353,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12400
},
{
"epoch": 2.8522458798092756,
"grad_norm": 2.7643377780914307,
"learning_rate": 8.990830159752422e-05,
"loss": 0.1338,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12450
},
{
"epoch": 2.8637007975486477,
"grad_norm": 2.872948169708252,
"learning_rate": 8.978754167721151e-05,
"loss": 0.1352,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12500
},
{
"epoch": 2.8751557152880194,
"grad_norm": 3.3348748683929443,
"learning_rate": 8.96661456461019e-05,
"loss": 0.1337,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12550
},
{
"epoch": 2.8866106330273915,
"grad_norm": 2.863382577896118,
"learning_rate": 8.954411544503729e-05,
"loss": 0.1291,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12600
},
{
"epoch": 2.8980655507667636,
"grad_norm": 3.632277250289917,
"learning_rate": 8.94214530249984e-05,
"loss": 0.1325,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12650
},
{
"epoch": 2.9095204685061358,
"grad_norm": 3.788857936859131,
"learning_rate": 8.929816034707375e-05,
"loss": 0.1331,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12700
},
{
"epoch": 2.9209753862455075,
"grad_norm": 2.73443865776062,
"learning_rate": 8.917423938242814e-05,
"loss": 0.1322,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12750
},
{
"epoch": 2.9324303039848796,
"grad_norm": 3.1101582050323486,
"learning_rate": 8.904969211227134e-05,
"loss": 0.1274,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12800
},
{
"epoch": 2.9438852217242513,
"grad_norm": 2.1412153244018555,
"learning_rate": 8.892452052782616e-05,
"loss": 0.1363,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12850
},
{
"epoch": 2.9553401394636234,
"grad_norm": 2.4939417839050293,
"learning_rate": 8.879872663029689e-05,
"loss": 0.1317,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12900
},
{
"epoch": 2.9667950572029955,
"grad_norm": 2.754542589187622,
"learning_rate": 8.867231243083703e-05,
"loss": 0.1257,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 12950
},
{
"epoch": 2.9782499749423677,
"grad_norm": 2.955983877182007,
"learning_rate": 8.854527995051738e-05,
"loss": 0.1289,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13000
},
{
"epoch": 2.9897048926817393,
"grad_norm": 3.313758373260498,
"learning_rate": 8.841763122029358e-05,
"loss": 0.1308,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13050
},
{
"epoch": 3.0011454917739373,
"grad_norm": 1.7117892503738403,
"learning_rate": 8.828936828097368e-05,
"loss": 0.1221,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13100
},
{
"epoch": 3.012600409513309,
"grad_norm": 3.7318451404571533,
"learning_rate": 8.816049318318552e-05,
"loss": 0.0704,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13150
},
{
"epoch": 3.024055327252681,
"grad_norm": 2.1490225791931152,
"learning_rate": 8.803100798734391e-05,
"loss": 0.0698,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13200
},
{
"epoch": 3.0355102449920532,
"grad_norm": 2.4357903003692627,
"learning_rate": 8.790091476361777e-05,
"loss": 0.0717,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13250
},
{
"epoch": 3.046965162731425,
"grad_norm": 3.2305984497070312,
"learning_rate": 8.777021559189695e-05,
"loss": 0.0673,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13300
},
{
"epoch": 3.058420080470797,
"grad_norm": 2.8263580799102783,
"learning_rate": 8.763891256175902e-05,
"loss": 0.069,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13350
},
{
"epoch": 3.069874998210169,
"grad_norm": 3.3232004642486572,
"learning_rate": 8.750700777243583e-05,
"loss": 0.0723,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13400
},
{
"epoch": 3.0813299159495413,
"grad_norm": 2.5803654193878174,
"learning_rate": 8.737450333277996e-05,
"loss": 0.068,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13450
},
{
"epoch": 3.092784833688913,
"grad_norm": 3.2602574825286865,
"learning_rate": 8.724140136123106e-05,
"loss": 0.0682,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13500
},
{
"epoch": 3.104239751428285,
"grad_norm": 3.49511456489563,
"learning_rate": 8.710770398578189e-05,
"loss": 0.0744,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13550
},
{
"epoch": 3.1156946691676572,
"grad_norm": 3.492642879486084,
"learning_rate": 8.697341334394435e-05,
"loss": 0.0678,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13600
},
{
"epoch": 3.127149586907029,
"grad_norm": 2.680922269821167,
"learning_rate": 8.683853158271532e-05,
"loss": 0.0682,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13650
},
{
"epoch": 3.138604504646401,
"grad_norm": 2.501112699508667,
"learning_rate": 8.670306085854229e-05,
"loss": 0.0727,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13700
},
{
"epoch": 3.150059422385773,
"grad_norm": 1.7489196062088013,
"learning_rate": 8.65670033372889e-05,
"loss": 0.0706,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13750
},
{
"epoch": 3.161514340125145,
"grad_norm": 2.4260241985321045,
"learning_rate": 8.643036119420033e-05,
"loss": 0.0718,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13800
},
{
"epoch": 3.172969257864517,
"grad_norm": 3.021453380584717,
"learning_rate": 8.629313661386856e-05,
"loss": 0.0723,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13850
},
{
"epoch": 3.184424175603889,
"grad_norm": 2.5771586894989014,
"learning_rate": 8.615533179019726e-05,
"loss": 0.0712,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13900
},
{
"epoch": 3.195879093343261,
"grad_norm": 3.019286870956421,
"learning_rate": 8.6016948926367e-05,
"loss": 0.0705,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 13950
},
{
"epoch": 3.207334011082633,
"grad_norm": 2.4302775859832764,
"learning_rate": 8.587799023479982e-05,
"loss": 0.071,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14000
},
{
"epoch": 3.218788928822005,
"grad_norm": 1.8431477546691895,
"learning_rate": 8.573845793712383e-05,
"loss": 0.0727,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14050
},
{
"epoch": 3.2302438465613768,
"grad_norm": 2.839580774307251,
"learning_rate": 8.559835426413794e-05,
"loss": 0.0739,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14100
},
{
"epoch": 3.241698764300749,
"grad_norm": 3.9472312927246094,
"learning_rate": 8.545768145577589e-05,
"loss": 0.0689,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14150
},
{
"epoch": 3.253153682040121,
"grad_norm": 2.908961296081543,
"learning_rate": 8.531644176107066e-05,
"loss": 0.0701,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14200
},
{
"epoch": 3.2646085997794927,
"grad_norm": 1.9942492246627808,
"learning_rate": 8.517463743811836e-05,
"loss": 0.0708,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14250
},
{
"epoch": 3.276063517518865,
"grad_norm": 2.883118152618408,
"learning_rate": 8.503227075404227e-05,
"loss": 0.0751,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14300
},
{
"epoch": 3.287518435258237,
"grad_norm": 2.3924851417541504,
"learning_rate": 8.488934398495649e-05,
"loss": 0.0725,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14350
},
{
"epoch": 3.2989733529976086,
"grad_norm": 2.108149766921997,
"learning_rate": 8.474585941592959e-05,
"loss": 0.0754,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14400
},
{
"epoch": 3.3104282707369808,
"grad_norm": 1.8208028078079224,
"learning_rate": 8.460181934094809e-05,
"loss": 0.0713,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14450
},
{
"epoch": 3.321883188476353,
"grad_norm": 2.987584114074707,
"learning_rate": 8.445722606287971e-05,
"loss": 0.0727,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14500
},
{
"epoch": 3.3333381062157246,
"grad_norm": 3.576843023300171,
"learning_rate": 8.43120818934367e-05,
"loss": 0.0692,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14550
},
{
"epoch": 3.3447930239550967,
"grad_norm": 1.5616097450256348,
"learning_rate": 8.416638915313868e-05,
"loss": 0.071,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14600
},
{
"epoch": 3.356247941694469,
"grad_norm": 2.461344003677368,
"learning_rate": 8.402015017127571e-05,
"loss": 0.0728,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14650
},
{
"epoch": 3.3677028594338405,
"grad_norm": 2.740246534347534,
"learning_rate": 8.387336728587103e-05,
"loss": 0.0738,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14700
},
{
"epoch": 3.3791577771732126,
"grad_norm": 2.1253201961517334,
"learning_rate": 8.372604284364355e-05,
"loss": 0.0721,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14750
},
{
"epoch": 3.3906126949125848,
"grad_norm": 2.5474374294281006,
"learning_rate": 8.357817919997049e-05,
"loss": 0.0701,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14800
},
{
"epoch": 3.4020676126519565,
"grad_norm": 1.9206650257110596,
"learning_rate": 8.34297787188496e-05,
"loss": 0.0721,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14850
},
{
"epoch": 3.4135225303913286,
"grad_norm": 2.298408031463623,
"learning_rate": 8.328084377286149e-05,
"loss": 0.0719,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14900
},
{
"epoch": 3.4249774481307007,
"grad_norm": 2.9477977752685547,
"learning_rate": 8.313137674313158e-05,
"loss": 0.0724,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 14950
},
{
"epoch": 3.436432365870073,
"grad_norm": 2.4904532432556152,
"learning_rate": 8.298138001929206e-05,
"loss": 0.0726,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15000
},
{
"epoch": 3.4478872836094445,
"grad_norm": 2.2400805950164795,
"learning_rate": 8.283085599944376e-05,
"loss": 0.0713,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15050
},
{
"epoch": 3.4593422013488166,
"grad_norm": 2.3121421337127686,
"learning_rate": 8.267980709011769e-05,
"loss": 0.0668,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15100
},
{
"epoch": 3.4707971190881883,
"grad_norm": 2.701951026916504,
"learning_rate": 8.25282357062367e-05,
"loss": 0.0698,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15150
},
{
"epoch": 3.4822520368275605,
"grad_norm": 2.5985162258148193,
"learning_rate": 8.237614427107672e-05,
"loss": 0.0682,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15200
},
{
"epoch": 3.4937069545669326,
"grad_norm": 1.998067855834961,
"learning_rate": 8.222353521622819e-05,
"loss": 0.0716,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15250
},
{
"epoch": 3.5051618723063047,
"grad_norm": 2.705017328262329,
"learning_rate": 8.2070410981557e-05,
"loss": 0.0687,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15300
},
{
"epoch": 3.5166167900456764,
"grad_norm": 2.35690975189209,
"learning_rate": 8.191677401516565e-05,
"loss": 0.0693,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15350
},
{
"epoch": 3.5280717077850485,
"grad_norm": 2.5952446460723877,
"learning_rate": 8.176262677335398e-05,
"loss": 0.0712,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15400
},
{
"epoch": 3.53952662552442,
"grad_norm": 2.347503662109375,
"learning_rate": 8.160797172057998e-05,
"loss": 0.0724,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15450
},
{
"epoch": 3.5509815432637923,
"grad_norm": 2.6107993125915527,
"learning_rate": 8.145281132942037e-05,
"loss": 0.069,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15500
},
{
"epoch": 3.5624364610031645,
"grad_norm": 2.2941091060638428,
"learning_rate": 8.129714808053106e-05,
"loss": 0.069,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15550
},
{
"epoch": 3.5738913787425366,
"grad_norm": 3.4392402172088623,
"learning_rate": 8.114098446260745e-05,
"loss": 0.072,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15600
},
{
"epoch": 3.5853462964819083,
"grad_norm": 1.876505732536316,
"learning_rate": 8.098432297234473e-05,
"loss": 0.0694,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15650
},
{
"epoch": 3.5968012142212804,
"grad_norm": 1.9874284267425537,
"learning_rate": 8.082716611439793e-05,
"loss": 0.0685,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15700
},
{
"epoch": 3.608256131960652,
"grad_norm": 2.479461669921875,
"learning_rate": 8.066951640134181e-05,
"loss": 0.0696,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15750
},
{
"epoch": 3.619711049700024,
"grad_norm": 2.318502426147461,
"learning_rate": 8.051137635363078e-05,
"loss": 0.0712,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15800
},
{
"epoch": 3.6311659674393963,
"grad_norm": 2.2743539810180664,
"learning_rate": 8.035274849955858e-05,
"loss": 0.066,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15850
},
{
"epoch": 3.6426208851787685,
"grad_norm": 2.7927591800689697,
"learning_rate": 8.019363537521781e-05,
"loss": 0.0722,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15900
},
{
"epoch": 3.65407580291814,
"grad_norm": 2.3082404136657715,
"learning_rate": 8.003403952445942e-05,
"loss": 0.0727,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 15950
},
{
"epoch": 3.6655307206575123,
"grad_norm": 1.7190062999725342,
"learning_rate": 7.987396349885207e-05,
"loss": 0.0688,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16000
},
{
"epoch": 3.6769856383968844,
"grad_norm": 2.170894145965576,
"learning_rate": 7.97134098576413e-05,
"loss": 0.0643,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16050
},
{
"epoch": 3.688440556136256,
"grad_norm": 2.3685245513916016,
"learning_rate": 7.955238116770859e-05,
"loss": 0.0667,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16100
},
{
"epoch": 3.6998954738756282,
"grad_norm": 2.269733190536499,
"learning_rate": 7.939088000353038e-05,
"loss": 0.0653,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16150
},
{
"epoch": 3.7113503916150004,
"grad_norm": 2.966156005859375,
"learning_rate": 7.922890894713688e-05,
"loss": 0.0641,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16200
},
{
"epoch": 3.722805309354372,
"grad_norm": 2.5244526863098145,
"learning_rate": 7.906647058807078e-05,
"loss": 0.0673,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16250
},
{
"epoch": 3.734260227093744,
"grad_norm": 2.3612561225891113,
"learning_rate": 7.890356752334585e-05,
"loss": 0.0682,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16300
},
{
"epoch": 3.7457151448331163,
"grad_norm": 2.6866989135742188,
"learning_rate": 7.874020235740544e-05,
"loss": 0.0689,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16350
},
{
"epoch": 3.757170062572488,
"grad_norm": 2.266900062561035,
"learning_rate": 7.857637770208084e-05,
"loss": 0.0698,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16400
},
{
"epoch": 3.76862498031186,
"grad_norm": 2.235653877258301,
"learning_rate": 7.841209617654949e-05,
"loss": 0.0642,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16450
},
{
"epoch": 3.7800798980512322,
"grad_norm": 4.613194942474365,
"learning_rate": 7.824736040729315e-05,
"loss": 0.0646,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16500
},
{
"epoch": 3.7915348157906044,
"grad_norm": 1.9603101015090942,
"learning_rate": 7.808217302805587e-05,
"loss": 0.0686,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16550
},
{
"epoch": 3.802989733529976,
"grad_norm": 2.1632003784179688,
"learning_rate": 7.791653667980191e-05,
"loss": 0.0663,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16600
},
{
"epoch": 3.814444651269348,
"grad_norm": 2.5433571338653564,
"learning_rate": 7.77504540106735e-05,
"loss": 0.0664,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16650
},
{
"epoch": 3.82589956900872,
"grad_norm": 3.197382926940918,
"learning_rate": 7.758392767594853e-05,
"loss": 0.0679,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16700
},
{
"epoch": 3.837354486748092,
"grad_norm": 2.555476188659668,
"learning_rate": 7.741696033799804e-05,
"loss": 0.0681,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16750
},
{
"epoch": 3.848809404487464,
"grad_norm": 2.589463233947754,
"learning_rate": 7.724955466624371e-05,
"loss": 0.0677,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16800
},
{
"epoch": 3.8602643222268362,
"grad_norm": 2.2410428524017334,
"learning_rate": 7.708171333711517e-05,
"loss": 0.0688,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16850
},
{
"epoch": 3.871719239966208,
"grad_norm": 2.9268081188201904,
"learning_rate": 7.69134390340072e-05,
"loss": 0.0674,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16900
},
{
"epoch": 3.88317415770558,
"grad_norm": 2.1275105476379395,
"learning_rate": 7.674473444723684e-05,
"loss": 0.0677,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 16950
},
{
"epoch": 3.8946290754449517,
"grad_norm": 1.7868996858596802,
"learning_rate": 7.657560227400037e-05,
"loss": 0.0667,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17000
},
{
"epoch": 3.906083993184324,
"grad_norm": 2.705197811126709,
"learning_rate": 7.640604521833015e-05,
"loss": 0.0713,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17050
},
{
"epoch": 3.917538910923696,
"grad_norm": 1.5226702690124512,
"learning_rate": 7.62360659910515e-05,
"loss": 0.067,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17100
},
{
"epoch": 3.928993828663068,
"grad_norm": 2.7335004806518555,
"learning_rate": 7.60656673097392e-05,
"loss": 0.0653,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17150
},
{
"epoch": 3.94044874640244,
"grad_norm": 2.0359129905700684,
"learning_rate": 7.589485189867422e-05,
"loss": 0.067,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17200
},
{
"epoch": 3.951903664141812,
"grad_norm": 2.2404749393463135,
"learning_rate": 7.572362248880001e-05,
"loss": 0.0659,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17250
},
{
"epoch": 3.9633585818811836,
"grad_norm": 1.9133015871047974,
"learning_rate": 7.555198181767894e-05,
"loss": 0.0662,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17300
},
{
"epoch": 3.9748134996205557,
"grad_norm": 3.204033136367798,
"learning_rate": 7.537993262944849e-05,
"loss": 0.0644,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17350
},
{
"epoch": 3.986268417359928,
"grad_norm": 2.0416345596313477,
"learning_rate": 7.520747767477734e-05,
"loss": 0.0648,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17400
},
{
"epoch": 3.9977233350993,
"grad_norm": 2.1592066287994385,
"learning_rate": 7.50346197108215e-05,
"loss": 0.0629,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17450
},
{
"epoch": 4.0091639341914975,
"grad_norm": 2.386658191680908,
"learning_rate": 7.486136150118015e-05,
"loss": 0.0421,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17500
},
{
"epoch": 4.02061885193087,
"grad_norm": 1.3900179862976074,
"learning_rate": 7.468770581585146e-05,
"loss": 0.0324,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17550
},
{
"epoch": 4.032073769670242,
"grad_norm": 1.8588780164718628,
"learning_rate": 7.451365543118831e-05,
"loss": 0.0354,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17600
},
{
"epoch": 4.043528687409614,
"grad_norm": 1.4627822637557983,
"learning_rate": 7.433921312985393e-05,
"loss": 0.0328,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17650
},
{
"epoch": 4.054983605148985,
"grad_norm": 2.9422807693481445,
"learning_rate": 7.416438170077738e-05,
"loss": 0.0349,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17700
},
{
"epoch": 4.066438522888357,
"grad_norm": 1.9216961860656738,
"learning_rate": 7.398916393910895e-05,
"loss": 0.0364,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17750
},
{
"epoch": 4.077893440627729,
"grad_norm": 1.9999079704284668,
"learning_rate": 7.381356264617557e-05,
"loss": 0.0351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17800
},
{
"epoch": 4.0893483583671015,
"grad_norm": 1.1669881343841553,
"learning_rate": 7.363758062943587e-05,
"loss": 0.0351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17850
},
{
"epoch": 4.100803276106474,
"grad_norm": 1.4963182210922241,
"learning_rate": 7.346122070243539e-05,
"loss": 0.0351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17900
},
{
"epoch": 4.112258193845846,
"grad_norm": 2.435983419418335,
"learning_rate": 7.328448568476163e-05,
"loss": 0.0353,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 17950
},
{
"epoch": 4.123713111585217,
"grad_norm": 1.783022403717041,
"learning_rate": 7.310737840199885e-05,
"loss": 0.0343,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18000
},
{
"epoch": 4.135168029324589,
"grad_norm": 1.7959028482437134,
"learning_rate": 7.292990168568302e-05,
"loss": 0.0344,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18050
},
{
"epoch": 4.146622947063961,
"grad_norm": 1.0920823812484741,
"learning_rate": 7.275205837325649e-05,
"loss": 0.0352,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18100
},
{
"epoch": 4.158077864803333,
"grad_norm": 2.1539368629455566,
"learning_rate": 7.257385130802261e-05,
"loss": 0.0362,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18150
},
{
"epoch": 4.1695327825427055,
"grad_norm": 2.0688672065734863,
"learning_rate": 7.239528333910031e-05,
"loss": 0.0358,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18200
},
{
"epoch": 4.180987700282078,
"grad_norm": 2.0575592517852783,
"learning_rate": 7.221635732137854e-05,
"loss": 0.037,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18250
},
{
"epoch": 4.192442618021449,
"grad_norm": 2.307478189468384,
"learning_rate": 7.203707611547066e-05,
"loss": 0.0383,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18300
},
{
"epoch": 4.203897535760821,
"grad_norm": 1.4493507146835327,
"learning_rate": 7.185744258766858e-05,
"loss": 0.0368,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18350
},
{
"epoch": 4.215352453500193,
"grad_norm": 1.858702301979065,
"learning_rate": 7.167745960989708e-05,
"loss": 0.0371,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18400
},
{
"epoch": 4.226807371239565,
"grad_norm": 2.091564893722534,
"learning_rate": 7.149713005966784e-05,
"loss": 0.037,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18450
},
{
"epoch": 4.238262288978937,
"grad_norm": 1.320420503616333,
"learning_rate": 7.13164568200334e-05,
"loss": 0.0395,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18500
},
{
"epoch": 4.2497172067183095,
"grad_norm": 1.7669836282730103,
"learning_rate": 7.113544277954116e-05,
"loss": 0.036,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18550
},
{
"epoch": 4.261172124457682,
"grad_norm": 1.7692891359329224,
"learning_rate": 7.095409083218705e-05,
"loss": 0.0363,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18600
},
{
"epoch": 4.272627042197053,
"grad_norm": 1.4716825485229492,
"learning_rate": 7.077240387736943e-05,
"loss": 0.0387,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18650
},
{
"epoch": 4.284081959936425,
"grad_norm": 1.9312763214111328,
"learning_rate": 7.05903848198426e-05,
"loss": 0.0351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18700
},
{
"epoch": 4.295536877675797,
"grad_norm": 1.417018175125122,
"learning_rate": 7.040803656967045e-05,
"loss": 0.0364,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18750
},
{
"epoch": 4.306991795415169,
"grad_norm": 2.400550365447998,
"learning_rate": 7.022536204217989e-05,
"loss": 0.0363,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18800
},
{
"epoch": 4.318446713154541,
"grad_norm": 1.612289547920227,
"learning_rate": 7.004236415791421e-05,
"loss": 0.0371,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18850
},
{
"epoch": 4.3299016308939136,
"grad_norm": 2.4686686992645264,
"learning_rate": 6.985904584258649e-05,
"loss": 0.0401,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18900
},
{
"epoch": 4.341356548633285,
"grad_norm": 3.242429256439209,
"learning_rate": 6.967541002703274e-05,
"loss": 0.0353,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 18950
},
{
"epoch": 4.352811466372657,
"grad_norm": 2.2859609127044678,
"learning_rate": 6.949145964716505e-05,
"loss": 0.0365,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19000
},
{
"epoch": 4.364266384112029,
"grad_norm": 2.1360576152801514,
"learning_rate": 6.930719764392466e-05,
"loss": 0.0382,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19050
},
{
"epoch": 4.375721301851401,
"grad_norm": 1.6462370157241821,
"learning_rate": 6.912262696323497e-05,
"loss": 0.0358,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19100
},
{
"epoch": 4.387176219590773,
"grad_norm": 1.5075321197509766,
"learning_rate": 6.893775055595442e-05,
"loss": 0.0356,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19150
},
{
"epoch": 4.398631137330145,
"grad_norm": 1.614206075668335,
"learning_rate": 6.87525713778293e-05,
"loss": 0.0392,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19200
},
{
"epoch": 4.410086055069517,
"grad_norm": 1.9505984783172607,
"learning_rate": 6.856709238944649e-05,
"loss": 0.0354,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19250
},
{
"epoch": 4.421540972808889,
"grad_norm": 1.831098198890686,
"learning_rate": 6.838131655618618e-05,
"loss": 0.0355,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19300
},
{
"epoch": 4.432995890548261,
"grad_norm": 2.2867400646209717,
"learning_rate": 6.819524684817438e-05,
"loss": 0.037,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19350
},
{
"epoch": 4.444450808287633,
"grad_norm": 1.2839210033416748,
"learning_rate": 6.800888624023553e-05,
"loss": 0.0375,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19400
},
{
"epoch": 4.455905726027005,
"grad_norm": 1.812117099761963,
"learning_rate": 6.782223771184484e-05,
"loss": 0.0365,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19450
},
{
"epoch": 4.467360643766377,
"grad_norm": 1.3475086688995361,
"learning_rate": 6.763530424708072e-05,
"loss": 0.0356,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19500
},
{
"epoch": 4.4788155615057486,
"grad_norm": 1.6308741569519043,
"learning_rate": 6.744808883457707e-05,
"loss": 0.0367,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19550
},
{
"epoch": 4.490270479245121,
"grad_norm": 1.424625039100647,
"learning_rate": 6.726059446747545e-05,
"loss": 0.0384,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19600
},
{
"epoch": 4.501725396984493,
"grad_norm": 2.242457389831543,
"learning_rate": 6.707282414337728e-05,
"loss": 0.0352,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19650
},
{
"epoch": 4.513180314723865,
"grad_norm": 2.116205930709839,
"learning_rate": 6.688478086429589e-05,
"loss": 0.0374,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19700
},
{
"epoch": 4.524635232463237,
"grad_norm": 1.493812084197998,
"learning_rate": 6.669646763660855e-05,
"loss": 0.0339,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19750
},
{
"epoch": 4.536090150202609,
"grad_norm": 1.5812180042266846,
"learning_rate": 6.650788747100832e-05,
"loss": 0.0375,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19800
},
{
"epoch": 4.54754506794198,
"grad_norm": 1.9899191856384277,
"learning_rate": 6.631904338245607e-05,
"loss": 0.0373,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19850
},
{
"epoch": 4.558999985681353,
"grad_norm": 1.682928442955017,
"learning_rate": 6.612993839013211e-05,
"loss": 0.0363,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19900
},
{
"epoch": 4.570454903420725,
"grad_norm": 1.5727615356445312,
"learning_rate": 6.594057551738803e-05,
"loss": 0.0368,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 19950
},
{
"epoch": 4.581909821160097,
"grad_norm": 1.2249151468276978,
"learning_rate": 6.575095779169836e-05,
"loss": 0.0374,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20000
},
{
"epoch": 4.593364738899469,
"grad_norm": 1.8625729084014893,
"learning_rate": 6.556108824461206e-05,
"loss": 0.0356,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20050
},
{
"epoch": 4.604819656638841,
"grad_norm": 1.3668529987335205,
"learning_rate": 6.537096991170423e-05,
"loss": 0.0331,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20100
},
{
"epoch": 4.616274574378213,
"grad_norm": 1.388374924659729,
"learning_rate": 6.518060583252741e-05,
"loss": 0.0355,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20150
},
{
"epoch": 4.627729492117584,
"grad_norm": 2.348038673400879,
"learning_rate": 6.498999905056309e-05,
"loss": 0.0369,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20200
},
{
"epoch": 4.639184409856957,
"grad_norm": 1.701794147491455,
"learning_rate": 6.479915261317298e-05,
"loss": 0.0351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20250
},
{
"epoch": 4.650639327596329,
"grad_norm": 1.3405938148498535,
"learning_rate": 6.460806957155037e-05,
"loss": 0.0355,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20300
},
{
"epoch": 4.662094245335701,
"grad_norm": 1.725538730621338,
"learning_rate": 6.441675298067128e-05,
"loss": 0.0348,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20350
},
{
"epoch": 4.673549163075073,
"grad_norm": 1.583162784576416,
"learning_rate": 6.422520589924564e-05,
"loss": 0.0344,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20400
},
{
"epoch": 4.685004080814444,
"grad_norm": 1.4338629245758057,
"learning_rate": 6.403343138966841e-05,
"loss": 0.0353,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20450
},
{
"epoch": 4.696458998553816,
"grad_norm": 2.6246755123138428,
"learning_rate": 6.384143251797056e-05,
"loss": 0.0363,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20500
},
{
"epoch": 4.7079139162931884,
"grad_norm": 2.1834542751312256,
"learning_rate": 6.364921235377016e-05,
"loss": 0.0343,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20550
},
{
"epoch": 4.719368834032561,
"grad_norm": 1.6738187074661255,
"learning_rate": 6.345677397022315e-05,
"loss": 0.0351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20600
},
{
"epoch": 4.730823751771933,
"grad_norm": 1.6495721340179443,
"learning_rate": 6.326412044397438e-05,
"loss": 0.0366,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20650
},
{
"epoch": 4.742278669511305,
"grad_norm": 1.7878650426864624,
"learning_rate": 6.307125485510828e-05,
"loss": 0.0338,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20700
},
{
"epoch": 4.753733587250677,
"grad_norm": 2.035374641418457,
"learning_rate": 6.287818028709967e-05,
"loss": 0.0371,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20750
},
{
"epoch": 4.765188504990048,
"grad_norm": 2.248223304748535,
"learning_rate": 6.268489982676446e-05,
"loss": 0.0374,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20800
},
{
"epoch": 4.77664342272942,
"grad_norm": 2.056480646133423,
"learning_rate": 6.249141656421035e-05,
"loss": 0.0353,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20850
},
{
"epoch": 4.7880983404687925,
"grad_norm": 1.4961349964141846,
"learning_rate": 6.229773359278735e-05,
"loss": 0.037,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20900
},
{
"epoch": 4.799553258208165,
"grad_norm": 1.207465410232544,
"learning_rate": 6.210385400903836e-05,
"loss": 0.0344,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 20950
},
{
"epoch": 4.811008175947537,
"grad_norm": 1.933811902999878,
"learning_rate": 6.190978091264959e-05,
"loss": 0.0338,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21000
},
{
"epoch": 4.822463093686909,
"grad_norm": 1.5286064147949219,
"learning_rate": 6.171551740640115e-05,
"loss": 0.033,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21050
},
{
"epoch": 4.83391801142628,
"grad_norm": 1.4746378660202026,
"learning_rate": 6.152106659611736e-05,
"loss": 0.035,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21100
},
{
"epoch": 4.845372929165652,
"grad_norm": 1.964225172996521,
"learning_rate": 6.132643159061707e-05,
"loss": 0.0336,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21150
},
{
"epoch": 4.856827846905024,
"grad_norm": 1.239408016204834,
"learning_rate": 6.1131615501664e-05,
"loss": 0.0321,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21200
},
{
"epoch": 4.8682827646443965,
"grad_norm": 2.219224452972412,
"learning_rate": 6.093662144391695e-05,
"loss": 0.0371,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21250
},
{
"epoch": 4.879737682383769,
"grad_norm": 1.2696152925491333,
"learning_rate": 6.074145253488006e-05,
"loss": 0.0338,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21300
},
{
"epoch": 4.891192600123141,
"grad_norm": 0.5583789944648743,
"learning_rate": 6.054611189485293e-05,
"loss": 0.0351,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21350
},
{
"epoch": 4.902647517862512,
"grad_norm": 1.4981776475906372,
"learning_rate": 6.035060264688075e-05,
"loss": 0.0321,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21400
},
{
"epoch": 4.914102435601884,
"grad_norm": 1.6405904293060303,
"learning_rate": 6.0154927916704304e-05,
"loss": 0.0339,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21450
},
{
"epoch": 4.925557353341256,
"grad_norm": 1.264320731163025,
"learning_rate": 5.9959090832710155e-05,
"loss": 0.0319,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21500
},
{
"epoch": 4.937012271080628,
"grad_norm": 2.039963722229004,
"learning_rate": 5.9763094525880426e-05,
"loss": 0.0344,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21550
},
{
"epoch": 4.9484671888200005,
"grad_norm": 1.5706747770309448,
"learning_rate": 5.956694212974292e-05,
"loss": 0.0334,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21600
},
{
"epoch": 4.959922106559373,
"grad_norm": 2.058473587036133,
"learning_rate": 5.937063678032093e-05,
"loss": 0.0335,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21650
},
{
"epoch": 4.971377024298745,
"grad_norm": 1.5394372940063477,
"learning_rate": 5.9174181616083066e-05,
"loss": 0.0337,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21700
},
{
"epoch": 4.982831942038116,
"grad_norm": 2.087599992752075,
"learning_rate": 5.89775797778932e-05,
"loss": 0.0341,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21750
},
{
"epoch": 4.994286859777488,
"grad_norm": 1.8887306451797485,
"learning_rate": 5.878083440896015e-05,
"loss": 0.0327,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21800
},
{
"epoch": 5.005727458869686,
"grad_norm": 1.0491043329238892,
"learning_rate": 5.858394865478745e-05,
"loss": 0.0263,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21850
},
{
"epoch": 5.017182376609058,
"grad_norm": 1.3754074573516846,
"learning_rate": 5.8386925663123104e-05,
"loss": 0.0157,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21900
},
{
"epoch": 5.02863729434843,
"grad_norm": 0.8756074905395508,
"learning_rate": 5.818976858390918e-05,
"loss": 0.0184,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 21950
},
{
"epoch": 5.040092212087802,
"grad_norm": 0.9743272066116333,
"learning_rate": 5.7992480569231514e-05,
"loss": 0.018,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22000
},
{
"epoch": 5.051547129827174,
"grad_norm": 1.1601800918579102,
"learning_rate": 5.779506477326933e-05,
"loss": 0.0177,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22050
},
{
"epoch": 5.063002047566546,
"grad_norm": 1.3135687112808228,
"learning_rate": 5.7597524352244734e-05,
"loss": 0.0191,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22100
},
{
"epoch": 5.0744569653059175,
"grad_norm": 1.3936012983322144,
"learning_rate": 5.7399862464372324e-05,
"loss": 0.0184,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22150
},
{
"epoch": 5.08591188304529,
"grad_norm": 0.9932096600532532,
"learning_rate": 5.720208226980864e-05,
"loss": 0.0186,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22200
},
{
"epoch": 5.097366800784662,
"grad_norm": 1.0546112060546875,
"learning_rate": 5.700418693060173e-05,
"loss": 0.0194,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22250
},
{
"epoch": 5.108821718524034,
"grad_norm": 0.8949224948883057,
"learning_rate": 5.6806179610640486e-05,
"loss": 0.0187,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22300
},
{
"epoch": 5.120276636263406,
"grad_norm": 0.9786812663078308,
"learning_rate": 5.660806347560416e-05,
"loss": 0.0176,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22350
},
{
"epoch": 5.131731554002778,
"grad_norm": 0.9927299618721008,
"learning_rate": 5.6409841692911625e-05,
"loss": 0.0195,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22400
},
{
"epoch": 5.143186471742149,
"grad_norm": 1.1186367273330688,
"learning_rate": 5.621151743167091e-05,
"loss": 0.0189,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22450
},
{
"epoch": 5.1546413894815215,
"grad_norm": 1.2558495998382568,
"learning_rate": 5.60130938626284e-05,
"loss": 0.0195,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22500
},
{
"epoch": 5.166096307220894,
"grad_norm": 1.2264378070831299,
"learning_rate": 5.581457415811815e-05,
"loss": 0.0198,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22550
},
{
"epoch": 5.177551224960266,
"grad_norm": 1.243323564529419,
"learning_rate": 5.561596149201127e-05,
"loss": 0.0187,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22600
},
{
"epoch": 5.189006142699638,
"grad_norm": 0.641426682472229,
"learning_rate": 5.541725903966504e-05,
"loss": 0.0183,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22650
},
{
"epoch": 5.20046106043901,
"grad_norm": 0.5407239198684692,
"learning_rate": 5.521846997787223e-05,
"loss": 0.019,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22700
},
{
"epoch": 5.211915978178382,
"grad_norm": 1.1588449478149414,
"learning_rate": 5.501959748481035e-05,
"loss": 0.0203,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22750
},
{
"epoch": 5.223370895917753,
"grad_norm": 1.5353953838348389,
"learning_rate": 5.482064473999071e-05,
"loss": 0.0197,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22800
},
{
"epoch": 5.2348258136571255,
"grad_norm": 1.5715053081512451,
"learning_rate": 5.462161492420772e-05,
"loss": 0.0205,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22850
},
{
"epoch": 5.246280731396498,
"grad_norm": 0.8576170206069946,
"learning_rate": 5.442251121948793e-05,
"loss": 0.0198,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22900
},
{
"epoch": 5.25773564913587,
"grad_norm": 1.722284197807312,
"learning_rate": 5.422333680903921e-05,
"loss": 0.0194,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 22950
},
{
"epoch": 5.269190566875242,
"grad_norm": 1.3785938024520874,
"learning_rate": 5.4024094877199884e-05,
"loss": 0.0204,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23000
},
{
"epoch": 5.280645484614614,
"grad_norm": 0.8565208911895752,
"learning_rate": 5.382478860938776e-05,
"loss": 0.0187,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23050
},
{
"epoch": 5.292100402353985,
"grad_norm": 1.519986629486084,
"learning_rate": 5.362542119204924e-05,
"loss": 0.0204,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23100
},
{
"epoch": 5.303555320093357,
"grad_norm": 0.8362240791320801,
"learning_rate": 5.3425995812608355e-05,
"loss": 0.0188,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23150
},
{
"epoch": 5.3150102378327295,
"grad_norm": 1.1630821228027344,
"learning_rate": 5.3226515659415824e-05,
"loss": 0.0193,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23200
},
{
"epoch": 5.326465155572102,
"grad_norm": 0.8801319599151611,
"learning_rate": 5.302698392169806e-05,
"loss": 0.0179,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23250
},
{
"epoch": 5.337920073311474,
"grad_norm": 1.0547822713851929,
"learning_rate": 5.2827403789506234e-05,
"loss": 0.0203,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23300
},
{
"epoch": 5.349374991050846,
"grad_norm": 1.2050660848617554,
"learning_rate": 5.262777845366515e-05,
"loss": 0.0189,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23350
},
{
"epoch": 5.360829908790217,
"grad_norm": 1.1393359899520874,
"learning_rate": 5.242811110572242e-05,
"loss": 0.0199,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23400
},
{
"epoch": 5.372284826529589,
"grad_norm": 0.7587368488311768,
"learning_rate": 5.2228404937897235e-05,
"loss": 0.0182,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23450
},
{
"epoch": 5.383739744268961,
"grad_norm": 1.5062503814697266,
"learning_rate": 5.20286631430295e-05,
"loss": 0.0202,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23500
},
{
"epoch": 5.3951946620083335,
"grad_norm": 0.8290795087814331,
"learning_rate": 5.1828888914528674e-05,
"loss": 0.0197,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23550
},
{
"epoch": 5.406649579747706,
"grad_norm": 1.096450924873352,
"learning_rate": 5.162908544632274e-05,
"loss": 0.0194,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23600
},
{
"epoch": 5.418104497487078,
"grad_norm": 0.8339506387710571,
"learning_rate": 5.142925593280722e-05,
"loss": 0.0206,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23650
},
{
"epoch": 5.429559415226449,
"grad_norm": 1.8221694231033325,
"learning_rate": 5.1229403568793963e-05,
"loss": 0.02,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23700
},
{
"epoch": 5.441014332965821,
"grad_norm": 2.2157158851623535,
"learning_rate": 5.1029531549460205e-05,
"loss": 0.0208,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23750
},
{
"epoch": 5.452469250705193,
"grad_norm": 1.0183664560317993,
"learning_rate": 5.0829643070297415e-05,
"loss": 0.0192,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23800
},
{
"epoch": 5.463924168444565,
"grad_norm": 0.8894338011741638,
"learning_rate": 5.062974132706016e-05,
"loss": 0.0188,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23850
},
{
"epoch": 5.4753790861839375,
"grad_norm": 1.4340115785598755,
"learning_rate": 5.042982951571515e-05,
"loss": 0.0188,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23900
},
{
"epoch": 5.48683400392331,
"grad_norm": 1.343416690826416,
"learning_rate": 5.022991083239002e-05,
"loss": 0.0204,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 23950
},
{
"epoch": 5.498288921662681,
"grad_norm": 1.0022575855255127,
"learning_rate": 5.0029988473322256e-05,
"loss": 0.0196,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24000
},
{
"epoch": 5.509743839402053,
"grad_norm": 1.0293878316879272,
"learning_rate": 4.9830065634808144e-05,
"loss": 0.0185,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24050
},
{
"epoch": 5.521198757141425,
"grad_norm": 1.5076055526733398,
"learning_rate": 4.963014551315163e-05,
"loss": 0.018,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24100
},
{
"epoch": 5.532653674880797,
"grad_norm": 1.333103895187378,
"learning_rate": 4.943023130461317e-05,
"loss": 0.0189,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24150
},
{
"epoch": 5.544108592620169,
"grad_norm": 1.848940134048462,
"learning_rate": 4.9230326205358794e-05,
"loss": 0.0191,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24200
},
{
"epoch": 5.5555635103595415,
"grad_norm": 1.3931152820587158,
"learning_rate": 4.903043341140879e-05,
"loss": 0.0199,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24250
},
{
"epoch": 5.567018428098914,
"grad_norm": 1.2779439687728882,
"learning_rate": 4.883055611858676e-05,
"loss": 0.0181,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24300
},
{
"epoch": 5.578473345838285,
"grad_norm": 1.221993088722229,
"learning_rate": 4.8630697522468455e-05,
"loss": 0.0201,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24350
},
{
"epoch": 5.589928263577657,
"grad_norm": 1.5948313474655151,
"learning_rate": 4.8430860818330756e-05,
"loss": 0.0192,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24400
},
{
"epoch": 5.601383181317029,
"grad_norm": 1.4784343242645264,
"learning_rate": 4.823104920110049e-05,
"loss": 0.0195,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24450
},
{
"epoch": 5.612838099056401,
"grad_norm": 1.4098366498947144,
"learning_rate": 4.8031265865303434e-05,
"loss": 0.0201,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24500
},
{
"epoch": 5.624293016795773,
"grad_norm": 1.5338062047958374,
"learning_rate": 4.783151400501319e-05,
"loss": 0.0196,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24550
},
{
"epoch": 5.635747934535145,
"grad_norm": 1.1229875087738037,
"learning_rate": 4.763179681380016e-05,
"loss": 0.0188,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24600
},
{
"epoch": 5.647202852274517,
"grad_norm": 1.63667893409729,
"learning_rate": 4.7432117484680434e-05,
"loss": 0.02,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24650
},
{
"epoch": 5.658657770013889,
"grad_norm": 0.6071897745132446,
"learning_rate": 4.723247921006483e-05,
"loss": 0.0202,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24700
},
{
"epoch": 5.670112687753261,
"grad_norm": 1.3240978717803955,
"learning_rate": 4.703288518170774e-05,
"loss": 0.019,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24750
},
{
"epoch": 5.681567605492633,
"grad_norm": 0.9332528114318848,
"learning_rate": 4.683333859065621e-05,
"loss": 0.0189,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24800
},
{
"epoch": 5.693022523232005,
"grad_norm": 1.2496166229248047,
"learning_rate": 4.663384262719881e-05,
"loss": 0.0183,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24850
},
{
"epoch": 5.704477440971377,
"grad_norm": 1.2651309967041016,
"learning_rate": 4.643440048081478e-05,
"loss": 0.0201,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24900
},
{
"epoch": 5.715932358710749,
"grad_norm": 1.1634583473205566,
"learning_rate": 4.623501534012287e-05,
"loss": 0.02,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 24950
},
{
"epoch": 5.727387276450121,
"grad_norm": 1.3410481214523315,
"learning_rate": 4.60356903928305e-05,
"loss": 0.0187,
"memory/device_mem_reserved(gib)": 53.25,
"memory/max_mem_active(gib)": 48.21,
"memory/max_mem_allocated(gib)": 48.21,
"step": 25000
}
],
"logging_steps": 50,
"max_steps": 43649,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.430940996497441e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}