Files
nemotron-terminal-data_quer…/trainer_state.json
ModelHub XC 0267e54605 初始化项目,由ModelHub XC社区提供模型
Model: laion/nemotron-terminal-data_querying__Qwen3-8B
Source: Original Platform
2026-04-23 16:38:11 +08:00

1584 lines
44 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.050335570469798654,
"grad_norm": 14.067043242829744,
"learning_rate": 2.285714285714286e-06,
"loss": 0.9272,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26700228452682495,
"step": 5,
"valid_targets_mean": 6933.7,
"valid_targets_min": 2457
},
{
"epoch": 0.10067114093959731,
"grad_norm": 6.914800443529387,
"learning_rate": 5.142857142857142e-06,
"loss": 0.8868,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29330819845199585,
"step": 10,
"valid_targets_mean": 8950.2,
"valid_targets_min": 3280
},
{
"epoch": 0.15100671140939598,
"grad_norm": 2.313179660547222,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7848,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2518497109413147,
"step": 15,
"valid_targets_mean": 8844.1,
"valid_targets_min": 2299
},
{
"epoch": 0.20134228187919462,
"grad_norm": 1.5075390661307202,
"learning_rate": 1.0857142857142858e-05,
"loss": 0.7382,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22578322887420654,
"step": 20,
"valid_targets_mean": 8364.3,
"valid_targets_min": 3708
},
{
"epoch": 0.2516778523489933,
"grad_norm": 1.1501359371585838,
"learning_rate": 1.3714285714285716e-05,
"loss": 0.7031,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2374565750360489,
"step": 25,
"valid_targets_mean": 9124.7,
"valid_targets_min": 3015
},
{
"epoch": 0.30201342281879195,
"grad_norm": 0.8105710331742373,
"learning_rate": 1.6571428571428574e-05,
"loss": 0.6708,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2113051414489746,
"step": 30,
"valid_targets_mean": 8698.6,
"valid_targets_min": 1259
},
{
"epoch": 0.3523489932885906,
"grad_norm": 0.6231544559752229,
"learning_rate": 1.942857142857143e-05,
"loss": 0.6344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19961217045783997,
"step": 35,
"valid_targets_mean": 8497.4,
"valid_targets_min": 3348
},
{
"epoch": 0.40268456375838924,
"grad_norm": 0.4897983592186633,
"learning_rate": 2.2285714285714287e-05,
"loss": 0.6074,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20259523391723633,
"step": 40,
"valid_targets_mean": 8546.9,
"valid_targets_min": 2063
},
{
"epoch": 0.45302013422818793,
"grad_norm": 0.43138833987068276,
"learning_rate": 2.5142857142857143e-05,
"loss": 0.5747,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2051178216934204,
"step": 45,
"valid_targets_mean": 8575.3,
"valid_targets_min": 2840
},
{
"epoch": 0.5033557046979866,
"grad_norm": 0.3841996600572438,
"learning_rate": 2.8e-05,
"loss": 0.5489,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19739781320095062,
"step": 50,
"valid_targets_mean": 8994.1,
"valid_targets_min": 3748
},
{
"epoch": 0.5536912751677853,
"grad_norm": 0.3384894884883755,
"learning_rate": 3.085714285714286e-05,
"loss": 0.5397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17745928466320038,
"step": 55,
"valid_targets_mean": 8368.2,
"valid_targets_min": 4038
},
{
"epoch": 0.6040268456375839,
"grad_norm": 0.30063971006378454,
"learning_rate": 3.3714285714285716e-05,
"loss": 0.5154,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17042505741119385,
"step": 60,
"valid_targets_mean": 8646.4,
"valid_targets_min": 3884
},
{
"epoch": 0.6543624161073825,
"grad_norm": 0.27761325302634815,
"learning_rate": 3.6571428571428576e-05,
"loss": 0.5001,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1616188883781433,
"step": 65,
"valid_targets_mean": 8152.5,
"valid_targets_min": 2735
},
{
"epoch": 0.7046979865771812,
"grad_norm": 0.26215808993297834,
"learning_rate": 3.9428571428571435e-05,
"loss": 0.4901,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14332228899002075,
"step": 70,
"valid_targets_mean": 7592.4,
"valid_targets_min": 2951
},
{
"epoch": 0.7550335570469798,
"grad_norm": 0.28480265821738415,
"learning_rate": 3.9996021455410475e-05,
"loss": 0.4879,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1745082437992096,
"step": 75,
"valid_targets_mean": 8807.3,
"valid_targets_min": 4207
},
{
"epoch": 0.8053691275167785,
"grad_norm": 0.28063407810285146,
"learning_rate": 3.9979861330826295e-05,
"loss": 0.4644,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14077091217041016,
"step": 80,
"valid_targets_mean": 7697.2,
"valid_targets_min": 3204
},
{
"epoch": 0.8557046979865772,
"grad_norm": 0.2685419255892034,
"learning_rate": 3.9951281005196486e-05,
"loss": 0.4603,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15987437963485718,
"step": 85,
"valid_targets_mean": 9192.3,
"valid_targets_min": 3074
},
{
"epoch": 0.9060402684563759,
"grad_norm": 0.25518469304078806,
"learning_rate": 3.99102982450803e-05,
"loss": 0.4573,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16007889807224274,
"step": 90,
"valid_targets_mean": 9071.2,
"valid_targets_min": 4077
},
{
"epoch": 0.9563758389261745,
"grad_norm": 0.292125810481402,
"learning_rate": 3.985693852683675e-05,
"loss": 0.4411,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15639987587928772,
"step": 95,
"valid_targets_mean": 9211.0,
"valid_targets_min": 3703
},
{
"epoch": 1.0,
"grad_norm": 0.3544831597811695,
"learning_rate": 3.9791235020787546e-05,
"loss": 0.4358,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41546738147735596,
"step": 100,
"valid_targets_mean": 8125.5,
"valid_targets_min": 3221
},
{
"epoch": 1.0503355704697988,
"grad_norm": 0.3048951044180603,
"learning_rate": 3.971322857059726e-05,
"loss": 0.4463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13855554163455963,
"step": 105,
"valid_targets_mean": 8141.4,
"valid_targets_min": 3502
},
{
"epoch": 1.1006711409395973,
"grad_norm": 0.27845721026650505,
"learning_rate": 3.962296766788345e-05,
"loss": 0.4345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14278040826320648,
"step": 110,
"valid_targets_mean": 8561.2,
"valid_targets_min": 3965
},
{
"epoch": 1.151006711409396,
"grad_norm": 0.3061482398268534,
"learning_rate": 3.952050842207249e-05,
"loss": 0.4304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14196383953094482,
"step": 115,
"valid_targets_mean": 8437.8,
"valid_targets_min": 2414
},
{
"epoch": 1.2013422818791946,
"grad_norm": 0.2626030786077286,
"learning_rate": 3.940591452551993e-05,
"loss": 0.4331,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13812202215194702,
"step": 120,
"valid_targets_mean": 8286.2,
"valid_targets_min": 2490
},
{
"epoch": 1.2516778523489933,
"grad_norm": 0.26798574842262085,
"learning_rate": 3.927925721391707e-05,
"loss": 0.4195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13096113502979279,
"step": 125,
"valid_targets_mean": 8221.9,
"valid_targets_min": 3465
},
{
"epoch": 1.302013422818792,
"grad_norm": 0.305138300632288,
"learning_rate": 3.914061522200825e-05,
"loss": 0.4256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11967723816633224,
"step": 130,
"valid_targets_mean": 7786.5,
"valid_targets_min": 2421
},
{
"epoch": 1.3523489932885906,
"grad_norm": 0.2862842034727045,
"learning_rate": 3.899007473464653e-05,
"loss": 0.4201,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1479918360710144,
"step": 135,
"valid_targets_mean": 9349.9,
"valid_targets_min": 2832
},
{
"epoch": 1.4026845637583891,
"grad_norm": 0.2637388574350784,
"learning_rate": 3.882772933321807e-05,
"loss": 0.4214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14767968654632568,
"step": 140,
"valid_targets_mean": 9506.8,
"valid_targets_min": 4115
},
{
"epoch": 1.4530201342281879,
"grad_norm": 0.3015172646055234,
"learning_rate": 3.8653679937468556e-05,
"loss": 0.4147,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13590680062770844,
"step": 145,
"valid_targets_mean": 8029.0,
"valid_targets_min": 2691
},
{
"epoch": 1.5033557046979866,
"grad_norm": 0.2570832110359043,
"learning_rate": 3.846803474276789e-05,
"loss": 0.4086,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14070050418376923,
"step": 150,
"valid_targets_mean": 9124.1,
"valid_targets_min": 3411
},
{
"epoch": 1.5536912751677852,
"grad_norm": 0.2798040054064293,
"learning_rate": 3.827090915285202e-05,
"loss": 0.4146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1273345947265625,
"step": 155,
"valid_targets_mean": 7573.3,
"valid_targets_min": 2830
},
{
"epoch": 1.604026845637584,
"grad_norm": 0.2685619362726408,
"learning_rate": 3.806242570808384e-05,
"loss": 0.4118,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12897896766662598,
"step": 160,
"valid_targets_mean": 8312.7,
"valid_targets_min": 2632
},
{
"epoch": 1.6543624161073827,
"grad_norm": 0.2657429865288553,
"learning_rate": 3.7842714009277675e-05,
"loss": 0.4131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12389227747917175,
"step": 165,
"valid_targets_mean": 8151.8,
"valid_targets_min": 2946
},
{
"epoch": 1.7046979865771812,
"grad_norm": 0.28840825658933456,
"learning_rate": 3.761191063713476e-05,
"loss": 0.4071,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.136656254529953,
"step": 170,
"valid_targets_mean": 8608.5,
"valid_targets_min": 4102
},
{
"epoch": 1.7550335570469797,
"grad_norm": 0.2596821078603819,
"learning_rate": 3.737015906733978e-05,
"loss": 0.4106,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11933685839176178,
"step": 175,
"valid_targets_mean": 8410.7,
"valid_targets_min": 3206
},
{
"epoch": 1.8053691275167785,
"grad_norm": 0.26139104771576044,
"learning_rate": 3.711760958137118e-05,
"loss": 0.4056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1329861581325531,
"step": 180,
"valid_targets_mean": 8316.8,
"valid_targets_min": 2862
},
{
"epoch": 1.8557046979865772,
"grad_norm": 0.305203527481438,
"learning_rate": 3.6854419173080784e-05,
"loss": 0.4115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15162314474582672,
"step": 185,
"valid_targets_mean": 8494.5,
"valid_targets_min": 3703
},
{
"epoch": 1.9060402684563758,
"grad_norm": 0.2924940281192017,
"learning_rate": 3.658075145110083e-05,
"loss": 0.4071,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12577557563781738,
"step": 190,
"valid_targets_mean": 8278.9,
"valid_targets_min": 3343
},
{
"epoch": 1.9563758389261745,
"grad_norm": 0.26987694233570014,
"learning_rate": 3.6296776537138905e-05,
"loss": 0.4078,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.140755757689476,
"step": 195,
"valid_targets_mean": 8258.5,
"valid_targets_min": 2520
},
{
"epoch": 2.0,
"grad_norm": 0.37739893150325504,
"learning_rate": 3.600267096022413e-05,
"loss": 0.4128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4272458553314209,
"step": 200,
"valid_targets_mean": 9129.7,
"valid_targets_min": 2470
},
{
"epoch": 2.0503355704697985,
"grad_norm": 0.3049953483433764,
"learning_rate": 3.569861754697045e-05,
"loss": 0.3993,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11460161954164505,
"step": 205,
"valid_targets_mean": 7970.5,
"valid_targets_min": 3494
},
{
"epoch": 2.1006711409395975,
"grad_norm": 0.2999304719531947,
"learning_rate": 3.538480530792498e-05,
"loss": 0.3982,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12947125732898712,
"step": 210,
"valid_targets_mean": 8025.0,
"valid_targets_min": 2579
},
{
"epoch": 2.151006711409396,
"grad_norm": 0.3353112866997524,
"learning_rate": 3.5061429320072225e-05,
"loss": 0.3894,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12318155169487,
"step": 215,
"valid_targets_mean": 8213.8,
"valid_targets_min": 3285
},
{
"epoch": 2.2013422818791946,
"grad_norm": 0.30258943318520176,
"learning_rate": 3.472869060556724e-05,
"loss": 0.3953,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13107575476169586,
"step": 220,
"valid_targets_mean": 9016.2,
"valid_targets_min": 3927
},
{
"epoch": 2.251677852348993,
"grad_norm": 0.2766581007748473,
"learning_rate": 3.438679600677303e-05,
"loss": 0.3862,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12715698778629303,
"step": 225,
"valid_targets_mean": 8472.8,
"valid_targets_min": 3374
},
{
"epoch": 2.302013422818792,
"grad_norm": 0.27363285439160917,
"learning_rate": 3.4035958057679836e-05,
"loss": 0.3993,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13462711870670319,
"step": 230,
"valid_targets_mean": 8907.7,
"valid_targets_min": 3054
},
{
"epoch": 2.3523489932885906,
"grad_norm": 0.3031249283095144,
"learning_rate": 3.36763948517864e-05,
"loss": 0.3919,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1310027539730072,
"step": 235,
"valid_targets_mean": 8485.2,
"valid_targets_min": 2790
},
{
"epoch": 2.402684563758389,
"grad_norm": 0.29023372434273265,
"learning_rate": 3.330832990652523e-05,
"loss": 0.3958,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14061188697814941,
"step": 240,
"valid_targets_mean": 8742.7,
"valid_targets_min": 2578
},
{
"epoch": 2.453020134228188,
"grad_norm": 0.23540024236787796,
"learning_rate": 3.293199202431599e-05,
"loss": 0.3947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12830784916877747,
"step": 245,
"valid_targets_mean": 8962.8,
"valid_targets_min": 2643
},
{
"epoch": 2.5033557046979866,
"grad_norm": 0.25906215046606,
"learning_rate": 3.2547615150333855e-05,
"loss": 0.3863,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12897969782352448,
"step": 250,
"valid_targets_mean": 7864.5,
"valid_targets_min": 3231
},
{
"epoch": 2.553691275167785,
"grad_norm": 0.2641585608461065,
"learning_rate": 3.2155438227080607e-05,
"loss": 0.3934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12830249965190887,
"step": 255,
"valid_targets_mean": 9147.8,
"valid_targets_min": 2996
},
{
"epoch": 2.604026845637584,
"grad_norm": 0.28192077058039483,
"learning_rate": 3.1755705045849465e-05,
"loss": 0.3912,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14721933007240295,
"step": 260,
"valid_targets_mean": 8236.8,
"valid_targets_min": 2500
},
{
"epoch": 2.6543624161073827,
"grad_norm": 0.2630346553683364,
"learning_rate": 3.134866409517564e-05,
"loss": 0.3956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1302284449338913,
"step": 265,
"valid_targets_mean": 8351.7,
"valid_targets_min": 2983
},
{
"epoch": 2.704697986577181,
"grad_norm": 0.27865692462524355,
"learning_rate": 3.0934568406366875e-05,
"loss": 0.3917,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12824495136737823,
"step": 270,
"valid_targets_mean": 7805.7,
"valid_targets_min": 3209
},
{
"epoch": 2.7550335570469797,
"grad_norm": 0.2577331217894111,
"learning_rate": 3.0513675396210094e-05,
"loss": 0.3851,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13080325722694397,
"step": 275,
"valid_targets_mean": 8240.6,
"valid_targets_min": 3069
},
{
"epoch": 2.8053691275167782,
"grad_norm": 0.2635426750558006,
"learning_rate": 3.0086246706951888e-05,
"loss": 0.3859,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12472230941057205,
"step": 280,
"valid_targets_mean": 8878.5,
"valid_targets_min": 2369
},
{
"epoch": 2.8557046979865772,
"grad_norm": 0.2789257907218796,
"learning_rate": 2.965254804365222e-05,
"loss": 0.3828,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1399698108434677,
"step": 285,
"valid_targets_mean": 8741.5,
"valid_targets_min": 4390
},
{
"epoch": 2.9060402684563758,
"grad_norm": 0.28293835669184936,
"learning_rate": 2.921284900901265e-05,
"loss": 0.3876,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12492035329341888,
"step": 290,
"valid_targets_mean": 7754.4,
"valid_targets_min": 1560
},
{
"epoch": 2.9563758389261743,
"grad_norm": 0.2481634552274392,
"learning_rate": 2.876742293578155e-05,
"loss": 0.3867,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12382625788450241,
"step": 295,
"valid_targets_mean": 8065.1,
"valid_targets_min": 2619
},
{
"epoch": 3.0,
"grad_norm": 0.38393308540389254,
"learning_rate": 2.831654671684066e-05,
"loss": 0.3839,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3663029372692108,
"step": 300,
"valid_targets_mean": 8224.7,
"valid_targets_min": 4535
},
{
"epoch": 3.0503355704697985,
"grad_norm": 0.30636912119481896,
"learning_rate": 2.7860500633078475e-05,
"loss": 0.3796,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13847512006759644,
"step": 305,
"valid_targets_mean": 9187.4,
"valid_targets_min": 1685
},
{
"epoch": 3.1006711409395975,
"grad_norm": 0.2600938627349429,
"learning_rate": 2.7399568179157582e-05,
"loss": 0.3766,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1181330680847168,
"step": 310,
"valid_targets_mean": 7997.2,
"valid_targets_min": 3698
},
{
"epoch": 3.151006711409396,
"grad_norm": 0.2584179619503242,
"learning_rate": 2.693403588728415e-05,
"loss": 0.3801,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12773603200912476,
"step": 315,
"valid_targets_mean": 8189.1,
"valid_targets_min": 3971
},
{
"epoch": 3.2013422818791946,
"grad_norm": 0.2774796872269688,
"learning_rate": 2.6464193149089204e-05,
"loss": 0.3812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10868334770202637,
"step": 320,
"valid_targets_mean": 7379.7,
"valid_targets_min": 3032
},
{
"epoch": 3.251677852348993,
"grad_norm": 0.3133065386988344,
"learning_rate": 2.5990332035732388e-05,
"loss": 0.3745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11322431266307831,
"step": 325,
"valid_targets_mean": 7262.5,
"valid_targets_min": 3063
},
{
"epoch": 3.302013422818792,
"grad_norm": 0.2758530310070607,
"learning_rate": 2.5512747116339985e-05,
"loss": 0.3774,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12534014880657196,
"step": 330,
"valid_targets_mean": 8078.2,
"valid_targets_min": 2959
},
{
"epoch": 3.3523489932885906,
"grad_norm": 0.2609966069352176,
"learning_rate": 2.5031735274890176e-05,
"loss": 0.3814,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12462468445301056,
"step": 335,
"valid_targets_mean": 8127.2,
"valid_targets_min": 3820
},
{
"epoch": 3.402684563758389,
"grad_norm": 0.254083690107489,
"learning_rate": 2.454759552565923e-05,
"loss": 0.376,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11217983812093735,
"step": 340,
"valid_targets_mean": 7862.0,
"valid_targets_min": 2752
},
{
"epoch": 3.453020134228188,
"grad_norm": 0.3236658923875938,
"learning_rate": 2.4060628827343525e-05,
"loss": 0.3756,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13022278249263763,
"step": 345,
"valid_targets_mean": 8530.5,
"valid_targets_min": 3280
},
{
"epoch": 3.5033557046979866,
"grad_norm": 0.239075753126642,
"learning_rate": 2.3571137895972735e-05,
"loss": 0.3786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13119235634803772,
"step": 350,
"valid_targets_mean": 9104.2,
"valid_targets_min": 3026
},
{
"epoch": 3.553691275167785,
"grad_norm": 0.2376608883417584,
"learning_rate": 2.307942701673067e-05,
"loss": 0.3791,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11539403349161148,
"step": 355,
"valid_targets_mean": 7391.9,
"valid_targets_min": 3058
},
{
"epoch": 3.604026845637584,
"grad_norm": 0.23963182948426243,
"learning_rate": 2.258580185480067e-05,
"loss": 0.3864,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12229941040277481,
"step": 360,
"valid_targets_mean": 8436.8,
"valid_targets_min": 4365
},
{
"epoch": 3.6543624161073827,
"grad_norm": 0.25828526074715563,
"learning_rate": 2.209056926535307e-05,
"loss": 0.3783,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12994864583015442,
"step": 365,
"valid_targets_mean": 9589.9,
"valid_targets_min": 2527
},
{
"epoch": 3.704697986577181,
"grad_norm": 0.2600007439024421,
"learning_rate": 2.1594037102793054e-05,
"loss": 0.3763,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13144755363464355,
"step": 370,
"valid_targets_mean": 8440.9,
"valid_targets_min": 3689
},
{
"epoch": 3.7550335570469797,
"grad_norm": 0.2826423814550457,
"learning_rate": 2.1096514029387204e-05,
"loss": 0.3747,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11268685758113861,
"step": 375,
"valid_targets_mean": 8373.9,
"valid_targets_min": 2523
},
{
"epoch": 3.8053691275167782,
"grad_norm": 0.23106532956293616,
"learning_rate": 2.0598309323387974e-05,
"loss": 0.3776,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11632819473743439,
"step": 380,
"valid_targets_mean": 7967.5,
"valid_targets_min": 2448
},
{
"epoch": 3.8557046979865772,
"grad_norm": 0.2793229698264241,
"learning_rate": 2.0099732686775165e-05,
"loss": 0.3745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12431603670120239,
"step": 385,
"valid_targets_mean": 8111.6,
"valid_targets_min": 3320
},
{
"epoch": 3.9060402684563758,
"grad_norm": 0.25004553466508994,
"learning_rate": 1.9601094052734043e-05,
"loss": 0.3767,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11581622064113617,
"step": 390,
"valid_targets_mean": 7781.6,
"valid_targets_min": 2793
},
{
"epoch": 3.9563758389261743,
"grad_norm": 0.23606564352865256,
"learning_rate": 1.910270339298971e-05,
"loss": 0.374,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12360651046037674,
"step": 395,
"valid_targets_mean": 9079.7,
"valid_targets_min": 2256
},
{
"epoch": 4.0,
"grad_norm": 0.35482595437396325,
"learning_rate": 1.8604870525117496e-05,
"loss": 0.3722,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34915870428085327,
"step": 400,
"valid_targets_mean": 8397.6,
"valid_targets_min": 2217
},
{
"epoch": 4.050335570469799,
"grad_norm": 0.2342122173238615,
"learning_rate": 1.810790491994926e-05,
"loss": 0.3755,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12274056673049927,
"step": 405,
"valid_targets_mean": 7699.5,
"valid_targets_min": 3326
},
{
"epoch": 4.100671140939597,
"grad_norm": 0.23963948392191442,
"learning_rate": 1.7612115509195118e-05,
"loss": 0.3774,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1295328140258789,
"step": 410,
"valid_targets_mean": 8832.6,
"valid_targets_min": 3074
},
{
"epoch": 4.151006711409396,
"grad_norm": 0.243907924996217,
"learning_rate": 1.7117810493400403e-05,
"loss": 0.3698,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13174119591712952,
"step": 415,
"valid_targets_mean": 8698.4,
"valid_targets_min": 2273
},
{
"epoch": 4.201342281879195,
"grad_norm": 0.26480175820703833,
"learning_rate": 1.6625297150357103e-05,
"loss": 0.3661,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1256350874900818,
"step": 420,
"valid_targets_mean": 8510.8,
"valid_targets_min": 2007
},
{
"epoch": 4.251677852348993,
"grad_norm": 0.23403758629417248,
"learning_rate": 1.613488164408894e-05,
"loss": 0.3726,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13531510531902313,
"step": 425,
"valid_targets_mean": 9202.9,
"valid_targets_min": 4057
},
{
"epoch": 4.302013422818792,
"grad_norm": 0.24821740250948118,
"learning_rate": 1.5646868834528756e-05,
"loss": 0.3754,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12739360332489014,
"step": 430,
"valid_targets_mean": 8427.7,
"valid_targets_min": 2689
},
{
"epoch": 4.35234899328859,
"grad_norm": 0.26401888784599165,
"learning_rate": 1.5161562088006649e-05,
"loss": 0.3683,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12882062792778015,
"step": 435,
"valid_targets_mean": 8841.7,
"valid_targets_min": 3116
},
{
"epoch": 4.402684563758389,
"grad_norm": 0.24650023655502046,
"learning_rate": 1.46792630886665e-05,
"loss": 0.3727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11792377382516861,
"step": 440,
"valid_targets_mean": 8282.2,
"valid_targets_min": 3285
},
{
"epoch": 4.453020134228188,
"grad_norm": 0.21989371193578802,
"learning_rate": 1.4200271650928277e-05,
"loss": 0.3691,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12472942471504211,
"step": 445,
"valid_targets_mean": 8738.6,
"valid_targets_min": 3032
},
{
"epoch": 4.503355704697986,
"grad_norm": 0.25636623550713955,
"learning_rate": 1.3724885533112595e-05,
"loss": 0.3713,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12675310671329498,
"step": 450,
"valid_targets_mean": 9018.7,
"valid_targets_min": 3726
},
{
"epoch": 4.553691275167785,
"grad_norm": 0.21737284709139357,
"learning_rate": 1.3253400252343403e-05,
"loss": 0.3666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13098780810832977,
"step": 455,
"valid_targets_mean": 8928.5,
"valid_targets_min": 1627
},
{
"epoch": 4.604026845637584,
"grad_norm": 0.28414631511633537,
"learning_rate": 1.2786108900843927e-05,
"loss": 0.3675,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1366838663816452,
"step": 460,
"valid_targets_mean": 8725.3,
"valid_targets_min": 2298
},
{
"epoch": 4.654362416107382,
"grad_norm": 0.23351941571371157,
"learning_rate": 1.2323301963739995e-05,
"loss": 0.3664,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10459558665752411,
"step": 465,
"valid_targets_mean": 7133.8,
"valid_targets_min": 2295
},
{
"epoch": 4.704697986577181,
"grad_norm": 0.24312028409352227,
"learning_rate": 1.1865267138484e-05,
"loss": 0.367,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12919902801513672,
"step": 470,
"valid_targets_mean": 8653.2,
"valid_targets_min": 3442
},
{
"epoch": 4.75503355704698,
"grad_norm": 0.2413133969520935,
"learning_rate": 1.1412289156011816e-05,
"loss": 0.3686,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12121167778968811,
"step": 475,
"valid_targets_mean": 9176.2,
"valid_targets_min": 3055
},
{
"epoch": 4.805369127516778,
"grad_norm": 0.24338773318065152,
"learning_rate": 1.0964649603743837e-05,
"loss": 0.3681,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1103016585111618,
"step": 480,
"valid_targets_mean": 7677.8,
"valid_targets_min": 913
},
{
"epoch": 4.855704697986577,
"grad_norm": 0.2090170817247366,
"learning_rate": 1.0522626750540029e-05,
"loss": 0.3656,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11717471480369568,
"step": 485,
"valid_targets_mean": 9180.3,
"valid_targets_min": 3708
},
{
"epoch": 4.906040268456376,
"grad_norm": 0.22269853809647422,
"learning_rate": 1.0086495373718048e-05,
"loss": 0.3677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13055413961410522,
"step": 490,
"valid_targets_mean": 8114.8,
"valid_targets_min": 3226
},
{
"epoch": 4.956375838926174,
"grad_norm": 0.22882550873144628,
"learning_rate": 9.656526588241745e-06,
"loss": 0.3679,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11334412544965744,
"step": 495,
"valid_targets_mean": 8118.7,
"valid_targets_min": 2793
},
{
"epoch": 5.0,
"grad_norm": 0.3733228007434408,
"learning_rate": 9.232987678186357e-06,
"loss": 0.3708,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3762795329093933,
"step": 500,
"valid_targets_mean": 7505.5,
"valid_targets_min": 3065
},
{
"epoch": 5.050335570469799,
"grad_norm": 0.23829621547620775,
"learning_rate": 8.816141930585067e-06,
"loss": 0.3629,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1350947916507721,
"step": 505,
"valid_targets_mean": 9287.1,
"valid_targets_min": 2062
},
{
"epoch": 5.100671140939597,
"grad_norm": 0.22395514111013695,
"learning_rate": 8.406248471760357e-06,
"loss": 0.3678,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12289828807115555,
"step": 510,
"valid_targets_mean": 8912.8,
"valid_targets_min": 3025
},
{
"epoch": 5.151006711409396,
"grad_norm": 0.2122241497882817,
"learning_rate": 8.003562106241727e-06,
"loss": 0.3623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12498777359724045,
"step": 515,
"valid_targets_mean": 8863.8,
"valid_targets_min": 2840
},
{
"epoch": 5.201342281879195,
"grad_norm": 0.28891708884786693,
"learning_rate": 7.608333158370036e-06,
"loss": 0.3611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11199884861707687,
"step": 520,
"valid_targets_mean": 7870.0,
"valid_targets_min": 2620
},
{
"epoch": 5.251677852348993,
"grad_norm": 0.23394607717295618,
"learning_rate": 7.220807316686886e-06,
"loss": 0.3691,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12199863791465759,
"step": 525,
"valid_targets_mean": 8564.5,
"valid_targets_min": 4298
},
{
"epoch": 5.302013422818792,
"grad_norm": 0.21294918655504008,
"learning_rate": 6.841225481205749e-06,
"loss": 0.3635,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12100854516029358,
"step": 530,
"valid_targets_mean": 9039.7,
"valid_targets_min": 1560
},
{
"epoch": 5.35234899328859,
"grad_norm": 0.21494480970028404,
"learning_rate": 6.469823613659896e-06,
"loss": 0.3651,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11410963535308838,
"step": 535,
"valid_targets_mean": 8951.2,
"valid_targets_min": 3544
},
{
"epoch": 5.402684563758389,
"grad_norm": 0.21580063331767288,
"learning_rate": 6.106832590820053e-06,
"loss": 0.365,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11225862056016922,
"step": 540,
"valid_targets_mean": 7322.1,
"valid_targets_min": 3287
},
{
"epoch": 5.453020134228188,
"grad_norm": 0.22123701317383823,
"learning_rate": 5.752478060973108e-06,
"loss": 0.3669,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12023737281560898,
"step": 545,
"valid_targets_mean": 8372.3,
"valid_targets_min": 3655
},
{
"epoch": 5.503355704697986,
"grad_norm": 0.2143304441931162,
"learning_rate": 5.406980303650984e-06,
"loss": 0.3632,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1282443404197693,
"step": 550,
"valid_targets_mean": 9076.7,
"valid_targets_min": 3224
},
{
"epoch": 5.553691275167785,
"grad_norm": 0.2059409325448744,
"learning_rate": 5.070554092696997e-06,
"loss": 0.3658,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1095615029335022,
"step": 555,
"valid_targets_mean": 7624.6,
"valid_targets_min": 3724
},
{
"epoch": 5.604026845637584,
"grad_norm": 0.20837192180926864,
"learning_rate": 4.74340856275467e-06,
"loss": 0.3649,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11278112977743149,
"step": 560,
"valid_targets_mean": 8129.7,
"valid_targets_min": 2778
},
{
"epoch": 5.654362416107382,
"grad_norm": 0.21402988791875252,
"learning_rate": 4.425747079262121e-06,
"loss": 0.3674,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12279454618692398,
"step": 565,
"valid_targets_mean": 8821.7,
"valid_targets_min": 2820
},
{
"epoch": 5.704697986577181,
"grad_norm": 0.2074510845148661,
"learning_rate": 4.11776711203278e-06,
"loss": 0.3626,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1227295845746994,
"step": 570,
"valid_targets_mean": 8974.7,
"valid_targets_min": 2500
},
{
"epoch": 5.75503355704698,
"grad_norm": 0.2306440244667225,
"learning_rate": 3.819660112501053e-06,
"loss": 0.3604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11513447761535645,
"step": 575,
"valid_targets_mean": 7983.1,
"valid_targets_min": 3126
},
{
"epoch": 5.805369127516778,
"grad_norm": 0.2172176590730982,
"learning_rate": 3.531611394709216e-06,
"loss": 0.3684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12855035066604614,
"step": 580,
"valid_targets_mean": 8633.9,
"valid_targets_min": 2741
},
{
"epoch": 5.855704697986577,
"grad_norm": 0.21562611800655393,
"learning_rate": 3.2538000201095363e-06,
"loss": 0.3663,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12427804619073868,
"step": 585,
"valid_targets_mean": 9145.5,
"valid_targets_min": 3920
},
{
"epoch": 5.906040268456376,
"grad_norm": 0.2190237582438193,
"learning_rate": 2.986398686253211e-06,
"loss": 0.3701,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13051660358905792,
"step": 590,
"valid_targets_mean": 8620.1,
"valid_targets_min": 3357
},
{
"epoch": 5.956375838926174,
"grad_norm": 0.2080744597422396,
"learning_rate": 2.729573619435384e-06,
"loss": 0.363,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1293225884437561,
"step": 595,
"valid_targets_mean": 8719.9,
"valid_targets_min": 3853
},
{
"epoch": 6.0,
"grad_norm": 0.3674873001472298,
"learning_rate": 2.483484471362869e-06,
"loss": 0.362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3444654941558838,
"step": 600,
"valid_targets_mean": 7666.7,
"valid_targets_min": 2472
},
{
"epoch": 6.050335570469799,
"grad_norm": 0.21298796074381432,
"learning_rate": 2.248284219908918e-06,
"loss": 0.3644,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09744159877300262,
"step": 605,
"valid_targets_mean": 7061.6,
"valid_targets_min": 913
},
{
"epoch": 6.100671140939597,
"grad_norm": 0.20603793728596564,
"learning_rate": 2.024119074016664e-06,
"loss": 0.3661,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12634268403053284,
"step": 610,
"valid_targets_mean": 8669.8,
"valid_targets_min": 4100
},
{
"epoch": 6.151006711409396,
"grad_norm": 0.19281788398410155,
"learning_rate": 1.8111283828103566e-06,
"loss": 0.3666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12144550681114197,
"step": 615,
"valid_targets_mean": 8999.8,
"valid_targets_min": 2519
},
{
"epoch": 6.201342281879195,
"grad_norm": 0.22141968540557966,
"learning_rate": 1.6094445489709886e-06,
"loss": 0.3646,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12076833099126816,
"step": 620,
"valid_targets_mean": 9312.1,
"valid_targets_min": 2778
},
{
"epoch": 6.251677852348993,
"grad_norm": 0.1984445594470365,
"learning_rate": 1.4191929464299481e-06,
"loss": 0.3607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12098690867424011,
"step": 625,
"valid_targets_mean": 9203.8,
"valid_targets_min": 3448
},
{
"epoch": 6.302013422818792,
"grad_norm": 0.19771555014644152,
"learning_rate": 1.2404918424321277e-06,
"loss": 0.3607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10897114872932434,
"step": 630,
"valid_targets_mean": 7500.3,
"valid_targets_min": 991
},
{
"epoch": 6.35234899328859,
"grad_norm": 0.20058217433838568,
"learning_rate": 1.073452324016715e-06,
"loss": 0.3614,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1343023031949997,
"step": 635,
"valid_targets_mean": 9441.8,
"valid_targets_min": 3744
},
{
"epoch": 6.402684563758389,
"grad_norm": 0.2162603197139562,
"learning_rate": 9.181782289615149e-07,
"loss": 0.3569,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11951828002929688,
"step": 640,
"valid_targets_mean": 8311.2,
"valid_targets_min": 3340
},
{
"epoch": 6.453020134228188,
"grad_norm": 0.21759350387739748,
"learning_rate": 7.747660812336221e-07,
"loss": 0.3612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12520036101341248,
"step": 645,
"valid_targets_mean": 8280.2,
"valid_targets_min": 1940
},
{
"epoch": 6.503355704697986,
"grad_norm": 0.21007505903652565,
"learning_rate": 6.433050309866717e-07,
"loss": 0.3636,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1255347728729248,
"step": 650,
"valid_targets_mean": 7931.5,
"valid_targets_min": 3055
},
{
"epoch": 6.553691275167785,
"grad_norm": 0.222415922560622,
"learning_rate": 5.238767991418737e-07,
"loss": 0.3608,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10943958163261414,
"step": 655,
"valid_targets_mean": 7532.7,
"valid_targets_min": 2637
},
{
"epoch": 6.604026845637584,
"grad_norm": 0.20158116982395177,
"learning_rate": 4.165556265873716e-07,
"loss": 0.3589,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12891879677772522,
"step": 660,
"valid_targets_mean": 8930.4,
"valid_targets_min": 3999
},
{
"epoch": 6.654362416107382,
"grad_norm": 0.20822765851254327,
"learning_rate": 3.214082280274067e-07,
"loss": 0.3652,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12707442045211792,
"step": 665,
"valid_targets_mean": 8714.0,
"valid_targets_min": 2643
},
{
"epoch": 6.704697986577181,
"grad_norm": 0.20171448776496836,
"learning_rate": 2.384937505100804e-07,
"loss": 0.368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12111932784318924,
"step": 670,
"valid_targets_mean": 8530.3,
"valid_targets_min": 2996
},
{
"epoch": 6.75503355704698,
"grad_norm": 0.2091122427985211,
"learning_rate": 1.6786373665939492e-07,
"loss": 0.3692,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1105751022696495,
"step": 675,
"valid_targets_mean": 7289.7,
"valid_targets_min": 2951
},
{
"epoch": 6.805369127516778,
"grad_norm": 0.19123999178770715,
"learning_rate": 1.0956209263453421e-07,
"loss": 0.3633,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11396750807762146,
"step": 680,
"valid_targets_mean": 8709.4,
"valid_targets_min": 481
},
{
"epoch": 6.855704697986577,
"grad_norm": 0.20611680687469927,
"learning_rate": 6.362506083618103e-08,
"loss": 0.3628,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12498224526643753,
"step": 685,
"valid_targets_mean": 8953.2,
"valid_targets_min": 3802
},
{
"epoch": 6.906040268456376,
"grad_norm": 0.1966589947991026,
"learning_rate": 3.0081197376965465e-08,
"loss": 0.3574,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12874117493629456,
"step": 690,
"valid_targets_mean": 8914.9,
"valid_targets_min": 3898
},
{
"epoch": 6.956375838926174,
"grad_norm": 0.212158333151148,
"learning_rate": 8.951354329933548e-09,
"loss": 0.3671,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12312234938144684,
"step": 695,
"valid_targets_mean": 8675.7,
"valid_targets_min": 2308
},
{
"epoch": 7.0,
"grad_norm": 0.34474378658754257,
"learning_rate": 2.486667661627529e-10,
"loss": 0.3647,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3694714307785034,
"step": 700,
"valid_targets_mean": 8521.5,
"valid_targets_min": 2998
},
{
"epoch": 7.0,
"step": 700,
"total_flos": 2.638568480974045e+18,
"train_loss": 0.0,
"train_runtime": 0.8295,
"train_samples_per_second": 80459.329,
"train_steps_per_second": 843.831
}
],
"logging_steps": 5,
"max_steps": 700,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.638568480974045e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}