1862 lines
45 KiB
JSON
1862 lines
45 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.9998683627921449,
|
|
"eval_steps": 500,
|
|
"global_step": 2611,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0038294460466953076,
|
|
"grad_norm": 5.426074325401742,
|
|
"learning_rate": 5.69620253164557e-07,
|
|
"loss": 0.596,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.007658892093390615,
|
|
"grad_norm": 1.7300049140411797,
|
|
"learning_rate": 1.2025316455696204e-06,
|
|
"loss": 0.4935,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.011488338140085923,
|
|
"grad_norm": 1.0566614384820672,
|
|
"learning_rate": 1.8354430379746838e-06,
|
|
"loss": 0.4179,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.01531778418678123,
|
|
"grad_norm": 1.1239284474445808,
|
|
"learning_rate": 2.4683544303797473e-06,
|
|
"loss": 0.3786,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.019147230233476538,
|
|
"grad_norm": 0.8236458977275282,
|
|
"learning_rate": 3.10126582278481e-06,
|
|
"loss": 0.3645,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.022976676280171845,
|
|
"grad_norm": 0.939010895376297,
|
|
"learning_rate": 3.7341772151898737e-06,
|
|
"loss": 0.3472,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.026806122326867153,
|
|
"grad_norm": 0.7558771041067948,
|
|
"learning_rate": 4.367088607594937e-06,
|
|
"loss": 0.3234,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.03063556837356246,
|
|
"grad_norm": 0.8507599742006083,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3273,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.03446501442025777,
|
|
"grad_norm": 0.9322867483149297,
|
|
"learning_rate": 4.999807568225742e-06,
|
|
"loss": 0.3269,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.038294460466953076,
|
|
"grad_norm": 0.8060180679646447,
|
|
"learning_rate": 4.999230302526956e-06,
|
|
"loss": 0.338,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.04212390651364838,
|
|
"grad_norm": 0.8178741447804784,
|
|
"learning_rate": 4.998268291771053e-06,
|
|
"loss": 0.3232,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.04595335256034369,
|
|
"grad_norm": 0.8264908445738317,
|
|
"learning_rate": 4.9969216840551815e-06,
|
|
"loss": 0.3239,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.049782798607039,
|
|
"grad_norm": 0.8185737508740193,
|
|
"learning_rate": 4.995190686683432e-06,
|
|
"loss": 0.3164,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.053612244653734306,
|
|
"grad_norm": 0.8016123974519153,
|
|
"learning_rate": 4.9930755661349215e-06,
|
|
"loss": 0.3227,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.057441690700429614,
|
|
"grad_norm": 0.8304543418674754,
|
|
"learning_rate": 4.990576648022768e-06,
|
|
"loss": 0.3136,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.06127113674712492,
|
|
"grad_norm": 0.7555621229293721,
|
|
"learning_rate": 4.98769431704397e-06,
|
|
"loss": 0.3033,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.06510058279382024,
|
|
"grad_norm": 0.8637646507692361,
|
|
"learning_rate": 4.984429016920178e-06,
|
|
"loss": 0.3231,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.06893002884051554,
|
|
"grad_norm": 0.7476252312804286,
|
|
"learning_rate": 4.980781250329389e-06,
|
|
"loss": 0.309,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.07275947488721085,
|
|
"grad_norm": 0.8589580099130042,
|
|
"learning_rate": 4.976751578828562e-06,
|
|
"loss": 0.3122,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.07658892093390615,
|
|
"grad_norm": 0.8355945240964501,
|
|
"learning_rate": 4.9723406227671645e-06,
|
|
"loss": 0.3109,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.08041836698060147,
|
|
"grad_norm": 0.7902874463038752,
|
|
"learning_rate": 4.967549061191679e-06,
|
|
"loss": 0.3118,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.08424781302729677,
|
|
"grad_norm": 0.8876153745394134,
|
|
"learning_rate": 4.962377631741061e-06,
|
|
"loss": 0.306,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.08807725907399208,
|
|
"grad_norm": 0.866679126237035,
|
|
"learning_rate": 4.956827130533185e-06,
|
|
"loss": 0.3135,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.09190670512068738,
|
|
"grad_norm": 0.8629644307536233,
|
|
"learning_rate": 4.95089841204229e-06,
|
|
"loss": 0.302,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.0957361511673827,
|
|
"grad_norm": 0.7338565663547403,
|
|
"learning_rate": 4.9445923889674285e-06,
|
|
"loss": 0.303,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.099565597214078,
|
|
"grad_norm": 0.7305791719581167,
|
|
"learning_rate": 4.937910032091968e-06,
|
|
"loss": 0.3009,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.10339504326077331,
|
|
"grad_norm": 0.6695669065750437,
|
|
"learning_rate": 4.9308523701341415e-06,
|
|
"loss": 0.305,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.10722448930746861,
|
|
"grad_norm": 0.7251397159069465,
|
|
"learning_rate": 4.923420489588677e-06,
|
|
"loss": 0.3005,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.11105393535416393,
|
|
"grad_norm": 0.7899848638296046,
|
|
"learning_rate": 4.915615534559545e-06,
|
|
"loss": 0.3036,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.11488338140085923,
|
|
"grad_norm": 0.8159662777762657,
|
|
"learning_rate": 4.907438706583818e-06,
|
|
"loss": 0.2997,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.11871282744755454,
|
|
"grad_norm": 0.7496451104541622,
|
|
"learning_rate": 4.898891264446709e-06,
|
|
"loss": 0.2984,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.12254227349424984,
|
|
"grad_norm": 0.7090938182919049,
|
|
"learning_rate": 4.889974523987784e-06,
|
|
"loss": 0.3037,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.12637171954094514,
|
|
"grad_norm": 0.7534479964128541,
|
|
"learning_rate": 4.880689857898392e-06,
|
|
"loss": 0.2907,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.13020116558764047,
|
|
"grad_norm": 0.7967215720335111,
|
|
"learning_rate": 4.871038695510347e-06,
|
|
"loss": 0.3035,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.13403061163433577,
|
|
"grad_norm": 0.8589787634591548,
|
|
"learning_rate": 4.861022522575892e-06,
|
|
"loss": 0.2917,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.13786005768103107,
|
|
"grad_norm": 0.77962150007173,
|
|
"learning_rate": 4.850642881038969e-06,
|
|
"loss": 0.3019,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.14168950372772637,
|
|
"grad_norm": 0.6913568705438217,
|
|
"learning_rate": 4.839901368797849e-06,
|
|
"loss": 0.2987,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.1455189497744217,
|
|
"grad_norm": 0.7245824514430528,
|
|
"learning_rate": 4.828799639459139e-06,
|
|
"loss": 0.2996,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.149348395821117,
|
|
"grad_norm": 0.7331355223021702,
|
|
"learning_rate": 4.817339402083217e-06,
|
|
"loss": 0.2958,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.1531778418678123,
|
|
"grad_norm": 0.684066179750134,
|
|
"learning_rate": 4.805522420921132e-06,
|
|
"loss": 0.2923,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.1570072879145076,
|
|
"grad_norm": 0.7273451737034062,
|
|
"learning_rate": 4.793350515143007e-06,
|
|
"loss": 0.2955,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.16083673396120293,
|
|
"grad_norm": 0.6922184014896344,
|
|
"learning_rate": 4.780825558557981e-06,
|
|
"loss": 0.3021,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.16466618000789823,
|
|
"grad_norm": 0.7901617539245519,
|
|
"learning_rate": 4.767949479325749e-06,
|
|
"loss": 0.3004,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.16849562605459353,
|
|
"grad_norm": 0.7269661753924206,
|
|
"learning_rate": 4.754724259659727e-06,
|
|
"loss": 0.2966,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.17232507210128883,
|
|
"grad_norm": 0.7893201123959417,
|
|
"learning_rate": 4.741151935521906e-06,
|
|
"loss": 0.2985,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.17615451814798416,
|
|
"grad_norm": 0.6946007030633581,
|
|
"learning_rate": 4.727234596309417e-06,
|
|
"loss": 0.3036,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.17998396419467946,
|
|
"grad_norm": 0.7322226198234377,
|
|
"learning_rate": 4.71297438453288e-06,
|
|
"loss": 0.3001,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.18381341024137476,
|
|
"grad_norm": 0.6823377208042828,
|
|
"learning_rate": 4.69837349548658e-06,
|
|
"loss": 0.2925,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.1876428562880701,
|
|
"grad_norm": 0.7135771256714908,
|
|
"learning_rate": 4.683434176910503e-06,
|
|
"loss": 0.2939,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.1914723023347654,
|
|
"grad_norm": 0.691020843307318,
|
|
"learning_rate": 4.668158728644315e-06,
|
|
"loss": 0.2804,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.1953017483814607,
|
|
"grad_norm": 0.6612550303473294,
|
|
"learning_rate": 4.652549502273305e-06,
|
|
"loss": 0.2922,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.199131194428156,
|
|
"grad_norm": 0.7364276724142655,
|
|
"learning_rate": 4.636608900766372e-06,
|
|
"loss": 0.2891,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.20296064047485132,
|
|
"grad_norm": 0.6721354413833053,
|
|
"learning_rate": 4.620339378106103e-06,
|
|
"loss": 0.2809,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.20679008652154662,
|
|
"grad_norm": 0.6653652380595567,
|
|
"learning_rate": 4.6037434389109855e-06,
|
|
"loss": 0.2983,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.21061953256824192,
|
|
"grad_norm": 0.7449501156668481,
|
|
"learning_rate": 4.586823638049841e-06,
|
|
"loss": 0.2903,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.21444897861493722,
|
|
"grad_norm": 0.715508013931221,
|
|
"learning_rate": 4.569582580248509e-06,
|
|
"loss": 0.2923,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.21827842466163255,
|
|
"grad_norm": 0.6641963861933619,
|
|
"learning_rate": 4.552022919688861e-06,
|
|
"loss": 0.2924,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.22210787070832785,
|
|
"grad_norm": 0.7113518461330477,
|
|
"learning_rate": 4.534147359600211e-06,
|
|
"loss": 0.2819,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.22593731675502315,
|
|
"grad_norm": 0.7089839605701392,
|
|
"learning_rate": 4.515958651843151e-06,
|
|
"loss": 0.2939,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.22976676280171845,
|
|
"grad_norm": 0.677078858734863,
|
|
"learning_rate": 4.497459596485924e-06,
|
|
"loss": 0.2835,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.23359620884841378,
|
|
"grad_norm": 0.7703623199956556,
|
|
"learning_rate": 4.478653041373371e-06,
|
|
"loss": 0.2854,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.23742565489510908,
|
|
"grad_norm": 0.6407221872558565,
|
|
"learning_rate": 4.459541881688501e-06,
|
|
"loss": 0.2872,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.24125510094180438,
|
|
"grad_norm": 0.7625245912179776,
|
|
"learning_rate": 4.440129059506808e-06,
|
|
"loss": 0.2852,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.24508454698849969,
|
|
"grad_norm": 0.6533872161632752,
|
|
"learning_rate": 4.420417563343347e-06,
|
|
"loss": 0.2883,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.248913993035195,
|
|
"grad_norm": 0.6347508565680315,
|
|
"learning_rate": 4.40041042769266e-06,
|
|
"loss": 0.2818,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.2527434390818903,
|
|
"grad_norm": 0.6956608959261769,
|
|
"learning_rate": 4.380110732561636e-06,
|
|
"loss": 0.2858,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.2565728851285856,
|
|
"grad_norm": 0.7174236888577228,
|
|
"learning_rate": 4.3595216029953575e-06,
|
|
"loss": 0.2948,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.26040233117528094,
|
|
"grad_norm": 0.6538209955447881,
|
|
"learning_rate": 4.338646208596009e-06,
|
|
"loss": 0.2901,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.2642317772219762,
|
|
"grad_norm": 0.6777945072272051,
|
|
"learning_rate": 4.317487763034936e-06,
|
|
"loss": 0.2848,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.26806122326867154,
|
|
"grad_norm": 0.6915180680337352,
|
|
"learning_rate": 4.296049523557917e-06,
|
|
"loss": 0.294,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.2718906693153669,
|
|
"grad_norm": 0.6811198761407046,
|
|
"learning_rate": 4.274334790483718e-06,
|
|
"loss": 0.2925,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.27572011536206215,
|
|
"grad_norm": 0.6682149146681646,
|
|
"learning_rate": 4.2523469066960295e-06,
|
|
"loss": 0.2832,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.2795495614087575,
|
|
"grad_norm": 0.6343382096231662,
|
|
"learning_rate": 4.230089257128842e-06,
|
|
"loss": 0.2865,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.28337900745545275,
|
|
"grad_norm": 0.7142478024296977,
|
|
"learning_rate": 4.207565268245356e-06,
|
|
"loss": 0.2852,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.2872084535021481,
|
|
"grad_norm": 0.6678411720839094,
|
|
"learning_rate": 4.184778407510484e-06,
|
|
"loss": 0.2924,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.2910378995488434,
|
|
"grad_norm": 0.7689563352484293,
|
|
"learning_rate": 4.16173218285706e-06,
|
|
"loss": 0.2901,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.2948673455955387,
|
|
"grad_norm": 0.749864633773232,
|
|
"learning_rate": 4.138430142145805e-06,
|
|
"loss": 0.2839,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.298696791642234,
|
|
"grad_norm": 0.6699595192464245,
|
|
"learning_rate": 4.114875872619147e-06,
|
|
"loss": 0.2951,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.30252623768892933,
|
|
"grad_norm": 0.691214596956005,
|
|
"learning_rate": 4.091073000348989e-06,
|
|
"loss": 0.2874,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.3063556837356246,
|
|
"grad_norm": 0.6554956219244137,
|
|
"learning_rate": 4.067025189678485e-06,
|
|
"loss": 0.286,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.31018512978231993,
|
|
"grad_norm": 0.6954151666813602,
|
|
"learning_rate": 4.042736142657936e-06,
|
|
"loss": 0.2834,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.3140145758290152,
|
|
"grad_norm": 0.7196715435903528,
|
|
"learning_rate": 4.018209598474869e-06,
|
|
"loss": 0.284,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.31784402187571054,
|
|
"grad_norm": 0.7723328668264622,
|
|
"learning_rate": 3.9934493328784185e-06,
|
|
"loss": 0.2777,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.32167346792240586,
|
|
"grad_norm": 0.6919247319317805,
|
|
"learning_rate": 3.9684591575980546e-06,
|
|
"loss": 0.2893,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.32550291396910114,
|
|
"grad_norm": 0.612288507127871,
|
|
"learning_rate": 3.943242919756792e-06,
|
|
"loss": 0.2891,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.32933236001579647,
|
|
"grad_norm": 0.7304106009933916,
|
|
"learning_rate": 3.917804501278942e-06,
|
|
"loss": 0.2838,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.3331618060624918,
|
|
"grad_norm": 0.6961425637816675,
|
|
"learning_rate": 3.892147818292505e-06,
|
|
"loss": 0.2818,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.33699125210918707,
|
|
"grad_norm": 0.6415090586497654,
|
|
"learning_rate": 3.866276820526305e-06,
|
|
"loss": 0.2826,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.3408206981558824,
|
|
"grad_norm": 0.7319097656364029,
|
|
"learning_rate": 3.840195490701943e-06,
|
|
"loss": 0.2797,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.34465014420257767,
|
|
"grad_norm": 0.6643906637801983,
|
|
"learning_rate": 3.8139078439206755e-06,
|
|
"loss": 0.2823,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.348479590249273,
|
|
"grad_norm": 0.651442028320178,
|
|
"learning_rate": 3.787417927045315e-06,
|
|
"loss": 0.2845,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.3523090362959683,
|
|
"grad_norm": 0.703399820267242,
|
|
"learning_rate": 3.760729818077224e-06,
|
|
"loss": 0.2782,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.3561384823426636,
|
|
"grad_norm": 0.6339374061657803,
|
|
"learning_rate": 3.7338476255285295e-06,
|
|
"loss": 0.2809,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.3599679283893589,
|
|
"grad_norm": 0.6650804278367294,
|
|
"learning_rate": 3.7067754877896388e-06,
|
|
"loss": 0.288,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.36379737443605425,
|
|
"grad_norm": 0.6645625939049019,
|
|
"learning_rate": 3.6795175724921506e-06,
|
|
"loss": 0.2821,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.36762682048274953,
|
|
"grad_norm": 0.6819651400048093,
|
|
"learning_rate": 3.652078075867267e-06,
|
|
"loss": 0.2759,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.37145626652944486,
|
|
"grad_norm": 0.6767426872168217,
|
|
"learning_rate": 3.624461222099804e-06,
|
|
"loss": 0.28,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.3752857125761402,
|
|
"grad_norm": 0.7310278431962209,
|
|
"learning_rate": 3.596671262677898e-06,
|
|
"loss": 0.2883,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.37911515862283546,
|
|
"grad_norm": 0.6791506792289325,
|
|
"learning_rate": 3.5687124757385084e-06,
|
|
"loss": 0.2885,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.3829446046695308,
|
|
"grad_norm": 0.7086896739862765,
|
|
"learning_rate": 3.5405891654088154e-06,
|
|
"loss": 0.2815,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.38677405071622606,
|
|
"grad_norm": 0.6280354477468107,
|
|
"learning_rate": 3.5123056611436224e-06,
|
|
"loss": 0.2807,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.3906034967629214,
|
|
"grad_norm": 0.6538192887744636,
|
|
"learning_rate": 3.4838663170588573e-06,
|
|
"loss": 0.2723,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.3944329428096167,
|
|
"grad_norm": 0.6999366929435851,
|
|
"learning_rate": 3.455275511261272e-06,
|
|
"loss": 0.2804,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.398262388856312,
|
|
"grad_norm": 0.6406720215436563,
|
|
"learning_rate": 3.4265376451744564e-06,
|
|
"loss": 0.2776,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.4020918349030073,
|
|
"grad_norm": 0.6809060571142534,
|
|
"learning_rate": 3.3976571428612583e-06,
|
|
"loss": 0.2823,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.40592128094970265,
|
|
"grad_norm": 0.6506142391402524,
|
|
"learning_rate": 3.3686384503427177e-06,
|
|
"loss": 0.2785,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.4097507269963979,
|
|
"grad_norm": 0.6623408933951855,
|
|
"learning_rate": 3.339486034913627e-06,
|
|
"loss": 0.2781,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.41358017304309325,
|
|
"grad_norm": 0.675859964827601,
|
|
"learning_rate": 3.310204384454805e-06,
|
|
"loss": 0.2776,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.4174096190897885,
|
|
"grad_norm": 0.7354171342985883,
|
|
"learning_rate": 3.280798006742213e-06,
|
|
"loss": 0.2929,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.42123906513648385,
|
|
"grad_norm": 0.6457062459707484,
|
|
"learning_rate": 3.2512714287530007e-06,
|
|
"loss": 0.2743,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.4250685111831792,
|
|
"grad_norm": 0.641329871481826,
|
|
"learning_rate": 3.2216291959686007e-06,
|
|
"loss": 0.2737,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.42889795722987445,
|
|
"grad_norm": 0.6566421664911357,
|
|
"learning_rate": 3.191875871674971e-06,
|
|
"loss": 0.2838,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.4327274032765698,
|
|
"grad_norm": 0.6027701960697498,
|
|
"learning_rate": 3.162016036260098e-06,
|
|
"loss": 0.2752,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.4365568493232651,
|
|
"grad_norm": 0.648907215118306,
|
|
"learning_rate": 3.1320542865088695e-06,
|
|
"loss": 0.2667,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.4403862953699604,
|
|
"grad_norm": 0.6077801768682932,
|
|
"learning_rate": 3.1019952348954163e-06,
|
|
"loss": 0.2747,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.4442157414166557,
|
|
"grad_norm": 0.6576708995713357,
|
|
"learning_rate": 3.071843508873046e-06,
|
|
"loss": 0.2836,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.448045187463351,
|
|
"grad_norm": 0.695646974082748,
|
|
"learning_rate": 3.0416037501618676e-06,
|
|
"loss": 0.2732,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.4518746335100463,
|
|
"grad_norm": 0.616879255204133,
|
|
"learning_rate": 3.0112806140342176e-06,
|
|
"loss": 0.2759,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.45570407955674164,
|
|
"grad_norm": 0.6008817125153927,
|
|
"learning_rate": 2.9808787685980054e-06,
|
|
"loss": 0.2769,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.4595335256034369,
|
|
"grad_norm": 0.5972779982979275,
|
|
"learning_rate": 2.9504028940780777e-06,
|
|
"loss": 0.2836,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.46336297165013224,
|
|
"grad_norm": 0.6541064777899037,
|
|
"learning_rate": 2.9198576820957188e-06,
|
|
"loss": 0.2678,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.46719241769682757,
|
|
"grad_norm": 0.7461980100750918,
|
|
"learning_rate": 2.8892478349463987e-06,
|
|
"loss": 0.279,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.47102186374352284,
|
|
"grad_norm": 0.697749095404112,
|
|
"learning_rate": 2.8585780648758745e-06,
|
|
"loss": 0.2774,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.47485130979021817,
|
|
"grad_norm": 0.6901917627087478,
|
|
"learning_rate": 2.827853093354763e-06,
|
|
"loss": 0.2731,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.47868075583691344,
|
|
"grad_norm": 0.6167921413072504,
|
|
"learning_rate": 2.79707765035169e-06,
|
|
"loss": 0.2781,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.48251020188360877,
|
|
"grad_norm": 0.6877679962661163,
|
|
"learning_rate": 2.7662564736051378e-06,
|
|
"loss": 0.2779,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.4863396479303041,
|
|
"grad_norm": 0.6529732798686552,
|
|
"learning_rate": 2.7353943078940876e-06,
|
|
"loss": 0.2755,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.49016909397699937,
|
|
"grad_norm": 0.573925656257561,
|
|
"learning_rate": 2.7044959043075815e-06,
|
|
"loss": 0.2781,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.4939985400236947,
|
|
"grad_norm": 0.7060010201146872,
|
|
"learning_rate": 2.67356601951332e-06,
|
|
"loss": 0.2885,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.49782798607039,
|
|
"grad_norm": 0.6706570115076154,
|
|
"learning_rate": 2.64260941502539e-06,
|
|
"loss": 0.2823,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.5016574321170854,
|
|
"grad_norm": 0.6816750561081921,
|
|
"learning_rate": 2.611630856471252e-06,
|
|
"loss": 0.2734,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.5054868781637806,
|
|
"grad_norm": 0.6503962781113031,
|
|
"learning_rate": 2.5806351128580963e-06,
|
|
"loss": 0.2775,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.5093163242104759,
|
|
"grad_norm": 0.6770280496930409,
|
|
"learning_rate": 2.549626955838673e-06,
|
|
"loss": 0.2805,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.5131457702571712,
|
|
"grad_norm": 0.6025144898257462,
|
|
"learning_rate": 2.5186111589767187e-06,
|
|
"loss": 0.2715,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.5169752163038666,
|
|
"grad_norm": 0.6707574673231309,
|
|
"learning_rate": 2.487592497012089e-06,
|
|
"loss": 0.2763,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.5208046623505619,
|
|
"grad_norm": 0.6063466305966893,
|
|
"learning_rate": 2.456575745125713e-06,
|
|
"loss": 0.2845,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.5246341083972571,
|
|
"grad_norm": 0.6060316435488056,
|
|
"learning_rate": 2.4255656782044644e-06,
|
|
"loss": 0.2772,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.5284635544439524,
|
|
"grad_norm": 0.6023582378786108,
|
|
"learning_rate": 2.3945670701061033e-06,
|
|
"loss": 0.267,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.5322930004906478,
|
|
"grad_norm": 0.672852399998615,
|
|
"learning_rate": 2.3635846929243536e-06,
|
|
"loss": 0.2757,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.5361224465373431,
|
|
"grad_norm": 0.671673917828738,
|
|
"learning_rate": 2.3326233162542655e-06,
|
|
"loss": 0.2772,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.5399518925840384,
|
|
"grad_norm": 0.6908191182674716,
|
|
"learning_rate": 2.3016877064579564e-06,
|
|
"loss": 0.2752,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.5437813386307337,
|
|
"grad_norm": 0.6483112345732687,
|
|
"learning_rate": 2.2707826259308493e-06,
|
|
"loss": 0.2773,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.547610784677429,
|
|
"grad_norm": 0.6527686650275873,
|
|
"learning_rate": 2.2399128323685287e-06,
|
|
"loss": 0.2711,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.5514402307241243,
|
|
"grad_norm": 0.6402708780992856,
|
|
"learning_rate": 2.2090830780343116e-06,
|
|
"loss": 0.2774,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.5552696767708196,
|
|
"grad_norm": 0.7121133583105477,
|
|
"learning_rate": 2.178298109027659e-06,
|
|
"loss": 0.2789,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.559099122817515,
|
|
"grad_norm": 0.6936411537357103,
|
|
"learning_rate": 2.147562664553537e-06,
|
|
"loss": 0.2744,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.5629285688642103,
|
|
"grad_norm": 0.6126488504445933,
|
|
"learning_rate": 2.116881476192834e-06,
|
|
"loss": 0.2698,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.5667580149109055,
|
|
"grad_norm": 0.6045796996159061,
|
|
"learning_rate": 2.086259267173961e-06,
|
|
"loss": 0.2756,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.5705874609576008,
|
|
"grad_norm": 0.6331045988234285,
|
|
"learning_rate": 2.0557007516457287e-06,
|
|
"loss": 0.2813,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.5744169070042962,
|
|
"grad_norm": 0.6250063298087053,
|
|
"learning_rate": 2.025210633951627e-06,
|
|
"loss": 0.2659,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.5782463530509915,
|
|
"grad_norm": 0.6244702166810576,
|
|
"learning_rate": 1.9947936079056118e-06,
|
|
"loss": 0.2691,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.5820757990976868,
|
|
"grad_norm": 0.6645304037867747,
|
|
"learning_rate": 1.964454356069514e-06,
|
|
"loss": 0.2653,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.5859052451443821,
|
|
"grad_norm": 0.6422648719414517,
|
|
"learning_rate": 1.934197549032183e-06,
|
|
"loss": 0.2753,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.5897346911910774,
|
|
"grad_norm": 0.6796801504650317,
|
|
"learning_rate": 1.904027844690468e-06,
|
|
"loss": 0.2756,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.5935641372377727,
|
|
"grad_norm": 0.612092392514174,
|
|
"learning_rate": 1.8739498875321563e-06,
|
|
"loss": 0.2781,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.597393583284468,
|
|
"grad_norm": 0.6072776324810985,
|
|
"learning_rate": 1.8439683079209789e-06,
|
|
"loss": 0.2762,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.6012230293311633,
|
|
"grad_norm": 0.6756861517531914,
|
|
"learning_rate": 1.8140877213837823e-06,
|
|
"loss": 0.2671,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.6050524753778587,
|
|
"grad_norm": 0.6297494292950692,
|
|
"learning_rate": 1.7843127278999944e-06,
|
|
"loss": 0.2656,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.6088819214245539,
|
|
"grad_norm": 0.6384980188983074,
|
|
"learning_rate": 1.7546479111934733e-06,
|
|
"loss": 0.2742,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.6127113674712492,
|
|
"grad_norm": 0.6354445299662702,
|
|
"learning_rate": 1.7250978380268696e-06,
|
|
"loss": 0.2703,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.6165408135179445,
|
|
"grad_norm": 0.6499360638842633,
|
|
"learning_rate": 1.6956670574985909e-06,
|
|
"loss": 0.2778,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.6203702595646399,
|
|
"grad_norm": 0.6611350047172592,
|
|
"learning_rate": 1.6663601003424884e-06,
|
|
"loss": 0.2751,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.6241997056113352,
|
|
"grad_norm": 0.6676095929381155,
|
|
"learning_rate": 1.6371814782303723e-06,
|
|
"loss": 0.2697,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.6280291516580304,
|
|
"grad_norm": 0.6627616428191541,
|
|
"learning_rate": 1.6081356830774625e-06,
|
|
"loss": 0.2728,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.6318585977047257,
|
|
"grad_norm": 0.6297343768461555,
|
|
"learning_rate": 1.5792271863508751e-06,
|
|
"loss": 0.2725,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.6356880437514211,
|
|
"grad_norm": 0.6164109978910287,
|
|
"learning_rate": 1.5504604383812646e-06,
|
|
"loss": 0.2665,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.6395174897981164,
|
|
"grad_norm": 0.6163778395985405,
|
|
"learning_rate": 1.5218398676777103e-06,
|
|
"loss": 0.2676,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.6433469358448117,
|
|
"grad_norm": 0.7111914438547673,
|
|
"learning_rate": 1.493369880245973e-06,
|
|
"loss": 0.2682,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.6471763818915071,
|
|
"grad_norm": 0.5951988021270335,
|
|
"learning_rate": 1.4650548589102092e-06,
|
|
"loss": 0.2725,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.6510058279382023,
|
|
"grad_norm": 0.615884409391351,
|
|
"learning_rate": 1.436899162638255e-06,
|
|
"loss": 0.2693,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.6548352739848976,
|
|
"grad_norm": 0.5615244588845161,
|
|
"learning_rate": 1.4089071258705782e-06,
|
|
"loss": 0.2717,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.6586647200315929,
|
|
"grad_norm": 0.613522242640938,
|
|
"learning_rate": 1.3810830578530226e-06,
|
|
"loss": 0.2645,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.6624941660782883,
|
|
"grad_norm": 0.6255188764708021,
|
|
"learning_rate": 1.3534312419734066e-06,
|
|
"loss": 0.2619,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.6663236121249836,
|
|
"grad_norm": 0.6219089714484455,
|
|
"learning_rate": 1.3259559351021249e-06,
|
|
"loss": 0.2706,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.6701530581716788,
|
|
"grad_norm": 0.7068243904113033,
|
|
"learning_rate": 1.2986613669368159e-06,
|
|
"loss": 0.2724,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.6739825042183741,
|
|
"grad_norm": 0.6239904301630281,
|
|
"learning_rate": 1.2715517393512239e-06,
|
|
"loss": 0.2699,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.6778119502650695,
|
|
"grad_norm": 0.636573560785032,
|
|
"learning_rate": 1.2446312257483358e-06,
|
|
"loss": 0.2606,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.6816413963117648,
|
|
"grad_norm": 0.5822319215351535,
|
|
"learning_rate": 1.2179039704179119e-06,
|
|
"loss": 0.2671,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.6854708423584601,
|
|
"grad_norm": 0.6414100583030806,
|
|
"learning_rate": 1.1913740878984818e-06,
|
|
"loss": 0.2728,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.6893002884051553,
|
|
"grad_norm": 0.6019132802768392,
|
|
"learning_rate": 1.1650456623439368e-06,
|
|
"loss": 0.2684,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.6931297344518507,
|
|
"grad_norm": 0.6936171991583808,
|
|
"learning_rate": 1.1389227468947905e-06,
|
|
"loss": 0.271,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.696959180498546,
|
|
"grad_norm": 0.621045757650744,
|
|
"learning_rate": 1.11300936305422e-06,
|
|
"loss": 0.2657,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.7007886265452413,
|
|
"grad_norm": 0.6554333537644303,
|
|
"learning_rate": 1.0873095000689676e-06,
|
|
"loss": 0.2666,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.7046180725919367,
|
|
"grad_norm": 0.6286228416556564,
|
|
"learning_rate": 1.0618271143152185e-06,
|
|
"loss": 0.2714,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.708447518638632,
|
|
"grad_norm": 0.6221920366356362,
|
|
"learning_rate": 1.0365661286895364e-06,
|
|
"loss": 0.2672,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.7122769646853272,
|
|
"grad_norm": 0.6038564143991549,
|
|
"learning_rate": 1.011530432004948e-06,
|
|
"loss": 0.2639,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.7161064107320225,
|
|
"grad_norm": 0.6754360798564671,
|
|
"learning_rate": 9.86723878392279e-07,
|
|
"loss": 0.2675,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.7199358567787179,
|
|
"grad_norm": 0.5875424854805807,
|
|
"learning_rate": 9.621502867068286e-07,
|
|
"loss": 0.2592,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.7237653028254132,
|
|
"grad_norm": 0.6032756771398841,
|
|
"learning_rate": 9.378134399404768e-07,
|
|
"loss": 0.2676,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.7275947488721085,
|
|
"grad_norm": 0.597275111662435,
|
|
"learning_rate": 9.137170846393054e-07,
|
|
"loss": 0.268,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.7314241949188037,
|
|
"grad_norm": 0.610745344226226,
|
|
"learning_rate": 8.898649303268373e-07,
|
|
"loss": 0.2752,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.7352536409654991,
|
|
"grad_norm": 0.7020431210410311,
|
|
"learning_rate": 8.662606489329712e-07,
|
|
"loss": 0.2793,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.7390830870121944,
|
|
"grad_norm": 0.6300673226887155,
|
|
"learning_rate": 8.429078742287072e-07,
|
|
"loss": 0.2673,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.7429125330588897,
|
|
"grad_norm": 0.6440907203937188,
|
|
"learning_rate": 8.198102012667409e-07,
|
|
"loss": 0.2662,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.746741979105585,
|
|
"grad_norm": 0.6258159414377766,
|
|
"learning_rate": 7.969711858280251e-07,
|
|
"loss": 0.2712,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.7505714251522804,
|
|
"grad_norm": 0.5989574886855157,
|
|
"learning_rate": 7.743943438743676e-07,
|
|
"loss": 0.2634,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.7544008711989756,
|
|
"grad_norm": 0.7834580590666472,
|
|
"learning_rate": 7.520831510071744e-07,
|
|
"loss": 0.2632,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.7582303172456709,
|
|
"grad_norm": 0.6728771166977499,
|
|
"learning_rate": 7.30041041932387e-07,
|
|
"loss": 0.2756,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.7620597632923662,
|
|
"grad_norm": 0.5993928197071544,
|
|
"learning_rate": 7.082714099317334e-07,
|
|
"loss": 0.2664,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.7658892093390616,
|
|
"grad_norm": 0.5848853677834179,
|
|
"learning_rate": 6.867776063403411e-07,
|
|
"loss": 0.2628,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.7697186553857569,
|
|
"grad_norm": 0.629330394414076,
|
|
"learning_rate": 6.655629400308191e-07,
|
|
"loss": 0.2658,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.7735481014324521,
|
|
"grad_norm": 0.5924525230054234,
|
|
"learning_rate": 6.44630676903869e-07,
|
|
"loss": 0.2669,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.7773775474791474,
|
|
"grad_norm": 0.6760063055600575,
|
|
"learning_rate": 6.239840393855185e-07,
|
|
"loss": 0.2692,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.7812069935258428,
|
|
"grad_norm": 0.6602651610212074,
|
|
"learning_rate": 6.036262059310383e-07,
|
|
"loss": 0.2629,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.7850364395725381,
|
|
"grad_norm": 0.6327130212823728,
|
|
"learning_rate": 5.835603105356396e-07,
|
|
"loss": 0.2678,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.7888658856192334,
|
|
"grad_norm": 0.68617187633267,
|
|
"learning_rate": 5.637894422520027e-07,
|
|
"loss": 0.268,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.7926953316659286,
|
|
"grad_norm": 0.6854582170736043,
|
|
"learning_rate": 5.443166447147392e-07,
|
|
"loss": 0.2652,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.796524777712624,
|
|
"grad_norm": 0.6454661983770266,
|
|
"learning_rate": 5.251449156718313e-07,
|
|
"loss": 0.2616,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.8003542237593193,
|
|
"grad_norm": 0.6204018065147047,
|
|
"learning_rate": 5.062772065231492e-07,
|
|
"loss": 0.2664,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.8041836698060146,
|
|
"grad_norm": 0.6500266139560574,
|
|
"learning_rate": 4.877164218660901e-07,
|
|
"loss": 0.2656,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.80801311585271,
|
|
"grad_norm": 0.6419744527723766,
|
|
"learning_rate": 4.694654190484327e-07,
|
|
"loss": 0.2612,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.8118425618994053,
|
|
"grad_norm": 0.6238307107139875,
|
|
"learning_rate": 4.5152700772845947e-07,
|
|
"loss": 0.2676,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.8156720079461005,
|
|
"grad_norm": 0.6514460031994411,
|
|
"learning_rate": 4.339039494424263e-07,
|
|
"loss": 0.2755,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.8195014539927958,
|
|
"grad_norm": 0.5974777353730542,
|
|
"learning_rate": 4.16598957179431e-07,
|
|
"loss": 0.2597,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.8233309000394912,
|
|
"grad_norm": 0.6009135110985171,
|
|
"learning_rate": 3.9961469496376584e-07,
|
|
"loss": 0.2592,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.8271603460861865,
|
|
"grad_norm": 0.6454195389454723,
|
|
"learning_rate": 3.829537774448e-07,
|
|
"loss": 0.2714,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.8309897921328818,
|
|
"grad_norm": 0.5913764986372555,
|
|
"learning_rate": 3.6661876949447006e-07,
|
|
"loss": 0.2637,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.834819238179577,
|
|
"grad_norm": 0.6714610867965414,
|
|
"learning_rate": 3.506121858124253e-07,
|
|
"loss": 0.2652,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.8386486842262724,
|
|
"grad_norm": 0.6260094170090917,
|
|
"learning_rate": 3.3493649053890325e-07,
|
|
"loss": 0.2642,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.8424781302729677,
|
|
"grad_norm": 0.5797985857131752,
|
|
"learning_rate": 3.1959409687538854e-07,
|
|
"loss": 0.2632,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.846307576319663,
|
|
"grad_norm": 0.6189077791942977,
|
|
"learning_rate": 3.04587366713108e-07,
|
|
"loss": 0.2648,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.8501370223663584,
|
|
"grad_norm": 0.6122894183484023,
|
|
"learning_rate": 2.8991861026943015e-07,
|
|
"loss": 0.2741,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.8539664684130536,
|
|
"grad_norm": 0.6661899719747609,
|
|
"learning_rate": 2.755900857322172e-07,
|
|
"loss": 0.2645,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.8577959144597489,
|
|
"grad_norm": 0.5568358628059588,
|
|
"learning_rate": 2.616039989121899e-07,
|
|
"loss": 0.2546,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.8616253605064442,
|
|
"grad_norm": 0.6975565699445695,
|
|
"learning_rate": 2.479625029033489e-07,
|
|
"loss": 0.2774,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.8654548065531396,
|
|
"grad_norm": 0.6137153739809891,
|
|
"learning_rate": 2.3466769775151887e-07,
|
|
"loss": 0.266,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.8692842525998349,
|
|
"grad_norm": 0.6065270965865103,
|
|
"learning_rate": 2.21721630131054e-07,
|
|
"loss": 0.2717,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.8731136986465302,
|
|
"grad_norm": 0.6194228612620867,
|
|
"learning_rate": 2.0912629302976494e-07,
|
|
"loss": 0.2656,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.8769431446932254,
|
|
"grad_norm": 0.6721041148274991,
|
|
"learning_rate": 1.968836254421036e-07,
|
|
"loss": 0.2653,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.8807725907399208,
|
|
"grad_norm": 0.664610505857682,
|
|
"learning_rate": 1.849955120706673e-07,
|
|
"loss": 0.2677,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.8846020367866161,
|
|
"grad_norm": 0.6321996875856193,
|
|
"learning_rate": 1.734637830360536e-07,
|
|
"loss": 0.2645,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.8884314828333114,
|
|
"grad_norm": 0.6365093137379149,
|
|
"learning_rate": 1.6229021359512626e-07,
|
|
"loss": 0.2658,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.8922609288800067,
|
|
"grad_norm": 0.6539770975459238,
|
|
"learning_rate": 1.514765238677185e-07,
|
|
"loss": 0.259,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.896090374926702,
|
|
"grad_norm": 0.5990361778489132,
|
|
"learning_rate": 1.4102437857183155e-07,
|
|
"loss": 0.265,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.8999198209733973,
|
|
"grad_norm": 0.6484099138497742,
|
|
"learning_rate": 1.30935386767356e-07,
|
|
"loss": 0.2667,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.9037492670200926,
|
|
"grad_norm": 0.5867117908179056,
|
|
"learning_rate": 1.2121110160836697e-07,
|
|
"loss": 0.2634,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.907578713066788,
|
|
"grad_norm": 0.5796597988618597,
|
|
"learning_rate": 1.1185302010402105e-07,
|
|
"loss": 0.2719,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.9114081591134833,
|
|
"grad_norm": 0.6237999983224305,
|
|
"learning_rate": 1.0286258288810108e-07,
|
|
"loss": 0.2627,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.9152376051601786,
|
|
"grad_norm": 0.603524669140342,
|
|
"learning_rate": 9.424117399723432e-08,
|
|
"loss": 0.262,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.9190670512068738,
|
|
"grad_norm": 0.620014776939978,
|
|
"learning_rate": 8.599012065782924e-08,
|
|
"loss": 0.271,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.9228964972535691,
|
|
"grad_norm": 0.6276616849770017,
|
|
"learning_rate": 7.811069308175156e-08,
|
|
"loss": 0.2692,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.9267259433002645,
|
|
"grad_norm": 0.6160811115395115,
|
|
"learning_rate": 7.060410427078473e-08,
|
|
"loss": 0.2674,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.9305553893469598,
|
|
"grad_norm": 0.5848887503471439,
|
|
"learning_rate": 6.347150982989159e-08,
|
|
"loss": 0.2625,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.9343848353936551,
|
|
"grad_norm": 0.5883046146508241,
|
|
"learning_rate": 5.6714007789314686e-08,
|
|
"loss": 0.2621,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.9382142814403504,
|
|
"grad_norm": 0.6940261485803617,
|
|
"learning_rate": 5.033263843554015e-08,
|
|
"loss": 0.2646,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.9420437274870457,
|
|
"grad_norm": 0.6682532368250712,
|
|
"learning_rate": 4.4328384151149094e-08,
|
|
"loss": 0.2667,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.945873173533741,
|
|
"grad_norm": 0.6267836020326658,
|
|
"learning_rate": 3.870216926358555e-08,
|
|
"loss": 0.2643,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.9497026195804363,
|
|
"grad_norm": 0.5575161118815479,
|
|
"learning_rate": 3.3454859902860295e-08,
|
|
"loss": 0.2641,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.9535320656271317,
|
|
"grad_norm": 0.5714423960482532,
|
|
"learning_rate": 2.858726386821359e-08,
|
|
"loss": 0.2707,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.9573615116738269,
|
|
"grad_norm": 0.6111083062299648,
|
|
"learning_rate": 2.410013050375859e-08,
|
|
"loss": 0.2709,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.9611909577205222,
|
|
"grad_norm": 0.642938384488523,
|
|
"learning_rate": 1.999415058312276e-08,
|
|
"loss": 0.271,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.9650204037672175,
|
|
"grad_norm": 0.6112837503032897,
|
|
"learning_rate": 1.6269956203107117e-08,
|
|
"loss": 0.2512,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.9688498498139129,
|
|
"grad_norm": 0.5889833725999716,
|
|
"learning_rate": 1.2928120686377388e-08,
|
|
"loss": 0.2661,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.9726792958606082,
|
|
"grad_norm": 0.5773801445867635,
|
|
"learning_rate": 9.969158493204067e-09,
|
|
"loss": 0.2653,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.9765087419073035,
|
|
"grad_norm": 0.6062759480472552,
|
|
"learning_rate": 7.393525142262992e-09,
|
|
"loss": 0.2691,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.9803381879539987,
|
|
"grad_norm": 0.5770795227743729,
|
|
"learning_rate": 5.201617140510318e-09,
|
|
"loss": 0.2694,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.9841676340006941,
|
|
"grad_norm": 0.6054174692665726,
|
|
"learning_rate": 3.3937719221427413e-09,
|
|
"loss": 0.2592,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.9879970800473894,
|
|
"grad_norm": 0.5960623948954498,
|
|
"learning_rate": 1.9702677966507157e-09,
|
|
"loss": 0.2641,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.9918265260940847,
|
|
"grad_norm": 0.6702560926553222,
|
|
"learning_rate": 9.31323905974113e-10,
|
|
"loss": 0.2621,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.99565597214078,
|
|
"grad_norm": 0.623171089038422,
|
|
"learning_rate": 2.7710019076532257e-10,
|
|
"loss": 0.2672,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.9994854181874753,
|
|
"grad_norm": 0.6269492106617217,
|
|
"learning_rate": 7.697365768943865e-12,
|
|
"loss": 0.2736,
|
|
"step": 2610
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 2611,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 600,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 624569162006528.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|