2625 lines
64 KiB
JSON
2625 lines
64 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 3708,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.008090614886731391,
|
|
"grad_norm": 10.671297597284719,
|
|
"learning_rate": 1.2129380053908356e-06,
|
|
"loss": 2.601,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.016181229773462782,
|
|
"grad_norm": 6.311089864073009,
|
|
"learning_rate": 2.560646900269542e-06,
|
|
"loss": 2.3954,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.024271844660194174,
|
|
"grad_norm": 5.215362734590839,
|
|
"learning_rate": 3.908355795148248e-06,
|
|
"loss": 1.7763,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.032362459546925564,
|
|
"grad_norm": 2.0676790587143836,
|
|
"learning_rate": 5.256064690026954e-06,
|
|
"loss": 1.2086,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.040453074433656956,
|
|
"grad_norm": 1.4206424846718397,
|
|
"learning_rate": 6.60377358490566e-06,
|
|
"loss": 0.8436,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.04854368932038835,
|
|
"grad_norm": 1.5867176039284459,
|
|
"learning_rate": 7.951482479784367e-06,
|
|
"loss": 0.6706,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.05663430420711974,
|
|
"grad_norm": 1.2138979784248856,
|
|
"learning_rate": 9.299191374663074e-06,
|
|
"loss": 0.5858,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.06472491909385113,
|
|
"grad_norm": 0.7702442127235006,
|
|
"learning_rate": 1.0646900269541779e-05,
|
|
"loss": 0.5384,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.07281553398058252,
|
|
"grad_norm": 1.0114209152057974,
|
|
"learning_rate": 1.1994609164420486e-05,
|
|
"loss": 0.4977,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.08090614886731391,
|
|
"grad_norm": 0.5794843568633474,
|
|
"learning_rate": 1.3342318059299191e-05,
|
|
"loss": 0.4639,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.0889967637540453,
|
|
"grad_norm": 0.696228528125689,
|
|
"learning_rate": 1.4690026954177898e-05,
|
|
"loss": 0.4589,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.0970873786407767,
|
|
"grad_norm": 0.7033818455552552,
|
|
"learning_rate": 1.6037735849056604e-05,
|
|
"loss": 0.4511,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.10517799352750809,
|
|
"grad_norm": 0.6918266955210028,
|
|
"learning_rate": 1.738544474393531e-05,
|
|
"loss": 0.4259,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.11326860841423948,
|
|
"grad_norm": 0.48391302376719975,
|
|
"learning_rate": 1.8733153638814018e-05,
|
|
"loss": 0.4271,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.12135922330097088,
|
|
"grad_norm": 0.6600203232524687,
|
|
"learning_rate": 2.0080862533692725e-05,
|
|
"loss": 0.415,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.12944983818770225,
|
|
"grad_norm": 0.6491506270021533,
|
|
"learning_rate": 2.1428571428571428e-05,
|
|
"loss": 0.4047,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.13754045307443366,
|
|
"grad_norm": 1.108358955768956,
|
|
"learning_rate": 2.2776280323450135e-05,
|
|
"loss": 0.3977,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.14563106796116504,
|
|
"grad_norm": 1.0220828997842137,
|
|
"learning_rate": 2.4123989218328842e-05,
|
|
"loss": 0.3972,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.15372168284789645,
|
|
"grad_norm": 0.7866428488585541,
|
|
"learning_rate": 2.547169811320755e-05,
|
|
"loss": 0.3886,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.16181229773462782,
|
|
"grad_norm": 0.7956757749575036,
|
|
"learning_rate": 2.6819407008086256e-05,
|
|
"loss": 0.3881,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.16990291262135923,
|
|
"grad_norm": 0.7154298194551921,
|
|
"learning_rate": 2.8167115902964963e-05,
|
|
"loss": 0.3915,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.1779935275080906,
|
|
"grad_norm": 0.7172238259413939,
|
|
"learning_rate": 2.9514824797843667e-05,
|
|
"loss": 0.384,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.18608414239482202,
|
|
"grad_norm": 0.7995655410192202,
|
|
"learning_rate": 3.086253369272237e-05,
|
|
"loss": 0.3748,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.1941747572815534,
|
|
"grad_norm": 13.800265685156136,
|
|
"learning_rate": 3.221024258760108e-05,
|
|
"loss": 0.3732,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.2022653721682848,
|
|
"grad_norm": 0.9742105484202757,
|
|
"learning_rate": 3.355795148247979e-05,
|
|
"loss": 0.3778,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.21035598705501618,
|
|
"grad_norm": 0.7304881990540768,
|
|
"learning_rate": 3.490566037735849e-05,
|
|
"loss": 0.3779,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.21844660194174756,
|
|
"grad_norm": 0.6020503863674406,
|
|
"learning_rate": 3.62533692722372e-05,
|
|
"loss": 0.3659,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.22653721682847897,
|
|
"grad_norm": 0.6249460279802084,
|
|
"learning_rate": 3.76010781671159e-05,
|
|
"loss": 0.3678,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.23462783171521034,
|
|
"grad_norm": 0.7546333918045006,
|
|
"learning_rate": 3.894878706199461e-05,
|
|
"loss": 0.3608,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.24271844660194175,
|
|
"grad_norm": 1.0594379228439452,
|
|
"learning_rate": 4.0296495956873316e-05,
|
|
"loss": 0.3623,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.25080906148867316,
|
|
"grad_norm": 0.8968179435463678,
|
|
"learning_rate": 4.164420485175202e-05,
|
|
"loss": 0.361,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.2588996763754045,
|
|
"grad_norm": 0.6785334732318974,
|
|
"learning_rate": 4.299191374663073e-05,
|
|
"loss": 0.3587,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.2669902912621359,
|
|
"grad_norm": 0.7864105607470325,
|
|
"learning_rate": 4.433962264150944e-05,
|
|
"loss": 0.3641,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.2750809061488673,
|
|
"grad_norm": 0.7520885489880877,
|
|
"learning_rate": 4.5687331536388144e-05,
|
|
"loss": 0.3571,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.28317152103559873,
|
|
"grad_norm": 0.7154271960483066,
|
|
"learning_rate": 4.703504043126685e-05,
|
|
"loss": 0.3564,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.2912621359223301,
|
|
"grad_norm": 0.5540540831665184,
|
|
"learning_rate": 4.838274932614555e-05,
|
|
"loss": 0.3499,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.2993527508090615,
|
|
"grad_norm": 0.8972157863979777,
|
|
"learning_rate": 4.973045822102426e-05,
|
|
"loss": 0.3546,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.3074433656957929,
|
|
"grad_norm": 0.5977035180668759,
|
|
"learning_rate": 4.9999290952604396e-05,
|
|
"loss": 0.358,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.3155339805825243,
|
|
"grad_norm": 0.6794119464101197,
|
|
"learning_rate": 4.9996410516491115e-05,
|
|
"loss": 0.3442,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.32362459546925565,
|
|
"grad_norm": 0.6445475083450791,
|
|
"learning_rate": 4.9991314631296585e-05,
|
|
"loss": 0.3504,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.33171521035598706,
|
|
"grad_norm": 0.6254469656785541,
|
|
"learning_rate": 4.9984003748672604e-05,
|
|
"loss": 0.3451,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.33980582524271846,
|
|
"grad_norm": 0.6124809359745962,
|
|
"learning_rate": 4.997447851658774e-05,
|
|
"loss": 0.3373,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.3478964401294498,
|
|
"grad_norm": 0.8314626543045377,
|
|
"learning_rate": 4.9962739779269887e-05,
|
|
"loss": 0.3427,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.3559870550161812,
|
|
"grad_norm": 0.7096404483846562,
|
|
"learning_rate": 4.9948788577131414e-05,
|
|
"loss": 0.3402,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.3640776699029126,
|
|
"grad_norm": 0.5010872233529703,
|
|
"learning_rate": 4.993262614667696e-05,
|
|
"loss": 0.3404,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.37216828478964403,
|
|
"grad_norm": 0.7096873040187438,
|
|
"learning_rate": 4.9914253920393884e-05,
|
|
"loss": 0.3374,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.3802588996763754,
|
|
"grad_norm": 0.6463889915075135,
|
|
"learning_rate": 4.9893673526625265e-05,
|
|
"loss": 0.3374,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.3883495145631068,
|
|
"grad_norm": 0.698463838389325,
|
|
"learning_rate": 4.987088678942555e-05,
|
|
"loss": 0.338,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.3964401294498382,
|
|
"grad_norm": 0.6798285617884782,
|
|
"learning_rate": 4.984589572839897e-05,
|
|
"loss": 0.3335,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.4045307443365696,
|
|
"grad_norm": 0.4599792513564179,
|
|
"learning_rate": 4.9818702558520485e-05,
|
|
"loss": 0.3299,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.41262135922330095,
|
|
"grad_norm": 0.49728635683969147,
|
|
"learning_rate": 4.978930968993946e-05,
|
|
"loss": 0.3388,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.42071197411003236,
|
|
"grad_norm": 0.4217113735156639,
|
|
"learning_rate": 4.9757719727766085e-05,
|
|
"loss": 0.3319,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.42880258899676377,
|
|
"grad_norm": 0.48369122768285383,
|
|
"learning_rate": 4.972393547184046e-05,
|
|
"loss": 0.33,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.4368932038834951,
|
|
"grad_norm": 0.43182058716411714,
|
|
"learning_rate": 4.968795991648446e-05,
|
|
"loss": 0.3252,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.4449838187702265,
|
|
"grad_norm": 0.5876386529955178,
|
|
"learning_rate": 4.9649796250236344e-05,
|
|
"loss": 0.324,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.45307443365695793,
|
|
"grad_norm": 0.5486057928498816,
|
|
"learning_rate": 4.960944785556814e-05,
|
|
"loss": 0.3254,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.46116504854368934,
|
|
"grad_norm": 0.4867958049194759,
|
|
"learning_rate": 4.956691830858585e-05,
|
|
"loss": 0.3216,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.4692556634304207,
|
|
"grad_norm": 0.4571043410958847,
|
|
"learning_rate": 4.952221137871252e-05,
|
|
"loss": 0.3206,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.4773462783171521,
|
|
"grad_norm": 0.5063667000215466,
|
|
"learning_rate": 4.947533102835413e-05,
|
|
"loss": 0.322,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.4854368932038835,
|
|
"grad_norm": 0.4522556669690265,
|
|
"learning_rate": 4.942628141254843e-05,
|
|
"loss": 0.3257,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.4935275080906149,
|
|
"grad_norm": 0.4321047527271199,
|
|
"learning_rate": 4.937506687859666e-05,
|
|
"loss": 0.3197,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.5016181229773463,
|
|
"grad_norm": 0.393401694292632,
|
|
"learning_rate": 4.932169196567824e-05,
|
|
"loss": 0.3209,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.5097087378640777,
|
|
"grad_norm": 0.39900307971080384,
|
|
"learning_rate": 4.9266161404448454e-05,
|
|
"loss": 0.3102,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.517799352750809,
|
|
"grad_norm": 0.4716849647042864,
|
|
"learning_rate": 4.920848011661919e-05,
|
|
"loss": 0.3195,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.5258899676375405,
|
|
"grad_norm": 0.460323217460504,
|
|
"learning_rate": 4.914865321452274e-05,
|
|
"loss": 0.3151,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.5339805825242718,
|
|
"grad_norm": 0.45925505988077353,
|
|
"learning_rate": 4.908668600065862e-05,
|
|
"loss": 0.3151,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.5420711974110033,
|
|
"grad_norm": 0.5830461135145422,
|
|
"learning_rate": 4.90225839672237e-05,
|
|
"loss": 0.3216,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.5501618122977346,
|
|
"grad_norm": 0.4701717654524378,
|
|
"learning_rate": 4.8956352795625325e-05,
|
|
"loss": 0.3168,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.558252427184466,
|
|
"grad_norm": 0.337481033517792,
|
|
"learning_rate": 4.8887998355977886e-05,
|
|
"loss": 0.313,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.5663430420711975,
|
|
"grad_norm": 0.6269962483834282,
|
|
"learning_rate": 4.881752670658244e-05,
|
|
"loss": 0.3132,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.5744336569579288,
|
|
"grad_norm": 0.4662171259348263,
|
|
"learning_rate": 4.87449440933898e-05,
|
|
"loss": 0.314,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.5825242718446602,
|
|
"grad_norm": 0.5290025725406212,
|
|
"learning_rate": 4.867025694944698e-05,
|
|
"loss": 0.3109,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.5906148867313916,
|
|
"grad_norm": 0.43779075196037404,
|
|
"learning_rate": 4.859347189432699e-05,
|
|
"loss": 0.3188,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.598705501618123,
|
|
"grad_norm": 0.5368922851512631,
|
|
"learning_rate": 4.8514595733542144e-05,
|
|
"loss": 0.3053,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.6067961165048543,
|
|
"grad_norm": 0.3739446436447005,
|
|
"learning_rate": 4.8433635457940915e-05,
|
|
"loss": 0.3095,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.6148867313915858,
|
|
"grad_norm": 0.36805536772795056,
|
|
"learning_rate": 4.8350598243088283e-05,
|
|
"loss": 0.3081,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.6229773462783171,
|
|
"grad_norm": 0.6081454922011427,
|
|
"learning_rate": 4.8265491448629804e-05,
|
|
"loss": 0.3096,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.6310679611650486,
|
|
"grad_norm": 0.606989309832587,
|
|
"learning_rate": 4.817832261763928e-05,
|
|
"loss": 0.3064,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.63915857605178,
|
|
"grad_norm": 0.374478457061797,
|
|
"learning_rate": 4.8089099475950257e-05,
|
|
"loss": 0.3075,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.6472491909385113,
|
|
"grad_norm": 0.4495226360651449,
|
|
"learning_rate": 4.7997829931471225e-05,
|
|
"loss": 0.3055,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.6553398058252428,
|
|
"grad_norm": 0.48869687342847196,
|
|
"learning_rate": 4.7904522073484786e-05,
|
|
"loss": 0.3056,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.6634304207119741,
|
|
"grad_norm": 0.4244338878096003,
|
|
"learning_rate": 4.780918417193065e-05,
|
|
"loss": 0.3068,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.6715210355987055,
|
|
"grad_norm": 0.8568600210635637,
|
|
"learning_rate": 4.7711824676672726e-05,
|
|
"loss": 0.311,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.6796116504854369,
|
|
"grad_norm": 0.4818809073072544,
|
|
"learning_rate": 4.76124522167501e-05,
|
|
"loss": 0.3139,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.6877022653721683,
|
|
"grad_norm": 0.5673965426592162,
|
|
"learning_rate": 4.751107559961238e-05,
|
|
"loss": 0.3085,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.6957928802588996,
|
|
"grad_norm": 0.42296772741332755,
|
|
"learning_rate": 4.740770381033894e-05,
|
|
"loss": 0.3129,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.7038834951456311,
|
|
"grad_norm": 0.4358296754132214,
|
|
"learning_rate": 4.730234601084268e-05,
|
|
"loss": 0.3058,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.7119741100323624,
|
|
"grad_norm": 0.4373965191621123,
|
|
"learning_rate": 4.719501153905793e-05,
|
|
"loss": 0.3025,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.7200647249190939,
|
|
"grad_norm": 0.3787352736875979,
|
|
"learning_rate": 4.7085709908112866e-05,
|
|
"loss": 0.3034,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.7281553398058253,
|
|
"grad_norm": 0.469843837143792,
|
|
"learning_rate": 4.6974450805486305e-05,
|
|
"loss": 0.303,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.7362459546925566,
|
|
"grad_norm": 0.4394456386334091,
|
|
"learning_rate": 4.686124409214917e-05,
|
|
"loss": 0.2975,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.7443365695792881,
|
|
"grad_norm": 0.37340718943669743,
|
|
"learning_rate": 4.674609980169042e-05,
|
|
"loss": 0.3002,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.7524271844660194,
|
|
"grad_norm": 0.47566576341803307,
|
|
"learning_rate": 4.662902813942784e-05,
|
|
"loss": 0.3051,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.7605177993527508,
|
|
"grad_norm": 0.4410622981522137,
|
|
"learning_rate": 4.651003948150349e-05,
|
|
"loss": 0.2962,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.7686084142394822,
|
|
"grad_norm": 0.41727057267800893,
|
|
"learning_rate": 4.638914437396408e-05,
|
|
"loss": 0.2959,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.7766990291262136,
|
|
"grad_norm": 0.3766577031650596,
|
|
"learning_rate": 4.626635353182626e-05,
|
|
"loss": 0.2951,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.7847896440129449,
|
|
"grad_norm": 0.36286446424405566,
|
|
"learning_rate": 4.614167783812694e-05,
|
|
"loss": 0.294,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.7928802588996764,
|
|
"grad_norm": 0.4051751512540439,
|
|
"learning_rate": 4.601512834295874e-05,
|
|
"loss": 0.2944,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.8009708737864077,
|
|
"grad_norm": 0.37027627803204927,
|
|
"learning_rate": 4.588671626249057e-05,
|
|
"loss": 0.2952,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.8090614886731392,
|
|
"grad_norm": 0.464872048858869,
|
|
"learning_rate": 4.5756452977973585e-05,
|
|
"loss": 0.292,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.8171521035598706,
|
|
"grad_norm": 0.4408350560704109,
|
|
"learning_rate": 4.56243500347324e-05,
|
|
"loss": 0.3009,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.8252427184466019,
|
|
"grad_norm": 0.49398039939609895,
|
|
"learning_rate": 4.549041914114188e-05,
|
|
"loss": 0.2969,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 0.5592891850787267,
|
|
"learning_rate": 4.535467216758936e-05,
|
|
"loss": 0.2981,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.8414239482200647,
|
|
"grad_norm": 0.4230495195351826,
|
|
"learning_rate": 4.5217121145422616e-05,
|
|
"loss": 0.2883,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.8495145631067961,
|
|
"grad_norm": 0.33133186454457314,
|
|
"learning_rate": 4.5077778265883477e-05,
|
|
"loss": 0.2927,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.8576051779935275,
|
|
"grad_norm": 0.3936509630212508,
|
|
"learning_rate": 4.4936655879027336e-05,
|
|
"loss": 0.2948,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.8656957928802589,
|
|
"grad_norm": 0.4500916019115437,
|
|
"learning_rate": 4.479376649262855e-05,
|
|
"loss": 0.2943,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.8737864077669902,
|
|
"grad_norm": 0.3668748277040676,
|
|
"learning_rate": 4.464912277107185e-05,
|
|
"loss": 0.2928,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.8818770226537217,
|
|
"grad_norm": 0.320494366403153,
|
|
"learning_rate": 4.450273753422992e-05,
|
|
"loss": 0.2909,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.889967637540453,
|
|
"grad_norm": 0.37987923612254676,
|
|
"learning_rate": 4.435462375632711e-05,
|
|
"loss": 0.2964,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.8980582524271845,
|
|
"grad_norm": 0.39981221876873035,
|
|
"learning_rate": 4.420479456478957e-05,
|
|
"loss": 0.2994,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.9061488673139159,
|
|
"grad_norm": 0.3091126654651453,
|
|
"learning_rate": 4.405326323908178e-05,
|
|
"loss": 0.2851,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.9142394822006472,
|
|
"grad_norm": 0.47407567538309503,
|
|
"learning_rate": 4.390004320952947e-05,
|
|
"loss": 0.2914,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.9223300970873787,
|
|
"grad_norm": 0.4473436088105401,
|
|
"learning_rate": 4.374514805612942e-05,
|
|
"loss": 0.2958,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.93042071197411,
|
|
"grad_norm": 0.3335572127939302,
|
|
"learning_rate": 4.358859150734576e-05,
|
|
"loss": 0.2915,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.9385113268608414,
|
|
"grad_norm": 0.4410700993823537,
|
|
"learning_rate": 4.343038743889324e-05,
|
|
"loss": 0.2936,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.9466019417475728,
|
|
"grad_norm": 0.4566532222343424,
|
|
"learning_rate": 4.3270549872507415e-05,
|
|
"loss": 0.2921,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.9546925566343042,
|
|
"grad_norm": 0.34831612960636166,
|
|
"learning_rate": 4.3109092974701895e-05,
|
|
"loss": 0.2961,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.9627831715210357,
|
|
"grad_norm": 0.33970324826345877,
|
|
"learning_rate": 4.2946031055512733e-05,
|
|
"loss": 0.2887,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.970873786407767,
|
|
"grad_norm": 0.46192580192198573,
|
|
"learning_rate": 4.2781378567230145e-05,
|
|
"loss": 0.29,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.9789644012944984,
|
|
"grad_norm": 0.4615119904861507,
|
|
"learning_rate": 4.2615150103117576e-05,
|
|
"loss": 0.2886,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.9870550161812298,
|
|
"grad_norm": 0.2981731488057052,
|
|
"learning_rate": 4.24473603961183e-05,
|
|
"loss": 0.2908,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.9951456310679612,
|
|
"grad_norm": 0.3632890670918485,
|
|
"learning_rate": 4.227802431754961e-05,
|
|
"loss": 0.2924,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 1.0032362459546926,
|
|
"grad_norm": 0.4150403194043393,
|
|
"learning_rate": 4.210715687578481e-05,
|
|
"loss": 0.2727,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.0113268608414239,
|
|
"grad_norm": 0.34869821901507675,
|
|
"learning_rate": 4.193477321492293e-05,
|
|
"loss": 0.2453,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 1.0194174757281553,
|
|
"grad_norm": 0.3601010037088075,
|
|
"learning_rate": 4.176088861344657e-05,
|
|
"loss": 0.2414,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.0275080906148868,
|
|
"grad_norm": 0.32325115826296424,
|
|
"learning_rate": 4.158551848286773e-05,
|
|
"loss": 0.2425,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 1.035598705501618,
|
|
"grad_norm": 0.38691862676257593,
|
|
"learning_rate": 4.140867836636189e-05,
|
|
"loss": 0.247,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.0436893203883495,
|
|
"grad_norm": 0.34882276619380004,
|
|
"learning_rate": 4.1230383937390374e-05,
|
|
"loss": 0.239,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 1.051779935275081,
|
|
"grad_norm": 0.3951880114595209,
|
|
"learning_rate": 4.1050650998311215e-05,
|
|
"loss": 0.2375,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.0598705501618122,
|
|
"grad_norm": 0.3736593550545292,
|
|
"learning_rate": 4.086949547897862e-05,
|
|
"loss": 0.2479,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 1.0679611650485437,
|
|
"grad_norm": 0.34774896387912796,
|
|
"learning_rate": 4.068693343533103e-05,
|
|
"loss": 0.2372,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.0760517799352751,
|
|
"grad_norm": 0.3330973525666551,
|
|
"learning_rate": 4.050298104796812e-05,
|
|
"loss": 0.2413,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 1.0841423948220066,
|
|
"grad_norm": 0.27924340181820345,
|
|
"learning_rate": 4.0317654620716704e-05,
|
|
"loss": 0.2454,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.0922330097087378,
|
|
"grad_norm": 0.3807786754448861,
|
|
"learning_rate": 4.013097057918566e-05,
|
|
"loss": 0.2457,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 1.1003236245954693,
|
|
"grad_norm": 0.34402611290470836,
|
|
"learning_rate": 3.9942945469310175e-05,
|
|
"loss": 0.2406,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.1084142394822007,
|
|
"grad_norm": 0.3087094208360471,
|
|
"learning_rate": 3.9753595955885264e-05,
|
|
"loss": 0.2403,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 1.116504854368932,
|
|
"grad_norm": 0.334687554837771,
|
|
"learning_rate": 3.9562938821088705e-05,
|
|
"loss": 0.2443,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.1245954692556634,
|
|
"grad_norm": 0.34101610586326964,
|
|
"learning_rate": 3.9370990962993695e-05,
|
|
"loss": 0.2455,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 1.132686084142395,
|
|
"grad_norm": 0.36304196735659044,
|
|
"learning_rate": 3.9177769394071086e-05,
|
|
"loss": 0.2423,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.1407766990291262,
|
|
"grad_norm": 0.32455755430896477,
|
|
"learning_rate": 3.898329123968163e-05,
|
|
"loss": 0.2424,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 1.1488673139158576,
|
|
"grad_norm": 0.29393617658060145,
|
|
"learning_rate": 3.87875737365581e-05,
|
|
"loss": 0.2383,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.156957928802589,
|
|
"grad_norm": 0.40951987232018516,
|
|
"learning_rate": 3.8590634231277626e-05,
|
|
"loss": 0.2605,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 1.1650485436893203,
|
|
"grad_norm": 0.3441977909919604,
|
|
"learning_rate": 3.8392490178724184e-05,
|
|
"loss": 0.2455,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.1731391585760518,
|
|
"grad_norm": 0.3112960711261136,
|
|
"learning_rate": 3.8193159140541645e-05,
|
|
"loss": 0.2419,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 1.1812297734627832,
|
|
"grad_norm": 0.3032973624770194,
|
|
"learning_rate": 3.7992658783577215e-05,
|
|
"loss": 0.2422,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.1893203883495145,
|
|
"grad_norm": 0.29109047481785005,
|
|
"learning_rate": 3.779100687831563e-05,
|
|
"loss": 0.2462,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 1.197411003236246,
|
|
"grad_norm": 0.2948022665787199,
|
|
"learning_rate": 3.758822129730415e-05,
|
|
"loss": 0.245,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.2055016181229774,
|
|
"grad_norm": 0.31258495126302516,
|
|
"learning_rate": 3.738432001356851e-05,
|
|
"loss": 0.2455,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 1.2135922330097086,
|
|
"grad_norm": 0.27328969134267245,
|
|
"learning_rate": 3.7179321099019916e-05,
|
|
"loss": 0.2369,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.22168284789644,
|
|
"grad_norm": 0.2847184801501813,
|
|
"learning_rate": 3.6973242722853365e-05,
|
|
"loss": 0.2402,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 1.2297734627831716,
|
|
"grad_norm": 0.29812191017952583,
|
|
"learning_rate": 3.6766103149937295e-05,
|
|
"loss": 0.2427,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.237864077669903,
|
|
"grad_norm": 0.3017615376479855,
|
|
"learning_rate": 3.655792073919471e-05,
|
|
"loss": 0.2363,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 1.2459546925566343,
|
|
"grad_norm": 0.27567364203177047,
|
|
"learning_rate": 3.634871394197607e-05,
|
|
"loss": 0.2388,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.2540453074433657,
|
|
"grad_norm": 0.2685513406212581,
|
|
"learning_rate": 3.6138501300423934e-05,
|
|
"loss": 0.2378,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 1.262135922330097,
|
|
"grad_norm": 0.25546275734682755,
|
|
"learning_rate": 3.592730144582948e-05,
|
|
"loss": 0.2341,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.2702265372168284,
|
|
"grad_norm": 0.2841532225060496,
|
|
"learning_rate": 3.571513309698131e-05,
|
|
"loss": 0.2366,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 1.27831715210356,
|
|
"grad_norm": 0.30541995269870376,
|
|
"learning_rate": 3.5502015058506335e-05,
|
|
"loss": 0.2375,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.2864077669902914,
|
|
"grad_norm": 0.34844869264740597,
|
|
"learning_rate": 3.528796621920307e-05,
|
|
"loss": 0.239,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 1.2944983818770226,
|
|
"grad_norm": 0.30708349077357805,
|
|
"learning_rate": 3.50730055503676e-05,
|
|
"loss": 0.2356,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.302588996763754,
|
|
"grad_norm": 0.3175696506236882,
|
|
"learning_rate": 3.485715210411204e-05,
|
|
"loss": 0.2358,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 1.3106796116504853,
|
|
"grad_norm": 0.29197347345983926,
|
|
"learning_rate": 3.4640425011676034e-05,
|
|
"loss": 0.2408,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.3187702265372168,
|
|
"grad_norm": 0.28169129894596506,
|
|
"learning_rate": 3.442284348173106e-05,
|
|
"loss": 0.2395,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 1.3268608414239482,
|
|
"grad_norm": 0.3992186155056125,
|
|
"learning_rate": 3.420442679867796e-05,
|
|
"loss": 0.2391,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 1.3349514563106797,
|
|
"grad_norm": 0.35636813012626095,
|
|
"learning_rate": 3.398519432093782e-05,
|
|
"loss": 0.2374,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 1.343042071197411,
|
|
"grad_norm": 0.27742962322683806,
|
|
"learning_rate": 3.376516547923614e-05,
|
|
"loss": 0.2336,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 1.3511326860841424,
|
|
"grad_norm": 0.28660763778902115,
|
|
"learning_rate": 3.3544359774880714e-05,
|
|
"loss": 0.2371,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 1.3592233009708738,
|
|
"grad_norm": 0.40153445123463827,
|
|
"learning_rate": 3.3322796778033204e-05,
|
|
"loss": 0.2376,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 1.367313915857605,
|
|
"grad_norm": 0.3035749815074104,
|
|
"learning_rate": 3.3100496125974624e-05,
|
|
"loss": 0.2377,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 1.3754045307443366,
|
|
"grad_norm": 0.342174636479568,
|
|
"learning_rate": 3.2877477521364895e-05,
|
|
"loss": 0.2347,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 1.383495145631068,
|
|
"grad_norm": 0.3199103842497454,
|
|
"learning_rate": 3.2653760730496555e-05,
|
|
"loss": 0.2297,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 1.3915857605177995,
|
|
"grad_norm": 0.27362978201436644,
|
|
"learning_rate": 3.242936558154285e-05,
|
|
"loss": 0.2356,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 1.3996763754045307,
|
|
"grad_norm": 0.27057113114600895,
|
|
"learning_rate": 3.2204311962800426e-05,
|
|
"loss": 0.2322,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 1.4077669902912622,
|
|
"grad_norm": 0.26643871030324334,
|
|
"learning_rate": 3.197861982092651e-05,
|
|
"loss": 0.2384,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 1.4158576051779934,
|
|
"grad_norm": 0.31497718220761595,
|
|
"learning_rate": 3.175230915917108e-05,
|
|
"loss": 0.2427,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 1.4239482200647249,
|
|
"grad_norm": 0.3267383691103009,
|
|
"learning_rate": 3.152540003560398e-05,
|
|
"loss": 0.2417,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 1.4320388349514563,
|
|
"grad_norm": 0.29663197558136717,
|
|
"learning_rate": 3.129791256133712e-05,
|
|
"loss": 0.2288,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 1.4401294498381878,
|
|
"grad_norm": 0.32445565003875587,
|
|
"learning_rate": 3.106986689874204e-05,
|
|
"loss": 0.2347,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 1.448220064724919,
|
|
"grad_norm": 0.29535887592241783,
|
|
"learning_rate": 3.0841283259662875e-05,
|
|
"loss": 0.2385,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 1.4563106796116505,
|
|
"grad_norm": 0.2733621998571008,
|
|
"learning_rate": 3.0612181903625014e-05,
|
|
"loss": 0.2359,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 1.4644012944983817,
|
|
"grad_norm": 0.2698891118245767,
|
|
"learning_rate": 3.0382583136039444e-05,
|
|
"loss": 0.2339,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 1.4724919093851132,
|
|
"grad_norm": 0.30735748022487547,
|
|
"learning_rate": 3.015250730640308e-05,
|
|
"loss": 0.237,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 1.4805825242718447,
|
|
"grad_norm": 0.35024703112347744,
|
|
"learning_rate": 2.9921974806495178e-05,
|
|
"loss": 0.2301,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 1.4886731391585761,
|
|
"grad_norm": 0.27111655738392937,
|
|
"learning_rate": 2.969100606856998e-05,
|
|
"loss": 0.2339,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 1.4967637540453074,
|
|
"grad_norm": 0.2787522117740128,
|
|
"learning_rate": 2.9459621563545825e-05,
|
|
"loss": 0.2385,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 1.5048543689320388,
|
|
"grad_norm": 0.2969133211792456,
|
|
"learning_rate": 2.9227841799190775e-05,
|
|
"loss": 0.2305,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 1.51294498381877,
|
|
"grad_norm": 0.2613286640167969,
|
|
"learning_rate": 2.8995687318304975e-05,
|
|
"loss": 0.2328,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 1.5210355987055015,
|
|
"grad_norm": 0.27216163489187184,
|
|
"learning_rate": 2.8763178696899995e-05,
|
|
"loss": 0.2373,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 1.529126213592233,
|
|
"grad_norm": 0.2964461145886047,
|
|
"learning_rate": 2.853033654237507e-05,
|
|
"loss": 0.2289,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 1.5372168284789645,
|
|
"grad_norm": 0.32949202305010017,
|
|
"learning_rate": 2.8297181491690756e-05,
|
|
"loss": 0.23,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 1.545307443365696,
|
|
"grad_norm": 0.3071880700004742,
|
|
"learning_rate": 2.8063734209539773e-05,
|
|
"loss": 0.2367,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 1.5533980582524272,
|
|
"grad_norm": 0.2529468819780906,
|
|
"learning_rate": 2.783001538651554e-05,
|
|
"loss": 0.2292,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 1.5614886731391586,
|
|
"grad_norm": 0.2773840151703638,
|
|
"learning_rate": 2.7596045737278336e-05,
|
|
"loss": 0.2398,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 1.5695792880258899,
|
|
"grad_norm": 0.295185746563609,
|
|
"learning_rate": 2.7361845998719315e-05,
|
|
"loss": 0.2325,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 1.5776699029126213,
|
|
"grad_norm": 0.25593073129115584,
|
|
"learning_rate": 2.7127436928122612e-05,
|
|
"loss": 0.2367,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 1.5857605177993528,
|
|
"grad_norm": 0.3516341035154705,
|
|
"learning_rate": 2.6892839301325623e-05,
|
|
"loss": 0.239,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 1.5938511326860842,
|
|
"grad_norm": 0.27153437539635045,
|
|
"learning_rate": 2.6658073910877603e-05,
|
|
"loss": 0.2288,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 1.6019417475728155,
|
|
"grad_norm": 0.26490849237503283,
|
|
"learning_rate": 2.6423161564196803e-05,
|
|
"loss": 0.231,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 1.610032362459547,
|
|
"grad_norm": 0.25415769405974487,
|
|
"learning_rate": 2.6188123081726306e-05,
|
|
"loss": 0.2341,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 1.6181229773462782,
|
|
"grad_norm": 0.2566774871023634,
|
|
"learning_rate": 2.5952979295088714e-05,
|
|
"loss": 0.2303,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 1.6262135922330097,
|
|
"grad_norm": 0.28720215998836357,
|
|
"learning_rate": 2.57177510452398e-05,
|
|
"loss": 0.2297,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 1.6343042071197411,
|
|
"grad_norm": 0.26535154825569424,
|
|
"learning_rate": 2.5482459180621377e-05,
|
|
"loss": 0.2336,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 1.6423948220064726,
|
|
"grad_norm": 0.2348030131858575,
|
|
"learning_rate": 2.524712455531347e-05,
|
|
"loss": 0.2283,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 1.650485436893204,
|
|
"grad_norm": 0.2981340903697484,
|
|
"learning_rate": 2.501176802718599e-05,
|
|
"loss": 0.2367,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 1.6585760517799353,
|
|
"grad_norm": 0.2508609786813777,
|
|
"learning_rate": 2.4776410456050165e-05,
|
|
"loss": 0.232,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 0.2760175375931361,
|
|
"learning_rate": 2.4541072701809624e-05,
|
|
"loss": 0.2348,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 1.674757281553398,
|
|
"grad_norm": 0.2681609470474934,
|
|
"learning_rate": 2.4305775622611627e-05,
|
|
"loss": 0.2285,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 1.6828478964401294,
|
|
"grad_norm": 0.3323878551187027,
|
|
"learning_rate": 2.4070540072998372e-05,
|
|
"loss": 0.2272,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 1.690938511326861,
|
|
"grad_norm": 0.24794381758629244,
|
|
"learning_rate": 2.3835386902058637e-05,
|
|
"loss": 0.2292,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 1.6990291262135924,
|
|
"grad_norm": 0.26234035679605566,
|
|
"learning_rate": 2.360033695157995e-05,
|
|
"loss": 0.2337,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 1.7071197411003236,
|
|
"grad_norm": 0.3095409200594481,
|
|
"learning_rate": 2.3365411054201315e-05,
|
|
"loss": 0.2265,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 1.715210355987055,
|
|
"grad_norm": 0.24814239367844307,
|
|
"learning_rate": 2.3130630031566818e-05,
|
|
"loss": 0.2269,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 1.7233009708737863,
|
|
"grad_norm": 0.2503027719931105,
|
|
"learning_rate": 2.2896014692480226e-05,
|
|
"loss": 0.231,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 1.7313915857605178,
|
|
"grad_norm": 0.24658598678553392,
|
|
"learning_rate": 2.266158583106063e-05,
|
|
"loss": 0.228,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 1.7394822006472492,
|
|
"grad_norm": 0.23999491418212948,
|
|
"learning_rate": 2.2427364224899502e-05,
|
|
"loss": 0.2289,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 1.7475728155339807,
|
|
"grad_norm": 0.24294562795428556,
|
|
"learning_rate": 2.2193370633219115e-05,
|
|
"loss": 0.2295,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 1.755663430420712,
|
|
"grad_norm": 0.27195306057038027,
|
|
"learning_rate": 2.1959625795032664e-05,
|
|
"loss": 0.2312,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 1.7637540453074434,
|
|
"grad_norm": 0.31612469238407165,
|
|
"learning_rate": 2.1726150427306182e-05,
|
|
"loss": 0.2286,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 1.7718446601941746,
|
|
"grad_norm": 0.29752263873224494,
|
|
"learning_rate": 2.1492965223122305e-05,
|
|
"loss": 0.2267,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 1.779935275080906,
|
|
"grad_norm": 0.2603667246009949,
|
|
"learning_rate": 2.126009084984629e-05,
|
|
"loss": 0.2251,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 1.7880258899676376,
|
|
"grad_norm": 0.26289071039247,
|
|
"learning_rate": 2.102754794729426e-05,
|
|
"loss": 0.2246,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 1.796116504854369,
|
|
"grad_norm": 0.263903471162293,
|
|
"learning_rate": 2.079535712590382e-05,
|
|
"loss": 0.2282,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 1.8042071197411005,
|
|
"grad_norm": 0.27035700806263924,
|
|
"learning_rate": 2.056353896490742e-05,
|
|
"loss": 0.2231,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 1.8122977346278317,
|
|
"grad_norm": 0.2654192689718002,
|
|
"learning_rate": 2.0332114010508334e-05,
|
|
"loss": 0.2268,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 1.820388349514563,
|
|
"grad_norm": 0.23959916600696418,
|
|
"learning_rate": 2.010110277405966e-05,
|
|
"loss": 0.2274,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 1.8284789644012944,
|
|
"grad_norm": 0.24872872826123787,
|
|
"learning_rate": 1.9870525730246424e-05,
|
|
"loss": 0.2293,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 1.8365695792880259,
|
|
"grad_norm": 0.2645954361424342,
|
|
"learning_rate": 1.9640403315270824e-05,
|
|
"loss": 0.2286,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 1.8446601941747574,
|
|
"grad_norm": 0.24420673184598343,
|
|
"learning_rate": 1.9410755925041006e-05,
|
|
"loss": 0.2257,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 1.8527508090614888,
|
|
"grad_norm": 0.24705849748195569,
|
|
"learning_rate": 1.918160391336335e-05,
|
|
"loss": 0.2259,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 1.86084142394822,
|
|
"grad_norm": 0.2951271763878442,
|
|
"learning_rate": 1.8952967590138472e-05,
|
|
"loss": 0.2248,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 1.8689320388349513,
|
|
"grad_norm": 0.2520418480110052,
|
|
"learning_rate": 1.8724867219561203e-05,
|
|
"loss": 0.2287,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 1.8770226537216828,
|
|
"grad_norm": 0.294901863474176,
|
|
"learning_rate": 1.8497323018324476e-05,
|
|
"loss": 0.2252,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 1.8851132686084142,
|
|
"grad_norm": 0.2620675111989084,
|
|
"learning_rate": 1.8270355153827598e-05,
|
|
"loss": 0.2243,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 1.8932038834951457,
|
|
"grad_norm": 0.21729342901905718,
|
|
"learning_rate": 1.804398374238872e-05,
|
|
"loss": 0.2231,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 1.9012944983818771,
|
|
"grad_norm": 0.26438351131145754,
|
|
"learning_rate": 1.781822884746196e-05,
|
|
"loss": 0.2203,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 1.9093851132686084,
|
|
"grad_norm": 0.22358504292564055,
|
|
"learning_rate": 1.7593110477859153e-05,
|
|
"loss": 0.223,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 1.9174757281553398,
|
|
"grad_norm": 0.24990353950246044,
|
|
"learning_rate": 1.736864858597645e-05,
|
|
"loss": 0.2233,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 1.925566343042071,
|
|
"grad_norm": 0.24537211238465853,
|
|
"learning_rate": 1.7144863066025955e-05,
|
|
"loss": 0.2193,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 1.9336569579288025,
|
|
"grad_norm": 0.2394304662544921,
|
|
"learning_rate": 1.692177375227242e-05,
|
|
"loss": 0.2251,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 1.941747572815534,
|
|
"grad_norm": 0.2785110439501238,
|
|
"learning_rate": 1.669940041727538e-05,
|
|
"loss": 0.218,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 1.9498381877022655,
|
|
"grad_norm": 0.23030205395765063,
|
|
"learning_rate": 1.6477762770136707e-05,
|
|
"loss": 0.225,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 1.9579288025889967,
|
|
"grad_norm": 0.24458486846119534,
|
|
"learning_rate": 1.625688045475371e-05,
|
|
"loss": 0.2208,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 1.9660194174757282,
|
|
"grad_norm": 0.223552780447482,
|
|
"learning_rate": 1.603677304807815e-05,
|
|
"loss": 0.2223,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 1.9741100323624594,
|
|
"grad_norm": 0.23262918496301524,
|
|
"learning_rate": 1.5817460058381088e-05,
|
|
"loss": 0.2274,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 1.9822006472491909,
|
|
"grad_norm": 0.2210251232882193,
|
|
"learning_rate": 1.5598960923523842e-05,
|
|
"loss": 0.2248,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 1.9902912621359223,
|
|
"grad_norm": 0.2543751678692885,
|
|
"learning_rate": 1.5381295009235262e-05,
|
|
"loss": 0.2277,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 1.9983818770226538,
|
|
"grad_norm": 0.2767394174239906,
|
|
"learning_rate": 1.5164481607395238e-05,
|
|
"loss": 0.2243,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 2.0064724919093853,
|
|
"grad_norm": 0.2831027468460383,
|
|
"learning_rate": 1.4948539934324923e-05,
|
|
"loss": 0.1818,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 2.0145631067961167,
|
|
"grad_norm": 0.23514037047255174,
|
|
"learning_rate": 1.4733489129083534e-05,
|
|
"loss": 0.1718,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 2.0226537216828477,
|
|
"grad_norm": 0.23543211368476746,
|
|
"learning_rate": 1.4519348251772058e-05,
|
|
"loss": 0.1696,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 2.030744336569579,
|
|
"grad_norm": 0.24256191701199467,
|
|
"learning_rate": 1.4306136281843962e-05,
|
|
"loss": 0.1691,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 2.0388349514563107,
|
|
"grad_norm": 0.22234069623742508,
|
|
"learning_rate": 1.4093872116422979e-05,
|
|
"loss": 0.169,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 2.046925566343042,
|
|
"grad_norm": 0.23132835680535235,
|
|
"learning_rate": 1.3882574568628315e-05,
|
|
"loss": 0.168,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 2.0550161812297736,
|
|
"grad_norm": 0.23134456189741553,
|
|
"learning_rate": 1.3672262365907163e-05,
|
|
"loss": 0.1684,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 2.063106796116505,
|
|
"grad_norm": 0.22760921973646828,
|
|
"learning_rate": 1.3462954148374899e-05,
|
|
"loss": 0.1661,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 2.071197411003236,
|
|
"grad_norm": 0.21228629003960797,
|
|
"learning_rate": 1.3254668467163029e-05,
|
|
"loss": 0.1696,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 2.0792880258899675,
|
|
"grad_norm": 0.2182291343034792,
|
|
"learning_rate": 1.3047423782774937e-05,
|
|
"loss": 0.1649,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 2.087378640776699,
|
|
"grad_norm": 0.21718735716811796,
|
|
"learning_rate": 1.2841238463449743e-05,
|
|
"loss": 0.1719,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 2.0954692556634305,
|
|
"grad_norm": 0.22472401035201942,
|
|
"learning_rate": 1.2636130783534319e-05,
|
|
"loss": 0.1703,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 2.103559870550162,
|
|
"grad_norm": 0.21069125465698005,
|
|
"learning_rate": 1.2432118921863604e-05,
|
|
"loss": 0.168,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 2.1116504854368934,
|
|
"grad_norm": 0.21446866638864276,
|
|
"learning_rate": 1.2229220960149431e-05,
|
|
"loss": 0.1695,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 2.1197411003236244,
|
|
"grad_norm": 0.21738468937149366,
|
|
"learning_rate": 1.2027454881377889e-05,
|
|
"loss": 0.1675,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 2.127831715210356,
|
|
"grad_norm": 0.22154845083846222,
|
|
"learning_rate": 1.1826838568215526e-05,
|
|
"loss": 0.1685,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 2.1359223300970873,
|
|
"grad_norm": 0.20713221800601425,
|
|
"learning_rate": 1.1627389801424351e-05,
|
|
"loss": 0.1706,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 2.144012944983819,
|
|
"grad_norm": 0.20802282917927592,
|
|
"learning_rate": 1.1429126258285946e-05,
|
|
"loss": 0.1661,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 2.1521035598705502,
|
|
"grad_norm": 0.20646189593102915,
|
|
"learning_rate": 1.1232065511034696e-05,
|
|
"loss": 0.1663,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 2.1601941747572817,
|
|
"grad_norm": 0.21341477579858295,
|
|
"learning_rate": 1.1036225025300357e-05,
|
|
"loss": 0.1687,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 2.168284789644013,
|
|
"grad_norm": 0.21356709357188863,
|
|
"learning_rate": 1.0841622158560085e-05,
|
|
"loss": 0.1668,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 2.176375404530744,
|
|
"grad_norm": 0.20685337347705235,
|
|
"learning_rate": 1.0648274158599994e-05,
|
|
"loss": 0.1698,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 2.1844660194174756,
|
|
"grad_norm": 0.22156407557595678,
|
|
"learning_rate": 1.0456198161986489e-05,
|
|
"loss": 0.1753,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 2.192556634304207,
|
|
"grad_norm": 0.2035462284179136,
|
|
"learning_rate": 1.0265411192547462e-05,
|
|
"loss": 0.1718,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 2.2006472491909386,
|
|
"grad_norm": 0.20189924705262216,
|
|
"learning_rate": 1.0075930159863416e-05,
|
|
"loss": 0.1661,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 2.20873786407767,
|
|
"grad_norm": 0.20956549431298424,
|
|
"learning_rate": 9.887771857768796e-06,
|
|
"loss": 0.1683,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 2.2168284789644015,
|
|
"grad_norm": 0.2140205114087363,
|
|
"learning_rate": 9.700952962863513e-06,
|
|
"loss": 0.1701,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 2.2249190938511325,
|
|
"grad_norm": 0.20398172720239158,
|
|
"learning_rate": 9.515490033034893e-06,
|
|
"loss": 0.1667,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 2.233009708737864,
|
|
"grad_norm": 0.20165594228560696,
|
|
"learning_rate": 9.331399505990168e-06,
|
|
"loss": 0.1655,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 2.2411003236245954,
|
|
"grad_norm": 0.20182970094436348,
|
|
"learning_rate": 9.148697697799533e-06,
|
|
"loss": 0.1694,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 2.249190938511327,
|
|
"grad_norm": 0.2291167648037075,
|
|
"learning_rate": 8.967400801450105e-06,
|
|
"loss": 0.1685,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 2.2572815533980584,
|
|
"grad_norm": 0.20375204743694134,
|
|
"learning_rate": 8.787524885410678e-06,
|
|
"loss": 0.1638,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 2.26537216828479,
|
|
"grad_norm": 0.214007858233087,
|
|
"learning_rate": 8.60908589220758e-06,
|
|
"loss": 0.1676,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 2.273462783171521,
|
|
"grad_norm": 0.2025950468246571,
|
|
"learning_rate": 8.432099637011693e-06,
|
|
"loss": 0.1657,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 2.2815533980582523,
|
|
"grad_norm": 0.21480260045098348,
|
|
"learning_rate": 8.256581806236704e-06,
|
|
"loss": 0.1649,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 2.2896440129449838,
|
|
"grad_norm": 0.2052262400110326,
|
|
"learning_rate": 8.082547956148873e-06,
|
|
"loss": 0.1663,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 2.2977346278317152,
|
|
"grad_norm": 0.20019987270861134,
|
|
"learning_rate": 7.91001351148819e-06,
|
|
"loss": 0.1653,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 2.3058252427184467,
|
|
"grad_norm": 0.207551520801799,
|
|
"learning_rate": 7.738993764101324e-06,
|
|
"loss": 0.1677,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 2.313915857605178,
|
|
"grad_norm": 0.19280865204847908,
|
|
"learning_rate": 7.569503871586292e-06,
|
|
"loss": 0.1674,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 2.3220064724919096,
|
|
"grad_norm": 0.20122645680053192,
|
|
"learning_rate": 7.401558855949004e-06,
|
|
"loss": 0.1678,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 2.3300970873786406,
|
|
"grad_norm": 0.1945489837836613,
|
|
"learning_rate": 7.235173602271875e-06,
|
|
"loss": 0.162,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 2.338187702265372,
|
|
"grad_norm": 0.2051511298220981,
|
|
"learning_rate": 7.070362857394538e-06,
|
|
"loss": 0.1712,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 2.3462783171521036,
|
|
"grad_norm": 0.2038615765888219,
|
|
"learning_rate": 6.907141228606831e-06,
|
|
"loss": 0.1664,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 2.354368932038835,
|
|
"grad_norm": 0.19824356024446557,
|
|
"learning_rate": 6.745523182354147e-06,
|
|
"loss": 0.1634,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 2.3624595469255665,
|
|
"grad_norm": 0.1897897800458373,
|
|
"learning_rate": 6.585523042955233e-06,
|
|
"loss": 0.1633,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 2.3705501618122975,
|
|
"grad_norm": 0.19361276633294466,
|
|
"learning_rate": 6.427154991332665e-06,
|
|
"loss": 0.1665,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 2.378640776699029,
|
|
"grad_norm": 0.19269698264895235,
|
|
"learning_rate": 6.2704330637559315e-06,
|
|
"loss": 0.1632,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 2.3867313915857604,
|
|
"grad_norm": 0.19487091156668288,
|
|
"learning_rate": 6.115371150597413e-06,
|
|
"loss": 0.1657,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 2.394822006472492,
|
|
"grad_norm": 0.17726783434092147,
|
|
"learning_rate": 5.961982995101301e-06,
|
|
"loss": 0.1652,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 2.4029126213592233,
|
|
"grad_norm": 0.19174116985719267,
|
|
"learning_rate": 5.810282192165442e-06,
|
|
"loss": 0.1674,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 2.411003236245955,
|
|
"grad_norm": 0.20196362222659836,
|
|
"learning_rate": 5.660282187136507e-06,
|
|
"loss": 0.1648,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 2.4190938511326863,
|
|
"grad_norm": 0.2037293127188244,
|
|
"learning_rate": 5.511996274618253e-06,
|
|
"loss": 0.1673,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 2.4271844660194173,
|
|
"grad_norm": 0.2075997573199353,
|
|
"learning_rate": 5.365437597293238e-06,
|
|
"loss": 0.1681,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 2.4352750809061487,
|
|
"grad_norm": 0.19485346480204271,
|
|
"learning_rate": 5.220619144757996e-06,
|
|
"loss": 0.168,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 2.44336569579288,
|
|
"grad_norm": 0.20032468931249656,
|
|
"learning_rate": 5.077553752371708e-06,
|
|
"loss": 0.1695,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 2.4514563106796117,
|
|
"grad_norm": 0.18780814063723766,
|
|
"learning_rate": 4.936254100118656e-06,
|
|
"loss": 0.1626,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 2.459546925566343,
|
|
"grad_norm": 0.20597954746449326,
|
|
"learning_rate": 4.796732711484342e-06,
|
|
"loss": 0.1688,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 2.4676375404530746,
|
|
"grad_norm": 0.1884986064298571,
|
|
"learning_rate": 4.659001952345538e-06,
|
|
"loss": 0.1618,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 2.475728155339806,
|
|
"grad_norm": 0.19330264761303187,
|
|
"learning_rate": 4.523074029874291e-06,
|
|
"loss": 0.1611,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 2.483818770226537,
|
|
"grad_norm": 0.1899995190829889,
|
|
"learning_rate": 4.388960991455998e-06,
|
|
"loss": 0.1642,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 2.4919093851132685,
|
|
"grad_norm": 0.20352479818055758,
|
|
"learning_rate": 4.256674723621621e-06,
|
|
"loss": 0.165,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 0.19705480301688083,
|
|
"learning_rate": 4.126226950994211e-06,
|
|
"loss": 0.1633,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 2.5080906148867315,
|
|
"grad_norm": 0.19973033430929824,
|
|
"learning_rate": 3.997629235249692e-06,
|
|
"loss": 0.1645,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 2.516181229773463,
|
|
"grad_norm": 0.19227297342228458,
|
|
"learning_rate": 3.870892974092197e-06,
|
|
"loss": 0.1657,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 2.524271844660194,
|
|
"grad_norm": 0.19226389494086693,
|
|
"learning_rate": 3.7460294002438444e-06,
|
|
"loss": 0.1649,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 2.5323624595469254,
|
|
"grad_norm": 0.23947157434034713,
|
|
"learning_rate": 3.6230495804491864e-06,
|
|
"loss": 0.1638,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 2.540453074433657,
|
|
"grad_norm": 0.18606261818175104,
|
|
"learning_rate": 3.5019644144943576e-06,
|
|
"loss": 0.1657,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 2.5485436893203883,
|
|
"grad_norm": 0.18558275603040017,
|
|
"learning_rate": 3.382784634241015e-06,
|
|
"loss": 0.1626,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 2.55663430420712,
|
|
"grad_norm": 0.1838874793051254,
|
|
"learning_rate": 3.2655208026751816e-06,
|
|
"loss": 0.1645,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 2.5647249190938513,
|
|
"grad_norm": 0.18161462708162493,
|
|
"learning_rate": 3.150183312971014e-06,
|
|
"loss": 0.1625,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 2.5728155339805827,
|
|
"grad_norm": 0.18098413098000662,
|
|
"learning_rate": 3.036782387569659e-06,
|
|
"loss": 0.1623,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 2.5809061488673137,
|
|
"grad_norm": 0.19242866333812558,
|
|
"learning_rate": 2.9253280772732595e-06,
|
|
"loss": 0.1627,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 2.588996763754045,
|
|
"grad_norm": 0.17889937009144094,
|
|
"learning_rate": 2.8158302603540965e-06,
|
|
"loss": 0.1609,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 2.5970873786407767,
|
|
"grad_norm": 0.1879805259242369,
|
|
"learning_rate": 2.708298641679105e-06,
|
|
"loss": 0.1642,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 2.605177993527508,
|
|
"grad_norm": 0.18048882079511047,
|
|
"learning_rate": 2.6027427518497153e-06,
|
|
"loss": 0.1634,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 2.6132686084142396,
|
|
"grad_norm": 0.18903574393083356,
|
|
"learning_rate": 2.49917194635714e-06,
|
|
"loss": 0.1607,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 2.6213592233009706,
|
|
"grad_norm": 0.18631019977805036,
|
|
"learning_rate": 2.397595404753225e-06,
|
|
"loss": 0.1589,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 2.6294498381877025,
|
|
"grad_norm": 0.18583873327326217,
|
|
"learning_rate": 2.2980221298367995e-06,
|
|
"loss": 0.1679,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 2.6375404530744335,
|
|
"grad_norm": 0.1872827952915852,
|
|
"learning_rate": 2.2004609468558175e-06,
|
|
"loss": 0.1648,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 2.645631067961165,
|
|
"grad_norm": 0.18646198991723772,
|
|
"learning_rate": 2.1049205027251216e-06,
|
|
"loss": 0.1648,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 2.6537216828478964,
|
|
"grad_norm": 0.18510117286188082,
|
|
"learning_rate": 2.0114092652600806e-06,
|
|
"loss": 0.1669,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 2.661812297734628,
|
|
"grad_norm": 0.1813478635510933,
|
|
"learning_rate": 1.919935522426081e-06,
|
|
"loss": 0.1601,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 2.6699029126213594,
|
|
"grad_norm": 0.18995739003728715,
|
|
"learning_rate": 1.8305073816039492e-06,
|
|
"loss": 0.165,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 2.6779935275080904,
|
|
"grad_norm": 0.18850255636775962,
|
|
"learning_rate": 1.7431327688714139e-06,
|
|
"loss": 0.1686,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 2.686084142394822,
|
|
"grad_norm": 0.19405450386616901,
|
|
"learning_rate": 1.6578194283005804e-06,
|
|
"loss": 0.1586,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 2.6941747572815533,
|
|
"grad_norm": 0.18168641096398497,
|
|
"learning_rate": 1.5745749212715794e-06,
|
|
"loss": 0.1669,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 2.7022653721682848,
|
|
"grad_norm": 0.18390467428127966,
|
|
"learning_rate": 1.4934066258024182e-06,
|
|
"loss": 0.1681,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 2.7103559870550162,
|
|
"grad_norm": 0.18623903954773796,
|
|
"learning_rate": 1.4143217358950217e-06,
|
|
"loss": 0.1648,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 2.7184466019417477,
|
|
"grad_norm": 0.18708444577479785,
|
|
"learning_rate": 1.3373272608976668e-06,
|
|
"loss": 0.1635,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 2.726537216828479,
|
|
"grad_norm": 0.1850383082145996,
|
|
"learning_rate": 1.2624300248836928e-06,
|
|
"loss": 0.1607,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 2.73462783171521,
|
|
"grad_norm": 0.19358386622393567,
|
|
"learning_rate": 1.1896366660467173e-06,
|
|
"loss": 0.1645,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 2.7427184466019416,
|
|
"grad_norm": 0.17960690916881664,
|
|
"learning_rate": 1.1189536361122799e-06,
|
|
"loss": 0.1628,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 2.750809061488673,
|
|
"grad_norm": 0.20127134763525154,
|
|
"learning_rate": 1.0503871997660036e-06,
|
|
"loss": 0.168,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 2.7588996763754046,
|
|
"grad_norm": 0.19658936043581707,
|
|
"learning_rate": 9.83943434098372e-07,
|
|
"loss": 0.1633,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 2.766990291262136,
|
|
"grad_norm": 0.18650679653778446,
|
|
"learning_rate": 9.196282280661023e-07,
|
|
"loss": 0.1673,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 2.775080906148867,
|
|
"grad_norm": 0.1733323921140541,
|
|
"learning_rate": 8.574472819702029e-07,
|
|
"loss": 0.1652,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 2.783171521035599,
|
|
"grad_norm": 0.17972872694953243,
|
|
"learning_rate": 7.974061069507571e-07,
|
|
"loss": 0.1636,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 2.79126213592233,
|
|
"grad_norm": 0.1777461753836034,
|
|
"learning_rate": 7.395100244984604e-07,
|
|
"loss": 0.1634,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 2.7993527508090614,
|
|
"grad_norm": 0.18989630729322288,
|
|
"learning_rate": 6.837641659829807e-07,
|
|
"loss": 0.1661,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 2.807443365695793,
|
|
"grad_norm": 0.17625775327505652,
|
|
"learning_rate": 6.301734721981533e-07,
|
|
"loss": 0.1643,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 2.8155339805825244,
|
|
"grad_norm": 0.1862820489099266,
|
|
"learning_rate": 5.787426929240808e-07,
|
|
"loss": 0.1643,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 2.823624595469256,
|
|
"grad_norm": 0.1790622651864573,
|
|
"learning_rate": 5.294763865061558e-07,
|
|
"loss": 0.1626,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 2.831715210355987,
|
|
"grad_norm": 0.173713637483726,
|
|
"learning_rate": 4.823789194510514e-07,
|
|
"loss": 0.1639,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 2.8398058252427183,
|
|
"grad_norm": 0.1866652309901256,
|
|
"learning_rate": 4.3745446603971064e-07,
|
|
"loss": 0.1602,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 2.8478964401294498,
|
|
"grad_norm": 0.17910113776598907,
|
|
"learning_rate": 3.947070079573872e-07,
|
|
"loss": 0.1642,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 2.855987055016181,
|
|
"grad_norm": 0.18065751924055337,
|
|
"learning_rate": 3.541403339407279e-07,
|
|
"loss": 0.1656,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 2.8640776699029127,
|
|
"grad_norm": 0.18007167117331419,
|
|
"learning_rate": 3.1575803944199624e-07,
|
|
"loss": 0.1651,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 2.872168284789644,
|
|
"grad_norm": 0.18247762658055755,
|
|
"learning_rate": 2.7956352631038906e-07,
|
|
"loss": 0.1623,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 2.8802588996763756,
|
|
"grad_norm": 0.17849381652959184,
|
|
"learning_rate": 2.4556000249054133e-07,
|
|
"loss": 0.1653,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 2.8883495145631066,
|
|
"grad_norm": 0.1846373828579316,
|
|
"learning_rate": 2.1375048173818412e-07,
|
|
"loss": 0.1691,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 2.896440129449838,
|
|
"grad_norm": 0.17151731227871805,
|
|
"learning_rate": 1.8413778335305e-07,
|
|
"loss": 0.1606,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 2.9045307443365695,
|
|
"grad_norm": 0.22901848451176904,
|
|
"learning_rate": 1.567245319290006e-07,
|
|
"loss": 0.1607,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 2.912621359223301,
|
|
"grad_norm": 0.1841472999781577,
|
|
"learning_rate": 1.315131571213879e-07,
|
|
"loss": 0.1659,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 2.9207119741100325,
|
|
"grad_norm": 0.17620735263373793,
|
|
"learning_rate": 1.0850589343172624e-07,
|
|
"loss": 0.163,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 2.9288025889967635,
|
|
"grad_norm": 0.1732366383013906,
|
|
"learning_rate": 8.770478000964532e-08,
|
|
"loss": 0.1603,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 2.9368932038834954,
|
|
"grad_norm": 0.18106989776343552,
|
|
"learning_rate": 6.911166047215145e-08,
|
|
"loss": 0.1639,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 2.9449838187702264,
|
|
"grad_norm": 0.18480994602766684,
|
|
"learning_rate": 5.272818274023872e-08,
|
|
"loss": 0.163,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 2.953074433656958,
|
|
"grad_norm": 0.17298187343963853,
|
|
"learning_rate": 3.855579889282257e-08,
|
|
"loss": 0.1607,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 2.9611650485436893,
|
|
"grad_norm": 0.1826761908488716,
|
|
"learning_rate": 2.6595765038045507e-08,
|
|
"loss": 0.1598,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 2.969255663430421,
|
|
"grad_norm": 0.18104206371371273,
|
|
"learning_rate": 1.6849141201946693e-08,
|
|
"loss": 0.1656,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 2.9773462783171523,
|
|
"grad_norm": 0.17420913687043998,
|
|
"learning_rate": 9.316791234506572e-09,
|
|
"loss": 0.1589,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 2.9854368932038833,
|
|
"grad_norm": 0.177305619544985,
|
|
"learning_rate": 3.999382733096968e-09,
|
|
"loss": 0.1633,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 2.9935275080906147,
|
|
"grad_norm": 0.17610657137080274,
|
|
"learning_rate": 8.973869832895609e-10,
|
|
"loss": 0.1623,
|
|
"step": 3700
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 3708,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.364819972765516e+18,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|