1106 lines
24 KiB
JSON
1106 lines
24 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 5.0,
|
|
"eval_steps": 500,
|
|
"global_step": 1535,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.03262642740619902,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.6000000000000003e-06,
|
|
"loss": 0.7799,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.06525285481239804,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.600000000000001e-06,
|
|
"loss": 0.7788,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.09787928221859707,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.16e-05,
|
|
"loss": 0.7558,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.13050570962479607,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.5600000000000003e-05,
|
|
"loss": 0.7929,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.1631321370309951,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9600000000000002e-05,
|
|
"loss": 0.8025,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.19575856443719414,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.999818745523526e-05,
|
|
"loss": 0.8121,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.22838499184339314,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9991922711960104e-05,
|
|
"loss": 0.768,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.26101141924959215,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.998118619612634e-05,
|
|
"loss": 0.7567,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.2936378466557912,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.996598271274081e-05,
|
|
"loss": 0.7997,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.3262642740619902,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9946319065951382e-05,
|
|
"loss": 0.7646,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.35889070146818924,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9922204056001896e-05,
|
|
"loss": 0.793,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.3915171288743883,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9893648475293646e-05,
|
|
"loss": 0.7884,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.42414355628058725,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9860665103555418e-05,
|
|
"loss": 0.7747,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.4567699836867863,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.982326870212402e-05,
|
|
"loss": 0.7603,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.4893964110929853,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9781476007338058e-05,
|
|
"loss": 0.7703,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.5220228384991843,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.973530572304773e-05,
|
|
"loss": 0.7675,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.5546492659053833,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9684778512244172e-05,
|
|
"loss": 0.8043,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.5872756933115824,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9629916987811924e-05,
|
|
"loss": 0.8024,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.6199021207177814,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.957074570240883e-05,
|
|
"loss": 0.7716,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.6525285481239804,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9507291137477744e-05,
|
|
"loss": 0.7651,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.6851549755301795,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.943958169139507e-05,
|
|
"loss": 0.7819,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.7177814029363785,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9367647666761384e-05,
|
|
"loss": 0.7792,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.7504078303425775,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.929152125683986e-05,
|
|
"loss": 0.7937,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.7830342577487766,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.92112365311485e-05,
|
|
"loss": 0.7721,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.8156606851549756,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9126829420212764e-05,
|
|
"loss": 0.772,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.8482871125611745,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9038337699485207e-05,
|
|
"loss": 0.7611,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.8809135399673735,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.894580097243954e-05,
|
|
"loss": 0.7829,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.9135399673735726,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.884926065284652e-05,
|
|
"loss": 0.7815,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.9461663947797716,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.87487599462397e-05,
|
|
"loss": 0.7742,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.9787928221859706,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.864434383057927e-05,
|
|
"loss": 0.7561,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.0097879282218598,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.853605903612267e-05,
|
|
"loss": 0.7452,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.0424143556280587,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.8423954024510995e-05,
|
|
"loss": 0.7773,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.0750407830342577,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.8308078967080547e-05,
|
|
"loss": 0.8153,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.1076672104404568,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.8188485722409196e-05,
|
|
"loss": 0.793,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.1402936378466557,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.8065227813107667e-05,
|
|
"loss": 0.7822,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.1729200652528549,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.7938360401866096e-05,
|
|
"loss": 0.7703,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.2055464926590538,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.7807940266766595e-05,
|
|
"loss": 0.78,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.238172920065253,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.767402577587285e-05,
|
|
"loss": 0.7718,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.2707993474714518,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.7536676861108167e-05,
|
|
"loss": 0.7895,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.3034257748776508,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.7395954991433588e-05,
|
|
"loss": 0.7638,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.33605220228385,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.7251923145338175e-05,
|
|
"loss": 0.7874,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.368678629690049,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.710464578265369e-05,
|
|
"loss": 0.7947,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.401305057096248,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6954188815706306e-05,
|
|
"loss": 0.7811,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 1.433931484502447,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.680061957981831e-05,
|
|
"loss": 0.736,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.466557911908646,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6644006803172926e-05,
|
|
"loss": 0.765,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 1.499184339314845,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6484420576055787e-05,
|
|
"loss": 0.7608,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.531810766721044,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6321932319486822e-05,
|
|
"loss": 0.779,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 1.564437194127243,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6156614753256583e-05,
|
|
"loss": 0.7824,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 1.597063621533442,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.5988541863381323e-05,
|
|
"loss": 0.7859,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 1.629690048939641,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.581778886899138e-05,
|
|
"loss": 0.784,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.6623164763458402,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.5644432188667695e-05,
|
|
"loss": 0.7578,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 1.6949429037520392,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.546854940624156e-05,
|
|
"loss": 0.7779,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.727569331158238,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.5290219236072833e-05,
|
|
"loss": 0.7667,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 1.7601957585644372,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.5109521487822208e-05,
|
|
"loss": 0.7765,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.7928221859706364,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.4926537030733301e-05,
|
|
"loss": 0.8005,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.8254486133768353,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.474134775744054e-05,
|
|
"loss": 0.7501,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.8580750407830342,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.4554036547319033e-05,
|
|
"loss": 0.7968,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.8907014681892331,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.4364687229392823e-05,
|
|
"loss": 0.7676,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.9233278955954323,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.417338454481818e-05,
|
|
"loss": 0.8098,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.9559543230016314,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.3980214108958626e-05,
|
|
"loss": 0.7602,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.9885807504078303,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.3785262373068742e-05,
|
|
"loss": 0.78,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 2.0195758564437196,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.3588616585603908e-05,
|
|
"loss": 0.79,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.0522022838499185,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.3390364753173206e-05,
|
|
"loss": 0.7759,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 2.0848287112561175,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.319059560115308e-05,
|
|
"loss": 0.7811,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.1174551386623164,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2989398533979271e-05,
|
|
"loss": 0.793,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 2.1500815660685153,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.278686359513488e-05,
|
|
"loss": 0.7435,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.1827079934747147,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2583081426852412e-05,
|
|
"loss": 0.7775,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 2.2153344208809136,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.237814322954788e-05,
|
|
"loss": 0.7885,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 2.2479608482871125,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.217214072100508e-05,
|
|
"loss": 0.7745,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 2.2805872756933114,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.1965166095328302e-05,
|
|
"loss": 0.7463,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 2.3132137030995104,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.1757311981681943e-05,
|
|
"loss": 0.7962,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 2.3458401305057097,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.1548671402835325e-05,
|
|
"loss": 0.7699,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 2.3784665579119086,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.1339337733531435e-05,
|
|
"loss": 0.8087,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 2.4110929853181076,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.1129404658698082e-05,
|
|
"loss": 0.7399,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 2.443719412724307,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.0918966131520276e-05,
|
|
"loss": 0.7841,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 2.476345840130506,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.0708116331392542e-05,
|
|
"loss": 0.7998,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 2.5089722675367048,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.0496949621769976e-05,
|
|
"loss": 0.7869,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 2.5415986949429037,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.0285560507936962e-05,
|
|
"loss": 0.789,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 2.5742251223491026,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.007404359471238e-05,
|
|
"loss": 0.7694,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 2.6068515497553015,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.862493544110282e-06,
|
|
"loss": 0.7746,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 2.639477977161501,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.651005032974994e-06,
|
|
"loss": 0.7776,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 2.6721044045677,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.439672710609532e-06,
|
|
"loss": 0.8017,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 2.7047308319738987,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.228591156416405e-06,
|
|
"loss": 0.7494,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 2.737357259380098,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.017854837569629e-06,
|
|
"loss": 0.7635,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 2.769983686786297,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.807558066737042e-06,
|
|
"loss": 0.7947,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 2.802610114192496,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.597794959871694e-06,
|
|
"loss": 0.7897,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 2.835236541598695,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.388659394091362e-06,
|
|
"loss": 0.7715,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 2.867862969004894,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.180244965664845e-06,
|
|
"loss": 0.7685,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 2.9004893964110927,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.97264494812405e-06,
|
|
"loss": 0.7456,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 2.933115823817292,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.765952250520459e-06,
|
|
"loss": 0.8071,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 2.965742251223491,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.560259375844719e-06,
|
|
"loss": 0.7667,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 2.99836867862969,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.355658379627981e-06,
|
|
"loss": 0.764,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 3.029363784665579,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.1522408287434774e-06,
|
|
"loss": 0.8021,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 3.061990212071778,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.950097760426814e-06,
|
|
"loss": 0.7764,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 3.094616639477977,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.74931964153325e-06,
|
|
"loss": 0.8317,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 3.1272430668841764,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.549996328050296e-06,
|
|
"loss": 0.789,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 3.1598694942903753,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.352217024883678e-06,
|
|
"loss": 0.7928,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 3.1924959216965743,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.1560702459346845e-06,
|
|
"loss": 0.7768,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 3.225122349102773,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.961643774486754e-06,
|
|
"loss": 0.7542,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 3.257748776508972,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.769024623919064e-06,
|
|
"loss": 0.7807,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 3.2903752039151715,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.57829899876469e-06,
|
|
"loss": 0.7849,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 3.3230016313213704,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.38955225613069e-06,
|
|
"loss": 0.78,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 3.3556280587275693,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.202868867497542e-06,
|
|
"loss": 0.777,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 3.3882544861337682,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.01833238091485e-06,
|
|
"loss": 0.7735,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 3.4208809135399676,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.836025383610382e-06,
|
|
"loss": 0.7732,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 3.4535073409461665,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.656029465029057e-06,
|
|
"loss": 0.7516,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 3.4861337683523654,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.478425180318523e-06,
|
|
"loss": 0.7534,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 3.5187601957585644,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.3032920142776125e-06,
|
|
"loss": 0.7672,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 3.5513866231647633,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.1307083457838004e-06,
|
|
"loss": 0.7406,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 3.5840130505709626,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.960751412715629e-06,
|
|
"loss": 0.82,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 3.6166394779771616,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.7934972773857637e-06,
|
|
"loss": 0.7934,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 3.6492659053833605,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.6290207925001585e-06,
|
|
"loss": 0.7772,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 3.6818923327895594,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.4673955676585734e-06,
|
|
"loss": 0.7678,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 3.7145187601957588,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.308693936411421e-06,
|
|
"loss": 0.7717,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 3.7471451876019577,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.152986923887703e-06,
|
|
"loss": 0.7977,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 3.7797716150081566,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.000344215008524e-06,
|
|
"loss": 0.76,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 3.8123980424143555,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.8508341233003656e-06,
|
|
"loss": 0.7893,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 3.8450244698205545,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.7045235603221533e-06,
|
|
"loss": 0.7612,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 3.877650897226754,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.561478005719743e-06,
|
|
"loss": 0.7541,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 3.9102773246329527,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.421761477921232e-06,
|
|
"loss": 0.7643,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 3.9429037520391517,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.2854365054862383e-06,
|
|
"loss": 0.7838,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 3.9755301794453506,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.152564099121944e-06,
|
|
"loss": 0.788,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 4.006525285481239,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.0232037243784475e-06,
|
|
"loss": 0.7716,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 4.039151712887439,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.8974132750356156e-06,
|
|
"loss": 0.792,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 4.071778140293638,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.7752490471933769e-06,
|
|
"loss": 0.768,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 4.104404567699837,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6567657140770477e-06,
|
|
"loss": 0.7654,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 4.137030995106036,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.542016301568926e-06,
|
|
"loss": 0.7698,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 4.169657422512235,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.4310521644771657e-06,
|
|
"loss": 0.745,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 4.202283849918434,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.3239229635525074e-06,
|
|
"loss": 0.7774,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 4.234910277324633,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2206766432631766e-06,
|
|
"loss": 0.7848,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 4.267536704730832,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.121359410337859e-06,
|
|
"loss": 0.7814,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 4.300163132137031,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.0260157130864178e-06,
|
|
"loss": 0.809,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 4.33278955954323,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.346882215075348e-07,
|
|
"loss": 0.7976,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 4.365415986949429,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.474178081922524e-07,
|
|
"loss": 0.7825,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 4.398042414355628,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.642435300318906e-07,
|
|
"loss": 0.7712,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 4.430668841761827,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.852026107385756e-07,
|
|
"loss": 0.7711,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 4.463295269168026,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.103304241862006e-07,
|
|
"loss": 0.7903,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 4.495921696574225,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.396604785792281e-07,
|
|
"loss": 0.7527,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 4.528548123980424,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.7322440145647905e-07,
|
|
"loss": 0.7781,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 4.561174551386623,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.110519255365852e-07,
|
|
"loss": 0.8016,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 4.593800978792823,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.531708754114438e-07,
|
|
"loss": 0.7768,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 4.626427406199021,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.996071550936319e-07,
|
|
"loss": 0.7688,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 4.6590538336052205,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.503847364233614e-07,
|
|
"loss": 0.8049,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 4.691680261011419,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.0552564834014797e-07,
|
|
"loss": 0.7818,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 4.724306688417618,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6504996702401243e-07,
|
|
"loss": 0.7737,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 4.756933115823817,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2897580691060506e-07,
|
|
"loss": 0.8014,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 4.789559543230016,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.731931258429638e-08,
|
|
"loss": 0.7563,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 4.822185970636215,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.009465155285777e-08,
|
|
"loss": 0.7504,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 4.854812398042414,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.731400790693785e-08,
|
|
"loss": 0.7879,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 4.887438825448614,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.898757686722542e-08,
|
|
"loss": 0.7755,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 4.920065252854813,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.5123560221681488e-08,
|
|
"loss": 0.7803,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 4.952691680261012,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.728162654927705e-09,
|
|
"loss": 0.7493,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 4.985318107667211,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.05588971406479e-10,
|
|
"loss": 0.7814,
|
|
"step": 1530
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 1535,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 5,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.877356359182975e+18,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|