1426 lines
32 KiB
JSON
1426 lines
32 KiB
JSON
|
|
[
|
||
|
|
{
|
||
|
|
"loss": 3.2371,
|
||
|
|
"grad_norm": 4.275357246398926,
|
||
|
|
"learning_rate": 3.870967741935484e-05,
|
||
|
|
"epoch": 0.010810810810810811,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0982,
|
||
|
|
"grad_norm": 1.6401941776275635,
|
||
|
|
"learning_rate": 8.172043010752689e-05,
|
||
|
|
"epoch": 0.021621621621621623,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7814,
|
||
|
|
"grad_norm": 1.0990010499954224,
|
||
|
|
"learning_rate": 0.00012473118279569893,
|
||
|
|
"epoch": 0.032432432432432434,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8768,
|
||
|
|
"grad_norm": 1.2032235860824585,
|
||
|
|
"learning_rate": 0.00016774193548387098,
|
||
|
|
"epoch": 0.043243243243243246,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7967,
|
||
|
|
"grad_norm": 0.8312140107154846,
|
||
|
|
"learning_rate": 0.000210752688172043,
|
||
|
|
"epoch": 0.05405405405405406,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7088,
|
||
|
|
"grad_norm": 0.7974215149879456,
|
||
|
|
"learning_rate": 0.00025376344086021504,
|
||
|
|
"epoch": 0.06486486486486487,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7921,
|
||
|
|
"grad_norm": 0.8754441142082214,
|
||
|
|
"learning_rate": 0.0002967741935483871,
|
||
|
|
"epoch": 0.07567567567567568,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7169,
|
||
|
|
"grad_norm": 0.9818633794784546,
|
||
|
|
"learning_rate": 0.00033978494623655914,
|
||
|
|
"epoch": 0.08648648648648649,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8469,
|
||
|
|
"grad_norm": 1.0391249656677246,
|
||
|
|
"learning_rate": 0.0003827956989247312,
|
||
|
|
"epoch": 0.0972972972972973,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.763,
|
||
|
|
"grad_norm": 1.2312983274459839,
|
||
|
|
"learning_rate": 0.00039998849055034085,
|
||
|
|
"epoch": 0.10810810810810811,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9223,
|
||
|
|
"grad_norm": 1.136441707611084,
|
||
|
|
"learning_rate": 0.00039991815982176333,
|
||
|
|
"epoch": 0.11891891891891893,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9628,
|
||
|
|
"grad_norm": 0.9119946360588074,
|
||
|
|
"learning_rate": 0.0003997839149608889,
|
||
|
|
"epoch": 0.12972972972972974,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.777016043663025,
|
||
|
|
"eval_runtime": 18.3656,
|
||
|
|
"eval_samples_per_second": 42.416,
|
||
|
|
"eval_steps_per_second": 10.618,
|
||
|
|
"epoch": 0.13297297297297297,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6845,
|
||
|
|
"grad_norm": 1.4189170598983765,
|
||
|
|
"learning_rate": 0.00039958579888599896,
|
||
|
|
"epoch": 0.14054054054054055,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8529,
|
||
|
|
"grad_norm": 1.0813618898391724,
|
||
|
|
"learning_rate": 0.00039932387493509636,
|
||
|
|
"epoch": 0.15135135135135136,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7712,
|
||
|
|
"grad_norm": 1.0759323835372925,
|
||
|
|
"learning_rate": 0.00039899822684565697,
|
||
|
|
"epoch": 0.16216216216216217,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7869,
|
||
|
|
"grad_norm": 1.0583269596099854,
|
||
|
|
"learning_rate": 0.00039860895872785806,
|
||
|
|
"epoch": 0.17297297297297298,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8945,
|
||
|
|
"grad_norm": 1.1669530868530273,
|
||
|
|
"learning_rate": 0.0003981561950312943,
|
||
|
|
"epoch": 0.1837837837837838,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8797,
|
||
|
|
"grad_norm": 1.0436373949050903,
|
||
|
|
"learning_rate": 0.0003976400805051915,
|
||
|
|
"epoch": 0.1945945945945946,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9332,
|
||
|
|
"grad_norm": 0.8406238555908203,
|
||
|
|
"learning_rate": 0.00039706078015212907,
|
||
|
|
"epoch": 0.20540540540540542,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7786,
|
||
|
|
"grad_norm": 1.1354097127914429,
|
||
|
|
"learning_rate": 0.0003964184791752895,
|
||
|
|
"epoch": 0.21621621621621623,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8089,
|
||
|
|
"grad_norm": 1.4123671054840088,
|
||
|
|
"learning_rate": 0.0003957133829192479,
|
||
|
|
"epoch": 0.22702702702702704,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7916,
|
||
|
|
"grad_norm": 0.9333382248878479,
|
||
|
|
"learning_rate": 0.00039494571680432364,
|
||
|
|
"epoch": 0.23783783783783785,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7808,
|
||
|
|
"grad_norm": 1.0521595478057861,
|
||
|
|
"learning_rate": 0.0003941157262545123,
|
||
|
|
"epoch": 0.24864864864864866,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7977,
|
||
|
|
"grad_norm": 1.2558528184890747,
|
||
|
|
"learning_rate": 0.00039322367661902426,
|
||
|
|
"epoch": 0.2594594594594595,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7805718183517456,
|
||
|
|
"eval_runtime": 10.1271,
|
||
|
|
"eval_samples_per_second": 76.922,
|
||
|
|
"eval_steps_per_second": 19.255,
|
||
|
|
"epoch": 0.26594594594594595,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8482,
|
||
|
|
"grad_norm": 1.1523817777633667,
|
||
|
|
"learning_rate": 0.00039226985308745137,
|
||
|
|
"epoch": 0.2702702702702703,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.685,
|
||
|
|
"grad_norm": 0.9244216084480286,
|
||
|
|
"learning_rate": 0.00039125456059859175,
|
||
|
|
"epoch": 0.2810810810810811,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8774,
|
||
|
|
"grad_norm": 1.1236217021942139,
|
||
|
|
"learning_rate": 0.0003901781237429604,
|
||
|
|
"epoch": 0.2918918918918919,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9237,
|
||
|
|
"grad_norm": 1.112891435623169,
|
||
|
|
"learning_rate": 0.0003890408866590171,
|
||
|
|
"epoch": 0.3027027027027027,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7769,
|
||
|
|
"grad_norm": 1.0233204364776611,
|
||
|
|
"learning_rate": 0.00038784321292314485,
|
||
|
|
"epoch": 0.31351351351351353,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8192,
|
||
|
|
"grad_norm": 1.1586676836013794,
|
||
|
|
"learning_rate": 0.00038658548543341384,
|
||
|
|
"epoch": 0.32432432432432434,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7718,
|
||
|
|
"grad_norm": 1.034834384918213,
|
||
|
|
"learning_rate": 0.00038526810628716854,
|
||
|
|
"epoch": 0.33513513513513515,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6869,
|
||
|
|
"grad_norm": 1.1128815412521362,
|
||
|
|
"learning_rate": 0.0003838914966524765,
|
||
|
|
"epoch": 0.34594594594594597,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8987,
|
||
|
|
"grad_norm": 1.048621654510498,
|
||
|
|
"learning_rate": 0.00038245609663348034,
|
||
|
|
"epoch": 0.3567567567567568,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7818,
|
||
|
|
"grad_norm": 1.3258867263793945,
|
||
|
|
"learning_rate": 0.00038096236512969556,
|
||
|
|
"epoch": 0.3675675675675676,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7062,
|
||
|
|
"grad_norm": 0.9586314558982849,
|
||
|
|
"learning_rate": 0.0003794107796893002,
|
||
|
|
"epoch": 0.3783783783783784,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8131,
|
||
|
|
"grad_norm": 1.0099109411239624,
|
||
|
|
"learning_rate": 0.00037780183635646145,
|
||
|
|
"epoch": 0.3891891891891892,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7687468528747559,
|
||
|
|
"eval_runtime": 10.0798,
|
||
|
|
"eval_samples_per_second": 77.283,
|
||
|
|
"eval_steps_per_second": 19.346,
|
||
|
|
"epoch": 0.3989189189189189,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8824,
|
||
|
|
"grad_norm": 1.201002597808838,
|
||
|
|
"learning_rate": 0.00037613604951274986,
|
||
|
|
"epoch": 0.4,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8594,
|
||
|
|
"grad_norm": 0.9146278500556946,
|
||
|
|
"learning_rate": 0.0003744139517126908,
|
||
|
|
"epoch": 0.41081081081081083,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7923,
|
||
|
|
"grad_norm": 1.1093569993972778,
|
||
|
|
"learning_rate": 0.00037263609351350583,
|
||
|
|
"epoch": 0.42162162162162165,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9701,
|
||
|
|
"grad_norm": 0.9460511207580566,
|
||
|
|
"learning_rate": 0.0003708030432990989,
|
||
|
|
"epoch": 0.43243243243243246,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7968,
|
||
|
|
"grad_norm": 1.1481722593307495,
|
||
|
|
"learning_rate": 0.0003689153870983431,
|
||
|
|
"epoch": 0.44324324324324327,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7019,
|
||
|
|
"grad_norm": 1.1272804737091064,
|
||
|
|
"learning_rate": 0.00036697372839772634,
|
||
|
|
"epoch": 0.4540540540540541,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7139,
|
||
|
|
"grad_norm": 0.8615907430648804,
|
||
|
|
"learning_rate": 0.000364978687948416,
|
||
|
|
"epoch": 0.4648648648648649,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7769,
|
||
|
|
"grad_norm": 1.0832351446151733,
|
||
|
|
"learning_rate": 0.0003629309035678035,
|
||
|
|
"epoch": 0.4756756756756757,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8117,
|
||
|
|
"grad_norm": 1.0243345499038696,
|
||
|
|
"learning_rate": 0.00036083102993559343,
|
||
|
|
"epoch": 0.4864864864864865,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7035,
|
||
|
|
"grad_norm": 0.9396358728408813,
|
||
|
|
"learning_rate": 0.00035867973838450153,
|
||
|
|
"epoch": 0.4972972972972973,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9568,
|
||
|
|
"grad_norm": 0.9557101130485535,
|
||
|
|
"learning_rate": 0.0003564777166856282,
|
||
|
|
"epoch": 0.5081081081081081,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9079,
|
||
|
|
"grad_norm": 1.1307172775268555,
|
||
|
|
"learning_rate": 0.00035422566882857765,
|
||
|
|
"epoch": 0.518918918918919,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8791,
|
||
|
|
"grad_norm": 1.2252289056777954,
|
||
|
|
"learning_rate": 0.0003519243147963909,
|
||
|
|
"epoch": 0.5297297297297298,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7642391920089722,
|
||
|
|
"eval_runtime": 10.1201,
|
||
|
|
"eval_samples_per_second": 76.976,
|
||
|
|
"eval_steps_per_second": 19.269,
|
||
|
|
"epoch": 0.5318918918918919,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7498,
|
||
|
|
"grad_norm": 0.9916505813598633,
|
||
|
|
"learning_rate": 0.00034957439033536647,
|
||
|
|
"epoch": 0.5405405405405406,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8562,
|
||
|
|
"grad_norm": 1.2275047302246094,
|
||
|
|
"learning_rate": 0.0003471766467198408,
|
||
|
|
"epoch": 0.5513513513513514,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7812,
|
||
|
|
"grad_norm": 0.9753154516220093,
|
||
|
|
"learning_rate": 0.00034473185051200515,
|
||
|
|
"epoch": 0.5621621621621622,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 2.0087,
|
||
|
|
"grad_norm": 1.2194623947143555,
|
||
|
|
"learning_rate": 0.0003422407833168343,
|
||
|
|
"epoch": 0.572972972972973,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8641,
|
||
|
|
"grad_norm": 1.1282182931900024,
|
||
|
|
"learning_rate": 0.00033970424153220637,
|
||
|
|
"epoch": 0.5837837837837838,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8962,
|
||
|
|
"grad_norm": 1.3077672719955444,
|
||
|
|
"learning_rate": 0.0003371230360942931,
|
||
|
|
"epoch": 0.5945945945945946,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7113,
|
||
|
|
"grad_norm": 1.1093400716781616,
|
||
|
|
"learning_rate": 0.0003344979922183026,
|
||
|
|
"epoch": 0.6054054054054054,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8013,
|
||
|
|
"grad_norm": 1.0412172079086304,
|
||
|
|
"learning_rate": 0.0003318299491346565,
|
||
|
|
"epoch": 0.6162162162162163,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8316,
|
||
|
|
"grad_norm": 1.1250932216644287,
|
||
|
|
"learning_rate": 0.00032911975982068706,
|
||
|
|
"epoch": 0.6270270270270271,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7729,
|
||
|
|
"grad_norm": 0.971480131149292,
|
||
|
|
"learning_rate": 0.0003263682907279387,
|
||
|
|
"epoch": 0.6378378378378379,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.745,
|
||
|
|
"grad_norm": 1.1424800157546997,
|
||
|
|
"learning_rate": 0.00032357642150516265,
|
||
|
|
"epoch": 0.6486486486486487,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6717,
|
||
|
|
"grad_norm": 1.3536049127578735,
|
||
|
|
"learning_rate": 0.00032074504471709146,
|
||
|
|
"epoch": 0.6594594594594595,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7533202171325684,
|
||
|
|
"eval_runtime": 10.0831,
|
||
|
|
"eval_samples_per_second": 77.258,
|
||
|
|
"eval_steps_per_second": 19.339,
|
||
|
|
"epoch": 0.6648648648648648,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7822,
|
||
|
|
"grad_norm": 0.8749492168426514,
|
||
|
|
"learning_rate": 0.0003178750655590848,
|
||
|
|
"epoch": 0.6702702702702703,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8368,
|
||
|
|
"grad_norm": 3.0736031532287598,
|
||
|
|
"learning_rate": 0.00031496740156773776,
|
||
|
|
"epoch": 0.6810810810810811,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7322,
|
||
|
|
"grad_norm": 1.288352131843567,
|
||
|
|
"learning_rate": 0.00031202298232754186,
|
||
|
|
"epoch": 0.6918918918918919,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8685,
|
||
|
|
"grad_norm": 1.0477159023284912,
|
||
|
|
"learning_rate": 0.00030904274917369686,
|
||
|
|
"epoch": 0.7027027027027027,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7483,
|
||
|
|
"grad_norm": 0.9655544757843018,
|
||
|
|
"learning_rate": 0.0003060276548911634,
|
||
|
|
"epoch": 0.7135135135135136,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8099,
|
||
|
|
"grad_norm": 1.1260396242141724,
|
||
|
|
"learning_rate": 0.00030297866341005684,
|
||
|
|
"epoch": 0.7243243243243244,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6145,
|
||
|
|
"grad_norm": 1.1371850967407227,
|
||
|
|
"learning_rate": 0.0002998967494974774,
|
||
|
|
"epoch": 0.7351351351351352,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8311,
|
||
|
|
"grad_norm": 0.9440209865570068,
|
||
|
|
"learning_rate": 0.0002967828984458751,
|
||
|
|
"epoch": 0.745945945945946,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9393,
|
||
|
|
"grad_norm": 1.3496946096420288,
|
||
|
|
"learning_rate": 0.00029363810575805106,
|
||
|
|
"epoch": 0.7567567567567568,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9767,
|
||
|
|
"grad_norm": 1.0028049945831299,
|
||
|
|
"learning_rate": 0.00029046337682889315,
|
||
|
|
"epoch": 0.7675675675675676,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8328,
|
||
|
|
"grad_norm": 1.1777056455612183,
|
||
|
|
"learning_rate": 0.00028725972662395013,
|
||
|
|
"epoch": 0.7783783783783784,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7484,
|
||
|
|
"grad_norm": 1.2826964855194092,
|
||
|
|
"learning_rate": 0.00028402817935494547,
|
||
|
|
"epoch": 0.7891891891891892,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7475706338882446,
|
||
|
|
"eval_runtime": 10.0081,
|
||
|
|
"eval_samples_per_second": 77.837,
|
||
|
|
"eval_steps_per_second": 19.484,
|
||
|
|
"epoch": 0.7978378378378378,
|
||
|
|
"step": 738
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9079,
|
||
|
|
"grad_norm": 1.1097257137298584,
|
||
|
|
"learning_rate": 0.00028076976815233546,
|
||
|
|
"epoch": 0.8,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7847,
|
||
|
|
"grad_norm": 1.1187055110931396,
|
||
|
|
"learning_rate": 0.00027748553473501593,
|
||
|
|
"epoch": 0.8108108108108109,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6747,
|
||
|
|
"grad_norm": 1.182005524635315,
|
||
|
|
"learning_rate": 0.00027417652907728274,
|
||
|
|
"epoch": 0.8216216216216217,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7653,
|
||
|
|
"grad_norm": 0.9777538180351257,
|
||
|
|
"learning_rate": 0.000270843809073154,
|
||
|
|
"epoch": 0.8324324324324325,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7749,
|
||
|
|
"grad_norm": 1.1285064220428467,
|
||
|
|
"learning_rate": 0.0002674884401981597,
|
||
|
|
"epoch": 0.8432432432432433,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7904,
|
||
|
|
"grad_norm": 0.9783152937889099,
|
||
|
|
"learning_rate": 0.000264111495168707,
|
||
|
|
"epoch": 0.8540540540540541,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6915,
|
||
|
|
"grad_norm": 1.107857346534729,
|
||
|
|
"learning_rate": 0.0002607140535991321,
|
||
|
|
"epoch": 0.8648648648648649,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7857,
|
||
|
|
"grad_norm": 1.2584813833236694,
|
||
|
|
"learning_rate": 0.0002572972016565451,
|
||
|
|
"epoch": 0.8756756756756757,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8468,
|
||
|
|
"grad_norm": 1.2436493635177612,
|
||
|
|
"learning_rate": 0.00025386203171358157,
|
||
|
|
"epoch": 0.8864864864864865,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9164,
|
||
|
|
"grad_norm": 1.624140739440918,
|
||
|
|
"learning_rate": 0.00025040964199916856,
|
||
|
|
"epoch": 0.8972972972972973,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8009,
|
||
|
|
"grad_norm": 1.0699501037597656,
|
||
|
|
"learning_rate": 0.0002469411362474199,
|
||
|
|
"epoch": 0.9081081081081082,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6318,
|
||
|
|
"grad_norm": 0.9692312479019165,
|
||
|
|
"learning_rate": 0.0002434576233447703,
|
||
|
|
"epoch": 0.918918918918919,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.752,
|
||
|
|
"grad_norm": 0.9754092693328857,
|
||
|
|
"learning_rate": 0.000239960216975463,
|
||
|
|
"epoch": 0.9297297297297298,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7383391857147217,
|
||
|
|
"eval_runtime": 10.0527,
|
||
|
|
"eval_samples_per_second": 77.491,
|
||
|
|
"eval_steps_per_second": 19.398,
|
||
|
|
"epoch": 0.9308108108108109,
|
||
|
|
"step": 861
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.9364,
|
||
|
|
"grad_norm": 1.0026895999908447,
|
||
|
|
"learning_rate": 0.00023645003526550292,
|
||
|
|
"epoch": 0.9405405405405406,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8438,
|
||
|
|
"grad_norm": 1.269220232963562,
|
||
|
|
"learning_rate": 0.00023292820042519066,
|
||
|
|
"epoch": 0.9513513513513514,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7952,
|
||
|
|
"grad_norm": 1.0278656482696533,
|
||
|
|
"learning_rate": 0.00022939583839034965,
|
||
|
|
"epoch": 0.9621621621621622,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6568,
|
||
|
|
"grad_norm": 0.9819965958595276,
|
||
|
|
"learning_rate": 0.0002258540784623631,
|
||
|
|
"epoch": 0.972972972972973,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.8287,
|
||
|
|
"grad_norm": 1.1272140741348267,
|
||
|
|
"learning_rate": 0.00022230405294713465,
|
||
|
|
"epoch": 0.9837837837837838,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7379,
|
||
|
|
"grad_norm": 1.1125059127807617,
|
||
|
|
"learning_rate": 0.0002187468967930883,
|
||
|
|
"epoch": 0.9945945945945946,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.7004,
|
||
|
|
"grad_norm": 1.0192606449127197,
|
||
|
|
"learning_rate": 0.000215183747228324,
|
||
|
|
"epoch": 1.0054054054054054,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5612,
|
||
|
|
"grad_norm": 0.9857641458511353,
|
||
|
|
"learning_rate": 0.000211615743397044,
|
||
|
|
"epoch": 1.0162162162162163,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2828,
|
||
|
|
"grad_norm": 1.0608668327331543,
|
||
|
|
"learning_rate": 0.00020804402599536661,
|
||
|
|
"epoch": 1.027027027027027,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4035,
|
||
|
|
"grad_norm": 1.485253930091858,
|
||
|
|
"learning_rate": 0.0002044697369066443,
|
||
|
|
"epoch": 1.037837837837838,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5254,
|
||
|
|
"grad_norm": 0.9453800320625305,
|
||
|
|
"learning_rate": 0.0002008940188364015,
|
||
|
|
"epoch": 1.0486486486486486,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4442,
|
||
|
|
"grad_norm": 1.1382359266281128,
|
||
|
|
"learning_rate": 0.00019731801494701044,
|
||
|
|
"epoch": 1.0594594594594595,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7562943696975708,
|
||
|
|
"eval_runtime": 10.0133,
|
||
|
|
"eval_samples_per_second": 77.797,
|
||
|
|
"eval_steps_per_second": 19.474,
|
||
|
|
"epoch": 1.0637837837837838,
|
||
|
|
"step": 984
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6187,
|
||
|
|
"grad_norm": 1.2494144439697266,
|
||
|
|
"learning_rate": 0.0001937428684922197,
|
||
|
|
"epoch": 1.0702702702702702,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.6528,
|
||
|
|
"grad_norm": 0.9464777708053589,
|
||
|
|
"learning_rate": 0.00019016972245165526,
|
||
|
|
"epoch": 1.0810810810810811,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4064,
|
||
|
|
"grad_norm": 0.9740603566169739,
|
||
|
|
"learning_rate": 0.0001865997191654074,
|
||
|
|
"epoch": 1.0918918918918918,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3695,
|
||
|
|
"grad_norm": 1.2424192428588867,
|
||
|
|
"learning_rate": 0.00018303399996882325,
|
||
|
|
"epoch": 1.1027027027027028,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4764,
|
||
|
|
"grad_norm": 1.0215702056884766,
|
||
|
|
"learning_rate": 0.00017947370482762005,
|
||
|
|
"epoch": 1.1135135135135135,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5442,
|
||
|
|
"grad_norm": 1.0910210609436035,
|
||
|
|
"learning_rate": 0.00017591997197343657,
|
||
|
|
"epoch": 1.1243243243243244,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5187,
|
||
|
|
"grad_norm": 1.1207563877105713,
|
||
|
|
"learning_rate": 0.00017237393753993875,
|
||
|
|
"epoch": 1.135135135135135,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4571,
|
||
|
|
"grad_norm": 1.0761910676956177,
|
||
|
|
"learning_rate": 0.0001688367351995959,
|
||
|
|
"epoch": 1.145945945945946,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3436,
|
||
|
|
"grad_norm": 0.9719659090042114,
|
||
|
|
"learning_rate": 0.00016530949580124404,
|
||
|
|
"epoch": 1.1567567567567567,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5315,
|
||
|
|
"grad_norm": 1.0876080989837646,
|
||
|
|
"learning_rate": 0.00016179334700855189,
|
||
|
|
"epoch": 1.1675675675675676,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.369,
|
||
|
|
"grad_norm": 1.1940348148345947,
|
||
|
|
"learning_rate": 0.0001582894129395051,
|
||
|
|
"epoch": 1.1783783783783783,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4257,
|
||
|
|
"grad_norm": 1.1275503635406494,
|
||
|
|
"learning_rate": 0.00015479881380702415,
|
||
|
|
"epoch": 1.1891891891891893,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.763828158378601,
|
||
|
|
"eval_runtime": 9.9983,
|
||
|
|
"eval_samples_per_second": 77.914,
|
||
|
|
"eval_steps_per_second": 19.503,
|
||
|
|
"epoch": 1.1967567567567567,
|
||
|
|
"step": 1107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3932,
|
||
|
|
"grad_norm": 1.2186589241027832,
|
||
|
|
"learning_rate": 0.00015132266556083018,
|
||
|
|
"epoch": 1.2,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5195,
|
||
|
|
"grad_norm": 1.040711760520935,
|
||
|
|
"learning_rate": 0.00014786207953067492,
|
||
|
|
"epoch": 1.2108108108108109,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3596,
|
||
|
|
"grad_norm": 1.1564419269561768,
|
||
|
|
"learning_rate": 0.00014441816207104636,
|
||
|
|
"epoch": 1.2216216216216216,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.539,
|
||
|
|
"grad_norm": 0.9457581639289856,
|
||
|
|
"learning_rate": 0.00014099201420746585,
|
||
|
|
"epoch": 1.2324324324324325,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4309,
|
||
|
|
"grad_norm": 1.2473818063735962,
|
||
|
|
"learning_rate": 0.00013758473128448837,
|
||
|
|
"epoch": 1.2432432432432432,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.513,
|
||
|
|
"grad_norm": 1.0576856136322021,
|
||
|
|
"learning_rate": 0.0001341974026155195,
|
||
|
|
"epoch": 1.2540540540540541,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5212,
|
||
|
|
"grad_norm": 1.021657943725586,
|
||
|
|
"learning_rate": 0.00013083111113456025,
|
||
|
|
"epoch": 1.2648648648648648,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4125,
|
||
|
|
"grad_norm": 1.4797037839889526,
|
||
|
|
"learning_rate": 0.0001274869330499914,
|
||
|
|
"epoch": 1.2756756756756757,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4741,
|
||
|
|
"grad_norm": 1.4238656759262085,
|
||
|
|
"learning_rate": 0.00012416593750050803,
|
||
|
|
"epoch": 1.2864864864864864,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5072,
|
||
|
|
"grad_norm": 1.0679641962051392,
|
||
|
|
"learning_rate": 0.00012086918621331431,
|
||
|
|
"epoch": 1.2972972972972974,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3807,
|
||
|
|
"grad_norm": 1.5260353088378906,
|
||
|
|
"learning_rate": 0.00011759773316468794,
|
||
|
|
"epoch": 1.308108108108108,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4398,
|
||
|
|
"grad_norm": 1.005669355392456,
|
||
|
|
"learning_rate": 0.00011435262424302224,
|
||
|
|
"epoch": 1.318918918918919,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4314,
|
||
|
|
"grad_norm": 1.116134762763977,
|
||
|
|
"learning_rate": 0.00011113489691445385,
|
||
|
|
"epoch": 1.3297297297297297,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7593566179275513,
|
||
|
|
"eval_runtime": 9.9147,
|
||
|
|
"eval_samples_per_second": 78.57,
|
||
|
|
"eval_steps_per_second": 19.668,
|
||
|
|
"epoch": 1.3297297297297297,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4174,
|
||
|
|
"grad_norm": 1.100644826889038,
|
||
|
|
"learning_rate": 0.00010794557989118352,
|
||
|
|
"epoch": 1.3405405405405406,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3901,
|
||
|
|
"grad_norm": 0.9467904567718506,
|
||
|
|
"learning_rate": 0.00010478569280259542,
|
||
|
|
"epoch": 1.3513513513513513,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5013,
|
||
|
|
"grad_norm": 1.2005168199539185,
|
||
|
|
"learning_rate": 0.00010165624586927987,
|
||
|
|
"epoch": 1.3621621621621622,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4634,
|
||
|
|
"grad_norm": 1.0398645401000977,
|
||
|
|
"learning_rate": 9.855823958006427e-05,
|
||
|
|
"epoch": 1.372972972972973,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4728,
|
||
|
|
"grad_norm": 1.1238207817077637,
|
||
|
|
"learning_rate": 9.549266437215549e-05,
|
||
|
|
"epoch": 1.3837837837837839,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.453,
|
||
|
|
"grad_norm": 1.067688226699829,
|
||
|
|
"learning_rate": 9.246050031449569e-05,
|
||
|
|
"epoch": 1.3945945945945946,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.432,
|
||
|
|
"grad_norm": 1.1034791469573975,
|
||
|
|
"learning_rate": 8.946271679443276e-05,
|
||
|
|
"epoch": 1.4054054054054055,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3956,
|
||
|
|
"grad_norm": 1.4038920402526855,
|
||
|
|
"learning_rate": 8.650027220780555e-05,
|
||
|
|
"epoch": 1.4162162162162162,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3489,
|
||
|
|
"grad_norm": 1.0994772911071777,
|
||
|
|
"learning_rate": 8.357411365254341e-05,
|
||
|
|
"epoch": 1.427027027027027,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3385,
|
||
|
|
"grad_norm": 1.1797088384628296,
|
||
|
|
"learning_rate": 8.068517662587798e-05,
|
||
|
|
"epoch": 1.4378378378378378,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3012,
|
||
|
|
"grad_norm": 1.1310184001922607,
|
||
|
|
"learning_rate": 7.783438472526257e-05,
|
||
|
|
"epoch": 1.4486486486486487,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4044,
|
||
|
|
"grad_norm": 1.3859984874725342,
|
||
|
|
"learning_rate": 7.502264935309742e-05,
|
||
|
|
"epoch": 1.4594594594594594,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7528764009475708,
|
||
|
|
"eval_runtime": 10.0314,
|
||
|
|
"eval_samples_per_second": 77.656,
|
||
|
|
"eval_steps_per_second": 19.439,
|
||
|
|
"epoch": 1.4627027027027026,
|
||
|
|
"step": 1353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5262,
|
||
|
|
"grad_norm": 1.2141237258911133,
|
||
|
|
"learning_rate": 7.225086942535244e-05,
|
||
|
|
"epoch": 1.4702702702702704,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4401,
|
||
|
|
"grad_norm": 1.1930843591690063,
|
||
|
|
"learning_rate": 6.95199310841829e-05,
|
||
|
|
"epoch": 1.481081081081081,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3915,
|
||
|
|
"grad_norm": 1.0784533023834229,
|
||
|
|
"learning_rate": 6.6830707414628e-05,
|
||
|
|
"epoch": 1.491891891891892,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5117,
|
||
|
|
"grad_norm": 1.2977006435394287,
|
||
|
|
"learning_rate": 6.41840581654848e-05,
|
||
|
|
"epoch": 1.5027027027027027,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5385,
|
||
|
|
"grad_norm": 1.091192603111267,
|
||
|
|
"learning_rate": 6.158082947444484e-05,
|
||
|
|
"epoch": 1.5135135135135136,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2558,
|
||
|
|
"grad_norm": 1.2064927816390991,
|
||
|
|
"learning_rate": 5.902185359758272e-05,
|
||
|
|
"epoch": 1.5243243243243243,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3401,
|
||
|
|
"grad_norm": 1.18263578414917,
|
||
|
|
"learning_rate": 5.6507948643282905e-05,
|
||
|
|
"epoch": 1.535135135135135,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4368,
|
||
|
|
"grad_norm": 1.0201761722564697,
|
||
|
|
"learning_rate": 5.4039918310688995e-05,
|
||
|
|
"epoch": 1.545945945945946,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5864,
|
||
|
|
"grad_norm": 1.0474286079406738,
|
||
|
|
"learning_rate": 5.1618551632759904e-05,
|
||
|
|
"epoch": 1.5567567567567568,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3581,
|
||
|
|
"grad_norm": 0.9824125170707703,
|
||
|
|
"learning_rate": 4.924462272401484e-05,
|
||
|
|
"epoch": 1.5675675675675675,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4015,
|
||
|
|
"grad_norm": 1.190414547920227,
|
||
|
|
"learning_rate": 4.6918890533048034e-05,
|
||
|
|
"epoch": 1.5783783783783782,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.511,
|
||
|
|
"grad_norm": 1.0979052782058716,
|
||
|
|
"learning_rate": 4.464209859989146e-05,
|
||
|
|
"epoch": 1.5891891891891892,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7506372928619385,
|
||
|
|
"eval_runtime": 10.0066,
|
||
|
|
"eval_samples_per_second": 77.849,
|
||
|
|
"eval_steps_per_second": 19.487,
|
||
|
|
"epoch": 1.5956756756756758,
|
||
|
|
"step": 1476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.419,
|
||
|
|
"grad_norm": 1.2529748678207397,
|
||
|
|
"learning_rate": 4.241497481830396e-05,
|
||
|
|
"epoch": 1.6,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2973,
|
||
|
|
"grad_norm": 1.1004537343978882,
|
||
|
|
"learning_rate": 4.023823120306269e-05,
|
||
|
|
"epoch": 1.6108108108108108,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4756,
|
||
|
|
"grad_norm": 1.2662088871002197,
|
||
|
|
"learning_rate": 3.811256366233098e-05,
|
||
|
|
"epoch": 1.6216216216216215,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3959,
|
||
|
|
"grad_norm": 1.118185043334961,
|
||
|
|
"learning_rate": 3.603865177517516e-05,
|
||
|
|
"epoch": 1.6324324324324324,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4682,
|
||
|
|
"grad_norm": 0.995052695274353,
|
||
|
|
"learning_rate": 3.4017158574302564e-05,
|
||
|
|
"epoch": 1.6432432432432433,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4508,
|
||
|
|
"grad_norm": 1.0658509731292725,
|
||
|
|
"learning_rate": 3.204873033408853e-05,
|
||
|
|
"epoch": 1.654054054054054,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3456,
|
||
|
|
"grad_norm": 1.1724168062210083,
|
||
|
|
"learning_rate": 3.013399636396195e-05,
|
||
|
|
"epoch": 1.6648648648648647,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4285,
|
||
|
|
"grad_norm": 0.971674919128418,
|
||
|
|
"learning_rate": 2.827356880721368e-05,
|
||
|
|
"epoch": 1.6756756756756757,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2316,
|
||
|
|
"grad_norm": 0.939606785774231,
|
||
|
|
"learning_rate": 2.6468042445293883e-05,
|
||
|
|
"epoch": 1.6864864864864866,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4636,
|
||
|
|
"grad_norm": 1.2107715606689453,
|
||
|
|
"learning_rate": 2.4717994507659147e-05,
|
||
|
|
"epoch": 1.6972972972972973,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3471,
|
||
|
|
"grad_norm": 1.3718624114990234,
|
||
|
|
"learning_rate": 2.3023984487231466e-05,
|
||
|
|
"epoch": 1.708108108108108,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4623,
|
||
|
|
"grad_norm": 1.218471646308899,
|
||
|
|
"learning_rate": 2.1386553961527666e-05,
|
||
|
|
"epoch": 1.718918918918919,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.749324917793274,
|
||
|
|
"eval_runtime": 10.0439,
|
||
|
|
"eval_samples_per_second": 77.56,
|
||
|
|
"eval_steps_per_second": 19.415,
|
||
|
|
"epoch": 1.7286486486486488,
|
||
|
|
"step": 1599
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3439,
|
||
|
|
"grad_norm": 1.4828319549560547,
|
||
|
|
"learning_rate": 1.9806226419516192e-05,
|
||
|
|
"epoch": 1.7297297297297298,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3913,
|
||
|
|
"grad_norm": 1.1505448818206787,
|
||
|
|
"learning_rate": 1.828350709425677e-05,
|
||
|
|
"epoch": 1.7405405405405405,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4736,
|
||
|
|
"grad_norm": 1.2893474102020264,
|
||
|
|
"learning_rate": 1.68188828013768e-05,
|
||
|
|
"epoch": 1.7513513513513512,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3173,
|
||
|
|
"grad_norm": 1.2402210235595703,
|
||
|
|
"learning_rate": 1.541282178343566e-05,
|
||
|
|
"epoch": 1.7621621621621621,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3873,
|
||
|
|
"grad_norm": 1.0396865606307983,
|
||
|
|
"learning_rate": 1.4065773560226913e-05,
|
||
|
|
"epoch": 1.772972972972973,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3416,
|
||
|
|
"grad_norm": 1.0871046781539917,
|
||
|
|
"learning_rate": 1.277816878506597e-05,
|
||
|
|
"epoch": 1.7837837837837838,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4252,
|
||
|
|
"grad_norm": 1.3051209449768066,
|
||
|
|
"learning_rate": 1.1550419107109722e-05,
|
||
|
|
"epoch": 1.7945945945945945,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.2819,
|
||
|
|
"grad_norm": 1.1347006559371948,
|
||
|
|
"learning_rate": 1.0382917039751783e-05,
|
||
|
|
"epoch": 1.8054054054054054,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3439,
|
||
|
|
"grad_norm": 1.1445626020431519,
|
||
|
|
"learning_rate": 9.276035835135166e-06,
|
||
|
|
"epoch": 1.8162162162162163,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3387,
|
||
|
|
"grad_norm": 1.1986947059631348,
|
||
|
|
"learning_rate": 8.230129364823213e-06,
|
||
|
|
"epoch": 1.827027027027027,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3861,
|
||
|
|
"grad_norm": 1.0802196264266968,
|
||
|
|
"learning_rate": 7.245532006666178e-06,
|
||
|
|
"epoch": 1.8378378378378377,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4836,
|
||
|
|
"grad_norm": 1.322296142578125,
|
||
|
|
"learning_rate": 6.322558537900247e-06,
|
||
|
|
"epoch": 1.8486486486486486,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.35,
|
||
|
|
"grad_norm": 1.2359806299209595,
|
||
|
|
"learning_rate": 5.46150403451271e-06,
|
||
|
|
"epoch": 1.8594594594594596,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.746907353401184,
|
||
|
|
"eval_runtime": 10.0635,
|
||
|
|
"eval_samples_per_second": 77.409,
|
||
|
|
"eval_steps_per_second": 19.377,
|
||
|
|
"epoch": 1.8616216216216217,
|
||
|
|
"step": 1722
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5115,
|
||
|
|
"grad_norm": 0.9085004329681396,
|
||
|
|
"learning_rate": 4.6626437769057955e-06,
|
||
|
|
"epoch": 1.8702702702702703,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3037,
|
||
|
|
"grad_norm": 1.031083583831787,
|
||
|
|
"learning_rate": 3.9262331618890256e-06,
|
||
|
|
"epoch": 1.881081081081081,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.605,
|
||
|
|
"grad_norm": 1.240249752998352,
|
||
|
|
"learning_rate": 3.2525076210286e-06,
|
||
|
|
"epoch": 1.8918918918918919,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3975,
|
||
|
|
"grad_norm": 0.9747676849365234,
|
||
|
|
"learning_rate": 2.6416825453794646e-06,
|
||
|
|
"epoch": 1.9027027027027028,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3456,
|
||
|
|
"grad_norm": 1.1354199647903442,
|
||
|
|
"learning_rate": 2.093953216624556e-06,
|
||
|
|
"epoch": 1.9135135135135135,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.5099,
|
||
|
|
"grad_norm": 1.3150416612625122,
|
||
|
|
"learning_rate": 1.609494744642892e-06,
|
||
|
|
"epoch": 1.9243243243243242,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3622,
|
||
|
|
"grad_norm": 1.1703237295150757,
|
||
|
|
"learning_rate": 1.188462011526692e-06,
|
||
|
|
"epoch": 1.9351351351351351,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3636,
|
||
|
|
"grad_norm": 1.1234310865402222,
|
||
|
|
"learning_rate": 8.309896220654034e-07,
|
||
|
|
"epoch": 1.945945945945946,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.377,
|
||
|
|
"grad_norm": 1.2217568159103394,
|
||
|
|
"learning_rate": 5.371918607122827e-07,
|
||
|
|
"epoch": 1.9567567567567568,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.4881,
|
||
|
|
"grad_norm": 1.1424119472503662,
|
||
|
|
"learning_rate": 3.0716265504753263e-07,
|
||
|
|
"epoch": 1.9675675675675675,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3919,
|
||
|
|
"grad_norm": 1.0163501501083374,
|
||
|
|
"learning_rate": 1.409755457494555e-07,
|
||
|
|
"epoch": 1.9783783783783784,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3928,
|
||
|
|
"grad_norm": 1.117492914199829,
|
||
|
|
"learning_rate": 3.868366308346083e-08,
|
||
|
|
"epoch": 1.9891891891891893,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"eval_loss": 1.7465505599975586,
|
||
|
|
"eval_runtime": 9.9858,
|
||
|
|
"eval_samples_per_second": 78.011,
|
||
|
|
"eval_steps_per_second": 19.528,
|
||
|
|
"epoch": 1.9945945945945946,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3995,
|
||
|
|
"grad_norm": 1.4473432302474976,
|
||
|
|
"learning_rate": 3.1970991622998217e-10,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"train_runtime": 958.4314,
|
||
|
|
"train_samples_per_second": 30.878,
|
||
|
|
"train_steps_per_second": 1.93,
|
||
|
|
"total_flos": 9.431756938411008e+16,
|
||
|
|
"train_loss": 1.6259772120295344,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"step": 1850
|
||
|
|
}
|
||
|
|
]
|