1786 lines
43 KiB
JSON
1786 lines
43 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 249,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.012048192771084338,
|
|
"grad_norm": 40.75222396850586,
|
|
"learning_rate": 9.99960204377842e-06,
|
|
"loss": 0.8283,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.024096385542168676,
|
|
"grad_norm": 7.6217546463012695,
|
|
"learning_rate": 9.99840823846134e-06,
|
|
"loss": 0.7463,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.03614457831325301,
|
|
"grad_norm": 4.7737321853637695,
|
|
"learning_rate": 9.996418774081658e-06,
|
|
"loss": 0.6773,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.04819277108433735,
|
|
"grad_norm": 7.380457401275635,
|
|
"learning_rate": 9.99363396732727e-06,
|
|
"loss": 0.7069,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.060240963855421686,
|
|
"grad_norm": 6.07143497467041,
|
|
"learning_rate": 9.990054261490643e-06,
|
|
"loss": 0.8157,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.07228915662650602,
|
|
"grad_norm": 32.628204345703125,
|
|
"learning_rate": 9.985680226398261e-06,
|
|
"loss": 0.7604,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.08433734939759036,
|
|
"grad_norm": 7.0896759033203125,
|
|
"learning_rate": 9.980512558319915e-06,
|
|
"loss": 0.6947,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.0963855421686747,
|
|
"grad_norm": 4.166346549987793,
|
|
"learning_rate": 9.974552079857873e-06,
|
|
"loss": 0.5901,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.10843373493975904,
|
|
"grad_norm": 5.307025909423828,
|
|
"learning_rate": 9.967799739815925e-06,
|
|
"loss": 0.6768,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.12048192771084337,
|
|
"grad_norm": 5.205200672149658,
|
|
"learning_rate": 9.960256613048367e-06,
|
|
"loss": 0.7401,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.13253012048192772,
|
|
"grad_norm": 4.746809482574463,
|
|
"learning_rate": 9.951923900288888e-06,
|
|
"loss": 0.603,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.14457831325301204,
|
|
"grad_norm": 4.554243087768555,
|
|
"learning_rate": 9.942802927959444e-06,
|
|
"loss": 0.5951,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.1566265060240964,
|
|
"grad_norm": 4.13496732711792,
|
|
"learning_rate": 9.932895147959106e-06,
|
|
"loss": 0.6246,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.1686746987951807,
|
|
"grad_norm": 5.653600692749023,
|
|
"learning_rate": 9.922202137432954e-06,
|
|
"loss": 0.8116,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.18072289156626506,
|
|
"grad_norm": 4.084902286529541,
|
|
"learning_rate": 9.910725598521014e-06,
|
|
"loss": 0.5243,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.1927710843373494,
|
|
"grad_norm": 4.84393835067749,
|
|
"learning_rate": 9.89846735808731e-06,
|
|
"loss": 0.5911,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.20481927710843373,
|
|
"grad_norm": 3.5985801219940186,
|
|
"learning_rate": 9.885429367429062e-06,
|
|
"loss": 0.5873,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.21686746987951808,
|
|
"grad_norm": 4.133760452270508,
|
|
"learning_rate": 9.871613701966067e-06,
|
|
"loss": 0.58,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.2289156626506024,
|
|
"grad_norm": 5.736385345458984,
|
|
"learning_rate": 9.857022560910338e-06,
|
|
"loss": 0.6884,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.24096385542168675,
|
|
"grad_norm": 5.400482177734375,
|
|
"learning_rate": 9.84165826691602e-06,
|
|
"loss": 0.7507,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.25301204819277107,
|
|
"grad_norm": 3.2082321643829346,
|
|
"learning_rate": 9.825523265709667e-06,
|
|
"loss": 0.4539,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.26506024096385544,
|
|
"grad_norm": 3.9605965614318848,
|
|
"learning_rate": 9.808620125700925e-06,
|
|
"loss": 0.5744,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.27710843373493976,
|
|
"grad_norm": 3.652902603149414,
|
|
"learning_rate": 9.790951537573686e-06,
|
|
"loss": 0.4361,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.2891566265060241,
|
|
"grad_norm": 3.5659713745117188,
|
|
"learning_rate": 9.772520313857777e-06,
|
|
"loss": 0.4565,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.30120481927710846,
|
|
"grad_norm": 5.866443157196045,
|
|
"learning_rate": 9.753329388481261e-06,
|
|
"loss": 0.6564,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.3132530120481928,
|
|
"grad_norm": 5.043295383453369,
|
|
"learning_rate": 9.733381816303395e-06,
|
|
"loss": 0.5905,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.3253012048192771,
|
|
"grad_norm": 4.576389789581299,
|
|
"learning_rate": 9.712680772628365e-06,
|
|
"loss": 0.5458,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.3373493975903614,
|
|
"grad_norm": 2.964594602584839,
|
|
"learning_rate": 9.691229552699817e-06,
|
|
"loss": 0.4196,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.3493975903614458,
|
|
"grad_norm": 3.668825387954712,
|
|
"learning_rate": 9.669031571176322e-06,
|
|
"loss": 0.5939,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.3614457831325301,
|
|
"grad_norm": 3.4135804176330566,
|
|
"learning_rate": 9.646090361587828e-06,
|
|
"loss": 0.4942,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.37349397590361444,
|
|
"grad_norm": 3.3271186351776123,
|
|
"learning_rate": 9.622409575773162e-06,
|
|
"loss": 0.4447,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.3855421686746988,
|
|
"grad_norm": 3.8561484813690186,
|
|
"learning_rate": 9.597992983298748e-06,
|
|
"loss": 0.5443,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.39759036144578314,
|
|
"grad_norm": 3.4959912300109863,
|
|
"learning_rate": 9.572844470858537e-06,
|
|
"loss": 0.5228,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.40963855421686746,
|
|
"grad_norm": 4.416797637939453,
|
|
"learning_rate": 9.546968041655326e-06,
|
|
"loss": 0.5745,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.42168674698795183,
|
|
"grad_norm": 3.1685950756073,
|
|
"learning_rate": 9.520367814763514e-06,
|
|
"loss": 0.5249,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.43373493975903615,
|
|
"grad_norm": 3.5792479515075684,
|
|
"learning_rate": 9.493048024473413e-06,
|
|
"loss": 0.5533,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.4457831325301205,
|
|
"grad_norm": 3.136587619781494,
|
|
"learning_rate": 9.46501301961723e-06,
|
|
"loss": 0.5707,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.4578313253012048,
|
|
"grad_norm": 6.66333532333374,
|
|
"learning_rate": 9.436267262876808e-06,
|
|
"loss": 0.5537,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.46987951807228917,
|
|
"grad_norm": 3.710054397583008,
|
|
"learning_rate": 9.406815330073244e-06,
|
|
"loss": 0.5072,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.4819277108433735,
|
|
"grad_norm": 2.439741611480713,
|
|
"learning_rate": 9.376661909438496e-06,
|
|
"loss": 0.4088,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.4939759036144578,
|
|
"grad_norm": 2.6984646320343018,
|
|
"learning_rate": 9.3458118008691e-06,
|
|
"loss": 0.548,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.5060240963855421,
|
|
"grad_norm": 2.626049757003784,
|
|
"learning_rate": 9.314269915162115e-06,
|
|
"loss": 0.541,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.5180722891566265,
|
|
"grad_norm": 31.189899444580078,
|
|
"learning_rate": 9.282041273233402e-06,
|
|
"loss": 0.5461,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.5301204819277109,
|
|
"grad_norm": 4.356227397918701,
|
|
"learning_rate": 9.249131005318388e-06,
|
|
"loss": 0.6082,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.5421686746987951,
|
|
"grad_norm": 10.281394958496094,
|
|
"learning_rate": 9.215544350155423e-06,
|
|
"loss": 0.6193,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.5542168674698795,
|
|
"grad_norm": 81.10453796386719,
|
|
"learning_rate": 9.18128665415186e-06,
|
|
"loss": 0.4795,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.5662650602409639,
|
|
"grad_norm": 136.2274932861328,
|
|
"learning_rate": 9.146363370533004e-06,
|
|
"loss": 0.5669,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.5783132530120482,
|
|
"grad_norm": 24.73008155822754,
|
|
"learning_rate": 9.110780058474052e-06,
|
|
"loss": 0.4712,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.5903614457831325,
|
|
"grad_norm": 3.0569868087768555,
|
|
"learning_rate": 9.07454238221517e-06,
|
|
"loss": 0.4934,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.6024096385542169,
|
|
"grad_norm": 3.192237615585327,
|
|
"learning_rate": 9.03765611015985e-06,
|
|
"loss": 0.5427,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.6144578313253012,
|
|
"grad_norm": 1.920320749282837,
|
|
"learning_rate": 9.000127113956673e-06,
|
|
"loss": 0.4182,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.6265060240963856,
|
|
"grad_norm": 3.1197104454040527,
|
|
"learning_rate": 8.961961367564652e-06,
|
|
"loss": 0.5577,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.6385542168674698,
|
|
"grad_norm": 2.1309397220611572,
|
|
"learning_rate": 8.923164946302274e-06,
|
|
"loss": 0.5111,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.6506024096385542,
|
|
"grad_norm": 2.3042995929718018,
|
|
"learning_rate": 8.883744025880429e-06,
|
|
"loss": 0.5015,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.6626506024096386,
|
|
"grad_norm": 2.4492433071136475,
|
|
"learning_rate": 8.843704881419333e-06,
|
|
"loss": 0.3826,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.6746987951807228,
|
|
"grad_norm": 2.3031723499298096,
|
|
"learning_rate": 8.803053886449644e-06,
|
|
"loss": 0.4694,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.6867469879518072,
|
|
"grad_norm": 3.1464896202087402,
|
|
"learning_rate": 8.761797511897907e-06,
|
|
"loss": 0.5708,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.6987951807228916,
|
|
"grad_norm": 2.5254249572753906,
|
|
"learning_rate": 8.719942325056496e-06,
|
|
"loss": 0.5605,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.7108433734939759,
|
|
"grad_norm": 2.614318370819092,
|
|
"learning_rate": 8.67749498853821e-06,
|
|
"loss": 0.5702,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.7228915662650602,
|
|
"grad_norm": 2.1782386302948,
|
|
"learning_rate": 8.634462259215719e-06,
|
|
"loss": 0.5409,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.7349397590361446,
|
|
"grad_norm": 2.084237813949585,
|
|
"learning_rate": 8.590850987145964e-06,
|
|
"loss": 0.4923,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.7469879518072289,
|
|
"grad_norm": 2.4142396450042725,
|
|
"learning_rate": 8.546668114479769e-06,
|
|
"loss": 0.6142,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.7590361445783133,
|
|
"grad_norm": 1.6900039911270142,
|
|
"learning_rate": 8.501920674356755e-06,
|
|
"loss": 0.4445,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.7710843373493976,
|
|
"grad_norm": 1.9757111072540283,
|
|
"learning_rate": 8.456615789785804e-06,
|
|
"loss": 0.491,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.7831325301204819,
|
|
"grad_norm": 2.328930139541626,
|
|
"learning_rate": 8.410760672511188e-06,
|
|
"loss": 0.5563,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.7951807228915663,
|
|
"grad_norm": 2.8067822456359863,
|
|
"learning_rate": 8.364362621864595e-06,
|
|
"loss": 0.6574,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.8072289156626506,
|
|
"grad_norm": 2.0766549110412598,
|
|
"learning_rate": 8.31742902360319e-06,
|
|
"loss": 0.5063,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.8192771084337349,
|
|
"grad_norm": 2.085911989212036,
|
|
"learning_rate": 8.269967348733947e-06,
|
|
"loss": 0.5504,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.8313253012048193,
|
|
"grad_norm": 1.8254350423812866,
|
|
"learning_rate": 8.221985152324385e-06,
|
|
"loss": 0.4678,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.8433734939759037,
|
|
"grad_norm": 2.208496332168579,
|
|
"learning_rate": 8.17349007229994e-06,
|
|
"loss": 0.5589,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.8554216867469879,
|
|
"grad_norm": 2.833843469619751,
|
|
"learning_rate": 8.124489828228136e-06,
|
|
"loss": 0.6464,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.8674698795180723,
|
|
"grad_norm": 2.181140661239624,
|
|
"learning_rate": 8.07499222008977e-06,
|
|
"loss": 0.6037,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.8795180722891566,
|
|
"grad_norm": 1.5879639387130737,
|
|
"learning_rate": 8.025005127037282e-06,
|
|
"loss": 0.4077,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.891566265060241,
|
|
"grad_norm": 1.94895601272583,
|
|
"learning_rate": 7.974536506140546e-06,
|
|
"loss": 0.4523,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.9036144578313253,
|
|
"grad_norm": 2.282900810241699,
|
|
"learning_rate": 7.923594391120237e-06,
|
|
"loss": 0.4889,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.9156626506024096,
|
|
"grad_norm": 1.8225998878479004,
|
|
"learning_rate": 7.872186891068997e-06,
|
|
"loss": 0.4483,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.927710843373494,
|
|
"grad_norm": 2.1921205520629883,
|
|
"learning_rate": 7.820322189160618e-06,
|
|
"loss": 0.4848,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.9397590361445783,
|
|
"grad_norm": 1.9695558547973633,
|
|
"learning_rate": 7.768008541347423e-06,
|
|
"loss": 0.4577,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.9518072289156626,
|
|
"grad_norm": 2.367926836013794,
|
|
"learning_rate": 7.715254275046062e-06,
|
|
"loss": 0.6004,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.963855421686747,
|
|
"grad_norm": 1.95900297164917,
|
|
"learning_rate": 7.66206778781193e-06,
|
|
"loss": 0.5161,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.9759036144578314,
|
|
"grad_norm": 4.2675557136535645,
|
|
"learning_rate": 7.608457546002423e-06,
|
|
"loss": 0.4645,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.9879518072289156,
|
|
"grad_norm": 2.129870891571045,
|
|
"learning_rate": 7.554432083429253e-06,
|
|
"loss": 0.5267,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.7695404291152954,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 0.3909,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 1.0120481927710843,
|
|
"grad_norm": 2.0876364707946777,
|
|
"learning_rate": 7.445169960349167e-06,
|
|
"loss": 0.3333,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 1.0240963855421688,
|
|
"grad_norm": 1.5992554426193237,
|
|
"learning_rate": 7.389950692458916e-06,
|
|
"loss": 0.3103,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 1.036144578313253,
|
|
"grad_norm": 2.081721544265747,
|
|
"learning_rate": 7.3343509862697295e-06,
|
|
"loss": 0.286,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 1.0481927710843373,
|
|
"grad_norm": 1.5453327894210815,
|
|
"learning_rate": 7.278379692281209e-06,
|
|
"loss": 0.2851,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 1.0602409638554218,
|
|
"grad_norm": 1.6960233449935913,
|
|
"learning_rate": 7.22204572014322e-06,
|
|
"loss": 0.3118,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 1.072289156626506,
|
|
"grad_norm": 1.6961935758590698,
|
|
"learning_rate": 7.165358037237644e-06,
|
|
"loss": 0.3024,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 1.0843373493975903,
|
|
"grad_norm": 1.9473631381988525,
|
|
"learning_rate": 7.10832566725092e-06,
|
|
"loss": 0.3262,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 1.0963855421686748,
|
|
"grad_norm": 1.5019605159759521,
|
|
"learning_rate": 7.0509576887376375e-06,
|
|
"loss": 0.23,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 1.108433734939759,
|
|
"grad_norm": 1.7088998556137085,
|
|
"learning_rate": 6.99326323367538e-06,
|
|
"loss": 0.2511,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 1.1204819277108433,
|
|
"grad_norm": 2.8957417011260986,
|
|
"learning_rate": 6.9352514860110876e-06,
|
|
"loss": 0.3191,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 1.1325301204819278,
|
|
"grad_norm": 1.71742844581604,
|
|
"learning_rate": 6.876931680199121e-06,
|
|
"loss": 0.2792,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 1.144578313253012,
|
|
"grad_norm": 1.615378975868225,
|
|
"learning_rate": 6.818313099731308e-06,
|
|
"loss": 0.2653,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 1.1566265060240963,
|
|
"grad_norm": 1.4427539110183716,
|
|
"learning_rate": 6.759405075659165e-06,
|
|
"loss": 0.2909,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 1.1686746987951806,
|
|
"grad_norm": 1.1839165687561035,
|
|
"learning_rate": 6.700216985108568e-06,
|
|
"loss": 0.1959,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 1.180722891566265,
|
|
"grad_norm": 1.7143460512161255,
|
|
"learning_rate": 6.640758249787067e-06,
|
|
"loss": 0.2841,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 1.1927710843373494,
|
|
"grad_norm": 1.3873624801635742,
|
|
"learning_rate": 6.58103833448412e-06,
|
|
"loss": 0.2838,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 1.2048192771084336,
|
|
"grad_norm": 1.8592312335968018,
|
|
"learning_rate": 6.521066745564467e-06,
|
|
"loss": 0.2963,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.216867469879518,
|
|
"grad_norm": 1.608494758605957,
|
|
"learning_rate": 6.460853029454879e-06,
|
|
"loss": 0.2877,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 1.2289156626506024,
|
|
"grad_norm": 1.8831335306167603,
|
|
"learning_rate": 6.4004067711245366e-06,
|
|
"loss": 0.3066,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 1.2409638554216866,
|
|
"grad_norm": 1.743905782699585,
|
|
"learning_rate": 6.3397375925592675e-06,
|
|
"loss": 0.3099,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 1.2530120481927711,
|
|
"grad_norm": 1.8759677410125732,
|
|
"learning_rate": 6.2788551512299014e-06,
|
|
"loss": 0.2914,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 1.2650602409638554,
|
|
"grad_norm": 1.7082366943359375,
|
|
"learning_rate": 6.2177691385549595e-06,
|
|
"loss": 0.2931,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 1.2771084337349397,
|
|
"grad_norm": 1.519975185394287,
|
|
"learning_rate": 6.156489278357967e-06,
|
|
"loss": 0.2499,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 1.2891566265060241,
|
|
"grad_norm": 1.8293309211730957,
|
|
"learning_rate": 6.0950253253195656e-06,
|
|
"loss": 0.3611,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 1.3012048192771084,
|
|
"grad_norm": 1.728571891784668,
|
|
"learning_rate": 6.033387063424765e-06,
|
|
"loss": 0.3017,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 1.3132530120481927,
|
|
"grad_norm": 1.6766902208328247,
|
|
"learning_rate": 5.971584304405489e-06,
|
|
"loss": 0.2823,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 1.3253012048192772,
|
|
"grad_norm": 1.7143419981002808,
|
|
"learning_rate": 5.909626886178721e-06,
|
|
"loss": 0.2307,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 1.3373493975903614,
|
|
"grad_norm": 1.5373152494430542,
|
|
"learning_rate": 5.8475246712804845e-06,
|
|
"loss": 0.2963,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 1.3493975903614457,
|
|
"grad_norm": 1.8781455755233765,
|
|
"learning_rate": 5.785287545295895e-06,
|
|
"loss": 0.2874,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 1.3614457831325302,
|
|
"grad_norm": 1.824504017829895,
|
|
"learning_rate": 5.722925415285555e-06,
|
|
"loss": 0.2454,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 1.3734939759036144,
|
|
"grad_norm": 1.7806376218795776,
|
|
"learning_rate": 5.660448208208513e-06,
|
|
"loss": 0.3654,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 1.3855421686746987,
|
|
"grad_norm": 1.5633933544158936,
|
|
"learning_rate": 5.597865869342075e-06,
|
|
"loss": 0.2931,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 1.3975903614457832,
|
|
"grad_norm": 1.8875840902328491,
|
|
"learning_rate": 5.535188360698687e-06,
|
|
"loss": 0.331,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 1.4096385542168675,
|
|
"grad_norm": 1.404435634613037,
|
|
"learning_rate": 5.472425659440157e-06,
|
|
"loss": 0.246,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 1.4216867469879517,
|
|
"grad_norm": 1.4050829410552979,
|
|
"learning_rate": 5.409587756289462e-06,
|
|
"loss": 0.2689,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 1.4337349397590362,
|
|
"grad_norm": 1.5876859426498413,
|
|
"learning_rate": 5.346684653940408e-06,
|
|
"loss": 0.2645,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 1.4457831325301205,
|
|
"grad_norm": 1.6692218780517578,
|
|
"learning_rate": 5.2837263654653715e-06,
|
|
"loss": 0.3155,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.4578313253012047,
|
|
"grad_norm": 1.2533305883407593,
|
|
"learning_rate": 5.2207229127213866e-06,
|
|
"loss": 0.2112,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 1.4698795180722892,
|
|
"grad_norm": 1.5980626344680786,
|
|
"learning_rate": 5.157684324754858e-06,
|
|
"loss": 0.2441,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 1.4819277108433735,
|
|
"grad_norm": 1.6085745096206665,
|
|
"learning_rate": 5.094620636205096e-06,
|
|
"loss": 0.3087,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 1.4939759036144578,
|
|
"grad_norm": 1.7097792625427246,
|
|
"learning_rate": 5.031541885706987e-06,
|
|
"loss": 0.2499,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 1.5060240963855422,
|
|
"grad_norm": 1.4703900814056396,
|
|
"learning_rate": 4.9684581142930135e-06,
|
|
"loss": 0.2413,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 1.5180722891566265,
|
|
"grad_norm": 2.3154144287109375,
|
|
"learning_rate": 4.905379363794907e-06,
|
|
"loss": 0.3701,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 1.5301204819277108,
|
|
"grad_norm": 1.665852427482605,
|
|
"learning_rate": 4.842315675245144e-06,
|
|
"loss": 0.2791,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 1.5421686746987953,
|
|
"grad_norm": 1.7872849702835083,
|
|
"learning_rate": 4.779277087278615e-06,
|
|
"loss": 0.3303,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 1.5542168674698795,
|
|
"grad_norm": 1.4255069494247437,
|
|
"learning_rate": 4.71627363453463e-06,
|
|
"loss": 0.2462,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 1.5662650602409638,
|
|
"grad_norm": 1.8723397254943848,
|
|
"learning_rate": 4.653315346059592e-06,
|
|
"loss": 0.3083,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 1.5783132530120483,
|
|
"grad_norm": 1.6238393783569336,
|
|
"learning_rate": 4.5904122437105384e-06,
|
|
"loss": 0.2947,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 1.5903614457831325,
|
|
"grad_norm": 1.5982369184494019,
|
|
"learning_rate": 4.527574340559844e-06,
|
|
"loss": 0.3114,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 1.6024096385542168,
|
|
"grad_norm": 1.7584006786346436,
|
|
"learning_rate": 4.464811639301314e-06,
|
|
"loss": 0.3335,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 1.6144578313253013,
|
|
"grad_norm": 1.7169082164764404,
|
|
"learning_rate": 4.402134130657925e-06,
|
|
"loss": 0.2783,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 1.6265060240963856,
|
|
"grad_norm": 1.6119632720947266,
|
|
"learning_rate": 4.33955179179149e-06,
|
|
"loss": 0.252,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 1.6385542168674698,
|
|
"grad_norm": 1.5756961107254028,
|
|
"learning_rate": 4.277074584714447e-06,
|
|
"loss": 0.2825,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 1.6506024096385543,
|
|
"grad_norm": 1.511651873588562,
|
|
"learning_rate": 4.214712454704107e-06,
|
|
"loss": 0.2479,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 1.6626506024096386,
|
|
"grad_norm": 1.354615330696106,
|
|
"learning_rate": 4.152475328719517e-06,
|
|
"loss": 0.2192,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 1.6746987951807228,
|
|
"grad_norm": 1.821956753730774,
|
|
"learning_rate": 4.090373113821281e-06,
|
|
"loss": 0.2735,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 1.6867469879518073,
|
|
"grad_norm": 1.4524273872375488,
|
|
"learning_rate": 4.028415695594512e-06,
|
|
"loss": 0.2222,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 1.6987951807228916,
|
|
"grad_norm": 1.6997952461242676,
|
|
"learning_rate": 3.966612936575235e-06,
|
|
"loss": 0.2841,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 1.7108433734939759,
|
|
"grad_norm": 1.5502634048461914,
|
|
"learning_rate": 3.904974674680436e-06,
|
|
"loss": 0.281,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 1.7228915662650603,
|
|
"grad_norm": 1.6944836378097534,
|
|
"learning_rate": 3.843510721642036e-06,
|
|
"loss": 0.19,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 1.7349397590361446,
|
|
"grad_norm": 1.958292007446289,
|
|
"learning_rate": 3.782230861445041e-06,
|
|
"loss": 0.3143,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 1.7469879518072289,
|
|
"grad_norm": 1.9379884004592896,
|
|
"learning_rate": 3.7211448487701002e-06,
|
|
"loss": 0.2964,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 1.7590361445783134,
|
|
"grad_norm": 1.6362128257751465,
|
|
"learning_rate": 3.6602624074407354e-06,
|
|
"loss": 0.2749,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 1.7710843373493976,
|
|
"grad_norm": 1.740090250968933,
|
|
"learning_rate": 3.5995932288754655e-06,
|
|
"loss": 0.2572,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 1.783132530120482,
|
|
"grad_norm": 1.3941646814346313,
|
|
"learning_rate": 3.539146970545124e-06,
|
|
"loss": 0.2476,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 1.7951807228915664,
|
|
"grad_norm": 1.6419267654418945,
|
|
"learning_rate": 3.478933254435534e-06,
|
|
"loss": 0.2902,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 1.8072289156626506,
|
|
"grad_norm": 1.825861930847168,
|
|
"learning_rate": 3.4189616655158803e-06,
|
|
"loss": 0.3345,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.819277108433735,
|
|
"grad_norm": 1.749080777168274,
|
|
"learning_rate": 3.359241750212934e-06,
|
|
"loss": 0.314,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 1.8313253012048194,
|
|
"grad_norm": 1.2390449047088623,
|
|
"learning_rate": 3.2997830148914316e-06,
|
|
"loss": 0.214,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 1.8433734939759037,
|
|
"grad_norm": 1.6753946542739868,
|
|
"learning_rate": 3.240594924340835e-06,
|
|
"loss": 0.2988,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 1.855421686746988,
|
|
"grad_norm": 1.6091383695602417,
|
|
"learning_rate": 3.181686900268694e-06,
|
|
"loss": 0.2481,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 1.8674698795180724,
|
|
"grad_norm": 1.438892126083374,
|
|
"learning_rate": 3.1230683198008817e-06,
|
|
"loss": 0.2702,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 1.8795180722891565,
|
|
"grad_norm": 1.931443691253662,
|
|
"learning_rate": 3.0647485139889145e-06,
|
|
"loss": 0.2957,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 1.891566265060241,
|
|
"grad_norm": 1.5713204145431519,
|
|
"learning_rate": 3.006736766324623e-06,
|
|
"loss": 0.2815,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 1.9036144578313254,
|
|
"grad_norm": 1.5962169170379639,
|
|
"learning_rate": 2.9490423112623646e-06,
|
|
"loss": 0.2791,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 1.9156626506024095,
|
|
"grad_norm": 2.0020360946655273,
|
|
"learning_rate": 2.89167433274908e-06,
|
|
"loss": 0.3897,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 1.927710843373494,
|
|
"grad_norm": 1.6599327325820923,
|
|
"learning_rate": 2.834641962762358e-06,
|
|
"loss": 0.2742,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.9397590361445785,
|
|
"grad_norm": 1.6006088256835938,
|
|
"learning_rate": 2.7779542798567804e-06,
|
|
"loss": 0.2678,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 1.9518072289156625,
|
|
"grad_norm": 1.5215158462524414,
|
|
"learning_rate": 2.721620307718793e-06,
|
|
"loss": 0.3035,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 1.963855421686747,
|
|
"grad_norm": 1.8756093978881836,
|
|
"learning_rate": 2.66564901373027e-06,
|
|
"loss": 0.3407,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 1.9759036144578315,
|
|
"grad_norm": 1.5014938116073608,
|
|
"learning_rate": 2.610049307541085e-06,
|
|
"loss": 0.2533,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 1.9879518072289155,
|
|
"grad_norm": 1.6140003204345703,
|
|
"learning_rate": 2.554830039650834e-06,
|
|
"loss": 0.2369,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.5059895515441895,
|
|
"learning_rate": 2.5000000000000015e-06,
|
|
"loss": 0.1612,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 2.0120481927710845,
|
|
"grad_norm": 1.2760642766952515,
|
|
"learning_rate": 2.4455679165707473e-06,
|
|
"loss": 0.1247,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 2.0240963855421685,
|
|
"grad_norm": 1.3720568418502808,
|
|
"learning_rate": 2.391542453997578e-06,
|
|
"loss": 0.1618,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 2.036144578313253,
|
|
"grad_norm": 1.4044466018676758,
|
|
"learning_rate": 2.337932212188073e-06,
|
|
"loss": 0.1427,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 2.0481927710843375,
|
|
"grad_norm": 1.2212741374969482,
|
|
"learning_rate": 2.284745724953939e-06,
|
|
"loss": 0.1587,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 2.0602409638554215,
|
|
"grad_norm": 1.1166741847991943,
|
|
"learning_rate": 2.2319914586525776e-06,
|
|
"loss": 0.1169,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 2.072289156626506,
|
|
"grad_norm": 1.2007352113723755,
|
|
"learning_rate": 2.1796778108393824e-06,
|
|
"loss": 0.1232,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 2.0843373493975905,
|
|
"grad_norm": 1.4228880405426025,
|
|
"learning_rate": 2.127813108931007e-06,
|
|
"loss": 0.1646,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 2.0963855421686746,
|
|
"grad_norm": 1.2214866876602173,
|
|
"learning_rate": 2.0764056088797646e-06,
|
|
"loss": 0.1058,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 2.108433734939759,
|
|
"grad_norm": 1.8072195053100586,
|
|
"learning_rate": 2.0254634938594555e-06,
|
|
"loss": 0.1579,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 2.1204819277108435,
|
|
"grad_norm": 1.872309684753418,
|
|
"learning_rate": 1.9749948729627188e-06,
|
|
"loss": 0.138,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 2.1325301204819276,
|
|
"grad_norm": 1.8318668603897095,
|
|
"learning_rate": 1.9250077799102323e-06,
|
|
"loss": 0.1331,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 2.144578313253012,
|
|
"grad_norm": 2.1385316848754883,
|
|
"learning_rate": 1.875510171771865e-06,
|
|
"loss": 0.1635,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 2.1566265060240966,
|
|
"grad_norm": 1.719831943511963,
|
|
"learning_rate": 1.8265099277000614e-06,
|
|
"loss": 0.1561,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 2.1686746987951806,
|
|
"grad_norm": 1.7940328121185303,
|
|
"learning_rate": 1.7780148476756148e-06,
|
|
"loss": 0.14,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 2.180722891566265,
|
|
"grad_norm": 1.3721911907196045,
|
|
"learning_rate": 1.7300326512660542e-06,
|
|
"loss": 0.1233,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 2.1927710843373496,
|
|
"grad_norm": 1.2797173261642456,
|
|
"learning_rate": 1.6825709763968112e-06,
|
|
"loss": 0.0936,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 2.2048192771084336,
|
|
"grad_norm": 1.5839323997497559,
|
|
"learning_rate": 1.6356373781354058e-06,
|
|
"loss": 0.1648,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 2.216867469879518,
|
|
"grad_norm": 1.3700120449066162,
|
|
"learning_rate": 1.589239327488812e-06,
|
|
"loss": 0.126,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 2.2289156626506026,
|
|
"grad_norm": 1.5171151161193848,
|
|
"learning_rate": 1.543384210214196e-06,
|
|
"loss": 0.1212,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 2.2409638554216866,
|
|
"grad_norm": 1.6373289823532104,
|
|
"learning_rate": 1.4980793256432474e-06,
|
|
"loss": 0.1509,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 2.253012048192771,
|
|
"grad_norm": 1.30360746383667,
|
|
"learning_rate": 1.453331885520234e-06,
|
|
"loss": 0.12,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 2.2650602409638556,
|
|
"grad_norm": 1.395431399345398,
|
|
"learning_rate": 1.4091490128540374e-06,
|
|
"loss": 0.1406,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 2.2771084337349397,
|
|
"grad_norm": 1.3656375408172607,
|
|
"learning_rate": 1.3655377407842813e-06,
|
|
"loss": 0.1706,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 2.289156626506024,
|
|
"grad_norm": 1.189477562904358,
|
|
"learning_rate": 1.32250501146179e-06,
|
|
"loss": 0.1243,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 2.3012048192771086,
|
|
"grad_norm": 1.273803949356079,
|
|
"learning_rate": 1.2800576749435068e-06,
|
|
"loss": 0.1132,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 2.3132530120481927,
|
|
"grad_norm": 1.249987244606018,
|
|
"learning_rate": 1.2382024881020937e-06,
|
|
"loss": 0.133,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 2.325301204819277,
|
|
"grad_norm": 1.2117363214492798,
|
|
"learning_rate": 1.1969461135503573e-06,
|
|
"loss": 0.1153,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 2.337349397590361,
|
|
"grad_norm": 1.115524172782898,
|
|
"learning_rate": 1.1562951185806675e-06,
|
|
"loss": 0.1068,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 2.3493975903614457,
|
|
"grad_norm": 1.2410939931869507,
|
|
"learning_rate": 1.1162559741195733e-06,
|
|
"loss": 0.0926,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 2.36144578313253,
|
|
"grad_norm": 1.0989357233047485,
|
|
"learning_rate": 1.076835053697728e-06,
|
|
"loss": 0.1147,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 2.3734939759036147,
|
|
"grad_norm": 1.2773900032043457,
|
|
"learning_rate": 1.0380386324353508e-06,
|
|
"loss": 0.131,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 2.3855421686746987,
|
|
"grad_norm": 1.2643158435821533,
|
|
"learning_rate": 9.998728860433277e-07,
|
|
"loss": 0.1377,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 2.397590361445783,
|
|
"grad_norm": 1.3157423734664917,
|
|
"learning_rate": 9.62343889840151e-07,
|
|
"loss": 0.127,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 2.4096385542168672,
|
|
"grad_norm": 1.1823986768722534,
|
|
"learning_rate": 9.254576177848313e-07,
|
|
"loss": 0.1039,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 2.4216867469879517,
|
|
"grad_norm": 1.2062366008758545,
|
|
"learning_rate": 8.892199415259501e-07,
|
|
"loss": 0.1137,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 2.433734939759036,
|
|
"grad_norm": 1.358426570892334,
|
|
"learning_rate": 8.536366294669979e-07,
|
|
"loss": 0.1188,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 2.4457831325301207,
|
|
"grad_norm": 1.4414290189743042,
|
|
"learning_rate": 8.187133458481416e-07,
|
|
"loss": 0.1393,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 2.4578313253012047,
|
|
"grad_norm": 1.2111995220184326,
|
|
"learning_rate": 7.844556498445788e-07,
|
|
"loss": 0.1088,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 2.4698795180722892,
|
|
"grad_norm": 1.0797122716903687,
|
|
"learning_rate": 7.508689946816128e-07,
|
|
"loss": 0.1012,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 2.4819277108433733,
|
|
"grad_norm": 1.2910206317901611,
|
|
"learning_rate": 7.179587267665999e-07,
|
|
"loss": 0.1283,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 2.4939759036144578,
|
|
"grad_norm": 1.76227867603302,
|
|
"learning_rate": 6.857300848378857e-07,
|
|
"loss": 0.1773,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 2.5060240963855422,
|
|
"grad_norm": 1.2892178297042847,
|
|
"learning_rate": 6.541881991309013e-07,
|
|
"loss": 0.1003,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 2.5180722891566267,
|
|
"grad_norm": 1.2142372131347656,
|
|
"learning_rate": 6.233380905615049e-07,
|
|
"loss": 0.1059,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 2.5301204819277108,
|
|
"grad_norm": 1.3028932809829712,
|
|
"learning_rate": 5.931846699267558e-07,
|
|
"loss": 0.0997,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 2.5421686746987953,
|
|
"grad_norm": 1.2703701257705688,
|
|
"learning_rate": 5.637327371231921e-07,
|
|
"loss": 0.1074,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 2.5542168674698793,
|
|
"grad_norm": 1.6055101156234741,
|
|
"learning_rate": 5.349869803827717e-07,
|
|
"loss": 0.1635,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 2.566265060240964,
|
|
"grad_norm": 1.2764710187911987,
|
|
"learning_rate": 5.0695197552659e-07,
|
|
"loss": 0.1394,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 2.5783132530120483,
|
|
"grad_norm": 1.3518632650375366,
|
|
"learning_rate": 4.796321852364877e-07,
|
|
"loss": 0.1363,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 2.5903614457831328,
|
|
"grad_norm": 1.3571412563323975,
|
|
"learning_rate": 4.5303195834467463e-07,
|
|
"loss": 0.1326,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 2.602409638554217,
|
|
"grad_norm": 1.4019449949264526,
|
|
"learning_rate": 4.271555291414636e-07,
|
|
"loss": 0.1222,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 2.6144578313253013,
|
|
"grad_norm": 1.184061050415039,
|
|
"learning_rate": 4.020070167012541e-07,
|
|
"loss": 0.0845,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 2.6265060240963853,
|
|
"grad_norm": 1.5907222032546997,
|
|
"learning_rate": 3.775904242268391e-07,
|
|
"loss": 0.1353,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 2.63855421686747,
|
|
"grad_norm": 1.3479151725769043,
|
|
"learning_rate": 3.539096384121743e-07,
|
|
"loss": 0.1445,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 2.6506024096385543,
|
|
"grad_norm": 1.36601722240448,
|
|
"learning_rate": 3.309684288236775e-07,
|
|
"loss": 0.1386,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 2.662650602409639,
|
|
"grad_norm": 1.6582006216049194,
|
|
"learning_rate": 3.0877044730018515e-07,
|
|
"loss": 0.1237,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 2.674698795180723,
|
|
"grad_norm": 1.505927324295044,
|
|
"learning_rate": 2.873192273716369e-07,
|
|
"loss": 0.153,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 2.6867469879518073,
|
|
"grad_norm": 1.2739795446395874,
|
|
"learning_rate": 2.666181836966053e-07,
|
|
"loss": 0.1038,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 2.6987951807228914,
|
|
"grad_norm": 1.3373569250106812,
|
|
"learning_rate": 2.466706115187406e-07,
|
|
"loss": 0.1208,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 2.710843373493976,
|
|
"grad_norm": 1.3513188362121582,
|
|
"learning_rate": 2.274796861422246e-07,
|
|
"loss": 0.1209,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 2.7228915662650603,
|
|
"grad_norm": 1.4020378589630127,
|
|
"learning_rate": 2.090484624263167e-07,
|
|
"loss": 0.1323,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 2.734939759036145,
|
|
"grad_norm": 1.4146372079849243,
|
|
"learning_rate": 1.9137987429907635e-07,
|
|
"loss": 0.1304,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 2.746987951807229,
|
|
"grad_norm": 1.3225347995758057,
|
|
"learning_rate": 1.7447673429033361e-07,
|
|
"loss": 0.1149,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 2.7590361445783134,
|
|
"grad_norm": 1.3890403509140015,
|
|
"learning_rate": 1.583417330839798e-07,
|
|
"loss": 0.1557,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 2.7710843373493974,
|
|
"grad_norm": 1.466339349746704,
|
|
"learning_rate": 1.4297743908966212e-07,
|
|
"loss": 0.1489,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 2.783132530120482,
|
|
"grad_norm": 1.2367849349975586,
|
|
"learning_rate": 1.2838629803393343e-07,
|
|
"loss": 0.0997,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 2.7951807228915664,
|
|
"grad_norm": 1.390717625617981,
|
|
"learning_rate": 1.1457063257093892e-07,
|
|
"loss": 0.1218,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 2.807228915662651,
|
|
"grad_norm": 1.2239187955856323,
|
|
"learning_rate": 1.0153264191269052e-07,
|
|
"loss": 0.1135,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 2.819277108433735,
|
|
"grad_norm": 1.0472311973571777,
|
|
"learning_rate": 8.927440147898703e-08,
|
|
"loss": 0.1065,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 2.8313253012048194,
|
|
"grad_norm": 1.2322300672531128,
|
|
"learning_rate": 7.779786256704669e-08,
|
|
"loss": 0.1016,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 2.8433734939759034,
|
|
"grad_norm": 1.3203635215759277,
|
|
"learning_rate": 6.710485204089456e-08,
|
|
"loss": 0.1239,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 2.855421686746988,
|
|
"grad_norm": 1.2621276378631592,
|
|
"learning_rate": 5.7197072040557356e-08,
|
|
"loss": 0.1358,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 2.8674698795180724,
|
|
"grad_norm": 1.3743759393692017,
|
|
"learning_rate": 4.807609971111238e-08,
|
|
"loss": 0.1337,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 2.8795180722891565,
|
|
"grad_norm": 1.0450865030288696,
|
|
"learning_rate": 3.974338695163393e-08,
|
|
"loss": 0.0945,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 2.891566265060241,
|
|
"grad_norm": 1.5270490646362305,
|
|
"learning_rate": 3.220026018407541e-08,
|
|
"loss": 0.0994,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 2.9036144578313254,
|
|
"grad_norm": 1.694990873336792,
|
|
"learning_rate": 2.5447920142128712e-08,
|
|
"loss": 0.1689,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 2.9156626506024095,
|
|
"grad_norm": 1.4968199729919434,
|
|
"learning_rate": 1.9487441680084983e-08,
|
|
"loss": 0.1219,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 2.927710843373494,
|
|
"grad_norm": 1.332356572151184,
|
|
"learning_rate": 1.431977360173975e-08,
|
|
"loss": 0.1137,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 2.9397590361445785,
|
|
"grad_norm": 1.4295134544372559,
|
|
"learning_rate": 9.945738509358205e-09,
|
|
"loss": 0.1498,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 2.9518072289156625,
|
|
"grad_norm": 1.1830252408981323,
|
|
"learning_rate": 6.366032672731059e-09,
|
|
"loss": 0.1002,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 2.963855421686747,
|
|
"grad_norm": 1.3982295989990234,
|
|
"learning_rate": 3.5812259183426457e-09,
|
|
"loss": 0.1247,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 2.9759036144578315,
|
|
"grad_norm": 1.467788577079773,
|
|
"learning_rate": 1.591761538662362e-09,
|
|
"loss": 0.1098,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 2.9879518072289155,
|
|
"grad_norm": 1.2710858583450317,
|
|
"learning_rate": 3.9795622158111945e-10,
|
|
"loss": 0.1335,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.9258020520210266,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0612,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 249,
|
|
"total_flos": 3.4614492313052774e+17,
|
|
"train_loss": 0.32225324711706266,
|
|
"train_runtime": 433.6826,
|
|
"train_samples_per_second": 4.545,
|
|
"train_steps_per_second": 0.574
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 249,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": false,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.4614492313052774e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|