13282 lines
322 KiB
JSON
13282 lines
322 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.9996635488863468,
|
|
"eval_steps": 60,
|
|
"global_step": 1857,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0005383217818450979,
|
|
"grad_norm": 109.41815851913972,
|
|
"learning_rate": 1.7857142857142858e-07,
|
|
"loss": 1.8223,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.0010766435636901958,
|
|
"grad_norm": 6.467874390653583,
|
|
"learning_rate": 3.5714285714285716e-07,
|
|
"loss": 0.995,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.0016149653455352936,
|
|
"grad_norm": 4802.638302957245,
|
|
"learning_rate": 5.357142857142857e-07,
|
|
"loss": 1.8859,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.0021532871273803916,
|
|
"grad_norm": 16.08026403474417,
|
|
"learning_rate": 7.142857142857143e-07,
|
|
"loss": 0.953,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.0026916089092254895,
|
|
"grad_norm": 57.36353411210698,
|
|
"learning_rate": 8.928571428571429e-07,
|
|
"loss": 0.9429,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.0032299306910705873,
|
|
"grad_norm": 19675.34780096024,
|
|
"learning_rate": 1.0714285714285714e-06,
|
|
"loss": 2.6006,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.0037682524729156855,
|
|
"grad_norm": 818.3276445922152,
|
|
"learning_rate": 1.25e-06,
|
|
"loss": 0.9918,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.004306574254760783,
|
|
"grad_norm": 104.38551556669266,
|
|
"learning_rate": 1.4285714285714286e-06,
|
|
"loss": 0.9549,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.004844896036605881,
|
|
"grad_norm": 367.74940177761715,
|
|
"learning_rate": 1.6071428571428574e-06,
|
|
"loss": 2.68,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.005383217818450979,
|
|
"grad_norm": 2496.9575083108416,
|
|
"learning_rate": 1.7857142857142859e-06,
|
|
"loss": 1.0041,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.005921539600296077,
|
|
"grad_norm": 8.38602908383877,
|
|
"learning_rate": 1.9642857142857144e-06,
|
|
"loss": 0.9457,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.0064598613821411745,
|
|
"grad_norm": 8.592616439259098,
|
|
"learning_rate": 2.1428571428571427e-06,
|
|
"loss": 0.8897,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.006998183163986273,
|
|
"grad_norm": 438.07218134142477,
|
|
"learning_rate": 2.321428571428572e-06,
|
|
"loss": 0.9019,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.007536504945831371,
|
|
"grad_norm": 9.994776961708206,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 0.825,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.008074826727676468,
|
|
"grad_norm": 10.743498710130337,
|
|
"learning_rate": 2.6785714285714285e-06,
|
|
"loss": 0.849,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.008613148509521567,
|
|
"grad_norm": 15.967903678723232,
|
|
"learning_rate": 2.8571428571428573e-06,
|
|
"loss": 0.8012,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.009151470291366665,
|
|
"grad_norm": 4.1668693518220055,
|
|
"learning_rate": 3.0357142857142856e-06,
|
|
"loss": 0.7744,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.009689792073211762,
|
|
"grad_norm": 6.926360685293587,
|
|
"learning_rate": 3.2142857142857147e-06,
|
|
"loss": 0.692,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.010228113855056861,
|
|
"grad_norm": 3.2346473797515074,
|
|
"learning_rate": 3.3928571428571435e-06,
|
|
"loss": 0.6831,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.010766435636901958,
|
|
"grad_norm": 3.7571978637997177,
|
|
"learning_rate": 3.5714285714285718e-06,
|
|
"loss": 0.6537,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.011304757418747056,
|
|
"grad_norm": 7.559105627558757,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 0.6141,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.011843079200592153,
|
|
"grad_norm": 2.387397312463083,
|
|
"learning_rate": 3.928571428571429e-06,
|
|
"loss": 0.695,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.012381400982437252,
|
|
"grad_norm": 4.391512430632287,
|
|
"learning_rate": 4.107142857142857e-06,
|
|
"loss": 0.6185,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.012919722764282349,
|
|
"grad_norm": 4.15230181408785,
|
|
"learning_rate": 4.2857142857142855e-06,
|
|
"loss": 0.5936,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.013458044546127448,
|
|
"grad_norm": 2.0653396060051605,
|
|
"learning_rate": 4.464285714285715e-06,
|
|
"loss": 0.5898,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.013996366327972546,
|
|
"grad_norm": 17.979519982673633,
|
|
"learning_rate": 4.642857142857144e-06,
|
|
"loss": 0.5905,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.014534688109817643,
|
|
"grad_norm": 8.151818005754823,
|
|
"learning_rate": 4.821428571428572e-06,
|
|
"loss": 0.5533,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.015073009891662742,
|
|
"grad_norm": 5.987974608299902,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6107,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.015611331673507839,
|
|
"grad_norm": 2.330194497601682,
|
|
"learning_rate": 5.1785714285714296e-06,
|
|
"loss": 0.5506,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.016149653455352936,
|
|
"grad_norm": 12.794302959608173,
|
|
"learning_rate": 5.357142857142857e-06,
|
|
"loss": 0.5766,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.016687975237198036,
|
|
"grad_norm": 1.9007212740143298,
|
|
"learning_rate": 5.535714285714286e-06,
|
|
"loss": 0.5627,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.017226297019043133,
|
|
"grad_norm": 1.9146904127163438,
|
|
"learning_rate": 5.7142857142857145e-06,
|
|
"loss": 0.5883,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.01776461880088823,
|
|
"grad_norm": 2.187675355948441,
|
|
"learning_rate": 5.892857142857144e-06,
|
|
"loss": 0.5035,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.01830294058273333,
|
|
"grad_norm": 2.6806082565798124,
|
|
"learning_rate": 6.071428571428571e-06,
|
|
"loss": 0.5542,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.018841262364578427,
|
|
"grad_norm": 3.3951990046554323,
|
|
"learning_rate": 6.25e-06,
|
|
"loss": 0.538,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.019379584146423524,
|
|
"grad_norm": 2.5099129162892853,
|
|
"learning_rate": 6.4285714285714295e-06,
|
|
"loss": 0.5251,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.01991790592826862,
|
|
"grad_norm": 2.0412677432451627,
|
|
"learning_rate": 6.607142857142858e-06,
|
|
"loss": 0.5121,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.020456227710113722,
|
|
"grad_norm": 2.781712184570042,
|
|
"learning_rate": 6.785714285714287e-06,
|
|
"loss": 0.5254,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.02099454949195882,
|
|
"grad_norm": 6.805805985669443,
|
|
"learning_rate": 6.964285714285714e-06,
|
|
"loss": 0.5831,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.021532871273803916,
|
|
"grad_norm": 2.0490434598423652,
|
|
"learning_rate": 7.1428571428571436e-06,
|
|
"loss": 0.5393,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.022071193055649013,
|
|
"grad_norm": 1.9372367290098516,
|
|
"learning_rate": 7.321428571428572e-06,
|
|
"loss": 0.5972,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.022609514837494113,
|
|
"grad_norm": 2.7227356306942165,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 0.5369,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.02314783661933921,
|
|
"grad_norm": 2.011860407760454,
|
|
"learning_rate": 7.67857142857143e-06,
|
|
"loss": 0.5164,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.023686158401184307,
|
|
"grad_norm": 2.380752182458038,
|
|
"learning_rate": 7.857142857142858e-06,
|
|
"loss": 0.4715,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.024224480183029407,
|
|
"grad_norm": 2.0112153484283537,
|
|
"learning_rate": 8.035714285714286e-06,
|
|
"loss": 0.4943,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.024762801964874504,
|
|
"grad_norm": 1.7657871236862508,
|
|
"learning_rate": 8.214285714285714e-06,
|
|
"loss": 0.5792,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.0253011237467196,
|
|
"grad_norm": 2.012306508738324,
|
|
"learning_rate": 8.392857142857144e-06,
|
|
"loss": 0.5704,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.025839445528564698,
|
|
"grad_norm": 2.0657223159326743,
|
|
"learning_rate": 8.571428571428571e-06,
|
|
"loss": 0.5145,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.0263777673104098,
|
|
"grad_norm": 2.137310846323582,
|
|
"learning_rate": 8.750000000000001e-06,
|
|
"loss": 0.5067,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.026916089092254895,
|
|
"grad_norm": 2.2166052489861534,
|
|
"learning_rate": 8.92857142857143e-06,
|
|
"loss": 0.5799,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.027454410874099992,
|
|
"grad_norm": 2.029493952864758,
|
|
"learning_rate": 9.107142857142858e-06,
|
|
"loss": 0.5817,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.027992732655945093,
|
|
"grad_norm": 1.5628607433382145,
|
|
"learning_rate": 9.285714285714288e-06,
|
|
"loss": 0.4722,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.02853105443779019,
|
|
"grad_norm": 1.686683459837313,
|
|
"learning_rate": 9.464285714285714e-06,
|
|
"loss": 0.5233,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.029069376219635287,
|
|
"grad_norm": 1.7287851726495882,
|
|
"learning_rate": 9.642857142857144e-06,
|
|
"loss": 0.5744,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.029607698001480384,
|
|
"grad_norm": 2.246853321344625,
|
|
"learning_rate": 9.821428571428573e-06,
|
|
"loss": 0.4972,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.030146019783325484,
|
|
"grad_norm": 1.9175548738162544,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.535,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.03068434156517058,
|
|
"grad_norm": 2.169109901402676,
|
|
"learning_rate": 9.999992393020984e-06,
|
|
"loss": 0.5429,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.031222663347015678,
|
|
"grad_norm": 2.260825281362616,
|
|
"learning_rate": 9.99996957210708e-06,
|
|
"loss": 0.521,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.031760985128860775,
|
|
"grad_norm": 1.660309077201794,
|
|
"learning_rate": 9.999931537327727e-06,
|
|
"loss": 0.531,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.03229930691070587,
|
|
"grad_norm": 2.069841458563405,
|
|
"learning_rate": 9.999878288798659e-06,
|
|
"loss": 0.5661,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.03229930691070587,
|
|
"eval_loss": 0.5262647271156311,
|
|
"eval_runtime": 1569.0341,
|
|
"eval_samples_per_second": 15.94,
|
|
"eval_steps_per_second": 0.498,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.032837628692550976,
|
|
"grad_norm": 2.6347591222570577,
|
|
"learning_rate": 9.999809826681898e-06,
|
|
"loss": 0.544,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.03337595047439607,
|
|
"grad_norm": 2.286499156997404,
|
|
"learning_rate": 9.999726151185762e-06,
|
|
"loss": 0.5387,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.03391427225624117,
|
|
"grad_norm": 1.8415858956026085,
|
|
"learning_rate": 9.999627262564856e-06,
|
|
"loss": 0.5148,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.034452594038086266,
|
|
"grad_norm": 1.6900844200859937,
|
|
"learning_rate": 9.999513161120078e-06,
|
|
"loss": 0.5291,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.03499091581993136,
|
|
"grad_norm": 1.7125448582732223,
|
|
"learning_rate": 9.999383847198618e-06,
|
|
"loss": 0.5535,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.03552923760177646,
|
|
"grad_norm": 1.9111631206584763,
|
|
"learning_rate": 9.999239321193946e-06,
|
|
"loss": 0.5146,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.03606755938362156,
|
|
"grad_norm": 1.5772484080951499,
|
|
"learning_rate": 9.999079583545829e-06,
|
|
"loss": 0.4713,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.03660588116546666,
|
|
"grad_norm": 1.8895632782472054,
|
|
"learning_rate": 9.998904634740313e-06,
|
|
"loss": 0.5802,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.03714420294731176,
|
|
"grad_norm": 1.7764047564754841,
|
|
"learning_rate": 9.998714475309733e-06,
|
|
"loss": 0.4893,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.037682524729156855,
|
|
"grad_norm": 1.6552020383306354,
|
|
"learning_rate": 9.9985091058327e-06,
|
|
"loss": 0.5265,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.03822084651100195,
|
|
"grad_norm": 1.6488442266603467,
|
|
"learning_rate": 9.998288526934115e-06,
|
|
"loss": 0.5231,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.03875916829284705,
|
|
"grad_norm": 2.563488205094923,
|
|
"learning_rate": 9.998052739285151e-06,
|
|
"loss": 0.5305,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.039297490074692146,
|
|
"grad_norm": 1.7898615543554037,
|
|
"learning_rate": 9.997801743603264e-06,
|
|
"loss": 0.5237,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.03983581185653724,
|
|
"grad_norm": 1.7633259864675677,
|
|
"learning_rate": 9.997535540652177e-06,
|
|
"loss": 0.5502,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.04037413363838234,
|
|
"grad_norm": 1.8121416043404328,
|
|
"learning_rate": 9.997254131241893e-06,
|
|
"loss": 0.4952,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.040912455420227443,
|
|
"grad_norm": 1.5652647418073986,
|
|
"learning_rate": 9.996957516228682e-06,
|
|
"loss": 0.4945,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.04145077720207254,
|
|
"grad_norm": 2.048844737679617,
|
|
"learning_rate": 9.996645696515082e-06,
|
|
"loss": 0.5123,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.04198909898391764,
|
|
"grad_norm": 1.6687520157181732,
|
|
"learning_rate": 9.996318673049893e-06,
|
|
"loss": 0.5443,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.042527420765762734,
|
|
"grad_norm": 1.66167477759581,
|
|
"learning_rate": 9.995976446828182e-06,
|
|
"loss": 0.5029,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.04306574254760783,
|
|
"grad_norm": 1.5077402156848434,
|
|
"learning_rate": 9.99561901889127e-06,
|
|
"loss": 0.5197,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.04360406432945293,
|
|
"grad_norm": 1.8622381731018631,
|
|
"learning_rate": 9.995246390326739e-06,
|
|
"loss": 0.5048,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.044142386111298025,
|
|
"grad_norm": 1.6038417564132132,
|
|
"learning_rate": 9.994858562268415e-06,
|
|
"loss": 0.5779,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.04468070789314313,
|
|
"grad_norm": 2.2450492036773126,
|
|
"learning_rate": 9.994455535896383e-06,
|
|
"loss": 0.5407,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.045219029674988226,
|
|
"grad_norm": 1.7319893085330837,
|
|
"learning_rate": 9.994037312436963e-06,
|
|
"loss": 0.4857,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.04575735145683332,
|
|
"grad_norm": 1.6718459312817726,
|
|
"learning_rate": 9.99360389316273e-06,
|
|
"loss": 0.4815,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.04629567323867842,
|
|
"grad_norm": 2.7232264171397276,
|
|
"learning_rate": 9.993155279392479e-06,
|
|
"loss": 0.5877,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.04683399502052352,
|
|
"grad_norm": 1.9404135244552454,
|
|
"learning_rate": 9.992691472491253e-06,
|
|
"loss": 0.5062,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.047372316802368614,
|
|
"grad_norm": 1.9213426547558368,
|
|
"learning_rate": 9.99221247387032e-06,
|
|
"loss": 0.5188,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.04791063858421371,
|
|
"grad_norm": 1.5451598644824311,
|
|
"learning_rate": 9.991718284987173e-06,
|
|
"loss": 0.5397,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.048448960366058814,
|
|
"grad_norm": 2.5679521016629385,
|
|
"learning_rate": 9.991208907345524e-06,
|
|
"loss": 0.541,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.04898728214790391,
|
|
"grad_norm": 2.98985646242629,
|
|
"learning_rate": 9.990684342495304e-06,
|
|
"loss": 0.4854,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.04952560392974901,
|
|
"grad_norm": 1.9886055940456542,
|
|
"learning_rate": 9.990144592032657e-06,
|
|
"loss": 0.5256,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.050063925711594105,
|
|
"grad_norm": 2.083677922083048,
|
|
"learning_rate": 9.989589657599927e-06,
|
|
"loss": 0.4859,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.0506022474934392,
|
|
"grad_norm": 1.5145771411744222,
|
|
"learning_rate": 9.989019540885664e-06,
|
|
"loss": 0.4744,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.0511405692752843,
|
|
"grad_norm": 1.655565898472542,
|
|
"learning_rate": 9.98843424362462e-06,
|
|
"loss": 0.4615,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.051678891057129396,
|
|
"grad_norm": 1.9814143121579568,
|
|
"learning_rate": 9.987833767597726e-06,
|
|
"loss": 0.4806,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.0522172128389745,
|
|
"grad_norm": 1.5166169599719224,
|
|
"learning_rate": 9.987218114632109e-06,
|
|
"loss": 0.5279,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.0527555346208196,
|
|
"grad_norm": 1.7338166251896456,
|
|
"learning_rate": 9.98658728660107e-06,
|
|
"loss": 0.4885,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.053293856402664694,
|
|
"grad_norm": 2.059909188253357,
|
|
"learning_rate": 9.98594128542409e-06,
|
|
"loss": 0.4878,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.05383217818450979,
|
|
"grad_norm": 1.946469408161261,
|
|
"learning_rate": 9.985280113066816e-06,
|
|
"loss": 0.5423,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.05437049996635489,
|
|
"grad_norm": 2.2782083747319333,
|
|
"learning_rate": 9.984603771541055e-06,
|
|
"loss": 0.5132,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.054908821748199985,
|
|
"grad_norm": 2.057010956887204,
|
|
"learning_rate": 9.983912262904775e-06,
|
|
"loss": 0.5092,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.05544714353004508,
|
|
"grad_norm": 1.7498707830077607,
|
|
"learning_rate": 9.983205589262093e-06,
|
|
"loss": 0.4711,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.055985465311890185,
|
|
"grad_norm": 2.08857966446578,
|
|
"learning_rate": 9.98248375276327e-06,
|
|
"loss": 0.5405,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.05652378709373528,
|
|
"grad_norm": 1.6492587393982439,
|
|
"learning_rate": 9.981746755604703e-06,
|
|
"loss": 0.5346,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.05706210887558038,
|
|
"grad_norm": 2.4884932019084203,
|
|
"learning_rate": 9.980994600028919e-06,
|
|
"loss": 0.4979,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.057600430657425476,
|
|
"grad_norm": 2.357643749019895,
|
|
"learning_rate": 9.980227288324576e-06,
|
|
"loss": 0.547,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.05813875243927057,
|
|
"grad_norm": 1.7013608238808469,
|
|
"learning_rate": 9.979444822826438e-06,
|
|
"loss": 0.4984,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.05867707422111567,
|
|
"grad_norm": 1.6424667181868076,
|
|
"learning_rate": 9.978647205915386e-06,
|
|
"loss": 0.5501,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.05921539600296077,
|
|
"grad_norm": 1.8701509501400961,
|
|
"learning_rate": 9.977834440018406e-06,
|
|
"loss": 0.5478,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.05975371778480587,
|
|
"grad_norm": 1.8496243899167086,
|
|
"learning_rate": 9.977006527608569e-06,
|
|
"loss": 0.4782,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.06029203956665097,
|
|
"grad_norm": 1.6878413932010692,
|
|
"learning_rate": 9.976163471205045e-06,
|
|
"loss": 0.4832,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.060830361348496065,
|
|
"grad_norm": 1.9099800850936837,
|
|
"learning_rate": 9.975305273373075e-06,
|
|
"loss": 0.515,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.06136868313034116,
|
|
"grad_norm": 1.5649119566569916,
|
|
"learning_rate": 9.974431936723979e-06,
|
|
"loss": 0.4561,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.06190700491218626,
|
|
"grad_norm": 1.7341754469580601,
|
|
"learning_rate": 9.973543463915139e-06,
|
|
"loss": 0.5348,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.062445326694031356,
|
|
"grad_norm": 1.7476560123562952,
|
|
"learning_rate": 9.972639857649989e-06,
|
|
"loss": 0.5287,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.06298364847587645,
|
|
"grad_norm": 2.0434137346621624,
|
|
"learning_rate": 9.971721120678018e-06,
|
|
"loss": 0.5932,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.06352197025772155,
|
|
"grad_norm": 1.62299849715006,
|
|
"learning_rate": 9.97078725579475e-06,
|
|
"loss": 0.5077,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.06406029203956665,
|
|
"grad_norm": 1.7228929187523507,
|
|
"learning_rate": 9.969838265841739e-06,
|
|
"loss": 0.5859,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.06459861382141174,
|
|
"grad_norm": 1.6625474372880666,
|
|
"learning_rate": 9.968874153706567e-06,
|
|
"loss": 0.4655,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.06459861382141174,
|
|
"eval_loss": 0.5072533488273621,
|
|
"eval_runtime": 1577.1777,
|
|
"eval_samples_per_second": 15.857,
|
|
"eval_steps_per_second": 0.496,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.06513693560325684,
|
|
"grad_norm": 2.0716206061611486,
|
|
"learning_rate": 9.967894922322824e-06,
|
|
"loss": 0.539,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.06567525738510195,
|
|
"grad_norm": 1.6205145916384769,
|
|
"learning_rate": 9.96690057467011e-06,
|
|
"loss": 0.5478,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.06621357916694705,
|
|
"grad_norm": 1.587372514164151,
|
|
"learning_rate": 9.965891113774015e-06,
|
|
"loss": 0.538,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.06675190094879214,
|
|
"grad_norm": 1.4772510136765666,
|
|
"learning_rate": 9.964866542706119e-06,
|
|
"loss": 0.5349,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.06729022273063724,
|
|
"grad_norm": 1.7801746551956565,
|
|
"learning_rate": 9.963826864583979e-06,
|
|
"loss": 0.4909,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.06782854451248234,
|
|
"grad_norm": 5.729919312521928,
|
|
"learning_rate": 9.962772082571115e-06,
|
|
"loss": 0.6005,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.06836686629432744,
|
|
"grad_norm": 1.6619105967880943,
|
|
"learning_rate": 9.961702199877014e-06,
|
|
"loss": 0.4715,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.06890518807617253,
|
|
"grad_norm": 1.5987631874828743,
|
|
"learning_rate": 9.960617219757105e-06,
|
|
"loss": 0.4807,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.06944350985801763,
|
|
"grad_norm": 1.625681174655454,
|
|
"learning_rate": 9.959517145512754e-06,
|
|
"loss": 0.535,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.06998183163986273,
|
|
"grad_norm": 2.100345459551234,
|
|
"learning_rate": 9.958401980491259e-06,
|
|
"loss": 0.5264,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.07052015342170782,
|
|
"grad_norm": 1.7787800977162425,
|
|
"learning_rate": 9.957271728085836e-06,
|
|
"loss": 0.5171,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.07105847520355292,
|
|
"grad_norm": 1.6985346393670706,
|
|
"learning_rate": 9.956126391735605e-06,
|
|
"loss": 0.5016,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.07159679698539802,
|
|
"grad_norm": 1.3787117088478043,
|
|
"learning_rate": 9.954965974925586e-06,
|
|
"loss": 0.502,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.07213511876724311,
|
|
"grad_norm": 1.547259961768447,
|
|
"learning_rate": 9.953790481186689e-06,
|
|
"loss": 0.5046,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.07267344054908821,
|
|
"grad_norm": 1.7755359789986371,
|
|
"learning_rate": 9.952599914095692e-06,
|
|
"loss": 0.5385,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.07321176233093332,
|
|
"grad_norm": 1.5896819627160363,
|
|
"learning_rate": 9.951394277275247e-06,
|
|
"loss": 0.4749,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.07375008411277842,
|
|
"grad_norm": 1.6875256792153286,
|
|
"learning_rate": 9.950173574393853e-06,
|
|
"loss": 0.4763,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.07428840589462352,
|
|
"grad_norm": 1.437266797535168,
|
|
"learning_rate": 9.948937809165853e-06,
|
|
"loss": 0.4833,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.07482672767646861,
|
|
"grad_norm": 1.7282025114929471,
|
|
"learning_rate": 9.947686985351427e-06,
|
|
"loss": 0.4767,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.07536504945831371,
|
|
"grad_norm": 1.8616012721247828,
|
|
"learning_rate": 9.946421106756568e-06,
|
|
"loss": 0.5093,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.0759033712401588,
|
|
"grad_norm": 1.8460263465465812,
|
|
"learning_rate": 9.94514017723308e-06,
|
|
"loss": 0.517,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.0764416930220039,
|
|
"grad_norm": 2.0057873955643823,
|
|
"learning_rate": 9.94384420067857e-06,
|
|
"loss": 0.5154,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.076980014803849,
|
|
"grad_norm": 1.65882505385735,
|
|
"learning_rate": 9.94253318103642e-06,
|
|
"loss": 0.4701,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.0775183365856941,
|
|
"grad_norm": 2.3628830084290806,
|
|
"learning_rate": 9.941207122295789e-06,
|
|
"loss": 0.5405,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.0780566583675392,
|
|
"grad_norm": 1.6577450103892044,
|
|
"learning_rate": 9.9398660284916e-06,
|
|
"loss": 0.4927,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.07859498014938429,
|
|
"grad_norm": 1.4186036899765784,
|
|
"learning_rate": 9.938509903704521e-06,
|
|
"loss": 0.4898,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.07913330193122939,
|
|
"grad_norm": 1.544561300695159,
|
|
"learning_rate": 9.937138752060958e-06,
|
|
"loss": 0.4893,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.07967162371307449,
|
|
"grad_norm": 2.396784154476515,
|
|
"learning_rate": 9.935752577733038e-06,
|
|
"loss": 0.5326,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.08020994549491958,
|
|
"grad_norm": 1.6617814624124967,
|
|
"learning_rate": 9.9343513849386e-06,
|
|
"loss": 0.5131,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.08074826727676468,
|
|
"grad_norm": 1.7862849588167096,
|
|
"learning_rate": 9.932935177941185e-06,
|
|
"loss": 0.571,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.08128658905860979,
|
|
"grad_norm": 1.4319233814203582,
|
|
"learning_rate": 9.931503961050012e-06,
|
|
"loss": 0.5017,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.08182491084045489,
|
|
"grad_norm": 4.306871831666418,
|
|
"learning_rate": 9.93005773861998e-06,
|
|
"loss": 0.4935,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.08236323262229998,
|
|
"grad_norm": 2.160758045969246,
|
|
"learning_rate": 9.928596515051639e-06,
|
|
"loss": 0.4985,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.08290155440414508,
|
|
"grad_norm": 1.5540015811422117,
|
|
"learning_rate": 9.927120294791188e-06,
|
|
"loss": 0.4575,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.08343987618599018,
|
|
"grad_norm": 1.5794711992375656,
|
|
"learning_rate": 9.92562908233046e-06,
|
|
"loss": 0.5031,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.08397819796783527,
|
|
"grad_norm": 2.034943473794147,
|
|
"learning_rate": 9.9241228822069e-06,
|
|
"loss": 0.4829,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.08451651974968037,
|
|
"grad_norm": 1.878275757652009,
|
|
"learning_rate": 9.922601699003567e-06,
|
|
"loss": 0.5468,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.08505484153152547,
|
|
"grad_norm": 1.8197718876914466,
|
|
"learning_rate": 9.921065537349097e-06,
|
|
"loss": 0.5228,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.08559316331337057,
|
|
"grad_norm": 1.850901219005824,
|
|
"learning_rate": 9.919514401917717e-06,
|
|
"loss": 0.4894,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.08613148509521566,
|
|
"grad_norm": 1.6912529326600465,
|
|
"learning_rate": 9.917948297429202e-06,
|
|
"loss": 0.4783,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.08666980687706076,
|
|
"grad_norm": 1.9572290713193328,
|
|
"learning_rate": 9.916367228648887e-06,
|
|
"loss": 0.4889,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.08720812865890586,
|
|
"grad_norm": 2.2412763350776497,
|
|
"learning_rate": 9.914771200387634e-06,
|
|
"loss": 0.5196,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.08774645044075095,
|
|
"grad_norm": 2.0096075056146527,
|
|
"learning_rate": 9.913160217501822e-06,
|
|
"loss": 0.5098,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.08828477222259605,
|
|
"grad_norm": 1.561955725348752,
|
|
"learning_rate": 9.911534284893336e-06,
|
|
"loss": 0.4993,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.08882309400444116,
|
|
"grad_norm": 2.2239745440823113,
|
|
"learning_rate": 9.909893407509554e-06,
|
|
"loss": 0.5189,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.08936141578628626,
|
|
"grad_norm": 2.1956593936333606,
|
|
"learning_rate": 9.90823759034332e-06,
|
|
"loss": 0.4956,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.08989973756813135,
|
|
"grad_norm": 1.7245617400478288,
|
|
"learning_rate": 9.906566838432943e-06,
|
|
"loss": 0.5076,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.09043805934997645,
|
|
"grad_norm": 1.6846599680454537,
|
|
"learning_rate": 9.904881156862172e-06,
|
|
"loss": 0.4546,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.09097638113182155,
|
|
"grad_norm": 1.713604562000994,
|
|
"learning_rate": 9.903180550760184e-06,
|
|
"loss": 0.5622,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.09151470291366665,
|
|
"grad_norm": 1.4559714724478827,
|
|
"learning_rate": 9.901465025301571e-06,
|
|
"loss": 0.499,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.09205302469551174,
|
|
"grad_norm": 1.748975091207079,
|
|
"learning_rate": 9.899734585706316e-06,
|
|
"loss": 0.4823,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.09259134647735684,
|
|
"grad_norm": 1.6268147978199312,
|
|
"learning_rate": 9.89798923723979e-06,
|
|
"loss": 0.5452,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.09312966825920194,
|
|
"grad_norm": 1.7343158101478648,
|
|
"learning_rate": 9.896228985212722e-06,
|
|
"loss": 0.4359,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.09366799004104703,
|
|
"grad_norm": 2.07042169826696,
|
|
"learning_rate": 9.894453834981194e-06,
|
|
"loss": 0.511,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.09420631182289213,
|
|
"grad_norm": 1.791222622400255,
|
|
"learning_rate": 9.892663791946617e-06,
|
|
"loss": 0.5451,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.09474463360473723,
|
|
"grad_norm": 2.20105621306618,
|
|
"learning_rate": 9.890858861555719e-06,
|
|
"loss": 0.5144,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.09528295538658232,
|
|
"grad_norm": 1.6902715423027703,
|
|
"learning_rate": 9.889039049300526e-06,
|
|
"loss": 0.5445,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.09582127716842742,
|
|
"grad_norm": 1.6384822244675972,
|
|
"learning_rate": 9.88720436071835e-06,
|
|
"loss": 0.5164,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.09635959895027253,
|
|
"grad_norm": 1.486764051130488,
|
|
"learning_rate": 9.885354801391764e-06,
|
|
"loss": 0.478,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.09689792073211763,
|
|
"grad_norm": 1.701132133672937,
|
|
"learning_rate": 9.883490376948593e-06,
|
|
"loss": 0.5027,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.09689792073211763,
|
|
"eval_loss": 0.49806535243988037,
|
|
"eval_runtime": 1515.9148,
|
|
"eval_samples_per_second": 16.498,
|
|
"eval_steps_per_second": 0.516,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.09743624251396273,
|
|
"grad_norm": 1.9402448136247314,
|
|
"learning_rate": 9.881611093061891e-06,
|
|
"loss": 0.5127,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.09797456429580782,
|
|
"grad_norm": 1.7830082860168288,
|
|
"learning_rate": 9.879716955449927e-06,
|
|
"loss": 0.4977,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.09851288607765292,
|
|
"grad_norm": 1.8728338162339362,
|
|
"learning_rate": 9.877807969876167e-06,
|
|
"loss": 0.5303,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.09905120785949802,
|
|
"grad_norm": 1.9418905923773875,
|
|
"learning_rate": 9.875884142149258e-06,
|
|
"loss": 0.4924,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.09958952964134311,
|
|
"grad_norm": 1.7198468996934395,
|
|
"learning_rate": 9.873945478123006e-06,
|
|
"loss": 0.4753,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.10012785142318821,
|
|
"grad_norm": 1.9960103116925314,
|
|
"learning_rate": 9.87199198369636e-06,
|
|
"loss": 0.5277,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.10066617320503331,
|
|
"grad_norm": 1.627744057918891,
|
|
"learning_rate": 9.870023664813399e-06,
|
|
"loss": 0.46,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.1012044949868784,
|
|
"grad_norm": 1.689952574264165,
|
|
"learning_rate": 9.868040527463305e-06,
|
|
"loss": 0.4994,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.1017428167687235,
|
|
"grad_norm": 1.5603624594142342,
|
|
"learning_rate": 9.866042577680354e-06,
|
|
"loss": 0.5304,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.1022811385505686,
|
|
"grad_norm": 1.748472496778829,
|
|
"learning_rate": 9.86402982154389e-06,
|
|
"loss": 0.4964,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.1028194603324137,
|
|
"grad_norm": 1.7431819106596798,
|
|
"learning_rate": 9.862002265178308e-06,
|
|
"loss": 0.4783,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.10335778211425879,
|
|
"grad_norm": 1.837418537016329,
|
|
"learning_rate": 9.859959914753042e-06,
|
|
"loss": 0.4862,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.1038961038961039,
|
|
"grad_norm": 2.596761998177084,
|
|
"learning_rate": 9.857902776482538e-06,
|
|
"loss": 0.5261,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.104434425677949,
|
|
"grad_norm": 1.893467433056967,
|
|
"learning_rate": 9.85583085662624e-06,
|
|
"loss": 0.5324,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.1049727474597941,
|
|
"grad_norm": 1.5311561663354358,
|
|
"learning_rate": 9.853744161488568e-06,
|
|
"loss": 0.4934,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.1055110692416392,
|
|
"grad_norm": 1.573948338119931,
|
|
"learning_rate": 9.851642697418898e-06,
|
|
"loss": 0.5137,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.10604939102348429,
|
|
"grad_norm": 1.7486390517463863,
|
|
"learning_rate": 9.84952647081155e-06,
|
|
"loss": 0.535,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.10658771280532939,
|
|
"grad_norm": 1.589021194069147,
|
|
"learning_rate": 9.847395488105761e-06,
|
|
"loss": 0.443,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.10712603458717448,
|
|
"grad_norm": 1.9185393015026924,
|
|
"learning_rate": 9.845249755785665e-06,
|
|
"loss": 0.5281,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.10766435636901958,
|
|
"grad_norm": 2.3792026849321704,
|
|
"learning_rate": 9.84308928038028e-06,
|
|
"loss": 0.5031,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.10820267815086468,
|
|
"grad_norm": 1.9165328926467609,
|
|
"learning_rate": 9.840914068463482e-06,
|
|
"loss": 0.5557,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.10874099993270978,
|
|
"grad_norm": 2.5946215311840315,
|
|
"learning_rate": 9.838724126653987e-06,
|
|
"loss": 0.4922,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.10927932171455487,
|
|
"grad_norm": 2.13076319151747,
|
|
"learning_rate": 9.836519461615331e-06,
|
|
"loss": 0.5781,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.10981764349639997,
|
|
"grad_norm": 1.663228941320188,
|
|
"learning_rate": 9.834300080055854e-06,
|
|
"loss": 0.484,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.11035596527824507,
|
|
"grad_norm": 2.225077581890442,
|
|
"learning_rate": 9.832065988728667e-06,
|
|
"loss": 0.4869,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.11089428706009016,
|
|
"grad_norm": 1.4816502494413102,
|
|
"learning_rate": 9.829817194431646e-06,
|
|
"loss": 0.4782,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.11143260884193526,
|
|
"grad_norm": 1.9584675295393534,
|
|
"learning_rate": 9.827553704007403e-06,
|
|
"loss": 0.4572,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.11197093062378037,
|
|
"grad_norm": 1.4348786359320973,
|
|
"learning_rate": 9.82527552434327e-06,
|
|
"loss": 0.4682,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.11250925240562547,
|
|
"grad_norm": 1.836643464151516,
|
|
"learning_rate": 9.82298266237127e-06,
|
|
"loss": 0.475,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.11304757418747056,
|
|
"grad_norm": 1.6780795457698512,
|
|
"learning_rate": 9.820675125068105e-06,
|
|
"loss": 0.4903,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.11358589596931566,
|
|
"grad_norm": 2.0824594091852124,
|
|
"learning_rate": 9.818352919455133e-06,
|
|
"loss": 0.5396,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.11412421775116076,
|
|
"grad_norm": 1.7381485522277624,
|
|
"learning_rate": 9.816016052598336e-06,
|
|
"loss": 0.536,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.11466253953300586,
|
|
"grad_norm": 1.7730039428627105,
|
|
"learning_rate": 9.813664531608319e-06,
|
|
"loss": 0.5344,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.11520086131485095,
|
|
"grad_norm": 1.726577182888005,
|
|
"learning_rate": 9.811298363640265e-06,
|
|
"loss": 0.4686,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.11573918309669605,
|
|
"grad_norm": 1.4284226913661735,
|
|
"learning_rate": 9.808917555893934e-06,
|
|
"loss": 0.417,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.11627750487854115,
|
|
"grad_norm": 1.8490676859358208,
|
|
"learning_rate": 9.806522115613624e-06,
|
|
"loss": 0.4734,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.11681582666038624,
|
|
"grad_norm": 1.9252320315263673,
|
|
"learning_rate": 9.804112050088164e-06,
|
|
"loss": 0.5216,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.11735414844223134,
|
|
"grad_norm": 2.039324491259981,
|
|
"learning_rate": 9.801687366650882e-06,
|
|
"loss": 0.5209,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.11789247022407644,
|
|
"grad_norm": 2.9773699463269168,
|
|
"learning_rate": 9.799248072679581e-06,
|
|
"loss": 0.5341,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.11843079200592153,
|
|
"grad_norm": 2.742476530553411,
|
|
"learning_rate": 9.796794175596526e-06,
|
|
"loss": 0.5013,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.11896911378776663,
|
|
"grad_norm": 1.7756468554357536,
|
|
"learning_rate": 9.794325682868413e-06,
|
|
"loss": 0.4789,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.11950743556961174,
|
|
"grad_norm": 1.6809704903695406,
|
|
"learning_rate": 9.791842602006355e-06,
|
|
"loss": 0.4661,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.12004575735145684,
|
|
"grad_norm": 1.5983552620095136,
|
|
"learning_rate": 9.789344940565844e-06,
|
|
"loss": 0.4525,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.12058407913330194,
|
|
"grad_norm": 1.6785718872740183,
|
|
"learning_rate": 9.786832706146745e-06,
|
|
"loss": 0.5614,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.12112240091514703,
|
|
"grad_norm": 1.8472396669798028,
|
|
"learning_rate": 9.784305906393266e-06,
|
|
"loss": 0.5442,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.12166072269699213,
|
|
"grad_norm": 2.233728320756155,
|
|
"learning_rate": 9.781764548993932e-06,
|
|
"loss": 0.5065,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.12219904447883723,
|
|
"grad_norm": 1.7583669595786098,
|
|
"learning_rate": 9.77920864168156e-06,
|
|
"loss": 0.5031,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.12273736626068232,
|
|
"grad_norm": 1.856107901761449,
|
|
"learning_rate": 9.77663819223325e-06,
|
|
"loss": 0.5218,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.12327568804252742,
|
|
"grad_norm": 1.5999284716572806,
|
|
"learning_rate": 9.774053208470338e-06,
|
|
"loss": 0.447,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.12381400982437252,
|
|
"grad_norm": 3.170181526472491,
|
|
"learning_rate": 9.771453698258392e-06,
|
|
"loss": 0.4549,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.12435233160621761,
|
|
"grad_norm": 1.7567006972999655,
|
|
"learning_rate": 9.768839669507185e-06,
|
|
"loss": 0.5203,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.12489065338806271,
|
|
"grad_norm": 1.6024823185860628,
|
|
"learning_rate": 9.766211130170653e-06,
|
|
"loss": 0.5035,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.1254289751699078,
|
|
"grad_norm": 1.9234982966827474,
|
|
"learning_rate": 9.7635680882469e-06,
|
|
"loss": 0.5742,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.1259672969517529,
|
|
"grad_norm": 1.526400617412084,
|
|
"learning_rate": 9.760910551778149e-06,
|
|
"loss": 0.4953,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.126505618733598,
|
|
"grad_norm": 1.7460568880199783,
|
|
"learning_rate": 9.758238528850733e-06,
|
|
"loss": 0.4705,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.1270439405154431,
|
|
"grad_norm": 5.681983754980635,
|
|
"learning_rate": 9.755552027595055e-06,
|
|
"loss": 0.5499,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.1275822622972882,
|
|
"grad_norm": 1.9059517301514561,
|
|
"learning_rate": 9.752851056185583e-06,
|
|
"loss": 0.5016,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.1281205840791333,
|
|
"grad_norm": 2.032081768465102,
|
|
"learning_rate": 9.750135622840811e-06,
|
|
"loss": 0.4761,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.1286589058609784,
|
|
"grad_norm": 2.044888486278771,
|
|
"learning_rate": 9.747405735823232e-06,
|
|
"loss": 0.535,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.1291972276428235,
|
|
"grad_norm": 1.7814262228625417,
|
|
"learning_rate": 9.744661403439328e-06,
|
|
"loss": 0.5524,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.1291972276428235,
|
|
"eval_loss": 0.4923091232776642,
|
|
"eval_runtime": 1516.8995,
|
|
"eval_samples_per_second": 16.488,
|
|
"eval_steps_per_second": 0.516,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.12973554942466858,
|
|
"grad_norm": 3.1298270206538,
|
|
"learning_rate": 9.74190263403953e-06,
|
|
"loss": 0.4938,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.13027387120651368,
|
|
"grad_norm": 1.4984946811035116,
|
|
"learning_rate": 9.739129436018193e-06,
|
|
"loss": 0.4417,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.1308121929883588,
|
|
"grad_norm": 1.364613667269671,
|
|
"learning_rate": 9.736341817813586e-06,
|
|
"loss": 0.4698,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.1313505147702039,
|
|
"grad_norm": 1.4558332152005662,
|
|
"learning_rate": 9.733539787907851e-06,
|
|
"loss": 0.51,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.131888836552049,
|
|
"grad_norm": 1.605378069117634,
|
|
"learning_rate": 9.730723354826978e-06,
|
|
"loss": 0.4502,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.1324271583338941,
|
|
"grad_norm": 1.6741314580897366,
|
|
"learning_rate": 9.727892527140787e-06,
|
|
"loss": 0.4445,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.1329654801157392,
|
|
"grad_norm": 2.306950410094544,
|
|
"learning_rate": 9.725047313462897e-06,
|
|
"loss": 0.541,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.1335038018975843,
|
|
"grad_norm": 2.110791301537649,
|
|
"learning_rate": 9.722187722450699e-06,
|
|
"loss": 0.5105,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.1340421236794294,
|
|
"grad_norm": 1.8250944708952,
|
|
"learning_rate": 9.719313762805334e-06,
|
|
"loss": 0.5233,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.13458044546127448,
|
|
"grad_norm": 1.5279014760068415,
|
|
"learning_rate": 9.716425443271663e-06,
|
|
"loss": 0.4978,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.13511876724311958,
|
|
"grad_norm": 1.6155139379634116,
|
|
"learning_rate": 9.713522772638238e-06,
|
|
"loss": 0.489,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.13565708902496468,
|
|
"grad_norm": 1.7541916143762504,
|
|
"learning_rate": 9.710605759737281e-06,
|
|
"loss": 0.5058,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.13619541080680977,
|
|
"grad_norm": 2.0770411769433914,
|
|
"learning_rate": 9.707674413444658e-06,
|
|
"loss": 0.4765,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.13673373258865487,
|
|
"grad_norm": 2.20017292136363,
|
|
"learning_rate": 9.70472874267984e-06,
|
|
"loss": 0.5073,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.13727205437049997,
|
|
"grad_norm": 2.5155355882755495,
|
|
"learning_rate": 9.701768756405894e-06,
|
|
"loss": 0.5271,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.13781037615234507,
|
|
"grad_norm": 1.6203966463313373,
|
|
"learning_rate": 9.698794463629438e-06,
|
|
"loss": 0.5328,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.13834869793419016,
|
|
"grad_norm": 1.776204296227151,
|
|
"learning_rate": 9.695805873400627e-06,
|
|
"loss": 0.4975,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.13888701971603526,
|
|
"grad_norm": 1.817996887986963,
|
|
"learning_rate": 9.692802994813117e-06,
|
|
"loss": 0.5076,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.13942534149788036,
|
|
"grad_norm": 1.5387316388819356,
|
|
"learning_rate": 9.68978583700404e-06,
|
|
"loss": 0.4783,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.13996366327972545,
|
|
"grad_norm": 1.4525191587799346,
|
|
"learning_rate": 9.686754409153984e-06,
|
|
"loss": 0.4541,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.14050198506157055,
|
|
"grad_norm": 2.5072786042500286,
|
|
"learning_rate": 9.683708720486947e-06,
|
|
"loss": 0.4321,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.14104030684341565,
|
|
"grad_norm": 1.928234336171056,
|
|
"learning_rate": 9.680648780270327e-06,
|
|
"loss": 0.5026,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.14157862862526074,
|
|
"grad_norm": 1.9095002820990152,
|
|
"learning_rate": 9.677574597814884e-06,
|
|
"loss": 0.5048,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.14211695040710584,
|
|
"grad_norm": 2.7537047870453777,
|
|
"learning_rate": 9.674486182474716e-06,
|
|
"loss": 0.5202,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.14265527218895094,
|
|
"grad_norm": 1.5411698281683408,
|
|
"learning_rate": 9.671383543647225e-06,
|
|
"loss": 0.473,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.14319359397079603,
|
|
"grad_norm": 1.6351867542673815,
|
|
"learning_rate": 9.668266690773094e-06,
|
|
"loss": 0.4734,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.14373191575264113,
|
|
"grad_norm": 1.8884810300636565,
|
|
"learning_rate": 9.66513563333626e-06,
|
|
"loss": 0.5014,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.14427023753448623,
|
|
"grad_norm": 1.6743904016832571,
|
|
"learning_rate": 9.661990380863876e-06,
|
|
"loss": 0.4782,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.14480855931633133,
|
|
"grad_norm": 1.9090758165263444,
|
|
"learning_rate": 9.658830942926291e-06,
|
|
"loss": 0.5003,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.14534688109817642,
|
|
"grad_norm": 1.4937405913115736,
|
|
"learning_rate": 9.655657329137015e-06,
|
|
"loss": 0.4432,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.14588520288002152,
|
|
"grad_norm": 1.9026943182309153,
|
|
"learning_rate": 9.652469549152695e-06,
|
|
"loss": 0.529,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.14642352466186664,
|
|
"grad_norm": 1.8186943886881364,
|
|
"learning_rate": 9.649267612673079e-06,
|
|
"loss": 0.4737,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.14696184644371174,
|
|
"grad_norm": 1.8259823260308685,
|
|
"learning_rate": 9.646051529440993e-06,
|
|
"loss": 0.4985,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.14750016822555684,
|
|
"grad_norm": 1.9385932273349529,
|
|
"learning_rate": 9.64282130924231e-06,
|
|
"loss": 0.4838,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.14803849000740193,
|
|
"grad_norm": 2.04013899262351,
|
|
"learning_rate": 9.639576961905915e-06,
|
|
"loss": 0.5434,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.14857681178924703,
|
|
"grad_norm": 1.4822512590060632,
|
|
"learning_rate": 9.636318497303679e-06,
|
|
"loss": 0.5105,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.14911513357109213,
|
|
"grad_norm": 1.580055299090581,
|
|
"learning_rate": 9.633045925350436e-06,
|
|
"loss": 0.5236,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.14965345535293723,
|
|
"grad_norm": 1.947058506268201,
|
|
"learning_rate": 9.629759256003936e-06,
|
|
"loss": 0.517,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.15019177713478232,
|
|
"grad_norm": 2.09097300966892,
|
|
"learning_rate": 9.626458499264833e-06,
|
|
"loss": 0.4795,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.15073009891662742,
|
|
"grad_norm": 1.9281815370039999,
|
|
"learning_rate": 9.623143665176636e-06,
|
|
"loss": 0.5091,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.15126842069847252,
|
|
"grad_norm": 1.8942765435710498,
|
|
"learning_rate": 9.6198147638257e-06,
|
|
"loss": 0.486,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.1518067424803176,
|
|
"grad_norm": 1.5680877122601742,
|
|
"learning_rate": 9.616471805341175e-06,
|
|
"loss": 0.5756,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.1523450642621627,
|
|
"grad_norm": 1.8187589637332664,
|
|
"learning_rate": 9.613114799894989e-06,
|
|
"loss": 0.4848,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.1528833860440078,
|
|
"grad_norm": 2.845269186548161,
|
|
"learning_rate": 9.609743757701806e-06,
|
|
"loss": 0.5196,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.1534217078258529,
|
|
"grad_norm": 1.6573799451128552,
|
|
"learning_rate": 9.60635868901901e-06,
|
|
"loss": 0.5256,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.153960029607698,
|
|
"grad_norm": 1.403409672767778,
|
|
"learning_rate": 9.602959604146658e-06,
|
|
"loss": 0.4591,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.1544983513895431,
|
|
"grad_norm": 1.5756224710697608,
|
|
"learning_rate": 9.599546513427455e-06,
|
|
"loss": 0.4499,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.1550366731713882,
|
|
"grad_norm": 1.8561161081867996,
|
|
"learning_rate": 9.596119427246727e-06,
|
|
"loss": 0.514,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.1555749949532333,
|
|
"grad_norm": 1.6430886050709819,
|
|
"learning_rate": 9.592678356032382e-06,
|
|
"loss": 0.4916,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.1561133167350784,
|
|
"grad_norm": 1.5608831001537813,
|
|
"learning_rate": 9.589223310254881e-06,
|
|
"loss": 0.4845,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.15665163851692349,
|
|
"grad_norm": 2.041472319934021,
|
|
"learning_rate": 9.58575430042721e-06,
|
|
"loss": 0.5105,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.15718996029876858,
|
|
"grad_norm": 1.879252835980779,
|
|
"learning_rate": 9.582271337104844e-06,
|
|
"loss": 0.5254,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.15772828208061368,
|
|
"grad_norm": 1.7353738362985391,
|
|
"learning_rate": 9.578774430885714e-06,
|
|
"loss": 0.545,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.15826660386245878,
|
|
"grad_norm": 1.6167983704567415,
|
|
"learning_rate": 9.575263592410176e-06,
|
|
"loss": 0.484,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.15880492564430387,
|
|
"grad_norm": 1.6983057165346465,
|
|
"learning_rate": 9.571738832360979e-06,
|
|
"loss": 0.5001,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.15934324742614897,
|
|
"grad_norm": 2.081190213763369,
|
|
"learning_rate": 9.568200161463237e-06,
|
|
"loss": 0.4722,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.15988156920799407,
|
|
"grad_norm": 2.246655796617688,
|
|
"learning_rate": 9.564647590484384e-06,
|
|
"loss": 0.5171,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.16041989098983916,
|
|
"grad_norm": 1.4481263563444773,
|
|
"learning_rate": 9.561081130234155e-06,
|
|
"loss": 0.471,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 0.16095821277168426,
|
|
"grad_norm": 1.6254902571476582,
|
|
"learning_rate": 9.557500791564545e-06,
|
|
"loss": 0.4709,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 0.16149653455352936,
|
|
"grad_norm": 1.6522030181707457,
|
|
"learning_rate": 9.55390658536978e-06,
|
|
"loss": 0.4314,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.16149653455352936,
|
|
"eval_loss": 0.48600396513938904,
|
|
"eval_runtime": 1525.5556,
|
|
"eval_samples_per_second": 16.394,
|
|
"eval_steps_per_second": 0.513,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.16203485633537448,
|
|
"grad_norm": 1.6735119675316397,
|
|
"learning_rate": 9.550298522586277e-06,
|
|
"loss": 0.4981,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 0.16257317811721958,
|
|
"grad_norm": 1.7492206784400102,
|
|
"learning_rate": 9.546676614192623e-06,
|
|
"loss": 0.5166,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 0.16311149989906468,
|
|
"grad_norm": 1.8716369675908593,
|
|
"learning_rate": 9.543040871209528e-06,
|
|
"loss": 0.4587,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 0.16364982168090977,
|
|
"grad_norm": 1.5260344735318792,
|
|
"learning_rate": 9.5393913046998e-06,
|
|
"loss": 0.4637,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 0.16418814346275487,
|
|
"grad_norm": 1.9514934425079693,
|
|
"learning_rate": 9.535727925768312e-06,
|
|
"loss": 0.5018,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.16472646524459997,
|
|
"grad_norm": 1.9239888955973004,
|
|
"learning_rate": 9.53205074556196e-06,
|
|
"loss": 0.5156,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 0.16526478702644506,
|
|
"grad_norm": 1.4397611201745624,
|
|
"learning_rate": 9.528359775269637e-06,
|
|
"loss": 0.4876,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 0.16580310880829016,
|
|
"grad_norm": 1.6314792528136741,
|
|
"learning_rate": 9.524655026122199e-06,
|
|
"loss": 0.4466,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 0.16634143059013526,
|
|
"grad_norm": 1.7046994741333183,
|
|
"learning_rate": 9.520936509392425e-06,
|
|
"loss": 0.5137,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 0.16687975237198036,
|
|
"grad_norm": 1.6773498230286716,
|
|
"learning_rate": 9.517204236394983e-06,
|
|
"loss": 0.4857,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.16741807415382545,
|
|
"grad_norm": 1.9407453364887826,
|
|
"learning_rate": 9.513458218486404e-06,
|
|
"loss": 0.569,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 0.16795639593567055,
|
|
"grad_norm": 2.3596815310352355,
|
|
"learning_rate": 9.509698467065042e-06,
|
|
"loss": 0.4823,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 0.16849471771751565,
|
|
"grad_norm": 1.491461623274511,
|
|
"learning_rate": 9.505924993571037e-06,
|
|
"loss": 0.4814,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 0.16903303949936074,
|
|
"grad_norm": 1.755984194501031,
|
|
"learning_rate": 9.502137809486277e-06,
|
|
"loss": 0.4953,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 0.16957136128120584,
|
|
"grad_norm": 1.4330639099631888,
|
|
"learning_rate": 9.49833692633438e-06,
|
|
"loss": 0.4566,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.17010968306305094,
|
|
"grad_norm": 2.8224430252996413,
|
|
"learning_rate": 9.49452235568064e-06,
|
|
"loss": 0.5356,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 0.17064800484489603,
|
|
"grad_norm": 1.6038158256481398,
|
|
"learning_rate": 9.490694109131997e-06,
|
|
"loss": 0.4667,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 0.17118632662674113,
|
|
"grad_norm": 1.5264996881581228,
|
|
"learning_rate": 9.486852198337013e-06,
|
|
"loss": 0.5066,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 0.17172464840858623,
|
|
"grad_norm": 2.1960133726792987,
|
|
"learning_rate": 9.482996634985818e-06,
|
|
"loss": 0.51,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 0.17226297019043132,
|
|
"grad_norm": 1.8025162435130595,
|
|
"learning_rate": 9.479127430810087e-06,
|
|
"loss": 0.4542,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.17280129197227642,
|
|
"grad_norm": 1.573351382907097,
|
|
"learning_rate": 9.475244597583007e-06,
|
|
"loss": 0.4932,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 0.17333961375412152,
|
|
"grad_norm": 1.8667569419712537,
|
|
"learning_rate": 9.471348147119226e-06,
|
|
"loss": 0.5095,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 0.17387793553596662,
|
|
"grad_norm": 1.7668055772396445,
|
|
"learning_rate": 9.467438091274831e-06,
|
|
"loss": 0.5407,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 0.1744162573178117,
|
|
"grad_norm": 1.8953472452582216,
|
|
"learning_rate": 9.46351444194731e-06,
|
|
"loss": 0.5128,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 0.1749545790996568,
|
|
"grad_norm": 1.4178882398027213,
|
|
"learning_rate": 9.459577211075505e-06,
|
|
"loss": 0.4783,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.1754929008815019,
|
|
"grad_norm": 2.0556054399757833,
|
|
"learning_rate": 9.455626410639595e-06,
|
|
"loss": 0.4883,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 0.176031222663347,
|
|
"grad_norm": 1.7326020245251583,
|
|
"learning_rate": 9.451662052661042e-06,
|
|
"loss": 0.5118,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 0.1765695444451921,
|
|
"grad_norm": 4.171939008569256,
|
|
"learning_rate": 9.447684149202555e-06,
|
|
"loss": 0.5034,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 0.17710786622703723,
|
|
"grad_norm": 1.4094510294695572,
|
|
"learning_rate": 9.44369271236807e-06,
|
|
"loss": 0.485,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 0.17764618800888232,
|
|
"grad_norm": 1.7412556596004685,
|
|
"learning_rate": 9.4396877543027e-06,
|
|
"loss": 0.5202,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.17818450979072742,
|
|
"grad_norm": 2.605859372043168,
|
|
"learning_rate": 9.435669287192691e-06,
|
|
"loss": 0.4685,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 0.17872283157257252,
|
|
"grad_norm": 1.751047130574041,
|
|
"learning_rate": 9.431637323265406e-06,
|
|
"loss": 0.5435,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 0.1792611533544176,
|
|
"grad_norm": 1.6979113314955865,
|
|
"learning_rate": 9.42759187478927e-06,
|
|
"loss": 0.5082,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 0.1797994751362627,
|
|
"grad_norm": 1.655193667961951,
|
|
"learning_rate": 9.423532954073737e-06,
|
|
"loss": 0.52,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 0.1803377969181078,
|
|
"grad_norm": 1.715183078111553,
|
|
"learning_rate": 9.419460573469262e-06,
|
|
"loss": 0.4876,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.1808761186999529,
|
|
"grad_norm": 1.755206515543788,
|
|
"learning_rate": 9.415374745367245e-06,
|
|
"loss": 0.4826,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 0.181414440481798,
|
|
"grad_norm": 1.530238277234238,
|
|
"learning_rate": 9.411275482200015e-06,
|
|
"loss": 0.5227,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 0.1819527622636431,
|
|
"grad_norm": 1.4873212835334444,
|
|
"learning_rate": 9.40716279644077e-06,
|
|
"loss": 0.4784,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 0.1824910840454882,
|
|
"grad_norm": 1.4713841358562554,
|
|
"learning_rate": 9.403036700603561e-06,
|
|
"loss": 0.4872,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 0.1830294058273333,
|
|
"grad_norm": 1.5551919063027968,
|
|
"learning_rate": 9.398897207243232e-06,
|
|
"loss": 0.4817,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.1835677276091784,
|
|
"grad_norm": 1.8717050820441055,
|
|
"learning_rate": 9.394744328955403e-06,
|
|
"loss": 0.5002,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 0.18410604939102349,
|
|
"grad_norm": 1.9843100820794195,
|
|
"learning_rate": 9.390578078376417e-06,
|
|
"loss": 0.4799,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 0.18464437117286858,
|
|
"grad_norm": 2.156998251608843,
|
|
"learning_rate": 9.386398468183304e-06,
|
|
"loss": 0.4469,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 0.18518269295471368,
|
|
"grad_norm": 1.7123477834586953,
|
|
"learning_rate": 9.38220551109375e-06,
|
|
"loss": 0.5312,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 0.18572101473655878,
|
|
"grad_norm": 1.862901860663747,
|
|
"learning_rate": 9.377999219866046e-06,
|
|
"loss": 0.5146,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.18625933651840387,
|
|
"grad_norm": 1.8400145206055536,
|
|
"learning_rate": 9.373779607299061e-06,
|
|
"loss": 0.498,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 0.18679765830024897,
|
|
"grad_norm": 1.4419967374301528,
|
|
"learning_rate": 9.369546686232199e-06,
|
|
"loss": 0.491,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 0.18733598008209407,
|
|
"grad_norm": 1.6800971553110484,
|
|
"learning_rate": 9.365300469545352e-06,
|
|
"loss": 0.453,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 0.18787430186393916,
|
|
"grad_norm": 1.4414646625492236,
|
|
"learning_rate": 9.361040970158876e-06,
|
|
"loss": 0.4844,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 0.18841262364578426,
|
|
"grad_norm": 1.4693828151901231,
|
|
"learning_rate": 9.356768201033542e-06,
|
|
"loss": 0.4846,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.18895094542762936,
|
|
"grad_norm": 1.6213301090422854,
|
|
"learning_rate": 9.35248217517049e-06,
|
|
"loss": 0.4528,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 0.18948926720947445,
|
|
"grad_norm": 1.3998204036117714,
|
|
"learning_rate": 9.348182905611209e-06,
|
|
"loss": 0.4677,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 0.19002758899131955,
|
|
"grad_norm": 1.4713366703366633,
|
|
"learning_rate": 9.343870405437477e-06,
|
|
"loss": 0.4292,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 0.19056591077316465,
|
|
"grad_norm": 1.941068700941172,
|
|
"learning_rate": 9.339544687771334e-06,
|
|
"loss": 0.5102,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 0.19110423255500975,
|
|
"grad_norm": 1.828849112653357,
|
|
"learning_rate": 9.335205765775039e-06,
|
|
"loss": 0.4638,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.19164255433685484,
|
|
"grad_norm": 1.6885129161638754,
|
|
"learning_rate": 9.330853652651026e-06,
|
|
"loss": 0.4391,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 0.19218087611869994,
|
|
"grad_norm": 1.7268115477491062,
|
|
"learning_rate": 9.326488361641867e-06,
|
|
"loss": 0.4557,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 0.19271919790054506,
|
|
"grad_norm": 1.369390489248521,
|
|
"learning_rate": 9.322109906030237e-06,
|
|
"loss": 0.4451,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 0.19325751968239016,
|
|
"grad_norm": 1.653269795096283,
|
|
"learning_rate": 9.31771829913886e-06,
|
|
"loss": 0.4466,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 0.19379584146423526,
|
|
"grad_norm": 1.6015504141518857,
|
|
"learning_rate": 9.313313554330484e-06,
|
|
"loss": 0.4977,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.19379584146423526,
|
|
"eval_loss": 0.4812440574169159,
|
|
"eval_runtime": 1528.9254,
|
|
"eval_samples_per_second": 16.358,
|
|
"eval_steps_per_second": 0.511,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.19433416324608035,
|
|
"grad_norm": 1.6899547102686612,
|
|
"learning_rate": 9.308895685007824e-06,
|
|
"loss": 0.5404,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 0.19487248502792545,
|
|
"grad_norm": 1.8153441873291498,
|
|
"learning_rate": 9.304464704613541e-06,
|
|
"loss": 0.5128,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 0.19541080680977055,
|
|
"grad_norm": 1.6094259149494354,
|
|
"learning_rate": 9.300020626630184e-06,
|
|
"loss": 0.4854,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 0.19594912859161565,
|
|
"grad_norm": 1.726004590201776,
|
|
"learning_rate": 9.295563464580153e-06,
|
|
"loss": 0.4827,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 0.19648745037346074,
|
|
"grad_norm": 1.7917006550897865,
|
|
"learning_rate": 9.29109323202567e-06,
|
|
"loss": 0.4689,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.19702577215530584,
|
|
"grad_norm": 2.067420755566304,
|
|
"learning_rate": 9.286609942568712e-06,
|
|
"loss": 0.4411,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 0.19756409393715094,
|
|
"grad_norm": 1.9439738397276571,
|
|
"learning_rate": 9.282113609851002e-06,
|
|
"loss": 0.4748,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 0.19810241571899603,
|
|
"grad_norm": 1.6206588657538272,
|
|
"learning_rate": 9.277604247553939e-06,
|
|
"loss": 0.5215,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 0.19864073750084113,
|
|
"grad_norm": 2.0968303117516136,
|
|
"learning_rate": 9.273081869398577e-06,
|
|
"loss": 0.4466,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 0.19917905928268623,
|
|
"grad_norm": 1.5483077144548956,
|
|
"learning_rate": 9.268546489145566e-06,
|
|
"loss": 0.5042,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.19971738106453132,
|
|
"grad_norm": 1.6430391903483688,
|
|
"learning_rate": 9.263998120595124e-06,
|
|
"loss": 0.4798,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 0.20025570284637642,
|
|
"grad_norm": 1.451263876582638,
|
|
"learning_rate": 9.259436777586991e-06,
|
|
"loss": 0.4498,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 0.20079402462822152,
|
|
"grad_norm": 1.924895097651951,
|
|
"learning_rate": 9.25486247400038e-06,
|
|
"loss": 0.4971,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 0.20133234641006661,
|
|
"grad_norm": 1.5044716731151997,
|
|
"learning_rate": 9.250275223753948e-06,
|
|
"loss": 0.4761,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 0.2018706681919117,
|
|
"grad_norm": 1.8105401635317677,
|
|
"learning_rate": 9.245675040805738e-06,
|
|
"loss": 0.4645,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.2024089899737568,
|
|
"grad_norm": 1.4400001043179194,
|
|
"learning_rate": 9.241061939153146e-06,
|
|
"loss": 0.5052,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 0.2029473117556019,
|
|
"grad_norm": 2.1898160128283046,
|
|
"learning_rate": 9.236435932832883e-06,
|
|
"loss": 0.4571,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 0.203485633537447,
|
|
"grad_norm": 1.728102995146478,
|
|
"learning_rate": 9.231797035920921e-06,
|
|
"loss": 0.459,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 0.2040239553192921,
|
|
"grad_norm": 1.5484346370702677,
|
|
"learning_rate": 9.227145262532458e-06,
|
|
"loss": 0.5106,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 0.2045622771011372,
|
|
"grad_norm": 1.5623742217769747,
|
|
"learning_rate": 9.222480626821868e-06,
|
|
"loss": 0.444,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.2051005988829823,
|
|
"grad_norm": 1.7091436440987169,
|
|
"learning_rate": 9.217803142982668e-06,
|
|
"loss": 0.4732,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 0.2056389206648274,
|
|
"grad_norm": 1.4196906974845203,
|
|
"learning_rate": 9.213112825247466e-06,
|
|
"loss": 0.4779,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 0.2061772424466725,
|
|
"grad_norm": 1.5167704426292719,
|
|
"learning_rate": 9.20840968788792e-06,
|
|
"loss": 0.4967,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 0.20671556422851758,
|
|
"grad_norm": 1.4170871947038493,
|
|
"learning_rate": 9.203693745214698e-06,
|
|
"loss": 0.491,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 0.20725388601036268,
|
|
"grad_norm": 1.5152939794668674,
|
|
"learning_rate": 9.19896501157743e-06,
|
|
"loss": 0.4541,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.2077922077922078,
|
|
"grad_norm": 1.9536536833455793,
|
|
"learning_rate": 9.19422350136467e-06,
|
|
"loss": 0.4799,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 0.2083305295740529,
|
|
"grad_norm": 2.316326510948496,
|
|
"learning_rate": 9.18946922900384e-06,
|
|
"loss": 0.4658,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 0.208868851355898,
|
|
"grad_norm": 1.2922243986398827,
|
|
"learning_rate": 9.184702208961204e-06,
|
|
"loss": 0.4057,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 0.2094071731377431,
|
|
"grad_norm": 1.8303479595554093,
|
|
"learning_rate": 9.179922455741812e-06,
|
|
"loss": 0.4427,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 0.2099454949195882,
|
|
"grad_norm": 1.541720900007236,
|
|
"learning_rate": 9.175129983889452e-06,
|
|
"loss": 0.516,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.2104838167014333,
|
|
"grad_norm": 1.9307101459341938,
|
|
"learning_rate": 9.17032480798662e-06,
|
|
"loss": 0.4349,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 0.2110221384832784,
|
|
"grad_norm": 1.3922182421272982,
|
|
"learning_rate": 9.165506942654468e-06,
|
|
"loss": 0.4816,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 0.21156046026512348,
|
|
"grad_norm": 1.6974151932118977,
|
|
"learning_rate": 9.16067640255275e-06,
|
|
"loss": 0.4812,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 0.21209878204696858,
|
|
"grad_norm": 1.4726854167474133,
|
|
"learning_rate": 9.155833202379798e-06,
|
|
"loss": 0.4717,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 0.21263710382881368,
|
|
"grad_norm": 1.8790922445419658,
|
|
"learning_rate": 9.150977356872456e-06,
|
|
"loss": 0.4885,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.21317542561065878,
|
|
"grad_norm": 1.9084443087840661,
|
|
"learning_rate": 9.146108880806056e-06,
|
|
"loss": 0.4633,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 0.21371374739250387,
|
|
"grad_norm": 1.6996601490386696,
|
|
"learning_rate": 9.141227788994348e-06,
|
|
"loss": 0.4453,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 0.21425206917434897,
|
|
"grad_norm": 1.7127514086857762,
|
|
"learning_rate": 9.136334096289485e-06,
|
|
"loss": 0.5144,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 0.21479039095619407,
|
|
"grad_norm": 1.4183339048304517,
|
|
"learning_rate": 9.131427817581953e-06,
|
|
"loss": 0.476,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 0.21532871273803916,
|
|
"grad_norm": 1.5688801517253075,
|
|
"learning_rate": 9.12650896780053e-06,
|
|
"loss": 0.4657,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.21586703451988426,
|
|
"grad_norm": 1.391080609496865,
|
|
"learning_rate": 9.121577561912256e-06,
|
|
"loss": 0.5043,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 0.21640535630172936,
|
|
"grad_norm": 3.302547702490585,
|
|
"learning_rate": 9.11663361492237e-06,
|
|
"loss": 0.497,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 0.21694367808357445,
|
|
"grad_norm": 1.7874988296563226,
|
|
"learning_rate": 9.111677141874273e-06,
|
|
"loss": 0.4465,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 0.21748199986541955,
|
|
"grad_norm": 1.830004021479594,
|
|
"learning_rate": 9.106708157849478e-06,
|
|
"loss": 0.5088,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 0.21802032164726465,
|
|
"grad_norm": 2.4236747379642267,
|
|
"learning_rate": 9.101726677967569e-06,
|
|
"loss": 0.4922,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.21855864342910974,
|
|
"grad_norm": 1.5488577176317244,
|
|
"learning_rate": 9.096732717386152e-06,
|
|
"loss": 0.497,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 0.21909696521095484,
|
|
"grad_norm": 2.3263014189367306,
|
|
"learning_rate": 9.091726291300806e-06,
|
|
"loss": 0.4791,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 0.21963528699279994,
|
|
"grad_norm": 1.7243223143837634,
|
|
"learning_rate": 9.086707414945044e-06,
|
|
"loss": 0.5192,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 0.22017360877464504,
|
|
"grad_norm": 1.3667216442420331,
|
|
"learning_rate": 9.08167610359026e-06,
|
|
"loss": 0.4816,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 0.22071193055649013,
|
|
"grad_norm": 1.4675898960533509,
|
|
"learning_rate": 9.076632372545688e-06,
|
|
"loss": 0.4694,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.22125025233833523,
|
|
"grad_norm": 1.725309532729321,
|
|
"learning_rate": 9.071576237158348e-06,
|
|
"loss": 0.5097,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 0.22178857412018033,
|
|
"grad_norm": 1.48659542538949,
|
|
"learning_rate": 9.066507712813009e-06,
|
|
"loss": 0.445,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 0.22232689590202542,
|
|
"grad_norm": 1.6287270540094485,
|
|
"learning_rate": 9.06142681493213e-06,
|
|
"loss": 0.4948,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 0.22286521768387052,
|
|
"grad_norm": 1.5275233090165254,
|
|
"learning_rate": 9.056333558975828e-06,
|
|
"loss": 0.4556,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 0.22340353946571564,
|
|
"grad_norm": 1.6620168630066545,
|
|
"learning_rate": 9.051227960441819e-06,
|
|
"loss": 0.4652,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.22394186124756074,
|
|
"grad_norm": 2.059601149156459,
|
|
"learning_rate": 9.046110034865374e-06,
|
|
"loss": 0.5085,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 0.22448018302940584,
|
|
"grad_norm": 1.762324556385875,
|
|
"learning_rate": 9.040979797819275e-06,
|
|
"loss": 0.4461,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 0.22501850481125094,
|
|
"grad_norm": 1.7567357923246754,
|
|
"learning_rate": 9.035837264913764e-06,
|
|
"loss": 0.4732,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 0.22555682659309603,
|
|
"grad_norm": 1.6696886078675257,
|
|
"learning_rate": 9.030682451796497e-06,
|
|
"loss": 0.4642,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 0.22609514837494113,
|
|
"grad_norm": 1.8175306322549967,
|
|
"learning_rate": 9.025515374152498e-06,
|
|
"loss": 0.4613,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.22609514837494113,
|
|
"eval_loss": 0.4776149392127991,
|
|
"eval_runtime": 1533.2316,
|
|
"eval_samples_per_second": 16.312,
|
|
"eval_steps_per_second": 0.51,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.22663347015678623,
|
|
"grad_norm": 1.7934239843519915,
|
|
"learning_rate": 9.020336047704105e-06,
|
|
"loss": 0.516,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 0.22717179193863132,
|
|
"grad_norm": 1.5310720805604554,
|
|
"learning_rate": 9.015144488210927e-06,
|
|
"loss": 0.489,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 0.22771011372047642,
|
|
"grad_norm": 1.48774951332565,
|
|
"learning_rate": 9.009940711469804e-06,
|
|
"loss": 0.5009,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 0.22824843550232152,
|
|
"grad_norm": 2.4756529462562145,
|
|
"learning_rate": 9.004724733314738e-06,
|
|
"loss": 0.4406,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 0.22878675728416661,
|
|
"grad_norm": 1.4505668733407078,
|
|
"learning_rate": 8.999496569616867e-06,
|
|
"loss": 0.4554,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.2293250790660117,
|
|
"grad_norm": 1.7945762191089136,
|
|
"learning_rate": 8.994256236284402e-06,
|
|
"loss": 0.4632,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 0.2298634008478568,
|
|
"grad_norm": 1.6376843185311614,
|
|
"learning_rate": 8.989003749262587e-06,
|
|
"loss": 0.4885,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 0.2304017226297019,
|
|
"grad_norm": 1.8830741232863908,
|
|
"learning_rate": 8.983739124533644e-06,
|
|
"loss": 0.5075,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 0.230940044411547,
|
|
"grad_norm": 1.3195150579928587,
|
|
"learning_rate": 8.978462378116729e-06,
|
|
"loss": 0.4708,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 0.2314783661933921,
|
|
"grad_norm": 3.7495214134368977,
|
|
"learning_rate": 8.973173526067883e-06,
|
|
"loss": 0.4286,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.2320166879752372,
|
|
"grad_norm": 2.359888838059791,
|
|
"learning_rate": 8.967872584479977e-06,
|
|
"loss": 0.5009,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 0.2325550097570823,
|
|
"grad_norm": 2.307039087438763,
|
|
"learning_rate": 8.962559569482677e-06,
|
|
"loss": 0.5676,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 0.2330933315389274,
|
|
"grad_norm": 1.6816015759212095,
|
|
"learning_rate": 8.957234497242378e-06,
|
|
"loss": 0.4741,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 0.2336316533207725,
|
|
"grad_norm": 1.322921614998224,
|
|
"learning_rate": 8.951897383962163e-06,
|
|
"loss": 0.4688,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 0.23416997510261758,
|
|
"grad_norm": 1.4430047272258668,
|
|
"learning_rate": 8.946548245881758e-06,
|
|
"loss": 0.4711,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.23470829688446268,
|
|
"grad_norm": 1.5731159349637571,
|
|
"learning_rate": 8.941187099277475e-06,
|
|
"loss": 0.5128,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 0.23524661866630778,
|
|
"grad_norm": 1.7731819377906834,
|
|
"learning_rate": 8.935813960462166e-06,
|
|
"loss": 0.4669,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 0.23578494044815287,
|
|
"grad_norm": 1.5736170200351274,
|
|
"learning_rate": 8.930428845785171e-06,
|
|
"loss": 0.5151,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 0.23632326222999797,
|
|
"grad_norm": 1.9488876650276103,
|
|
"learning_rate": 8.925031771632273e-06,
|
|
"loss": 0.449,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 0.23686158401184307,
|
|
"grad_norm": 1.8677275264654012,
|
|
"learning_rate": 8.919622754425645e-06,
|
|
"loss": 0.4758,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.23739990579368817,
|
|
"grad_norm": 1.6185523790901868,
|
|
"learning_rate": 8.914201810623796e-06,
|
|
"loss": 0.4539,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 0.23793822757553326,
|
|
"grad_norm": 1.7808483857096469,
|
|
"learning_rate": 8.908768956721535e-06,
|
|
"loss": 0.5022,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 0.2384765493573784,
|
|
"grad_norm": 1.5766134824810658,
|
|
"learning_rate": 8.903324209249895e-06,
|
|
"loss": 0.448,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 0.23901487113922348,
|
|
"grad_norm": 1.734675342226781,
|
|
"learning_rate": 8.897867584776114e-06,
|
|
"loss": 0.4646,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 0.23955319292106858,
|
|
"grad_norm": 1.5790149541067802,
|
|
"learning_rate": 8.892399099903564e-06,
|
|
"loss": 0.4786,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.24009151470291368,
|
|
"grad_norm": 1.4746994503206987,
|
|
"learning_rate": 8.8869187712717e-06,
|
|
"loss": 0.5055,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 0.24062983648475877,
|
|
"grad_norm": 1.629202002564735,
|
|
"learning_rate": 8.881426615556023e-06,
|
|
"loss": 0.4572,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 0.24116815826660387,
|
|
"grad_norm": 2.060742412650639,
|
|
"learning_rate": 8.875922649468019e-06,
|
|
"loss": 0.5032,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 0.24170648004844897,
|
|
"grad_norm": 1.5621749237333817,
|
|
"learning_rate": 8.87040688975511e-06,
|
|
"loss": 0.4654,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 0.24224480183029407,
|
|
"grad_norm": 1.4674899116105513,
|
|
"learning_rate": 8.864879353200599e-06,
|
|
"loss": 0.4747,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.24278312361213916,
|
|
"grad_norm": 1.5183875651941505,
|
|
"learning_rate": 8.859340056623632e-06,
|
|
"loss": 0.4982,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 0.24332144539398426,
|
|
"grad_norm": 1.5706370531453442,
|
|
"learning_rate": 8.853789016879134e-06,
|
|
"loss": 0.4667,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 0.24385976717582936,
|
|
"grad_norm": 1.6305623278282155,
|
|
"learning_rate": 8.84822625085776e-06,
|
|
"loss": 0.456,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 0.24439808895767445,
|
|
"grad_norm": 1.6523301690172285,
|
|
"learning_rate": 8.842651775485848e-06,
|
|
"loss": 0.5383,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 0.24493641073951955,
|
|
"grad_norm": 1.5998220743266833,
|
|
"learning_rate": 8.837065607725368e-06,
|
|
"loss": 0.4829,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.24547473252136465,
|
|
"grad_norm": 1.7862569885991761,
|
|
"learning_rate": 8.831467764573863e-06,
|
|
"loss": 0.5101,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 0.24601305430320974,
|
|
"grad_norm": 1.704691179868801,
|
|
"learning_rate": 8.8258582630644e-06,
|
|
"loss": 0.4627,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 0.24655137608505484,
|
|
"grad_norm": 1.7756811764982563,
|
|
"learning_rate": 8.820237120265526e-06,
|
|
"loss": 0.5079,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 0.24708969786689994,
|
|
"grad_norm": 1.3696742776597963,
|
|
"learning_rate": 8.814604353281206e-06,
|
|
"loss": 0.4393,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 0.24762801964874503,
|
|
"grad_norm": 2.7637461827933083,
|
|
"learning_rate": 8.80895997925078e-06,
|
|
"loss": 0.4548,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.24816634143059013,
|
|
"grad_norm": 1.9115795242982947,
|
|
"learning_rate": 8.803304015348894e-06,
|
|
"loss": 0.4805,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 0.24870466321243523,
|
|
"grad_norm": 1.6805506691737162,
|
|
"learning_rate": 8.797636478785475e-06,
|
|
"loss": 0.4786,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 0.24924298499428033,
|
|
"grad_norm": 1.865661091263274,
|
|
"learning_rate": 8.791957386805651e-06,
|
|
"loss": 0.4722,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 0.24978130677612542,
|
|
"grad_norm": 1.9405317358586787,
|
|
"learning_rate": 8.78626675668972e-06,
|
|
"loss": 0.4705,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 0.2503196285579705,
|
|
"grad_norm": 1.4415009315383829,
|
|
"learning_rate": 8.78056460575308e-06,
|
|
"loss": 0.4301,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.2508579503398156,
|
|
"grad_norm": 1.6060330602526178,
|
|
"learning_rate": 8.774850951346188e-06,
|
|
"loss": 0.4114,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 0.2513962721216607,
|
|
"grad_norm": 1.7567677906852937,
|
|
"learning_rate": 8.769125810854504e-06,
|
|
"loss": 0.4922,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 0.2519345939035058,
|
|
"grad_norm": 1.4281502602519498,
|
|
"learning_rate": 8.763389201698438e-06,
|
|
"loss": 0.4426,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 0.2524729156853509,
|
|
"grad_norm": 1.787920776798679,
|
|
"learning_rate": 8.757641141333296e-06,
|
|
"loss": 0.4451,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 0.253011237467196,
|
|
"grad_norm": 1.4246034781799948,
|
|
"learning_rate": 8.751881647249228e-06,
|
|
"loss": 0.4353,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.2535495592490411,
|
|
"grad_norm": 1.6679185342871934,
|
|
"learning_rate": 8.746110736971175e-06,
|
|
"loss": 0.4573,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 0.2540878810308862,
|
|
"grad_norm": 1.6765594656197593,
|
|
"learning_rate": 8.740328428058813e-06,
|
|
"loss": 0.4797,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 0.2546262028127313,
|
|
"grad_norm": 1.7826390062476167,
|
|
"learning_rate": 8.734534738106503e-06,
|
|
"loss": 0.473,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 0.2551645245945764,
|
|
"grad_norm": 2.195730177211015,
|
|
"learning_rate": 8.728729684743238e-06,
|
|
"loss": 0.4648,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 0.2557028463764215,
|
|
"grad_norm": 1.475566632306908,
|
|
"learning_rate": 8.722913285632584e-06,
|
|
"loss": 0.4845,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.2562411681582666,
|
|
"grad_norm": 1.7347583810505152,
|
|
"learning_rate": 8.717085558472631e-06,
|
|
"loss": 0.4708,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 0.2567794899401117,
|
|
"grad_norm": 1.6902146229456119,
|
|
"learning_rate": 8.71124652099594e-06,
|
|
"loss": 0.4817,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 0.2573178117219568,
|
|
"grad_norm": 1.7071042054828858,
|
|
"learning_rate": 8.705396190969484e-06,
|
|
"loss": 0.4712,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 0.2578561335038019,
|
|
"grad_norm": 1.729348975756144,
|
|
"learning_rate": 8.699534586194598e-06,
|
|
"loss": 0.4881,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 0.258394455285647,
|
|
"grad_norm": 1.4614872127177663,
|
|
"learning_rate": 8.693661724506924e-06,
|
|
"loss": 0.457,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.258394455285647,
|
|
"eval_loss": 0.4751787483692169,
|
|
"eval_runtime": 1539.7899,
|
|
"eval_samples_per_second": 16.242,
|
|
"eval_steps_per_second": 0.508,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.25893277706749207,
|
|
"grad_norm": 2.1154756500873977,
|
|
"learning_rate": 8.687777623776357e-06,
|
|
"loss": 0.4842,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 0.25947109884933717,
|
|
"grad_norm": 1.5862460419373354,
|
|
"learning_rate": 8.681882301906988e-06,
|
|
"loss": 0.4432,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 0.26000942063118226,
|
|
"grad_norm": 1.796404843665338,
|
|
"learning_rate": 8.675975776837053e-06,
|
|
"loss": 0.4759,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 0.26054774241302736,
|
|
"grad_norm": 1.5555927859924092,
|
|
"learning_rate": 8.67005806653888e-06,
|
|
"loss": 0.509,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 0.26108606419487246,
|
|
"grad_norm": 2.1699720622194354,
|
|
"learning_rate": 8.664129189018826e-06,
|
|
"loss": 0.5334,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.2616243859767176,
|
|
"grad_norm": 1.690073634180223,
|
|
"learning_rate": 8.658189162317226e-06,
|
|
"loss": 0.4356,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 0.2621627077585627,
|
|
"grad_norm": 1.8294975401345657,
|
|
"learning_rate": 8.65223800450835e-06,
|
|
"loss": 0.4387,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 0.2627010295404078,
|
|
"grad_norm": 2.5288130694594337,
|
|
"learning_rate": 8.646275733700327e-06,
|
|
"loss": 0.4567,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 0.2632393513222529,
|
|
"grad_norm": 1.957861459161194,
|
|
"learning_rate": 8.640302368035105e-06,
|
|
"loss": 0.4614,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 0.263777673104098,
|
|
"grad_norm": 1.5304950580333017,
|
|
"learning_rate": 8.634317925688392e-06,
|
|
"loss": 0.4655,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.2643159948859431,
|
|
"grad_norm": 1.667011172421826,
|
|
"learning_rate": 8.628322424869599e-06,
|
|
"loss": 0.4834,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 0.2648543166677882,
|
|
"grad_norm": 2.1636641173694464,
|
|
"learning_rate": 8.622315883821783e-06,
|
|
"loss": 0.4776,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 0.2653926384496333,
|
|
"grad_norm": 1.46798046973594,
|
|
"learning_rate": 8.616298320821601e-06,
|
|
"loss": 0.4272,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 0.2659309602314784,
|
|
"grad_norm": 1.861178177564276,
|
|
"learning_rate": 8.61026975417924e-06,
|
|
"loss": 0.4784,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 0.2664692820133235,
|
|
"grad_norm": 1.6268110739530368,
|
|
"learning_rate": 8.604230202238373e-06,
|
|
"loss": 0.5029,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.2670076037951686,
|
|
"grad_norm": 1.5680263307618678,
|
|
"learning_rate": 8.598179683376098e-06,
|
|
"loss": 0.4225,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 0.2675459255770137,
|
|
"grad_norm": 1.5774347517397593,
|
|
"learning_rate": 8.592118216002883e-06,
|
|
"loss": 0.4879,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 0.2680842473588588,
|
|
"grad_norm": 2.670832440569625,
|
|
"learning_rate": 8.586045818562508e-06,
|
|
"loss": 0.4667,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 0.26862256914070387,
|
|
"grad_norm": 2.2055704035459787,
|
|
"learning_rate": 8.579962509532016e-06,
|
|
"loss": 0.4331,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 0.26916089092254897,
|
|
"grad_norm": 1.4435727148058994,
|
|
"learning_rate": 8.573868307421648e-06,
|
|
"loss": 0.4894,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.26969921270439406,
|
|
"grad_norm": 1.6814136996880347,
|
|
"learning_rate": 8.567763230774789e-06,
|
|
"loss": 0.4697,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 0.27023753448623916,
|
|
"grad_norm": 1.5774141123551826,
|
|
"learning_rate": 8.561647298167918e-06,
|
|
"loss": 0.503,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 0.27077585626808426,
|
|
"grad_norm": 1.5778826165083357,
|
|
"learning_rate": 8.555520528210541e-06,
|
|
"loss": 0.4535,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 0.27131417804992936,
|
|
"grad_norm": 1.7129721491097367,
|
|
"learning_rate": 8.549382939545143e-06,
|
|
"loss": 0.4494,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 0.27185249983177445,
|
|
"grad_norm": 1.8943346844828264,
|
|
"learning_rate": 8.543234550847128e-06,
|
|
"loss": 0.5063,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.27239082161361955,
|
|
"grad_norm": 1.5886936361058726,
|
|
"learning_rate": 8.537075380824761e-06,
|
|
"loss": 0.4652,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 0.27292914339546465,
|
|
"grad_norm": 1.4831172032030655,
|
|
"learning_rate": 8.530905448219112e-06,
|
|
"loss": 0.4243,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 0.27346746517730974,
|
|
"grad_norm": 1.7919686995453996,
|
|
"learning_rate": 8.524724771804001e-06,
|
|
"loss": 0.5049,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 0.27400578695915484,
|
|
"grad_norm": 1.7505822684442558,
|
|
"learning_rate": 8.518533370385939e-06,
|
|
"loss": 0.4423,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 0.27454410874099994,
|
|
"grad_norm": 1.5798026347891434,
|
|
"learning_rate": 8.512331262804069e-06,
|
|
"loss": 0.4866,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.27508243052284503,
|
|
"grad_norm": 1.8464155171834333,
|
|
"learning_rate": 8.506118467930112e-06,
|
|
"loss": 0.4708,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 0.27562075230469013,
|
|
"grad_norm": 1.6897436623195476,
|
|
"learning_rate": 8.499895004668308e-06,
|
|
"loss": 0.4903,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 0.27615907408653523,
|
|
"grad_norm": 1.7863457448170967,
|
|
"learning_rate": 8.49366089195536e-06,
|
|
"loss": 0.5092,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 0.2766973958683803,
|
|
"grad_norm": 1.7320740104134424,
|
|
"learning_rate": 8.487416148760375e-06,
|
|
"loss": 0.48,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 0.2772357176502254,
|
|
"grad_norm": 1.7064456081649735,
|
|
"learning_rate": 8.481160794084799e-06,
|
|
"loss": 0.4754,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.2777740394320705,
|
|
"grad_norm": 1.7525756365837095,
|
|
"learning_rate": 8.47489484696238e-06,
|
|
"loss": 0.427,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 0.2783123612139156,
|
|
"grad_norm": 2.058946941055886,
|
|
"learning_rate": 8.468618326459086e-06,
|
|
"loss": 0.4847,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 0.2788506829957607,
|
|
"grad_norm": 2.0477477556261467,
|
|
"learning_rate": 8.46233125167306e-06,
|
|
"loss": 0.4579,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 0.2793890047776058,
|
|
"grad_norm": 1.783616738245662,
|
|
"learning_rate": 8.456033641734562e-06,
|
|
"loss": 0.4858,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 0.2799273265594509,
|
|
"grad_norm": 2.0513841896237444,
|
|
"learning_rate": 8.449725515805907e-06,
|
|
"loss": 0.5352,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.280465648341296,
|
|
"grad_norm": 1.6372025528727123,
|
|
"learning_rate": 8.443406893081406e-06,
|
|
"loss": 0.4618,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 0.2810039701231411,
|
|
"grad_norm": 1.5571805104955587,
|
|
"learning_rate": 8.437077792787314e-06,
|
|
"loss": 0.4038,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 0.2815422919049862,
|
|
"grad_norm": 1.75233105631481,
|
|
"learning_rate": 8.43073823418176e-06,
|
|
"loss": 0.4845,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 0.2820806136868313,
|
|
"grad_norm": 1.6881033261753147,
|
|
"learning_rate": 8.424388236554704e-06,
|
|
"loss": 0.4865,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 0.2826189354686764,
|
|
"grad_norm": 1.796069079351986,
|
|
"learning_rate": 8.418027819227861e-06,
|
|
"loss": 0.4538,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.2831572572505215,
|
|
"grad_norm": 1.24349614978993,
|
|
"learning_rate": 8.41165700155466e-06,
|
|
"loss": 0.4166,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 0.2836955790323666,
|
|
"grad_norm": 1.932274887854439,
|
|
"learning_rate": 8.405275802920168e-06,
|
|
"loss": 0.5061,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 0.2842339008142117,
|
|
"grad_norm": 1.5593268393001998,
|
|
"learning_rate": 8.398884242741045e-06,
|
|
"loss": 0.4894,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 0.2847722225960568,
|
|
"grad_norm": 1.7069043502360113,
|
|
"learning_rate": 8.392482340465475e-06,
|
|
"loss": 0.4485,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 0.2853105443779019,
|
|
"grad_norm": 1.5063144141336193,
|
|
"learning_rate": 8.386070115573115e-06,
|
|
"loss": 0.4175,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.285848866159747,
|
|
"grad_norm": 1.4364305869165457,
|
|
"learning_rate": 8.379647587575026e-06,
|
|
"loss": 0.4766,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 0.28638718794159207,
|
|
"grad_norm": 1.3932649525614649,
|
|
"learning_rate": 8.373214776013625e-06,
|
|
"loss": 0.406,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 0.28692550972343717,
|
|
"grad_norm": 1.5523357464392091,
|
|
"learning_rate": 8.366771700462615e-06,
|
|
"loss": 0.508,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 0.28746383150528226,
|
|
"grad_norm": 2.1213305217928613,
|
|
"learning_rate": 8.360318380526932e-06,
|
|
"loss": 0.4985,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 0.28800215328712736,
|
|
"grad_norm": 1.5873480547904262,
|
|
"learning_rate": 8.353854835842685e-06,
|
|
"loss": 0.4919,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.28854047506897246,
|
|
"grad_norm": 1.5670280821673355,
|
|
"learning_rate": 8.347381086077095e-06,
|
|
"loss": 0.4708,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 0.28907879685081755,
|
|
"grad_norm": 1.6763746949820768,
|
|
"learning_rate": 8.34089715092843e-06,
|
|
"loss": 0.4165,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 0.28961711863266265,
|
|
"grad_norm": 1.5717106133141925,
|
|
"learning_rate": 8.334403050125956e-06,
|
|
"loss": 0.4554,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 0.29015544041450775,
|
|
"grad_norm": 1.9743994746638458,
|
|
"learning_rate": 8.327898803429866e-06,
|
|
"loss": 0.4695,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 0.29069376219635285,
|
|
"grad_norm": 1.5473676266482859,
|
|
"learning_rate": 8.32138443063123e-06,
|
|
"loss": 0.4712,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.29069376219635285,
|
|
"eval_loss": 0.47182729840278625,
|
|
"eval_runtime": 1553.992,
|
|
"eval_samples_per_second": 16.094,
|
|
"eval_steps_per_second": 0.503,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.29123208397819794,
|
|
"grad_norm": 1.4425882953477511,
|
|
"learning_rate": 8.314859951551926e-06,
|
|
"loss": 0.4837,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 0.29177040576004304,
|
|
"grad_norm": 1.3326493426074462,
|
|
"learning_rate": 8.308325386044583e-06,
|
|
"loss": 0.4814,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 0.2923087275418882,
|
|
"grad_norm": 1.6128638362772016,
|
|
"learning_rate": 8.301780753992523e-06,
|
|
"loss": 0.4575,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 0.2928470493237333,
|
|
"grad_norm": 1.4423693981211698,
|
|
"learning_rate": 8.295226075309697e-06,
|
|
"loss": 0.4633,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 0.2933853711055784,
|
|
"grad_norm": 1.6198600771922913,
|
|
"learning_rate": 8.288661369940627e-06,
|
|
"loss": 0.4463,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.2939236928874235,
|
|
"grad_norm": 1.5249628074643904,
|
|
"learning_rate": 8.282086657860342e-06,
|
|
"loss": 0.4668,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 0.2944620146692686,
|
|
"grad_norm": 1.8125904384120293,
|
|
"learning_rate": 8.275501959074325e-06,
|
|
"loss": 0.4825,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 0.2950003364511137,
|
|
"grad_norm": 1.9606743516276068,
|
|
"learning_rate": 8.268907293618437e-06,
|
|
"loss": 0.4684,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 0.2955386582329588,
|
|
"grad_norm": 1.494990763192773,
|
|
"learning_rate": 8.262302681558872e-06,
|
|
"loss": 0.4664,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 0.29607698001480387,
|
|
"grad_norm": 1.8337579001893594,
|
|
"learning_rate": 8.255688142992089e-06,
|
|
"loss": 0.4699,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.29661530179664897,
|
|
"grad_norm": 1.779841389754219,
|
|
"learning_rate": 8.24906369804475e-06,
|
|
"loss": 0.4857,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 0.29715362357849406,
|
|
"grad_norm": 1.6593925240524081,
|
|
"learning_rate": 8.242429366873663e-06,
|
|
"loss": 0.5038,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 0.29769194536033916,
|
|
"grad_norm": 1.9956877344800352,
|
|
"learning_rate": 8.235785169665711e-06,
|
|
"loss": 0.4911,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 0.29823026714218426,
|
|
"grad_norm": 1.579568204329291,
|
|
"learning_rate": 8.229131126637804e-06,
|
|
"loss": 0.4552,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 0.29876858892402935,
|
|
"grad_norm": 1.5989428055850947,
|
|
"learning_rate": 8.222467258036808e-06,
|
|
"loss": 0.5177,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.29930691070587445,
|
|
"grad_norm": 2.349536199541145,
|
|
"learning_rate": 8.215793584139485e-06,
|
|
"loss": 0.4911,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 0.29984523248771955,
|
|
"grad_norm": 1.9403593317863332,
|
|
"learning_rate": 8.209110125252435e-06,
|
|
"loss": 0.5061,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 0.30038355426956465,
|
|
"grad_norm": 1.7346564666609186,
|
|
"learning_rate": 8.202416901712033e-06,
|
|
"loss": 0.4357,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 0.30092187605140974,
|
|
"grad_norm": 1.710471255918245,
|
|
"learning_rate": 8.195713933884359e-06,
|
|
"loss": 0.5015,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 0.30146019783325484,
|
|
"grad_norm": 2.207816727293276,
|
|
"learning_rate": 8.189001242165151e-06,
|
|
"loss": 0.527,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.30199851961509994,
|
|
"grad_norm": 1.428363458277829,
|
|
"learning_rate": 8.182278846979728e-06,
|
|
"loss": 0.4983,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 0.30253684139694503,
|
|
"grad_norm": 1.77069966551508,
|
|
"learning_rate": 8.175546768782938e-06,
|
|
"loss": 0.4996,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 0.30307516317879013,
|
|
"grad_norm": 1.631420375855133,
|
|
"learning_rate": 8.168805028059095e-06,
|
|
"loss": 0.4899,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 0.3036134849606352,
|
|
"grad_norm": 1.6234744365340297,
|
|
"learning_rate": 8.162053645321908e-06,
|
|
"loss": 0.4275,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 0.3041518067424803,
|
|
"grad_norm": 1.7151129037835051,
|
|
"learning_rate": 8.15529264111443e-06,
|
|
"loss": 0.4628,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.3046901285243254,
|
|
"grad_norm": 1.6757537025608307,
|
|
"learning_rate": 8.148522036008985e-06,
|
|
"loss": 0.4636,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 0.3052284503061705,
|
|
"grad_norm": 1.157809434742461,
|
|
"learning_rate": 8.141741850607117e-06,
|
|
"loss": 0.3868,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 0.3057667720880156,
|
|
"grad_norm": 1.4360027236144732,
|
|
"learning_rate": 8.134952105539515e-06,
|
|
"loss": 0.4725,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 0.3063050938698607,
|
|
"grad_norm": 1.6762158717929798,
|
|
"learning_rate": 8.128152821465957e-06,
|
|
"loss": 0.4818,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 0.3068434156517058,
|
|
"grad_norm": 1.6736535469921034,
|
|
"learning_rate": 8.121344019075253e-06,
|
|
"loss": 0.4805,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.3073817374335509,
|
|
"grad_norm": 1.5918931966460608,
|
|
"learning_rate": 8.114525719085163e-06,
|
|
"loss": 0.5152,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 0.307920059215396,
|
|
"grad_norm": 1.4169517878992852,
|
|
"learning_rate": 8.107697942242356e-06,
|
|
"loss": 0.4731,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 0.3084583809972411,
|
|
"grad_norm": 1.5959353428431666,
|
|
"learning_rate": 8.100860709322334e-06,
|
|
"loss": 0.4463,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 0.3089967027790862,
|
|
"grad_norm": 1.4569323564340282,
|
|
"learning_rate": 8.094014041129373e-06,
|
|
"loss": 0.4046,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 0.3095350245609313,
|
|
"grad_norm": 1.5558748525412556,
|
|
"learning_rate": 8.087157958496456e-06,
|
|
"loss": 0.4644,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.3100733463427764,
|
|
"grad_norm": 1.6641076139574378,
|
|
"learning_rate": 8.080292482285213e-06,
|
|
"loss": 0.5064,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 0.3106116681246215,
|
|
"grad_norm": 1.5793644667521578,
|
|
"learning_rate": 8.07341763338586e-06,
|
|
"loss": 0.515,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 0.3111499899064666,
|
|
"grad_norm": 1.895774618714942,
|
|
"learning_rate": 8.066533432717127e-06,
|
|
"loss": 0.4763,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 0.3116883116883117,
|
|
"grad_norm": 1.6689610869771314,
|
|
"learning_rate": 8.059639901226203e-06,
|
|
"loss": 0.4487,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 0.3122266334701568,
|
|
"grad_norm": 1.4289516860868958,
|
|
"learning_rate": 8.05273705988867e-06,
|
|
"loss": 0.426,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.3127649552520019,
|
|
"grad_norm": 1.448460429863824,
|
|
"learning_rate": 8.04582492970843e-06,
|
|
"loss": 0.4622,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 0.31330327703384697,
|
|
"grad_norm": 1.562340995796949,
|
|
"learning_rate": 8.038903531717662e-06,
|
|
"loss": 0.4644,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 0.31384159881569207,
|
|
"grad_norm": 1.4837986133941243,
|
|
"learning_rate": 8.031972886976731e-06,
|
|
"loss": 0.4845,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 0.31437992059753717,
|
|
"grad_norm": 1.696043847539263,
|
|
"learning_rate": 8.025033016574148e-06,
|
|
"loss": 0.4631,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 0.31491824237938226,
|
|
"grad_norm": 1.8636443570370922,
|
|
"learning_rate": 8.018083941626494e-06,
|
|
"loss": 0.4582,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.31545656416122736,
|
|
"grad_norm": 1.6588060343624296,
|
|
"learning_rate": 8.011125683278351e-06,
|
|
"loss": 0.4118,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 0.31599488594307246,
|
|
"grad_norm": 2.064927405044272,
|
|
"learning_rate": 8.004158262702253e-06,
|
|
"loss": 0.5307,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 0.31653320772491755,
|
|
"grad_norm": 1.7599540523459494,
|
|
"learning_rate": 7.997181701098608e-06,
|
|
"loss": 0.4542,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 0.31707152950676265,
|
|
"grad_norm": 1.679120614548226,
|
|
"learning_rate": 7.99019601969564e-06,
|
|
"loss": 0.4462,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 0.31760985128860775,
|
|
"grad_norm": 1.6748781594901945,
|
|
"learning_rate": 7.983201239749321e-06,
|
|
"loss": 0.4435,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.31814817307045284,
|
|
"grad_norm": 1.6895768411385892,
|
|
"learning_rate": 7.976197382543306e-06,
|
|
"loss": 0.5043,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 0.31868649485229794,
|
|
"grad_norm": 1.4551705590923076,
|
|
"learning_rate": 7.969184469388877e-06,
|
|
"loss": 0.4992,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 0.31922481663414304,
|
|
"grad_norm": 1.8224446520059305,
|
|
"learning_rate": 7.962162521624865e-06,
|
|
"loss": 0.5242,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 0.31976313841598814,
|
|
"grad_norm": 1.5471915857747345,
|
|
"learning_rate": 7.955131560617595e-06,
|
|
"loss": 0.4672,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 0.32030146019783323,
|
|
"grad_norm": 1.943277469873626,
|
|
"learning_rate": 7.948091607760815e-06,
|
|
"loss": 0.4817,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.32083978197967833,
|
|
"grad_norm": 1.361762394527565,
|
|
"learning_rate": 7.941042684475635e-06,
|
|
"loss": 0.4341,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 0.3213781037615234,
|
|
"grad_norm": 1.578768861245864,
|
|
"learning_rate": 7.933984812210459e-06,
|
|
"loss": 0.452,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 0.3219164255433685,
|
|
"grad_norm": 1.3732353872225034,
|
|
"learning_rate": 7.926918012440923e-06,
|
|
"loss": 0.4349,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 0.3224547473252136,
|
|
"grad_norm": 1.8064334973816905,
|
|
"learning_rate": 7.919842306669825e-06,
|
|
"loss": 0.4499,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 0.3229930691070587,
|
|
"grad_norm": 1.582853458222087,
|
|
"learning_rate": 7.912757716427062e-06,
|
|
"loss": 0.4865,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.3229930691070587,
|
|
"eval_loss": 0.4672350585460663,
|
|
"eval_runtime": 1563.3319,
|
|
"eval_samples_per_second": 15.998,
|
|
"eval_steps_per_second": 0.5,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.32353139088890387,
|
|
"grad_norm": 1.6009402167895466,
|
|
"learning_rate": 7.905664263269567e-06,
|
|
"loss": 0.4576,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 0.32406971267074897,
|
|
"grad_norm": 1.6832973254975117,
|
|
"learning_rate": 7.898561968781242e-06,
|
|
"loss": 0.457,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 0.32460803445259406,
|
|
"grad_norm": 4.046599916473538,
|
|
"learning_rate": 7.891450854572884e-06,
|
|
"loss": 0.49,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 0.32514635623443916,
|
|
"grad_norm": 1.5254137578843718,
|
|
"learning_rate": 7.884330942282136e-06,
|
|
"loss": 0.4533,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 0.32568467801628426,
|
|
"grad_norm": 1.5392402810831298,
|
|
"learning_rate": 7.877202253573404e-06,
|
|
"loss": 0.4566,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.32622299979812935,
|
|
"grad_norm": 1.5838863815714255,
|
|
"learning_rate": 7.870064810137806e-06,
|
|
"loss": 0.4224,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 0.32676132157997445,
|
|
"grad_norm": 1.5112598539099842,
|
|
"learning_rate": 7.862918633693091e-06,
|
|
"loss": 0.4537,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 0.32729964336181955,
|
|
"grad_norm": 1.7380984306062113,
|
|
"learning_rate": 7.855763745983588e-06,
|
|
"loss": 0.5168,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 0.32783796514366464,
|
|
"grad_norm": 1.3686616623355445,
|
|
"learning_rate": 7.848600168780127e-06,
|
|
"loss": 0.4774,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 0.32837628692550974,
|
|
"grad_norm": 1.8037345014596735,
|
|
"learning_rate": 7.841427923879982e-06,
|
|
"loss": 0.4841,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.32891460870735484,
|
|
"grad_norm": 1.5578093278723995,
|
|
"learning_rate": 7.834247033106798e-06,
|
|
"loss": 0.4494,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 0.32945293048919994,
|
|
"grad_norm": 1.7470526074648303,
|
|
"learning_rate": 7.827057518310532e-06,
|
|
"loss": 0.4316,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 0.32999125227104503,
|
|
"grad_norm": 1.344635684714144,
|
|
"learning_rate": 7.819859401367376e-06,
|
|
"loss": 0.4277,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 0.33052957405289013,
|
|
"grad_norm": 1.6142148463610868,
|
|
"learning_rate": 7.8126527041797e-06,
|
|
"loss": 0.4732,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 0.3310678958347352,
|
|
"grad_norm": 1.4894686294102883,
|
|
"learning_rate": 7.805437448675986e-06,
|
|
"loss": 0.4804,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.3316062176165803,
|
|
"grad_norm": 1.959553525810308,
|
|
"learning_rate": 7.798213656810747e-06,
|
|
"loss": 0.5052,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 0.3321445393984254,
|
|
"grad_norm": 1.5799236754205312,
|
|
"learning_rate": 7.790981350564482e-06,
|
|
"loss": 0.432,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 0.3326828611802705,
|
|
"grad_norm": 1.82490515289263,
|
|
"learning_rate": 7.783740551943586e-06,
|
|
"loss": 0.4394,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 0.3332211829621156,
|
|
"grad_norm": 1.5031228288941465,
|
|
"learning_rate": 7.776491282980305e-06,
|
|
"loss": 0.5064,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 0.3337595047439607,
|
|
"grad_norm": 1.4329349118783261,
|
|
"learning_rate": 7.76923356573265e-06,
|
|
"loss": 0.489,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.3342978265258058,
|
|
"grad_norm": 1.4961946186338742,
|
|
"learning_rate": 7.761967422284347e-06,
|
|
"loss": 0.4704,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 0.3348361483076509,
|
|
"grad_norm": 1.7319823672043928,
|
|
"learning_rate": 7.754692874744752e-06,
|
|
"loss": 0.4621,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 0.335374470089496,
|
|
"grad_norm": 2.0507693298974035,
|
|
"learning_rate": 7.747409945248797e-06,
|
|
"loss": 0.502,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 0.3359127918713411,
|
|
"grad_norm": 1.4817353671174234,
|
|
"learning_rate": 7.74011865595692e-06,
|
|
"loss": 0.4975,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 0.3364511136531862,
|
|
"grad_norm": 1.5154706925154366,
|
|
"learning_rate": 7.732819029054999e-06,
|
|
"loss": 0.4819,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.3369894354350313,
|
|
"grad_norm": 2.9866409096863507,
|
|
"learning_rate": 7.725511086754269e-06,
|
|
"loss": 0.4947,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 0.3375277572168764,
|
|
"grad_norm": 1.7699700957236326,
|
|
"learning_rate": 7.718194851291284e-06,
|
|
"loss": 0.4703,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 0.3380660789987215,
|
|
"grad_norm": 2.371528841529566,
|
|
"learning_rate": 7.710870344927817e-06,
|
|
"loss": 0.5458,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 0.3386044007805666,
|
|
"grad_norm": 1.5200234564971724,
|
|
"learning_rate": 7.703537589950819e-06,
|
|
"loss": 0.4562,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 0.3391427225624117,
|
|
"grad_norm": 1.371146036616362,
|
|
"learning_rate": 7.696196608672333e-06,
|
|
"loss": 0.4196,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.3396810443442568,
|
|
"grad_norm": 1.5627852767313657,
|
|
"learning_rate": 7.688847423429434e-06,
|
|
"loss": 0.505,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 0.3402193661261019,
|
|
"grad_norm": 1.3089486655111793,
|
|
"learning_rate": 7.68149005658417e-06,
|
|
"loss": 0.4532,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 0.34075768790794697,
|
|
"grad_norm": 1.72862210074593,
|
|
"learning_rate": 7.674124530523461e-06,
|
|
"loss": 0.5431,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 0.34129600968979207,
|
|
"grad_norm": 1.397330557638678,
|
|
"learning_rate": 7.666750867659078e-06,
|
|
"loss": 0.46,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 0.34183433147163717,
|
|
"grad_norm": 1.5822930242940645,
|
|
"learning_rate": 7.659369090427537e-06,
|
|
"loss": 0.5183,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.34237265325348226,
|
|
"grad_norm": 1.517257101602274,
|
|
"learning_rate": 7.651979221290049e-06,
|
|
"loss": 0.4847,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 0.34291097503532736,
|
|
"grad_norm": 1.569552765274582,
|
|
"learning_rate": 7.644581282732445e-06,
|
|
"loss": 0.5237,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 0.34344929681717246,
|
|
"grad_norm": 1.5173887839906304,
|
|
"learning_rate": 7.637175297265109e-06,
|
|
"loss": 0.444,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 0.34398761859901755,
|
|
"grad_norm": 2.6037607041595883,
|
|
"learning_rate": 7.629761287422915e-06,
|
|
"loss": 0.4271,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 0.34452594038086265,
|
|
"grad_norm": 1.6900192017878133,
|
|
"learning_rate": 7.622339275765147e-06,
|
|
"loss": 0.4631,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.34506426216270775,
|
|
"grad_norm": 1.6204089265699804,
|
|
"learning_rate": 7.61490928487544e-06,
|
|
"loss": 0.4798,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 0.34560258394455284,
|
|
"grad_norm": 2.072148397739707,
|
|
"learning_rate": 7.6074713373617094e-06,
|
|
"loss": 0.5169,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 0.34614090572639794,
|
|
"grad_norm": 1.4489303833679512,
|
|
"learning_rate": 7.600025455856078e-06,
|
|
"loss": 0.4477,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 0.34667922750824304,
|
|
"grad_norm": 1.808968142318587,
|
|
"learning_rate": 7.592571663014811e-06,
|
|
"loss": 0.4591,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 0.34721754929008813,
|
|
"grad_norm": 1.4861828747421941,
|
|
"learning_rate": 7.5851099815182505e-06,
|
|
"loss": 0.4792,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.34775587107193323,
|
|
"grad_norm": 1.6729126421729203,
|
|
"learning_rate": 7.577640434070734e-06,
|
|
"loss": 0.4832,
|
|
"step": 646
|
|
},
|
|
{
|
|
"epoch": 0.34829419285377833,
|
|
"grad_norm": 1.871195222211602,
|
|
"learning_rate": 7.5701630434005405e-06,
|
|
"loss": 0.4417,
|
|
"step": 647
|
|
},
|
|
{
|
|
"epoch": 0.3488325146356234,
|
|
"grad_norm": 1.51735945461571,
|
|
"learning_rate": 7.56267783225981e-06,
|
|
"loss": 0.4741,
|
|
"step": 648
|
|
},
|
|
{
|
|
"epoch": 0.3493708364174685,
|
|
"grad_norm": 2.071142969866682,
|
|
"learning_rate": 7.555184823424479e-06,
|
|
"loss": 0.4127,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 0.3499091581993136,
|
|
"grad_norm": 1.910282433363155,
|
|
"learning_rate": 7.547684039694216e-06,
|
|
"loss": 0.4531,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.3504474799811587,
|
|
"grad_norm": 1.9652818314978835,
|
|
"learning_rate": 7.54017550389234e-06,
|
|
"loss": 0.5085,
|
|
"step": 651
|
|
},
|
|
{
|
|
"epoch": 0.3509858017630038,
|
|
"grad_norm": 1.6117024086203307,
|
|
"learning_rate": 7.5326592388657605e-06,
|
|
"loss": 0.5148,
|
|
"step": 652
|
|
},
|
|
{
|
|
"epoch": 0.3515241235448489,
|
|
"grad_norm": 1.4960314880258612,
|
|
"learning_rate": 7.525135267484906e-06,
|
|
"loss": 0.4629,
|
|
"step": 653
|
|
},
|
|
{
|
|
"epoch": 0.352062445326694,
|
|
"grad_norm": 1.604228922752054,
|
|
"learning_rate": 7.517603612643653e-06,
|
|
"loss": 0.5117,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 0.3526007671085391,
|
|
"grad_norm": 2.136019956641433,
|
|
"learning_rate": 7.5100642972592606e-06,
|
|
"loss": 0.4629,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.3531390888903842,
|
|
"grad_norm": 1.4857693238664922,
|
|
"learning_rate": 7.50251734427229e-06,
|
|
"loss": 0.4671,
|
|
"step": 656
|
|
},
|
|
{
|
|
"epoch": 0.3536774106722293,
|
|
"grad_norm": 1.4380772688023766,
|
|
"learning_rate": 7.494962776646549e-06,
|
|
"loss": 0.428,
|
|
"step": 657
|
|
},
|
|
{
|
|
"epoch": 0.35421573245407445,
|
|
"grad_norm": 1.7510803552126726,
|
|
"learning_rate": 7.487400617369013e-06,
|
|
"loss": 0.4417,
|
|
"step": 658
|
|
},
|
|
{
|
|
"epoch": 0.35475405423591955,
|
|
"grad_norm": 1.8718328199464012,
|
|
"learning_rate": 7.479830889449754e-06,
|
|
"loss": 0.4489,
|
|
"step": 659
|
|
},
|
|
{
|
|
"epoch": 0.35529237601776464,
|
|
"grad_norm": 1.3987482870509058,
|
|
"learning_rate": 7.472253615921878e-06,
|
|
"loss": 0.5121,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.35529237601776464,
|
|
"eval_loss": 0.4641415774822235,
|
|
"eval_runtime": 1581.4987,
|
|
"eval_samples_per_second": 15.814,
|
|
"eval_steps_per_second": 0.494,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.35583069779960974,
|
|
"grad_norm": 1.5856953831241587,
|
|
"learning_rate": 7.464668819841453e-06,
|
|
"loss": 0.4429,
|
|
"step": 661
|
|
},
|
|
{
|
|
"epoch": 0.35636901958145484,
|
|
"grad_norm": 1.648655956667231,
|
|
"learning_rate": 7.457076524287426e-06,
|
|
"loss": 0.4794,
|
|
"step": 662
|
|
},
|
|
{
|
|
"epoch": 0.35690734136329993,
|
|
"grad_norm": 1.8056054836187343,
|
|
"learning_rate": 7.4494767523615754e-06,
|
|
"loss": 0.4488,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 0.35744566314514503,
|
|
"grad_norm": 1.7062432057396102,
|
|
"learning_rate": 7.441869527188421e-06,
|
|
"loss": 0.4506,
|
|
"step": 664
|
|
},
|
|
{
|
|
"epoch": 0.35798398492699013,
|
|
"grad_norm": 1.4819375518870144,
|
|
"learning_rate": 7.434254871915166e-06,
|
|
"loss": 0.4135,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.3585223067088352,
|
|
"grad_norm": 1.734074823822691,
|
|
"learning_rate": 7.426632809711617e-06,
|
|
"loss": 0.4744,
|
|
"step": 666
|
|
},
|
|
{
|
|
"epoch": 0.3590606284906803,
|
|
"grad_norm": 1.5235626105944915,
|
|
"learning_rate": 7.4190033637701216e-06,
|
|
"loss": 0.4646,
|
|
"step": 667
|
|
},
|
|
{
|
|
"epoch": 0.3595989502725254,
|
|
"grad_norm": 1.9128329967338416,
|
|
"learning_rate": 7.411366557305495e-06,
|
|
"loss": 0.4626,
|
|
"step": 668
|
|
},
|
|
{
|
|
"epoch": 0.3601372720543705,
|
|
"grad_norm": 2.5022708068016097,
|
|
"learning_rate": 7.403722413554947e-06,
|
|
"loss": 0.4959,
|
|
"step": 669
|
|
},
|
|
{
|
|
"epoch": 0.3606755938362156,
|
|
"grad_norm": 1.8966801972869858,
|
|
"learning_rate": 7.396070955778013e-06,
|
|
"loss": 0.45,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.3612139156180607,
|
|
"grad_norm": 2.061313497940433,
|
|
"learning_rate": 7.388412207256486e-06,
|
|
"loss": 0.4961,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 0.3617522373999058,
|
|
"grad_norm": 1.6720715956995327,
|
|
"learning_rate": 7.380746191294341e-06,
|
|
"loss": 0.4667,
|
|
"step": 672
|
|
},
|
|
{
|
|
"epoch": 0.3622905591817509,
|
|
"grad_norm": 1.5487990630837682,
|
|
"learning_rate": 7.373072931217669e-06,
|
|
"loss": 0.527,
|
|
"step": 673
|
|
},
|
|
{
|
|
"epoch": 0.362828880963596,
|
|
"grad_norm": 1.4996736955806738,
|
|
"learning_rate": 7.365392450374598e-06,
|
|
"loss": 0.4353,
|
|
"step": 674
|
|
},
|
|
{
|
|
"epoch": 0.3633672027454411,
|
|
"grad_norm": 1.6372189463929279,
|
|
"learning_rate": 7.357704772135231e-06,
|
|
"loss": 0.469,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.3639055245272862,
|
|
"grad_norm": 1.5447454253844684,
|
|
"learning_rate": 7.350009919891574e-06,
|
|
"loss": 0.4278,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 0.3644438463091313,
|
|
"grad_norm": 1.4107385578994651,
|
|
"learning_rate": 7.342307917057457e-06,
|
|
"loss": 0.44,
|
|
"step": 677
|
|
},
|
|
{
|
|
"epoch": 0.3649821680909764,
|
|
"grad_norm": 1.4950963156286234,
|
|
"learning_rate": 7.334598787068469e-06,
|
|
"loss": 0.4529,
|
|
"step": 678
|
|
},
|
|
{
|
|
"epoch": 0.3655204898728215,
|
|
"grad_norm": 2.047196931688194,
|
|
"learning_rate": 7.326882553381886e-06,
|
|
"loss": 0.4993,
|
|
"step": 679
|
|
},
|
|
{
|
|
"epoch": 0.3660588116546666,
|
|
"grad_norm": 1.8078116478641435,
|
|
"learning_rate": 7.319159239476601e-06,
|
|
"loss": 0.4903,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.3665971334365117,
|
|
"grad_norm": 1.6585777335125267,
|
|
"learning_rate": 7.311428868853047e-06,
|
|
"loss": 0.449,
|
|
"step": 681
|
|
},
|
|
{
|
|
"epoch": 0.3671354552183568,
|
|
"grad_norm": 1.644551492901717,
|
|
"learning_rate": 7.30369146503313e-06,
|
|
"loss": 0.4359,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 0.3676737770002019,
|
|
"grad_norm": 1.566051715226832,
|
|
"learning_rate": 7.29594705156016e-06,
|
|
"loss": 0.5171,
|
|
"step": 683
|
|
},
|
|
{
|
|
"epoch": 0.36821209878204697,
|
|
"grad_norm": 1.860361723636211,
|
|
"learning_rate": 7.288195651998772e-06,
|
|
"loss": 0.5058,
|
|
"step": 684
|
|
},
|
|
{
|
|
"epoch": 0.36875042056389207,
|
|
"grad_norm": 1.479824820585221,
|
|
"learning_rate": 7.280437289934858e-06,
|
|
"loss": 0.5082,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.36928874234573716,
|
|
"grad_norm": 1.5621912841951935,
|
|
"learning_rate": 7.272671988975499e-06,
|
|
"loss": 0.4861,
|
|
"step": 686
|
|
},
|
|
{
|
|
"epoch": 0.36982706412758226,
|
|
"grad_norm": 1.6260728405178757,
|
|
"learning_rate": 7.264899772748889e-06,
|
|
"loss": 0.5003,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 0.37036538590942736,
|
|
"grad_norm": 1.5646367035382582,
|
|
"learning_rate": 7.2571206649042584e-06,
|
|
"loss": 0.4559,
|
|
"step": 688
|
|
},
|
|
{
|
|
"epoch": 0.37090370769127246,
|
|
"grad_norm": 1.7472551729015091,
|
|
"learning_rate": 7.249334689111814e-06,
|
|
"loss": 0.4541,
|
|
"step": 689
|
|
},
|
|
{
|
|
"epoch": 0.37144202947311755,
|
|
"grad_norm": 1.6362939723396042,
|
|
"learning_rate": 7.241541869062656e-06,
|
|
"loss": 0.4733,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.37198035125496265,
|
|
"grad_norm": 1.4710809281537391,
|
|
"learning_rate": 7.2337422284687135e-06,
|
|
"loss": 0.4523,
|
|
"step": 691
|
|
},
|
|
{
|
|
"epoch": 0.37251867303680775,
|
|
"grad_norm": 1.6849371563467512,
|
|
"learning_rate": 7.225935791062665e-06,
|
|
"loss": 0.4976,
|
|
"step": 692
|
|
},
|
|
{
|
|
"epoch": 0.37305699481865284,
|
|
"grad_norm": 1.7850003378424297,
|
|
"learning_rate": 7.2181225805978745e-06,
|
|
"loss": 0.4482,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 0.37359531660049794,
|
|
"grad_norm": 2.355398835881447,
|
|
"learning_rate": 7.210302620848315e-06,
|
|
"loss": 0.4599,
|
|
"step": 694
|
|
},
|
|
{
|
|
"epoch": 0.37413363838234304,
|
|
"grad_norm": 1.617194741699657,
|
|
"learning_rate": 7.20247593560849e-06,
|
|
"loss": 0.4543,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.37467196016418813,
|
|
"grad_norm": 1.4733355105927,
|
|
"learning_rate": 7.1946425486933755e-06,
|
|
"loss": 0.4125,
|
|
"step": 696
|
|
},
|
|
{
|
|
"epoch": 0.37521028194603323,
|
|
"grad_norm": 1.4512303803275823,
|
|
"learning_rate": 7.186802483938333e-06,
|
|
"loss": 0.4515,
|
|
"step": 697
|
|
},
|
|
{
|
|
"epoch": 0.3757486037278783,
|
|
"grad_norm": 1.4829224037632613,
|
|
"learning_rate": 7.178955765199048e-06,
|
|
"loss": 0.475,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 0.3762869255097234,
|
|
"grad_norm": 1.4882203445110318,
|
|
"learning_rate": 7.171102416351448e-06,
|
|
"loss": 0.4485,
|
|
"step": 699
|
|
},
|
|
{
|
|
"epoch": 0.3768252472915685,
|
|
"grad_norm": 1.6613200067557963,
|
|
"learning_rate": 7.163242461291639e-06,
|
|
"loss": 0.4402,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.3773635690734136,
|
|
"grad_norm": 1.7483634690103926,
|
|
"learning_rate": 7.155375923935826e-06,
|
|
"loss": 0.4936,
|
|
"step": 701
|
|
},
|
|
{
|
|
"epoch": 0.3779018908552587,
|
|
"grad_norm": 1.6616671629226913,
|
|
"learning_rate": 7.14750282822024e-06,
|
|
"loss": 0.4644,
|
|
"step": 702
|
|
},
|
|
{
|
|
"epoch": 0.3784402126371038,
|
|
"grad_norm": 1.5260208283942596,
|
|
"learning_rate": 7.139623198101073e-06,
|
|
"loss": 0.489,
|
|
"step": 703
|
|
},
|
|
{
|
|
"epoch": 0.3789785344189489,
|
|
"grad_norm": 1.361965813750003,
|
|
"learning_rate": 7.131737057554399e-06,
|
|
"loss": 0.3901,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 0.379516856200794,
|
|
"grad_norm": 1.620874046214403,
|
|
"learning_rate": 7.1238444305760975e-06,
|
|
"loss": 0.458,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.3800551779826391,
|
|
"grad_norm": 1.7744718469804224,
|
|
"learning_rate": 7.115945341181789e-06,
|
|
"loss": 0.4585,
|
|
"step": 706
|
|
},
|
|
{
|
|
"epoch": 0.3805934997644842,
|
|
"grad_norm": 1.4959797567409379,
|
|
"learning_rate": 7.108039813406755e-06,
|
|
"loss": 0.4497,
|
|
"step": 707
|
|
},
|
|
{
|
|
"epoch": 0.3811318215463293,
|
|
"grad_norm": 1.645088668489625,
|
|
"learning_rate": 7.10012787130587e-06,
|
|
"loss": 0.4419,
|
|
"step": 708
|
|
},
|
|
{
|
|
"epoch": 0.3816701433281744,
|
|
"grad_norm": 1.5908205648141605,
|
|
"learning_rate": 7.092209538953527e-06,
|
|
"loss": 0.4768,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 0.3822084651100195,
|
|
"grad_norm": 1.2865059891101038,
|
|
"learning_rate": 7.0842848404435574e-06,
|
|
"loss": 0.4432,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.3827467868918646,
|
|
"grad_norm": 1.438686585698748,
|
|
"learning_rate": 7.07635379988917e-06,
|
|
"loss": 0.463,
|
|
"step": 711
|
|
},
|
|
{
|
|
"epoch": 0.3832851086737097,
|
|
"grad_norm": 1.5810030390346108,
|
|
"learning_rate": 7.068416441422867e-06,
|
|
"loss": 0.4324,
|
|
"step": 712
|
|
},
|
|
{
|
|
"epoch": 0.3838234304555548,
|
|
"grad_norm": 1.8920886247581228,
|
|
"learning_rate": 7.060472789196378e-06,
|
|
"loss": 0.4513,
|
|
"step": 713
|
|
},
|
|
{
|
|
"epoch": 0.3843617522373999,
|
|
"grad_norm": 1.4721512319324748,
|
|
"learning_rate": 7.052522867380578e-06,
|
|
"loss": 0.4794,
|
|
"step": 714
|
|
},
|
|
{
|
|
"epoch": 0.38490007401924503,
|
|
"grad_norm": 1.8748283518664401,
|
|
"learning_rate": 7.044566700165426e-06,
|
|
"loss": 0.5359,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.38543839580109013,
|
|
"grad_norm": 2.1664339926414247,
|
|
"learning_rate": 7.036604311759879e-06,
|
|
"loss": 0.4696,
|
|
"step": 716
|
|
},
|
|
{
|
|
"epoch": 0.3859767175829352,
|
|
"grad_norm": 1.599064767192068,
|
|
"learning_rate": 7.028635726391826e-06,
|
|
"loss": 0.5009,
|
|
"step": 717
|
|
},
|
|
{
|
|
"epoch": 0.3865150393647803,
|
|
"grad_norm": 1.658951664965314,
|
|
"learning_rate": 7.020660968308011e-06,
|
|
"loss": 0.526,
|
|
"step": 718
|
|
},
|
|
{
|
|
"epoch": 0.3870533611466254,
|
|
"grad_norm": 1.5566803387570707,
|
|
"learning_rate": 7.012680061773962e-06,
|
|
"loss": 0.4944,
|
|
"step": 719
|
|
},
|
|
{
|
|
"epoch": 0.3875916829284705,
|
|
"grad_norm": 1.5561052872784167,
|
|
"learning_rate": 7.0046930310739145e-06,
|
|
"loss": 0.4023,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.3875916829284705,
|
|
"eval_loss": 0.4598337709903717,
|
|
"eval_runtime": 1512.3789,
|
|
"eval_samples_per_second": 16.537,
|
|
"eval_steps_per_second": 0.517,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.3881300047103156,
|
|
"grad_norm": 1.5343444055056177,
|
|
"learning_rate": 6.996699900510736e-06,
|
|
"loss": 0.4661,
|
|
"step": 721
|
|
},
|
|
{
|
|
"epoch": 0.3886683264921607,
|
|
"grad_norm": 1.5835711750557553,
|
|
"learning_rate": 6.988700694405861e-06,
|
|
"loss": 0.5243,
|
|
"step": 722
|
|
},
|
|
{
|
|
"epoch": 0.3892066482740058,
|
|
"grad_norm": 1.739458700941234,
|
|
"learning_rate": 6.980695437099203e-06,
|
|
"loss": 0.468,
|
|
"step": 723
|
|
},
|
|
{
|
|
"epoch": 0.3897449700558509,
|
|
"grad_norm": 1.4597418259308022,
|
|
"learning_rate": 6.972684152949095e-06,
|
|
"loss": 0.4312,
|
|
"step": 724
|
|
},
|
|
{
|
|
"epoch": 0.390283291837696,
|
|
"grad_norm": 1.4822140659700849,
|
|
"learning_rate": 6.964666866332202e-06,
|
|
"loss": 0.4171,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.3908216136195411,
|
|
"grad_norm": 2.219448742321713,
|
|
"learning_rate": 6.956643601643459e-06,
|
|
"loss": 0.4682,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 0.3913599354013862,
|
|
"grad_norm": 1.6249675680199915,
|
|
"learning_rate": 6.948614383295988e-06,
|
|
"loss": 0.467,
|
|
"step": 727
|
|
},
|
|
{
|
|
"epoch": 0.3918982571832313,
|
|
"grad_norm": 2.5331886913847916,
|
|
"learning_rate": 6.940579235721027e-06,
|
|
"loss": 0.5046,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 0.3924365789650764,
|
|
"grad_norm": 1.651989792055275,
|
|
"learning_rate": 6.932538183367854e-06,
|
|
"loss": 0.4432,
|
|
"step": 729
|
|
},
|
|
{
|
|
"epoch": 0.3929749007469215,
|
|
"grad_norm": 1.4451051204854284,
|
|
"learning_rate": 6.924491250703716e-06,
|
|
"loss": 0.436,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.3935132225287666,
|
|
"grad_norm": 1.6726948542569147,
|
|
"learning_rate": 6.916438462213756e-06,
|
|
"loss": 0.4701,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 0.3940515443106117,
|
|
"grad_norm": 1.3458270610890806,
|
|
"learning_rate": 6.908379842400926e-06,
|
|
"loss": 0.461,
|
|
"step": 732
|
|
},
|
|
{
|
|
"epoch": 0.3945898660924568,
|
|
"grad_norm": 1.8671906958135296,
|
|
"learning_rate": 6.90031541578593e-06,
|
|
"loss": 0.4621,
|
|
"step": 733
|
|
},
|
|
{
|
|
"epoch": 0.3951281878743019,
|
|
"grad_norm": 1.6937643401491398,
|
|
"learning_rate": 6.892245206907136e-06,
|
|
"loss": 0.4403,
|
|
"step": 734
|
|
},
|
|
{
|
|
"epoch": 0.39566650965614697,
|
|
"grad_norm": 1.6011629978962008,
|
|
"learning_rate": 6.88416924032051e-06,
|
|
"loss": 0.4832,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.39620483143799207,
|
|
"grad_norm": 1.7023847640279732,
|
|
"learning_rate": 6.876087540599532e-06,
|
|
"loss": 0.4871,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 0.39674315321983716,
|
|
"grad_norm": 1.5639503808317925,
|
|
"learning_rate": 6.868000132335132e-06,
|
|
"loss": 0.504,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 0.39728147500168226,
|
|
"grad_norm": 1.6209519657967315,
|
|
"learning_rate": 6.859907040135609e-06,
|
|
"loss": 0.4947,
|
|
"step": 738
|
|
},
|
|
{
|
|
"epoch": 0.39781979678352736,
|
|
"grad_norm": 1.4902231086791655,
|
|
"learning_rate": 6.851808288626554e-06,
|
|
"loss": 0.4329,
|
|
"step": 739
|
|
},
|
|
{
|
|
"epoch": 0.39835811856537245,
|
|
"grad_norm": 1.4751989923406863,
|
|
"learning_rate": 6.843703902450781e-06,
|
|
"loss": 0.469,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.39889644034721755,
|
|
"grad_norm": 1.7318655949983495,
|
|
"learning_rate": 6.8355939062682485e-06,
|
|
"loss": 0.4646,
|
|
"step": 741
|
|
},
|
|
{
|
|
"epoch": 0.39943476212906265,
|
|
"grad_norm": 2.0477062374958312,
|
|
"learning_rate": 6.827478324755986e-06,
|
|
"loss": 0.4527,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 0.39997308391090775,
|
|
"grad_norm": 1.5357049173396753,
|
|
"learning_rate": 6.819357182608014e-06,
|
|
"loss": 0.4119,
|
|
"step": 743
|
|
},
|
|
{
|
|
"epoch": 0.40051140569275284,
|
|
"grad_norm": 1.6669074072618764,
|
|
"learning_rate": 6.811230504535276e-06,
|
|
"loss": 0.4123,
|
|
"step": 744
|
|
},
|
|
{
|
|
"epoch": 0.40104972747459794,
|
|
"grad_norm": 2.0238793916536095,
|
|
"learning_rate": 6.803098315265563e-06,
|
|
"loss": 0.4607,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.40158804925644304,
|
|
"grad_norm": 1.7302550872159141,
|
|
"learning_rate": 6.7949606395434294e-06,
|
|
"loss": 0.5252,
|
|
"step": 746
|
|
},
|
|
{
|
|
"epoch": 0.40212637103828813,
|
|
"grad_norm": 1.5575167275155066,
|
|
"learning_rate": 6.786817502130127e-06,
|
|
"loss": 0.4484,
|
|
"step": 747
|
|
},
|
|
{
|
|
"epoch": 0.40266469282013323,
|
|
"grad_norm": 1.3960320100955355,
|
|
"learning_rate": 6.778668927803526e-06,
|
|
"loss": 0.444,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 0.4032030146019783,
|
|
"grad_norm": 1.5537207671933355,
|
|
"learning_rate": 6.770514941358041e-06,
|
|
"loss": 0.4522,
|
|
"step": 749
|
|
},
|
|
{
|
|
"epoch": 0.4037413363838234,
|
|
"grad_norm": 1.6191186519608955,
|
|
"learning_rate": 6.762355567604553e-06,
|
|
"loss": 0.489,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.4042796581656685,
|
|
"grad_norm": 1.7320364851332162,
|
|
"learning_rate": 6.7541908313703355e-06,
|
|
"loss": 0.4746,
|
|
"step": 751
|
|
},
|
|
{
|
|
"epoch": 0.4048179799475136,
|
|
"grad_norm": 1.5268044530623444,
|
|
"learning_rate": 6.746020757498979e-06,
|
|
"loss": 0.4138,
|
|
"step": 752
|
|
},
|
|
{
|
|
"epoch": 0.4053563017293587,
|
|
"grad_norm": 1.522928297135606,
|
|
"learning_rate": 6.737845370850317e-06,
|
|
"loss": 0.4938,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 0.4058946235112038,
|
|
"grad_norm": 1.567608770456755,
|
|
"learning_rate": 6.729664696300347e-06,
|
|
"loss": 0.4745,
|
|
"step": 754
|
|
},
|
|
{
|
|
"epoch": 0.4064329452930489,
|
|
"grad_norm": 1.5048680773669196,
|
|
"learning_rate": 6.721478758741155e-06,
|
|
"loss": 0.4714,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.406971267074894,
|
|
"grad_norm": 1.7508536934704277,
|
|
"learning_rate": 6.713287583080845e-06,
|
|
"loss": 0.4778,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 0.4075095888567391,
|
|
"grad_norm": 1.6217945250756625,
|
|
"learning_rate": 6.70509119424346e-06,
|
|
"loss": 0.4529,
|
|
"step": 757
|
|
},
|
|
{
|
|
"epoch": 0.4080479106385842,
|
|
"grad_norm": 1.6092594479977214,
|
|
"learning_rate": 6.696889617168897e-06,
|
|
"loss": 0.4674,
|
|
"step": 758
|
|
},
|
|
{
|
|
"epoch": 0.4085862324204293,
|
|
"grad_norm": 1.5153766468742507,
|
|
"learning_rate": 6.688682876812851e-06,
|
|
"loss": 0.4612,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 0.4091245542022744,
|
|
"grad_norm": 1.6200362705011053,
|
|
"learning_rate": 6.6804709981467195e-06,
|
|
"loss": 0.4812,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.4096628759841195,
|
|
"grad_norm": 1.6047382022765324,
|
|
"learning_rate": 6.672254006157541e-06,
|
|
"loss": 0.4758,
|
|
"step": 761
|
|
},
|
|
{
|
|
"epoch": 0.4102011977659646,
|
|
"grad_norm": 1.8520426373676713,
|
|
"learning_rate": 6.664031925847908e-06,
|
|
"loss": 0.4184,
|
|
"step": 762
|
|
},
|
|
{
|
|
"epoch": 0.4107395195478097,
|
|
"grad_norm": 2.2658987317474195,
|
|
"learning_rate": 6.6558047822358975e-06,
|
|
"loss": 0.5178,
|
|
"step": 763
|
|
},
|
|
{
|
|
"epoch": 0.4112778413296548,
|
|
"grad_norm": 1.580321228406977,
|
|
"learning_rate": 6.6475726003549934e-06,
|
|
"loss": 0.4249,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 0.4118161631114999,
|
|
"grad_norm": 1.4077736219835957,
|
|
"learning_rate": 6.639335405254008e-06,
|
|
"loss": 0.4586,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.412354484893345,
|
|
"grad_norm": 1.5112139801178681,
|
|
"learning_rate": 6.631093221997012e-06,
|
|
"loss": 0.4316,
|
|
"step": 766
|
|
},
|
|
{
|
|
"epoch": 0.41289280667519007,
|
|
"grad_norm": 1.4529648200398257,
|
|
"learning_rate": 6.6228460756632496e-06,
|
|
"loss": 0.4571,
|
|
"step": 767
|
|
},
|
|
{
|
|
"epoch": 0.41343112845703517,
|
|
"grad_norm": 1.826148495373045,
|
|
"learning_rate": 6.61459399134707e-06,
|
|
"loss": 0.4278,
|
|
"step": 768
|
|
},
|
|
{
|
|
"epoch": 0.41396945023888027,
|
|
"grad_norm": 1.5179851185666227,
|
|
"learning_rate": 6.6063369941578445e-06,
|
|
"loss": 0.4622,
|
|
"step": 769
|
|
},
|
|
{
|
|
"epoch": 0.41450777202072536,
|
|
"grad_norm": 1.3529363726674315,
|
|
"learning_rate": 6.5980751092198955e-06,
|
|
"loss": 0.4215,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.41504609380257046,
|
|
"grad_norm": 2.0731132539429944,
|
|
"learning_rate": 6.589808361672417e-06,
|
|
"loss": 0.484,
|
|
"step": 771
|
|
},
|
|
{
|
|
"epoch": 0.4155844155844156,
|
|
"grad_norm": 1.4870501106627148,
|
|
"learning_rate": 6.581536776669402e-06,
|
|
"loss": 0.4863,
|
|
"step": 772
|
|
},
|
|
{
|
|
"epoch": 0.4161227373662607,
|
|
"grad_norm": 1.9062099501037697,
|
|
"learning_rate": 6.5732603793795535e-06,
|
|
"loss": 0.4238,
|
|
"step": 773
|
|
},
|
|
{
|
|
"epoch": 0.4166610591481058,
|
|
"grad_norm": 1.5565227999579219,
|
|
"learning_rate": 6.564979194986229e-06,
|
|
"loss": 0.4524,
|
|
"step": 774
|
|
},
|
|
{
|
|
"epoch": 0.4171993809299509,
|
|
"grad_norm": 2.306172957615922,
|
|
"learning_rate": 6.5566932486873455e-06,
|
|
"loss": 0.4964,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.417737702711796,
|
|
"grad_norm": 1.401583156601946,
|
|
"learning_rate": 6.54840256569531e-06,
|
|
"loss": 0.4304,
|
|
"step": 776
|
|
},
|
|
{
|
|
"epoch": 0.4182760244936411,
|
|
"grad_norm": 1.749412909981746,
|
|
"learning_rate": 6.540107171236943e-06,
|
|
"loss": 0.4844,
|
|
"step": 777
|
|
},
|
|
{
|
|
"epoch": 0.4188143462754862,
|
|
"grad_norm": 1.6322807652870075,
|
|
"learning_rate": 6.531807090553402e-06,
|
|
"loss": 0.4853,
|
|
"step": 778
|
|
},
|
|
{
|
|
"epoch": 0.4193526680573313,
|
|
"grad_norm": 1.2479234535295218,
|
|
"learning_rate": 6.5235023489001046e-06,
|
|
"loss": 0.4491,
|
|
"step": 779
|
|
},
|
|
{
|
|
"epoch": 0.4198909898391764,
|
|
"grad_norm": 1.5833625576839316,
|
|
"learning_rate": 6.515192971546645e-06,
|
|
"loss": 0.4171,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.4198909898391764,
|
|
"eval_loss": 0.4564184546470642,
|
|
"eval_runtime": 1517.3821,
|
|
"eval_samples_per_second": 16.482,
|
|
"eval_steps_per_second": 0.515,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.4204293116210215,
|
|
"grad_norm": 1.5809122747897906,
|
|
"learning_rate": 6.50687898377673e-06,
|
|
"loss": 0.4087,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 0.4209676334028666,
|
|
"grad_norm": 1.5387429096209948,
|
|
"learning_rate": 6.49856041088809e-06,
|
|
"loss": 0.4414,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 0.4215059551847117,
|
|
"grad_norm": 1.6020701369523538,
|
|
"learning_rate": 6.49023727819241e-06,
|
|
"loss": 0.4237,
|
|
"step": 783
|
|
},
|
|
{
|
|
"epoch": 0.4220442769665568,
|
|
"grad_norm": 1.6896383664306511,
|
|
"learning_rate": 6.481909611015249e-06,
|
|
"loss": 0.5049,
|
|
"step": 784
|
|
},
|
|
{
|
|
"epoch": 0.42258259874840187,
|
|
"grad_norm": 1.4623261927757227,
|
|
"learning_rate": 6.47357743469596e-06,
|
|
"loss": 0.4513,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.42312092053024697,
|
|
"grad_norm": 1.8063028002015338,
|
|
"learning_rate": 6.465240774587623e-06,
|
|
"loss": 0.4917,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 0.42365924231209207,
|
|
"grad_norm": 1.639390083578586,
|
|
"learning_rate": 6.4568996560569515e-06,
|
|
"loss": 0.4578,
|
|
"step": 787
|
|
},
|
|
{
|
|
"epoch": 0.42419756409393716,
|
|
"grad_norm": 1.337761070121856,
|
|
"learning_rate": 6.448554104484236e-06,
|
|
"loss": 0.4523,
|
|
"step": 788
|
|
},
|
|
{
|
|
"epoch": 0.42473588587578226,
|
|
"grad_norm": 1.518872556678575,
|
|
"learning_rate": 6.44020414526325e-06,
|
|
"loss": 0.4384,
|
|
"step": 789
|
|
},
|
|
{
|
|
"epoch": 0.42527420765762736,
|
|
"grad_norm": 1.491028002743192,
|
|
"learning_rate": 6.431849803801179e-06,
|
|
"loss": 0.451,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.42581252943947245,
|
|
"grad_norm": 2.093042650030991,
|
|
"learning_rate": 6.423491105518542e-06,
|
|
"loss": 0.4656,
|
|
"step": 791
|
|
},
|
|
{
|
|
"epoch": 0.42635085122131755,
|
|
"grad_norm": 1.9063256309499805,
|
|
"learning_rate": 6.415128075849118e-06,
|
|
"loss": 0.4848,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 0.42688917300316265,
|
|
"grad_norm": 1.7660120890204227,
|
|
"learning_rate": 6.4067607402398625e-06,
|
|
"loss": 0.4451,
|
|
"step": 793
|
|
},
|
|
{
|
|
"epoch": 0.42742749478500774,
|
|
"grad_norm": 1.577961253859089,
|
|
"learning_rate": 6.398389124150832e-06,
|
|
"loss": 0.485,
|
|
"step": 794
|
|
},
|
|
{
|
|
"epoch": 0.42796581656685284,
|
|
"grad_norm": 1.6746798086361996,
|
|
"learning_rate": 6.3900132530551125e-06,
|
|
"loss": 0.4521,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.42850413834869794,
|
|
"grad_norm": 1.696615006593536,
|
|
"learning_rate": 6.381633152438733e-06,
|
|
"loss": 0.4406,
|
|
"step": 796
|
|
},
|
|
{
|
|
"epoch": 0.42904246013054304,
|
|
"grad_norm": 3.213801364228645,
|
|
"learning_rate": 6.373248847800595e-06,
|
|
"loss": 0.5115,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 0.42958078191238813,
|
|
"grad_norm": 1.719986070739237,
|
|
"learning_rate": 6.364860364652388e-06,
|
|
"loss": 0.4237,
|
|
"step": 798
|
|
},
|
|
{
|
|
"epoch": 0.43011910369423323,
|
|
"grad_norm": 1.778509802687885,
|
|
"learning_rate": 6.3564677285185196e-06,
|
|
"loss": 0.4568,
|
|
"step": 799
|
|
},
|
|
{
|
|
"epoch": 0.4306574254760783,
|
|
"grad_norm": 1.5260126863179546,
|
|
"learning_rate": 6.348070964936032e-06,
|
|
"loss": 0.4337,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.4311957472579234,
|
|
"grad_norm": 1.5937231247097972,
|
|
"learning_rate": 6.339670099454526e-06,
|
|
"loss": 0.4642,
|
|
"step": 801
|
|
},
|
|
{
|
|
"epoch": 0.4317340690397685,
|
|
"grad_norm": 2.9535392042792465,
|
|
"learning_rate": 6.3312651576360866e-06,
|
|
"loss": 0.4434,
|
|
"step": 802
|
|
},
|
|
{
|
|
"epoch": 0.4322723908216136,
|
|
"grad_norm": 1.49472223900728,
|
|
"learning_rate": 6.322856165055198e-06,
|
|
"loss": 0.4125,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 0.4328107126034587,
|
|
"grad_norm": 2.242176131558003,
|
|
"learning_rate": 6.314443147298675e-06,
|
|
"loss": 0.49,
|
|
"step": 804
|
|
},
|
|
{
|
|
"epoch": 0.4333490343853038,
|
|
"grad_norm": 1.681655235385771,
|
|
"learning_rate": 6.306026129965573e-06,
|
|
"loss": 0.4245,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.4338873561671489,
|
|
"grad_norm": 1.5909295811480582,
|
|
"learning_rate": 6.297605138667127e-06,
|
|
"loss": 0.4748,
|
|
"step": 806
|
|
},
|
|
{
|
|
"epoch": 0.434425677948994,
|
|
"grad_norm": 1.5145278838582474,
|
|
"learning_rate": 6.289180199026654e-06,
|
|
"loss": 0.4578,
|
|
"step": 807
|
|
},
|
|
{
|
|
"epoch": 0.4349639997308391,
|
|
"grad_norm": 1.459737051246134,
|
|
"learning_rate": 6.280751336679495e-06,
|
|
"loss": 0.4637,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 0.4355023215126842,
|
|
"grad_norm": 1.6191142290587295,
|
|
"learning_rate": 6.2723185772729166e-06,
|
|
"loss": 0.4582,
|
|
"step": 809
|
|
},
|
|
{
|
|
"epoch": 0.4360406432945293,
|
|
"grad_norm": 2.0040844342157422,
|
|
"learning_rate": 6.263881946466049e-06,
|
|
"loss": 0.4783,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.4365789650763744,
|
|
"grad_norm": 1.7322826082498741,
|
|
"learning_rate": 6.255441469929804e-06,
|
|
"loss": 0.5002,
|
|
"step": 811
|
|
},
|
|
{
|
|
"epoch": 0.4371172868582195,
|
|
"grad_norm": 1.4894619670010198,
|
|
"learning_rate": 6.2469971733467925e-06,
|
|
"loss": 0.4253,
|
|
"step": 812
|
|
},
|
|
{
|
|
"epoch": 0.4376556086400646,
|
|
"grad_norm": 1.6488111913669299,
|
|
"learning_rate": 6.238549082411247e-06,
|
|
"loss": 0.4539,
|
|
"step": 813
|
|
},
|
|
{
|
|
"epoch": 0.4381939304219097,
|
|
"grad_norm": 1.3488898562178637,
|
|
"learning_rate": 6.230097222828949e-06,
|
|
"loss": 0.4623,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 0.4387322522037548,
|
|
"grad_norm": 1.6423043283763479,
|
|
"learning_rate": 6.221641620317147e-06,
|
|
"loss": 0.4921,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.4392705739855999,
|
|
"grad_norm": 1.9335639612379423,
|
|
"learning_rate": 6.2131823006044756e-06,
|
|
"loss": 0.4453,
|
|
"step": 816
|
|
},
|
|
{
|
|
"epoch": 0.439808895767445,
|
|
"grad_norm": 1.389152591337612,
|
|
"learning_rate": 6.2047192894308815e-06,
|
|
"loss": 0.4413,
|
|
"step": 817
|
|
},
|
|
{
|
|
"epoch": 0.44034721754929007,
|
|
"grad_norm": 1.983305422880984,
|
|
"learning_rate": 6.196252612547545e-06,
|
|
"loss": 0.5093,
|
|
"step": 818
|
|
},
|
|
{
|
|
"epoch": 0.44088553933113517,
|
|
"grad_norm": 2.053814295705837,
|
|
"learning_rate": 6.187782295716802e-06,
|
|
"loss": 0.4381,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 0.44142386111298026,
|
|
"grad_norm": 1.547864349515979,
|
|
"learning_rate": 6.179308364712056e-06,
|
|
"loss": 0.4932,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.44196218289482536,
|
|
"grad_norm": 1.4111506897228125,
|
|
"learning_rate": 6.170830845317717e-06,
|
|
"loss": 0.4695,
|
|
"step": 821
|
|
},
|
|
{
|
|
"epoch": 0.44250050467667046,
|
|
"grad_norm": 2.5994615269947485,
|
|
"learning_rate": 6.162349763329109e-06,
|
|
"loss": 0.5318,
|
|
"step": 822
|
|
},
|
|
{
|
|
"epoch": 0.44303882645851556,
|
|
"grad_norm": 1.5802737203663468,
|
|
"learning_rate": 6.153865144552398e-06,
|
|
"loss": 0.4676,
|
|
"step": 823
|
|
},
|
|
{
|
|
"epoch": 0.44357714824036065,
|
|
"grad_norm": 1.4711770748421387,
|
|
"learning_rate": 6.145377014804509e-06,
|
|
"loss": 0.4687,
|
|
"step": 824
|
|
},
|
|
{
|
|
"epoch": 0.44411547002220575,
|
|
"grad_norm": 1.3383114582462243,
|
|
"learning_rate": 6.136885399913052e-06,
|
|
"loss": 0.4514,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.44465379180405085,
|
|
"grad_norm": 1.375700143244168,
|
|
"learning_rate": 6.1283903257162434e-06,
|
|
"loss": 0.4581,
|
|
"step": 826
|
|
},
|
|
{
|
|
"epoch": 0.44519211358589594,
|
|
"grad_norm": 1.6933351988143874,
|
|
"learning_rate": 6.119891818062822e-06,
|
|
"loss": 0.4399,
|
|
"step": 827
|
|
},
|
|
{
|
|
"epoch": 0.44573043536774104,
|
|
"grad_norm": 1.4137670063234855,
|
|
"learning_rate": 6.1113899028119764e-06,
|
|
"loss": 0.4298,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 0.4462687571495862,
|
|
"grad_norm": 1.8781325581931287,
|
|
"learning_rate": 6.102884605833262e-06,
|
|
"loss": 0.4921,
|
|
"step": 829
|
|
},
|
|
{
|
|
"epoch": 0.4468070789314313,
|
|
"grad_norm": 1.5329498351981126,
|
|
"learning_rate": 6.094375953006527e-06,
|
|
"loss": 0.4518,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.4473454007132764,
|
|
"grad_norm": 1.6692806133274172,
|
|
"learning_rate": 6.085863970221827e-06,
|
|
"loss": 0.5337,
|
|
"step": 831
|
|
},
|
|
{
|
|
"epoch": 0.4478837224951215,
|
|
"grad_norm": 1.5092683621943173,
|
|
"learning_rate": 6.077348683379351e-06,
|
|
"loss": 0.4578,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 0.4484220442769666,
|
|
"grad_norm": 1.6510945855973929,
|
|
"learning_rate": 6.068830118389345e-06,
|
|
"loss": 0.479,
|
|
"step": 833
|
|
},
|
|
{
|
|
"epoch": 0.4489603660588117,
|
|
"grad_norm": 2.639396623007194,
|
|
"learning_rate": 6.060308301172026e-06,
|
|
"loss": 0.451,
|
|
"step": 834
|
|
},
|
|
{
|
|
"epoch": 0.4494986878406568,
|
|
"grad_norm": 1.8709014826106682,
|
|
"learning_rate": 6.051783257657508e-06,
|
|
"loss": 0.5109,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.45003700962250187,
|
|
"grad_norm": 2.1325245569205284,
|
|
"learning_rate": 6.04325501378572e-06,
|
|
"loss": 0.4874,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 0.45057533140434697,
|
|
"grad_norm": 1.4972184191802396,
|
|
"learning_rate": 6.034723595506334e-06,
|
|
"loss": 0.4671,
|
|
"step": 837
|
|
},
|
|
{
|
|
"epoch": 0.45111365318619207,
|
|
"grad_norm": 1.3179174814289414,
|
|
"learning_rate": 6.026189028778675e-06,
|
|
"loss": 0.4078,
|
|
"step": 838
|
|
},
|
|
{
|
|
"epoch": 0.45165197496803716,
|
|
"grad_norm": 1.521198968359238,
|
|
"learning_rate": 6.017651339571652e-06,
|
|
"loss": 0.4456,
|
|
"step": 839
|
|
},
|
|
{
|
|
"epoch": 0.45219029674988226,
|
|
"grad_norm": 1.4836797423023151,
|
|
"learning_rate": 6.009110553863674e-06,
|
|
"loss": 0.4497,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.45219029674988226,
|
|
"eval_loss": 0.4534289836883545,
|
|
"eval_runtime": 1525.9354,
|
|
"eval_samples_per_second": 16.39,
|
|
"eval_steps_per_second": 0.512,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.45272861853172736,
|
|
"grad_norm": 1.808617433298175,
|
|
"learning_rate": 6.000566697642575e-06,
|
|
"loss": 0.435,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 0.45326694031357245,
|
|
"grad_norm": 2.008290454012663,
|
|
"learning_rate": 5.992019796905524e-06,
|
|
"loss": 0.4626,
|
|
"step": 842
|
|
},
|
|
{
|
|
"epoch": 0.45380526209541755,
|
|
"grad_norm": 1.7710157949578111,
|
|
"learning_rate": 5.9834698776589614e-06,
|
|
"loss": 0.4311,
|
|
"step": 843
|
|
},
|
|
{
|
|
"epoch": 0.45434358387726265,
|
|
"grad_norm": 1.6230775011015806,
|
|
"learning_rate": 5.9749169659185104e-06,
|
|
"loss": 0.4693,
|
|
"step": 844
|
|
},
|
|
{
|
|
"epoch": 0.45488190565910774,
|
|
"grad_norm": 1.3639464284433171,
|
|
"learning_rate": 5.966361087708898e-06,
|
|
"loss": 0.4658,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.45542022744095284,
|
|
"grad_norm": 1.8137146027163404,
|
|
"learning_rate": 5.957802269063878e-06,
|
|
"loss": 0.4567,
|
|
"step": 846
|
|
},
|
|
{
|
|
"epoch": 0.45595854922279794,
|
|
"grad_norm": 1.6758956331351547,
|
|
"learning_rate": 5.949240536026153e-06,
|
|
"loss": 0.467,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 0.45649687100464303,
|
|
"grad_norm": 1.5131926980070547,
|
|
"learning_rate": 5.940675914647293e-06,
|
|
"loss": 0.4106,
|
|
"step": 848
|
|
},
|
|
{
|
|
"epoch": 0.45703519278648813,
|
|
"grad_norm": 1.5046633719884865,
|
|
"learning_rate": 5.9321084309876555e-06,
|
|
"loss": 0.4282,
|
|
"step": 849
|
|
},
|
|
{
|
|
"epoch": 0.45757351456833323,
|
|
"grad_norm": 1.6481158877878923,
|
|
"learning_rate": 5.923538111116307e-06,
|
|
"loss": 0.4414,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.4581118363501783,
|
|
"grad_norm": 2.175705374474076,
|
|
"learning_rate": 5.914964981110944e-06,
|
|
"loss": 0.5038,
|
|
"step": 851
|
|
},
|
|
{
|
|
"epoch": 0.4586501581320234,
|
|
"grad_norm": 1.748850851161863,
|
|
"learning_rate": 5.906389067057819e-06,
|
|
"loss": 0.4603,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 0.4591884799138685,
|
|
"grad_norm": 1.5440809581743327,
|
|
"learning_rate": 5.897810395051646e-06,
|
|
"loss": 0.4697,
|
|
"step": 853
|
|
},
|
|
{
|
|
"epoch": 0.4597268016957136,
|
|
"grad_norm": 1.5332714275032744,
|
|
"learning_rate": 5.889228991195539e-06,
|
|
"loss": 0.4549,
|
|
"step": 854
|
|
},
|
|
{
|
|
"epoch": 0.4602651234775587,
|
|
"grad_norm": 1.6246537267152152,
|
|
"learning_rate": 5.880644881600921e-06,
|
|
"loss": 0.4413,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.4608034452594038,
|
|
"grad_norm": 1.7384003721983572,
|
|
"learning_rate": 5.872058092387449e-06,
|
|
"loss": 0.5178,
|
|
"step": 856
|
|
},
|
|
{
|
|
"epoch": 0.4613417670412489,
|
|
"grad_norm": 1.4306474231507047,
|
|
"learning_rate": 5.863468649682933e-06,
|
|
"loss": 0.4584,
|
|
"step": 857
|
|
},
|
|
{
|
|
"epoch": 0.461880088823094,
|
|
"grad_norm": 1.7487008875581123,
|
|
"learning_rate": 5.8548765796232565e-06,
|
|
"loss": 0.4775,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 0.4624184106049391,
|
|
"grad_norm": 1.6200058585564832,
|
|
"learning_rate": 5.846281908352299e-06,
|
|
"loss": 0.4718,
|
|
"step": 859
|
|
},
|
|
{
|
|
"epoch": 0.4629567323867842,
|
|
"grad_norm": 1.4993582658806037,
|
|
"learning_rate": 5.837684662021856e-06,
|
|
"loss": 0.4367,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.4634950541686293,
|
|
"grad_norm": 1.6215871681690963,
|
|
"learning_rate": 5.829084866791551e-06,
|
|
"loss": 0.4891,
|
|
"step": 861
|
|
},
|
|
{
|
|
"epoch": 0.4640333759504744,
|
|
"grad_norm": 1.6479378578126422,
|
|
"learning_rate": 5.820482548828773e-06,
|
|
"loss": 0.4701,
|
|
"step": 862
|
|
},
|
|
{
|
|
"epoch": 0.4645716977323195,
|
|
"grad_norm": 1.709497613352161,
|
|
"learning_rate": 5.811877734308583e-06,
|
|
"loss": 0.4314,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 0.4651100195141646,
|
|
"grad_norm": 1.850585526202356,
|
|
"learning_rate": 5.803270449413636e-06,
|
|
"loss": 0.4399,
|
|
"step": 864
|
|
},
|
|
{
|
|
"epoch": 0.4656483412960097,
|
|
"grad_norm": 1.4300437023045451,
|
|
"learning_rate": 5.7946607203341075e-06,
|
|
"loss": 0.4434,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.4661866630778548,
|
|
"grad_norm": 1.4799373263095972,
|
|
"learning_rate": 5.786048573267608e-06,
|
|
"loss": 0.4065,
|
|
"step": 866
|
|
},
|
|
{
|
|
"epoch": 0.4667249848596999,
|
|
"grad_norm": 1.8869037850434587,
|
|
"learning_rate": 5.777434034419111e-06,
|
|
"loss": 0.4823,
|
|
"step": 867
|
|
},
|
|
{
|
|
"epoch": 0.467263306641545,
|
|
"grad_norm": 1.720619241457494,
|
|
"learning_rate": 5.768817130000857e-06,
|
|
"loss": 0.4444,
|
|
"step": 868
|
|
},
|
|
{
|
|
"epoch": 0.46780162842339007,
|
|
"grad_norm": 1.3809501342652182,
|
|
"learning_rate": 5.760197886232292e-06,
|
|
"loss": 0.4058,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 0.46833995020523517,
|
|
"grad_norm": 1.6474446895806825,
|
|
"learning_rate": 5.75157632933998e-06,
|
|
"loss": 0.4244,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.46887827198708026,
|
|
"grad_norm": 1.3347455312904397,
|
|
"learning_rate": 5.7429524855575216e-06,
|
|
"loss": 0.4509,
|
|
"step": 871
|
|
},
|
|
{
|
|
"epoch": 0.46941659376892536,
|
|
"grad_norm": 2.4700574740497583,
|
|
"learning_rate": 5.7343263811254746e-06,
|
|
"loss": 0.4078,
|
|
"step": 872
|
|
},
|
|
{
|
|
"epoch": 0.46995491555077046,
|
|
"grad_norm": 1.6808144924631037,
|
|
"learning_rate": 5.725698042291279e-06,
|
|
"loss": 0.445,
|
|
"step": 873
|
|
},
|
|
{
|
|
"epoch": 0.47049323733261555,
|
|
"grad_norm": 1.6561338534624221,
|
|
"learning_rate": 5.717067495309172e-06,
|
|
"loss": 0.4626,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 0.47103155911446065,
|
|
"grad_norm": 1.4357104359447126,
|
|
"learning_rate": 5.708434766440109e-06,
|
|
"loss": 0.4253,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.47156988089630575,
|
|
"grad_norm": 1.5584705980730198,
|
|
"learning_rate": 5.699799881951684e-06,
|
|
"loss": 0.4326,
|
|
"step": 876
|
|
},
|
|
{
|
|
"epoch": 0.47210820267815085,
|
|
"grad_norm": 1.6134096232268902,
|
|
"learning_rate": 5.691162868118052e-06,
|
|
"loss": 0.4361,
|
|
"step": 877
|
|
},
|
|
{
|
|
"epoch": 0.47264652445999594,
|
|
"grad_norm": 1.4597620039500387,
|
|
"learning_rate": 5.682523751219846e-06,
|
|
"loss": 0.4009,
|
|
"step": 878
|
|
},
|
|
{
|
|
"epoch": 0.47318484624184104,
|
|
"grad_norm": 1.6065681327100592,
|
|
"learning_rate": 5.673882557544098e-06,
|
|
"loss": 0.4859,
|
|
"step": 879
|
|
},
|
|
{
|
|
"epoch": 0.47372316802368614,
|
|
"grad_norm": 1.5207533993363942,
|
|
"learning_rate": 5.665239313384161e-06,
|
|
"loss": 0.4281,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.47426148980553123,
|
|
"grad_norm": 1.4714029139534557,
|
|
"learning_rate": 5.656594045039623e-06,
|
|
"loss": 0.4364,
|
|
"step": 881
|
|
},
|
|
{
|
|
"epoch": 0.47479981158737633,
|
|
"grad_norm": 1.7055967072229654,
|
|
"learning_rate": 5.647946778816238e-06,
|
|
"loss": 0.5044,
|
|
"step": 882
|
|
},
|
|
{
|
|
"epoch": 0.4753381333692214,
|
|
"grad_norm": 1.7261543220071143,
|
|
"learning_rate": 5.639297541025831e-06,
|
|
"loss": 0.486,
|
|
"step": 883
|
|
},
|
|
{
|
|
"epoch": 0.4758764551510665,
|
|
"grad_norm": 1.6626927738024924,
|
|
"learning_rate": 5.630646357986232e-06,
|
|
"loss": 0.5142,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 0.4764147769329116,
|
|
"grad_norm": 1.5653946306822688,
|
|
"learning_rate": 5.621993256021188e-06,
|
|
"loss": 0.4364,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.4769530987147568,
|
|
"grad_norm": 1.8026208698346797,
|
|
"learning_rate": 5.613338261460287e-06,
|
|
"loss": 0.4538,
|
|
"step": 886
|
|
},
|
|
{
|
|
"epoch": 0.47749142049660187,
|
|
"grad_norm": 1.6799784860946594,
|
|
"learning_rate": 5.6046814006388705e-06,
|
|
"loss": 0.4644,
|
|
"step": 887
|
|
},
|
|
{
|
|
"epoch": 0.47802974227844697,
|
|
"grad_norm": 1.4364276865950356,
|
|
"learning_rate": 5.596022699897963e-06,
|
|
"loss": 0.4051,
|
|
"step": 888
|
|
},
|
|
{
|
|
"epoch": 0.47856806406029206,
|
|
"grad_norm": 1.6914469502870713,
|
|
"learning_rate": 5.587362185584189e-06,
|
|
"loss": 0.4871,
|
|
"step": 889
|
|
},
|
|
{
|
|
"epoch": 0.47910638584213716,
|
|
"grad_norm": 1.4415518156055118,
|
|
"learning_rate": 5.578699884049683e-06,
|
|
"loss": 0.4429,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.47964470762398226,
|
|
"grad_norm": 1.4674935937695475,
|
|
"learning_rate": 5.570035821652029e-06,
|
|
"loss": 0.426,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 0.48018302940582736,
|
|
"grad_norm": 2.1147351198112982,
|
|
"learning_rate": 5.561370024754161e-06,
|
|
"loss": 0.4789,
|
|
"step": 892
|
|
},
|
|
{
|
|
"epoch": 0.48072135118767245,
|
|
"grad_norm": 1.4253127193278772,
|
|
"learning_rate": 5.552702519724294e-06,
|
|
"loss": 0.4346,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 0.48125967296951755,
|
|
"grad_norm": 3.7503200169998676,
|
|
"learning_rate": 5.544033332935838e-06,
|
|
"loss": 0.4393,
|
|
"step": 894
|
|
},
|
|
{
|
|
"epoch": 0.48179799475136265,
|
|
"grad_norm": 2.1079137772003818,
|
|
"learning_rate": 5.535362490767323e-06,
|
|
"loss": 0.5118,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.48233631653320774,
|
|
"grad_norm": 2.2185325950005477,
|
|
"learning_rate": 5.526690019602315e-06,
|
|
"loss": 0.3894,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 0.48287463831505284,
|
|
"grad_norm": 1.5274617672885367,
|
|
"learning_rate": 5.518015945829337e-06,
|
|
"loss": 0.42,
|
|
"step": 897
|
|
},
|
|
{
|
|
"epoch": 0.48341296009689794,
|
|
"grad_norm": 1.622273471984762,
|
|
"learning_rate": 5.509340295841785e-06,
|
|
"loss": 0.5112,
|
|
"step": 898
|
|
},
|
|
{
|
|
"epoch": 0.48395128187874303,
|
|
"grad_norm": 1.5776105686627353,
|
|
"learning_rate": 5.500663096037856e-06,
|
|
"loss": 0.4577,
|
|
"step": 899
|
|
},
|
|
{
|
|
"epoch": 0.48448960366058813,
|
|
"grad_norm": 1.4494216604414056,
|
|
"learning_rate": 5.491984372820461e-06,
|
|
"loss": 0.4585,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.48448960366058813,
|
|
"eval_loss": 0.4497644305229187,
|
|
"eval_runtime": 1526.5252,
|
|
"eval_samples_per_second": 16.384,
|
|
"eval_steps_per_second": 0.512,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.4850279254424332,
|
|
"grad_norm": 1.5164622603897875,
|
|
"learning_rate": 5.483304152597145e-06,
|
|
"loss": 0.4488,
|
|
"step": 901
|
|
},
|
|
{
|
|
"epoch": 0.4855662472242783,
|
|
"grad_norm": 1.5363015107046971,
|
|
"learning_rate": 5.474622461780011e-06,
|
|
"loss": 0.424,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 0.4861045690061234,
|
|
"grad_norm": 1.5955517741757022,
|
|
"learning_rate": 5.465939326785634e-06,
|
|
"loss": 0.4544,
|
|
"step": 903
|
|
},
|
|
{
|
|
"epoch": 0.4866428907879685,
|
|
"grad_norm": 1.879614888686265,
|
|
"learning_rate": 5.457254774034983e-06,
|
|
"loss": 0.5032,
|
|
"step": 904
|
|
},
|
|
{
|
|
"epoch": 0.4871812125698136,
|
|
"grad_norm": 1.5621620080191398,
|
|
"learning_rate": 5.448568829953344e-06,
|
|
"loss": 0.4675,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.4877195343516587,
|
|
"grad_norm": 1.463009731317384,
|
|
"learning_rate": 5.439881520970234e-06,
|
|
"loss": 0.5112,
|
|
"step": 906
|
|
},
|
|
{
|
|
"epoch": 0.4882578561335038,
|
|
"grad_norm": 1.4309448662315376,
|
|
"learning_rate": 5.431192873519326e-06,
|
|
"loss": 0.4532,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 0.4887961779153489,
|
|
"grad_norm": 1.8077348129923718,
|
|
"learning_rate": 5.422502914038359e-06,
|
|
"loss": 0.4498,
|
|
"step": 908
|
|
},
|
|
{
|
|
"epoch": 0.489334499697194,
|
|
"grad_norm": 1.770786349097794,
|
|
"learning_rate": 5.413811668969072e-06,
|
|
"loss": 0.5081,
|
|
"step": 909
|
|
},
|
|
{
|
|
"epoch": 0.4898728214790391,
|
|
"grad_norm": 1.911624959064584,
|
|
"learning_rate": 5.4051191647571126e-06,
|
|
"loss": 0.4297,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.4904111432608842,
|
|
"grad_norm": 2.238598280094612,
|
|
"learning_rate": 5.396425427851958e-06,
|
|
"loss": 0.4722,
|
|
"step": 911
|
|
},
|
|
{
|
|
"epoch": 0.4909494650427293,
|
|
"grad_norm": 1.7184560772593453,
|
|
"learning_rate": 5.387730484706839e-06,
|
|
"loss": 0.4778,
|
|
"step": 912
|
|
},
|
|
{
|
|
"epoch": 0.4914877868245744,
|
|
"grad_norm": 1.452205930174256,
|
|
"learning_rate": 5.3790343617786555e-06,
|
|
"loss": 0.4233,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 0.4920261086064195,
|
|
"grad_norm": 1.6315132839706739,
|
|
"learning_rate": 5.3703370855278995e-06,
|
|
"loss": 0.4429,
|
|
"step": 914
|
|
},
|
|
{
|
|
"epoch": 0.4925644303882646,
|
|
"grad_norm": 2.1202501474227984,
|
|
"learning_rate": 5.361638682418565e-06,
|
|
"loss": 0.461,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.4931027521701097,
|
|
"grad_norm": 1.4850726589476337,
|
|
"learning_rate": 5.352939178918084e-06,
|
|
"loss": 0.5053,
|
|
"step": 916
|
|
},
|
|
{
|
|
"epoch": 0.4936410739519548,
|
|
"grad_norm": 2.5715760460764505,
|
|
"learning_rate": 5.344238601497231e-06,
|
|
"loss": 0.523,
|
|
"step": 917
|
|
},
|
|
{
|
|
"epoch": 0.4941793957337999,
|
|
"grad_norm": 1.6641597075498922,
|
|
"learning_rate": 5.335536976630052e-06,
|
|
"loss": 0.4452,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 0.494717717515645,
|
|
"grad_norm": 1.579954501546705,
|
|
"learning_rate": 5.326834330793775e-06,
|
|
"loss": 0.4365,
|
|
"step": 919
|
|
},
|
|
{
|
|
"epoch": 0.49525603929749007,
|
|
"grad_norm": 1.8639771696751175,
|
|
"learning_rate": 5.318130690468741e-06,
|
|
"loss": 0.4956,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.49579436107933517,
|
|
"grad_norm": 1.6264721082016091,
|
|
"learning_rate": 5.309426082138311e-06,
|
|
"loss": 0.4592,
|
|
"step": 921
|
|
},
|
|
{
|
|
"epoch": 0.49633268286118026,
|
|
"grad_norm": 1.624012882860616,
|
|
"learning_rate": 5.300720532288798e-06,
|
|
"loss": 0.437,
|
|
"step": 922
|
|
},
|
|
{
|
|
"epoch": 0.49687100464302536,
|
|
"grad_norm": 1.6131788103239653,
|
|
"learning_rate": 5.29201406740937e-06,
|
|
"loss": 0.4335,
|
|
"step": 923
|
|
},
|
|
{
|
|
"epoch": 0.49740932642487046,
|
|
"grad_norm": 1.4350753111666732,
|
|
"learning_rate": 5.28330671399199e-06,
|
|
"loss": 0.4462,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 0.49794764820671555,
|
|
"grad_norm": 1.9075044926150524,
|
|
"learning_rate": 5.274598498531318e-06,
|
|
"loss": 0.5123,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.49848596998856065,
|
|
"grad_norm": 2.2955162228107233,
|
|
"learning_rate": 5.265889447524641e-06,
|
|
"loss": 0.4649,
|
|
"step": 926
|
|
},
|
|
{
|
|
"epoch": 0.49902429177040575,
|
|
"grad_norm": 1.8752294916309997,
|
|
"learning_rate": 5.257179587471784e-06,
|
|
"loss": 0.4339,
|
|
"step": 927
|
|
},
|
|
{
|
|
"epoch": 0.49956261355225084,
|
|
"grad_norm": 1.776206864828494,
|
|
"learning_rate": 5.248468944875036e-06,
|
|
"loss": 0.4047,
|
|
"step": 928
|
|
},
|
|
{
|
|
"epoch": 0.5001009353340959,
|
|
"grad_norm": 1.6863520776370677,
|
|
"learning_rate": 5.239757546239069e-06,
|
|
"loss": 0.4041,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 0.500639257115941,
|
|
"grad_norm": 1.6004117617835396,
|
|
"learning_rate": 5.231045418070852e-06,
|
|
"loss": 0.4026,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.5011775788977861,
|
|
"grad_norm": 1.6497898215404967,
|
|
"learning_rate": 5.222332586879576e-06,
|
|
"loss": 0.4953,
|
|
"step": 931
|
|
},
|
|
{
|
|
"epoch": 0.5017159006796312,
|
|
"grad_norm": 1.6264336562152901,
|
|
"learning_rate": 5.2136190791765714e-06,
|
|
"loss": 0.4697,
|
|
"step": 932
|
|
},
|
|
{
|
|
"epoch": 0.5022542224614763,
|
|
"grad_norm": 1.4687648507656423,
|
|
"learning_rate": 5.204904921475226e-06,
|
|
"loss": 0.4608,
|
|
"step": 933
|
|
},
|
|
{
|
|
"epoch": 0.5027925442433214,
|
|
"grad_norm": 1.555407852307028,
|
|
"learning_rate": 5.196190140290905e-06,
|
|
"loss": 0.4191,
|
|
"step": 934
|
|
},
|
|
{
|
|
"epoch": 0.5033308660251665,
|
|
"grad_norm": 1.6926089059266405,
|
|
"learning_rate": 5.1874747621408705e-06,
|
|
"loss": 0.4034,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.5038691878070116,
|
|
"grad_norm": 1.5853166612648868,
|
|
"learning_rate": 5.178758813544203e-06,
|
|
"loss": 0.4288,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 0.5044075095888567,
|
|
"grad_norm": 1.5462488708677307,
|
|
"learning_rate": 5.170042321021721e-06,
|
|
"loss": 0.5049,
|
|
"step": 937
|
|
},
|
|
{
|
|
"epoch": 0.5049458313707018,
|
|
"grad_norm": 1.6860561151031408,
|
|
"learning_rate": 5.161325311095889e-06,
|
|
"loss": 0.4673,
|
|
"step": 938
|
|
},
|
|
{
|
|
"epoch": 0.5054841531525469,
|
|
"grad_norm": 1.603506680608381,
|
|
"learning_rate": 5.1526078102907565e-06,
|
|
"loss": 0.4613,
|
|
"step": 939
|
|
},
|
|
{
|
|
"epoch": 0.506022474934392,
|
|
"grad_norm": 1.7493626988274396,
|
|
"learning_rate": 5.143889845131859e-06,
|
|
"loss": 0.4563,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.5065607967162371,
|
|
"grad_norm": 1.7677497007408356,
|
|
"learning_rate": 5.135171442146147e-06,
|
|
"loss": 0.4389,
|
|
"step": 941
|
|
},
|
|
{
|
|
"epoch": 0.5070991184980822,
|
|
"grad_norm": 1.7686507376112643,
|
|
"learning_rate": 5.126452627861906e-06,
|
|
"loss": 0.469,
|
|
"step": 942
|
|
},
|
|
{
|
|
"epoch": 0.5076374402799273,
|
|
"grad_norm": 2.03881052798833,
|
|
"learning_rate": 5.117733428808671e-06,
|
|
"loss": 0.473,
|
|
"step": 943
|
|
},
|
|
{
|
|
"epoch": 0.5081757620617724,
|
|
"grad_norm": 1.5924723958151055,
|
|
"learning_rate": 5.109013871517148e-06,
|
|
"loss": 0.4449,
|
|
"step": 944
|
|
},
|
|
{
|
|
"epoch": 0.5087140838436175,
|
|
"grad_norm": 1.787982594535362,
|
|
"learning_rate": 5.10029398251913e-06,
|
|
"loss": 0.4575,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.5092524056254626,
|
|
"grad_norm": 1.8443122029947836,
|
|
"learning_rate": 5.091573788347424e-06,
|
|
"loss": 0.4825,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 0.5097907274073077,
|
|
"grad_norm": 1.5660114035251782,
|
|
"learning_rate": 5.082853315535764e-06,
|
|
"loss": 0.4705,
|
|
"step": 947
|
|
},
|
|
{
|
|
"epoch": 0.5103290491891528,
|
|
"grad_norm": 1.4015195298555256,
|
|
"learning_rate": 5.074132590618731e-06,
|
|
"loss": 0.4222,
|
|
"step": 948
|
|
},
|
|
{
|
|
"epoch": 0.5108673709709979,
|
|
"grad_norm": 1.6261999654731143,
|
|
"learning_rate": 5.065411640131672e-06,
|
|
"loss": 0.4172,
|
|
"step": 949
|
|
},
|
|
{
|
|
"epoch": 0.511405692752843,
|
|
"grad_norm": 1.6580955314247148,
|
|
"learning_rate": 5.0566904906106254e-06,
|
|
"loss": 0.4803,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.5119440145346881,
|
|
"grad_norm": 1.6882580545035042,
|
|
"learning_rate": 5.047969168592229e-06,
|
|
"loss": 0.4959,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 0.5124823363165332,
|
|
"grad_norm": 1.2734853203083423,
|
|
"learning_rate": 5.039247700613649e-06,
|
|
"loss": 0.4532,
|
|
"step": 952
|
|
},
|
|
{
|
|
"epoch": 0.5130206580983783,
|
|
"grad_norm": 1.6598696282615735,
|
|
"learning_rate": 5.030526113212494e-06,
|
|
"loss": 0.4443,
|
|
"step": 953
|
|
},
|
|
{
|
|
"epoch": 0.5135589798802234,
|
|
"grad_norm": 1.555381309193185,
|
|
"learning_rate": 5.021804432926739e-06,
|
|
"loss": 0.4704,
|
|
"step": 954
|
|
},
|
|
{
|
|
"epoch": 0.5140973016620685,
|
|
"grad_norm": 1.5525351037863324,
|
|
"learning_rate": 5.013082686294639e-06,
|
|
"loss": 0.4373,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.5146356234439136,
|
|
"grad_norm": 1.5575470355469987,
|
|
"learning_rate": 5.00436089985465e-06,
|
|
"loss": 0.4242,
|
|
"step": 956
|
|
},
|
|
{
|
|
"epoch": 0.5151739452257587,
|
|
"grad_norm": 1.7457061624641392,
|
|
"learning_rate": 4.995639100145352e-06,
|
|
"loss": 0.4685,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 0.5157122670076038,
|
|
"grad_norm": 1.6284837184280405,
|
|
"learning_rate": 4.9869173137053625e-06,
|
|
"loss": 0.4702,
|
|
"step": 958
|
|
},
|
|
{
|
|
"epoch": 0.5162505887894488,
|
|
"grad_norm": 2.191085743474062,
|
|
"learning_rate": 4.978195567073262e-06,
|
|
"loss": 0.5185,
|
|
"step": 959
|
|
},
|
|
{
|
|
"epoch": 0.516788910571294,
|
|
"grad_norm": 1.5407588424547343,
|
|
"learning_rate": 4.969473886787507e-06,
|
|
"loss": 0.505,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.516788910571294,
|
|
"eval_loss": 0.44528621435165405,
|
|
"eval_runtime": 1532.2971,
|
|
"eval_samples_per_second": 16.322,
|
|
"eval_steps_per_second": 0.51,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.517327232353139,
|
|
"grad_norm": 1.7214959560480187,
|
|
"learning_rate": 4.960752299386353e-06,
|
|
"loss": 0.4826,
|
|
"step": 961
|
|
},
|
|
{
|
|
"epoch": 0.5178655541349841,
|
|
"grad_norm": 1.5649628360297678,
|
|
"learning_rate": 4.9520308314077726e-06,
|
|
"loss": 0.4224,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 0.5184038759168292,
|
|
"grad_norm": 1.6424636557347856,
|
|
"learning_rate": 4.943309509389377e-06,
|
|
"loss": 0.4148,
|
|
"step": 963
|
|
},
|
|
{
|
|
"epoch": 0.5189421976986743,
|
|
"grad_norm": 1.98993484637264,
|
|
"learning_rate": 4.934588359868329e-06,
|
|
"loss": 0.4307,
|
|
"step": 964
|
|
},
|
|
{
|
|
"epoch": 0.5194805194805194,
|
|
"grad_norm": 2.0804456077787123,
|
|
"learning_rate": 4.92586740938127e-06,
|
|
"loss": 0.4108,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.5200188412623645,
|
|
"grad_norm": 1.748710199317067,
|
|
"learning_rate": 4.917146684464238e-06,
|
|
"loss": 0.4567,
|
|
"step": 966
|
|
},
|
|
{
|
|
"epoch": 0.5205571630442096,
|
|
"grad_norm": 1.4755067360374794,
|
|
"learning_rate": 4.908426211652577e-06,
|
|
"loss": 0.4523,
|
|
"step": 967
|
|
},
|
|
{
|
|
"epoch": 0.5210954848260547,
|
|
"grad_norm": 1.6340640272431366,
|
|
"learning_rate": 4.899706017480872e-06,
|
|
"loss": 0.4697,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 0.5216338066078998,
|
|
"grad_norm": 1.5338487326156454,
|
|
"learning_rate": 4.890986128482854e-06,
|
|
"loss": 0.4108,
|
|
"step": 969
|
|
},
|
|
{
|
|
"epoch": 0.5221721283897449,
|
|
"grad_norm": 1.4204187507894679,
|
|
"learning_rate": 4.88226657119133e-06,
|
|
"loss": 0.4175,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.52271045017159,
|
|
"grad_norm": 1.4916766712552136,
|
|
"learning_rate": 4.873547372138095e-06,
|
|
"loss": 0.4274,
|
|
"step": 971
|
|
},
|
|
{
|
|
"epoch": 0.5232487719534352,
|
|
"grad_norm": 1.514306526603469,
|
|
"learning_rate": 4.864828557853854e-06,
|
|
"loss": 0.4745,
|
|
"step": 972
|
|
},
|
|
{
|
|
"epoch": 0.5237870937352803,
|
|
"grad_norm": 1.774262113242822,
|
|
"learning_rate": 4.856110154868143e-06,
|
|
"loss": 0.4172,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 0.5243254155171254,
|
|
"grad_norm": 1.4311594537408503,
|
|
"learning_rate": 4.847392189709246e-06,
|
|
"loss": 0.4499,
|
|
"step": 974
|
|
},
|
|
{
|
|
"epoch": 0.5248637372989705,
|
|
"grad_norm": 2.045966100772589,
|
|
"learning_rate": 4.8386746889041116e-06,
|
|
"loss": 0.496,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.5254020590808156,
|
|
"grad_norm": 1.3914439869095196,
|
|
"learning_rate": 4.82995767897828e-06,
|
|
"loss": 0.4068,
|
|
"step": 976
|
|
},
|
|
{
|
|
"epoch": 0.5259403808626607,
|
|
"grad_norm": 1.3260222946498679,
|
|
"learning_rate": 4.8212411864557975e-06,
|
|
"loss": 0.4344,
|
|
"step": 977
|
|
},
|
|
{
|
|
"epoch": 0.5264787026445058,
|
|
"grad_norm": 1.7672350290368148,
|
|
"learning_rate": 4.812525237859131e-06,
|
|
"loss": 0.4647,
|
|
"step": 978
|
|
},
|
|
{
|
|
"epoch": 0.5270170244263509,
|
|
"grad_norm": 1.5287264304361414,
|
|
"learning_rate": 4.803809859709097e-06,
|
|
"loss": 0.4406,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 0.527555346208196,
|
|
"grad_norm": 1.5180822455976997,
|
|
"learning_rate": 4.795095078524775e-06,
|
|
"loss": 0.4462,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.5280936679900411,
|
|
"grad_norm": 1.5390017294524125,
|
|
"learning_rate": 4.78638092082343e-06,
|
|
"loss": 0.4427,
|
|
"step": 981
|
|
},
|
|
{
|
|
"epoch": 0.5286319897718862,
|
|
"grad_norm": 1.8490518419390272,
|
|
"learning_rate": 4.777667413120425e-06,
|
|
"loss": 0.4716,
|
|
"step": 982
|
|
},
|
|
{
|
|
"epoch": 0.5291703115537313,
|
|
"grad_norm": 1.9241747880139426,
|
|
"learning_rate": 4.7689545819291484e-06,
|
|
"loss": 0.4471,
|
|
"step": 983
|
|
},
|
|
{
|
|
"epoch": 0.5297086333355764,
|
|
"grad_norm": 1.5723366516079713,
|
|
"learning_rate": 4.760242453760932e-06,
|
|
"loss": 0.3616,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 0.5302469551174215,
|
|
"grad_norm": 2.125474240340618,
|
|
"learning_rate": 4.751531055124965e-06,
|
|
"loss": 0.4567,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.5307852768992666,
|
|
"grad_norm": 1.5872857045985345,
|
|
"learning_rate": 4.742820412528217e-06,
|
|
"loss": 0.4311,
|
|
"step": 986
|
|
},
|
|
{
|
|
"epoch": 0.5313235986811117,
|
|
"grad_norm": 1.5991351116825514,
|
|
"learning_rate": 4.73411055247536e-06,
|
|
"loss": 0.4572,
|
|
"step": 987
|
|
},
|
|
{
|
|
"epoch": 0.5318619204629568,
|
|
"grad_norm": 1.5620726404348677,
|
|
"learning_rate": 4.725401501468683e-06,
|
|
"loss": 0.4299,
|
|
"step": 988
|
|
},
|
|
{
|
|
"epoch": 0.5324002422448019,
|
|
"grad_norm": 1.6599112973852914,
|
|
"learning_rate": 4.716693286008011e-06,
|
|
"loss": 0.4444,
|
|
"step": 989
|
|
},
|
|
{
|
|
"epoch": 0.532938564026647,
|
|
"grad_norm": 1.7825302359359856,
|
|
"learning_rate": 4.707985932590631e-06,
|
|
"loss": 0.4321,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.5334768858084921,
|
|
"grad_norm": 1.5739707930921258,
|
|
"learning_rate": 4.699279467711204e-06,
|
|
"loss": 0.4567,
|
|
"step": 991
|
|
},
|
|
{
|
|
"epoch": 0.5340152075903372,
|
|
"grad_norm": 1.5857670482566744,
|
|
"learning_rate": 4.69057391786169e-06,
|
|
"loss": 0.4312,
|
|
"step": 992
|
|
},
|
|
{
|
|
"epoch": 0.5345535293721823,
|
|
"grad_norm": 1.3615110605746865,
|
|
"learning_rate": 4.68186930953126e-06,
|
|
"loss": 0.376,
|
|
"step": 993
|
|
},
|
|
{
|
|
"epoch": 0.5350918511540274,
|
|
"grad_norm": 1.4263273424189502,
|
|
"learning_rate": 4.673165669206226e-06,
|
|
"loss": 0.4424,
|
|
"step": 994
|
|
},
|
|
{
|
|
"epoch": 0.5356301729358725,
|
|
"grad_norm": 2.8748098476059933,
|
|
"learning_rate": 4.6644630233699495e-06,
|
|
"loss": 0.4828,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.5361684947177175,
|
|
"grad_norm": 1.7530111025052908,
|
|
"learning_rate": 4.65576139850277e-06,
|
|
"loss": 0.4565,
|
|
"step": 996
|
|
},
|
|
{
|
|
"epoch": 0.5367068164995626,
|
|
"grad_norm": 1.625700838321751,
|
|
"learning_rate": 4.647060821081918e-06,
|
|
"loss": 0.4397,
|
|
"step": 997
|
|
},
|
|
{
|
|
"epoch": 0.5372451382814077,
|
|
"grad_norm": 1.7382100638812064,
|
|
"learning_rate": 4.638361317581437e-06,
|
|
"loss": 0.4701,
|
|
"step": 998
|
|
},
|
|
{
|
|
"epoch": 0.5377834600632528,
|
|
"grad_norm": 2.153555864190946,
|
|
"learning_rate": 4.629662914472103e-06,
|
|
"loss": 0.45,
|
|
"step": 999
|
|
},
|
|
{
|
|
"epoch": 0.5383217818450979,
|
|
"grad_norm": 1.6756544006397587,
|
|
"learning_rate": 4.620965638221346e-06,
|
|
"loss": 0.4373,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.538860103626943,
|
|
"grad_norm": 2.115872641463188,
|
|
"learning_rate": 4.612269515293162e-06,
|
|
"loss": 0.4807,
|
|
"step": 1001
|
|
},
|
|
{
|
|
"epoch": 0.5393984254087881,
|
|
"grad_norm": 1.7162266935661588,
|
|
"learning_rate": 4.603574572148043e-06,
|
|
"loss": 0.4231,
|
|
"step": 1002
|
|
},
|
|
{
|
|
"epoch": 0.5399367471906332,
|
|
"grad_norm": 1.828685276454168,
|
|
"learning_rate": 4.59488083524289e-06,
|
|
"loss": 0.4405,
|
|
"step": 1003
|
|
},
|
|
{
|
|
"epoch": 0.5404750689724783,
|
|
"grad_norm": 1.6864896839159536,
|
|
"learning_rate": 4.58618833103093e-06,
|
|
"loss": 0.4144,
|
|
"step": 1004
|
|
},
|
|
{
|
|
"epoch": 0.5410133907543234,
|
|
"grad_norm": 1.4876643937775926,
|
|
"learning_rate": 4.5774970859616426e-06,
|
|
"loss": 0.4628,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.5415517125361685,
|
|
"grad_norm": 1.5038750034441302,
|
|
"learning_rate": 4.568807126480676e-06,
|
|
"loss": 0.4595,
|
|
"step": 1006
|
|
},
|
|
{
|
|
"epoch": 0.5420900343180136,
|
|
"grad_norm": 1.3366252716503892,
|
|
"learning_rate": 4.560118479029768e-06,
|
|
"loss": 0.4447,
|
|
"step": 1007
|
|
},
|
|
{
|
|
"epoch": 0.5426283560998587,
|
|
"grad_norm": 1.5955474786951926,
|
|
"learning_rate": 4.5514311700466575e-06,
|
|
"loss": 0.4731,
|
|
"step": 1008
|
|
},
|
|
{
|
|
"epoch": 0.5431666778817038,
|
|
"grad_norm": 1.415371321661975,
|
|
"learning_rate": 4.5427452259650185e-06,
|
|
"loss": 0.4565,
|
|
"step": 1009
|
|
},
|
|
{
|
|
"epoch": 0.5437049996635489,
|
|
"grad_norm": 1.414837591715847,
|
|
"learning_rate": 4.534060673214367e-06,
|
|
"loss": 0.439,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.544243321445394,
|
|
"grad_norm": 1.6390543819341332,
|
|
"learning_rate": 4.525377538219991e-06,
|
|
"loss": 0.4434,
|
|
"step": 1011
|
|
},
|
|
{
|
|
"epoch": 0.5447816432272391,
|
|
"grad_norm": 1.9027726313032218,
|
|
"learning_rate": 4.516695847402857e-06,
|
|
"loss": 0.4841,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"epoch": 0.5453199650090842,
|
|
"grad_norm": 1.6549184700101718,
|
|
"learning_rate": 4.50801562717954e-06,
|
|
"loss": 0.4187,
|
|
"step": 1013
|
|
},
|
|
{
|
|
"epoch": 0.5458582867909293,
|
|
"grad_norm": 1.672495923944031,
|
|
"learning_rate": 4.499336903962146e-06,
|
|
"loss": 0.461,
|
|
"step": 1014
|
|
},
|
|
{
|
|
"epoch": 0.5463966085727744,
|
|
"grad_norm": 1.9002456572131434,
|
|
"learning_rate": 4.490659704158218e-06,
|
|
"loss": 0.4305,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.5469349303546195,
|
|
"grad_norm": 1.3438622389285284,
|
|
"learning_rate": 4.481984054170666e-06,
|
|
"loss": 0.4569,
|
|
"step": 1016
|
|
},
|
|
{
|
|
"epoch": 0.5474732521364646,
|
|
"grad_norm": 1.6738782134152472,
|
|
"learning_rate": 4.473309980397686e-06,
|
|
"loss": 0.4574,
|
|
"step": 1017
|
|
},
|
|
{
|
|
"epoch": 0.5480115739183097,
|
|
"grad_norm": 1.410079098904291,
|
|
"learning_rate": 4.464637509232679e-06,
|
|
"loss": 0.4616,
|
|
"step": 1018
|
|
},
|
|
{
|
|
"epoch": 0.5485498957001548,
|
|
"grad_norm": 1.5059024241541985,
|
|
"learning_rate": 4.455966667064164e-06,
|
|
"loss": 0.4257,
|
|
"step": 1019
|
|
},
|
|
{
|
|
"epoch": 0.5490882174819999,
|
|
"grad_norm": 1.8743979543800648,
|
|
"learning_rate": 4.447297480275708e-06,
|
|
"loss": 0.4468,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.5490882174819999,
|
|
"eval_loss": 0.44231292605400085,
|
|
"eval_runtime": 1542.3429,
|
|
"eval_samples_per_second": 16.216,
|
|
"eval_steps_per_second": 0.507,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.549626539263845,
|
|
"grad_norm": 2.326652305551719,
|
|
"learning_rate": 4.4386299752458405e-06,
|
|
"loss": 0.5123,
|
|
"step": 1021
|
|
},
|
|
{
|
|
"epoch": 0.5501648610456901,
|
|
"grad_norm": 1.5214313173590028,
|
|
"learning_rate": 4.429964178347973e-06,
|
|
"loss": 0.4525,
|
|
"step": 1022
|
|
},
|
|
{
|
|
"epoch": 0.5507031828275352,
|
|
"grad_norm": 1.578588355929213,
|
|
"learning_rate": 4.4213001159503185e-06,
|
|
"loss": 0.4511,
|
|
"step": 1023
|
|
},
|
|
{
|
|
"epoch": 0.5512415046093803,
|
|
"grad_norm": 1.5736153928065848,
|
|
"learning_rate": 4.4126378144158145e-06,
|
|
"loss": 0.402,
|
|
"step": 1024
|
|
},
|
|
{
|
|
"epoch": 0.5517798263912254,
|
|
"grad_norm": 1.4881049360513776,
|
|
"learning_rate": 4.4039773001020394e-06,
|
|
"loss": 0.4312,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.5523181481730705,
|
|
"grad_norm": 1.5453517436989277,
|
|
"learning_rate": 4.395318599361133e-06,
|
|
"loss": 0.4297,
|
|
"step": 1026
|
|
},
|
|
{
|
|
"epoch": 0.5528564699549156,
|
|
"grad_norm": 1.7401645944762647,
|
|
"learning_rate": 4.386661738539716e-06,
|
|
"loss": 0.4021,
|
|
"step": 1027
|
|
},
|
|
{
|
|
"epoch": 0.5533947917367606,
|
|
"grad_norm": 1.6594295806955806,
|
|
"learning_rate": 4.3780067439788125e-06,
|
|
"loss": 0.3936,
|
|
"step": 1028
|
|
},
|
|
{
|
|
"epoch": 0.5539331135186057,
|
|
"grad_norm": 1.4018911995650016,
|
|
"learning_rate": 4.3693536420137704e-06,
|
|
"loss": 0.4208,
|
|
"step": 1029
|
|
},
|
|
{
|
|
"epoch": 0.5544714353004508,
|
|
"grad_norm": 1.554369257290078,
|
|
"learning_rate": 4.360702458974172e-06,
|
|
"loss": 0.3869,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.5550097570822959,
|
|
"grad_norm": 1.7013778785431986,
|
|
"learning_rate": 4.3520532211837645e-06,
|
|
"loss": 0.4557,
|
|
"step": 1031
|
|
},
|
|
{
|
|
"epoch": 0.555548078864141,
|
|
"grad_norm": 1.5141795112180816,
|
|
"learning_rate": 4.343405954960378e-06,
|
|
"loss": 0.437,
|
|
"step": 1032
|
|
},
|
|
{
|
|
"epoch": 0.5560864006459861,
|
|
"grad_norm": 1.6876343830074998,
|
|
"learning_rate": 4.334760686615842e-06,
|
|
"loss": 0.4632,
|
|
"step": 1033
|
|
},
|
|
{
|
|
"epoch": 0.5566247224278312,
|
|
"grad_norm": 1.7137409506750598,
|
|
"learning_rate": 4.326117442455904e-06,
|
|
"loss": 0.451,
|
|
"step": 1034
|
|
},
|
|
{
|
|
"epoch": 0.5571630442096763,
|
|
"grad_norm": 2.2054388725094993,
|
|
"learning_rate": 4.3174762487801554e-06,
|
|
"loss": 0.4845,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.5577013659915214,
|
|
"grad_norm": 1.4514781472802996,
|
|
"learning_rate": 4.30883713188195e-06,
|
|
"loss": 0.4713,
|
|
"step": 1036
|
|
},
|
|
{
|
|
"epoch": 0.5582396877733665,
|
|
"grad_norm": 1.3155208362445518,
|
|
"learning_rate": 4.300200118048318e-06,
|
|
"loss": 0.4048,
|
|
"step": 1037
|
|
},
|
|
{
|
|
"epoch": 0.5587780095552116,
|
|
"grad_norm": 1.7594624250292574,
|
|
"learning_rate": 4.291565233559893e-06,
|
|
"loss": 0.4719,
|
|
"step": 1038
|
|
},
|
|
{
|
|
"epoch": 0.5593163313370567,
|
|
"grad_norm": 1.5899320924503517,
|
|
"learning_rate": 4.282932504690829e-06,
|
|
"loss": 0.4889,
|
|
"step": 1039
|
|
},
|
|
{
|
|
"epoch": 0.5598546531189018,
|
|
"grad_norm": 1.5400899090595648,
|
|
"learning_rate": 4.274301957708723e-06,
|
|
"loss": 0.48,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.5603929749007469,
|
|
"grad_norm": 1.9340975529821163,
|
|
"learning_rate": 4.265673618874527e-06,
|
|
"loss": 0.4558,
|
|
"step": 1041
|
|
},
|
|
{
|
|
"epoch": 0.560931296682592,
|
|
"grad_norm": 1.1875057467361612,
|
|
"learning_rate": 4.257047514442481e-06,
|
|
"loss": 0.4308,
|
|
"step": 1042
|
|
},
|
|
{
|
|
"epoch": 0.5614696184644371,
|
|
"grad_norm": 1.7255919834039524,
|
|
"learning_rate": 4.248423670660022e-06,
|
|
"loss": 0.4637,
|
|
"step": 1043
|
|
},
|
|
{
|
|
"epoch": 0.5620079402462822,
|
|
"grad_norm": 1.552937296818888,
|
|
"learning_rate": 4.239802113767711e-06,
|
|
"loss": 0.5167,
|
|
"step": 1044
|
|
},
|
|
{
|
|
"epoch": 0.5625462620281273,
|
|
"grad_norm": 1.4241418668403774,
|
|
"learning_rate": 4.231182869999146e-06,
|
|
"loss": 0.4262,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.5630845838099724,
|
|
"grad_norm": 1.4079020132555902,
|
|
"learning_rate": 4.222565965580892e-06,
|
|
"loss": 0.4527,
|
|
"step": 1046
|
|
},
|
|
{
|
|
"epoch": 0.5636229055918175,
|
|
"grad_norm": 1.3617602268653886,
|
|
"learning_rate": 4.2139514267323925e-06,
|
|
"loss": 0.4546,
|
|
"step": 1047
|
|
},
|
|
{
|
|
"epoch": 0.5641612273736626,
|
|
"grad_norm": 1.5838734348735288,
|
|
"learning_rate": 4.205339279665895e-06,
|
|
"loss": 0.3903,
|
|
"step": 1048
|
|
},
|
|
{
|
|
"epoch": 0.5646995491555077,
|
|
"grad_norm": 1.451984176062728,
|
|
"learning_rate": 4.196729550586367e-06,
|
|
"loss": 0.4211,
|
|
"step": 1049
|
|
},
|
|
{
|
|
"epoch": 0.5652378709373528,
|
|
"grad_norm": 1.5454288468811321,
|
|
"learning_rate": 4.18812226569142e-06,
|
|
"loss": 0.3856,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.5657761927191979,
|
|
"grad_norm": 1.6143068691418476,
|
|
"learning_rate": 4.17951745117123e-06,
|
|
"loss": 0.4137,
|
|
"step": 1051
|
|
},
|
|
{
|
|
"epoch": 0.566314514501043,
|
|
"grad_norm": 1.5780823976901985,
|
|
"learning_rate": 4.170915133208452e-06,
|
|
"loss": 0.4402,
|
|
"step": 1052
|
|
},
|
|
{
|
|
"epoch": 0.5668528362828881,
|
|
"grad_norm": 1.4482990847613153,
|
|
"learning_rate": 4.162315337978148e-06,
|
|
"loss": 0.5056,
|
|
"step": 1053
|
|
},
|
|
{
|
|
"epoch": 0.5673911580647332,
|
|
"grad_norm": 1.534829858260644,
|
|
"learning_rate": 4.153718091647702e-06,
|
|
"loss": 0.4212,
|
|
"step": 1054
|
|
},
|
|
{
|
|
"epoch": 0.5679294798465783,
|
|
"grad_norm": 1.6872941151721794,
|
|
"learning_rate": 4.145123420376745e-06,
|
|
"loss": 0.4604,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.5684678016284234,
|
|
"grad_norm": 1.3923901318290877,
|
|
"learning_rate": 4.136531350317069e-06,
|
|
"loss": 0.4608,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"epoch": 0.5690061234102685,
|
|
"grad_norm": 1.7627677860939457,
|
|
"learning_rate": 4.127941907612553e-06,
|
|
"loss": 0.4345,
|
|
"step": 1057
|
|
},
|
|
{
|
|
"epoch": 0.5695444451921136,
|
|
"grad_norm": 1.6236383393521263,
|
|
"learning_rate": 4.11935511839908e-06,
|
|
"loss": 0.4599,
|
|
"step": 1058
|
|
},
|
|
{
|
|
"epoch": 0.5700827669739587,
|
|
"grad_norm": 1.5390392661613181,
|
|
"learning_rate": 4.110771008804463e-06,
|
|
"loss": 0.4822,
|
|
"step": 1059
|
|
},
|
|
{
|
|
"epoch": 0.5706210887558038,
|
|
"grad_norm": 1.6460116304075034,
|
|
"learning_rate": 4.102189604948356e-06,
|
|
"loss": 0.4277,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.5711594105376488,
|
|
"grad_norm": 1.4089445870425645,
|
|
"learning_rate": 4.093610932942184e-06,
|
|
"loss": 0.4055,
|
|
"step": 1061
|
|
},
|
|
{
|
|
"epoch": 0.571697732319494,
|
|
"grad_norm": 1.4912945610802475,
|
|
"learning_rate": 4.085035018889058e-06,
|
|
"loss": 0.4081,
|
|
"step": 1062
|
|
},
|
|
{
|
|
"epoch": 0.572236054101339,
|
|
"grad_norm": 1.7313554326427134,
|
|
"learning_rate": 4.076461888883696e-06,
|
|
"loss": 0.4516,
|
|
"step": 1063
|
|
},
|
|
{
|
|
"epoch": 0.5727743758831841,
|
|
"grad_norm": 1.438398770463997,
|
|
"learning_rate": 4.067891569012347e-06,
|
|
"loss": 0.4591,
|
|
"step": 1064
|
|
},
|
|
{
|
|
"epoch": 0.5733126976650292,
|
|
"grad_norm": 1.2911877198700585,
|
|
"learning_rate": 4.059324085352709e-06,
|
|
"loss": 0.3877,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.5738510194468743,
|
|
"grad_norm": 1.4799665950387828,
|
|
"learning_rate": 4.050759463973849e-06,
|
|
"loss": 0.4027,
|
|
"step": 1066
|
|
},
|
|
{
|
|
"epoch": 0.5743893412287194,
|
|
"grad_norm": 1.31856553741587,
|
|
"learning_rate": 4.042197730936124e-06,
|
|
"loss": 0.4385,
|
|
"step": 1067
|
|
},
|
|
{
|
|
"epoch": 0.5749276630105645,
|
|
"grad_norm": 1.4681673368671948,
|
|
"learning_rate": 4.033638912291104e-06,
|
|
"loss": 0.4699,
|
|
"step": 1068
|
|
},
|
|
{
|
|
"epoch": 0.5754659847924096,
|
|
"grad_norm": 1.8186933987892613,
|
|
"learning_rate": 4.025083034081492e-06,
|
|
"loss": 0.474,
|
|
"step": 1069
|
|
},
|
|
{
|
|
"epoch": 0.5760043065742547,
|
|
"grad_norm": 1.7243406009536202,
|
|
"learning_rate": 4.016530122341039e-06,
|
|
"loss": 0.4664,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.5765426283560998,
|
|
"grad_norm": 1.7574219154990909,
|
|
"learning_rate": 4.007980203094476e-06,
|
|
"loss": 0.412,
|
|
"step": 1071
|
|
},
|
|
{
|
|
"epoch": 0.5770809501379449,
|
|
"grad_norm": 3.3723520725361325,
|
|
"learning_rate": 3.999433302357427e-06,
|
|
"loss": 0.3745,
|
|
"step": 1072
|
|
},
|
|
{
|
|
"epoch": 0.57761927191979,
|
|
"grad_norm": 1.470644839329035,
|
|
"learning_rate": 3.990889446136326e-06,
|
|
"loss": 0.4192,
|
|
"step": 1073
|
|
},
|
|
{
|
|
"epoch": 0.5781575937016351,
|
|
"grad_norm": 1.8064402874305607,
|
|
"learning_rate": 3.982348660428349e-06,
|
|
"loss": 0.4633,
|
|
"step": 1074
|
|
},
|
|
{
|
|
"epoch": 0.5786959154834802,
|
|
"grad_norm": 1.5560108586108519,
|
|
"learning_rate": 3.9738109712213255e-06,
|
|
"loss": 0.4554,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.5792342372653253,
|
|
"grad_norm": 1.390022072661602,
|
|
"learning_rate": 3.965276404493667e-06,
|
|
"loss": 0.4468,
|
|
"step": 1076
|
|
},
|
|
{
|
|
"epoch": 0.5797725590471704,
|
|
"grad_norm": 1.5485174930428875,
|
|
"learning_rate": 3.956744986214281e-06,
|
|
"loss": 0.4406,
|
|
"step": 1077
|
|
},
|
|
{
|
|
"epoch": 0.5803108808290155,
|
|
"grad_norm": 1.377328803064819,
|
|
"learning_rate": 3.948216742342492e-06,
|
|
"loss": 0.3914,
|
|
"step": 1078
|
|
},
|
|
{
|
|
"epoch": 0.5808492026108606,
|
|
"grad_norm": 1.7377815121930535,
|
|
"learning_rate": 3.939691698827975e-06,
|
|
"loss": 0.4409,
|
|
"step": 1079
|
|
},
|
|
{
|
|
"epoch": 0.5813875243927057,
|
|
"grad_norm": 1.584949416405362,
|
|
"learning_rate": 3.931169881610655e-06,
|
|
"loss": 0.4909,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.5813875243927057,
|
|
"eval_loss": 0.43915173411369324,
|
|
"eval_runtime": 1551.2876,
|
|
"eval_samples_per_second": 16.122,
|
|
"eval_steps_per_second": 0.504,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.5819258461745508,
|
|
"grad_norm": 1.4259479318176305,
|
|
"learning_rate": 3.922651316620648e-06,
|
|
"loss": 0.419,
|
|
"step": 1081
|
|
},
|
|
{
|
|
"epoch": 0.5824641679563959,
|
|
"grad_norm": 1.883836889268125,
|
|
"learning_rate": 3.914136029778173e-06,
|
|
"loss": 0.4847,
|
|
"step": 1082
|
|
},
|
|
{
|
|
"epoch": 0.583002489738241,
|
|
"grad_norm": 1.5440830790183266,
|
|
"learning_rate": 3.905624046993474e-06,
|
|
"loss": 0.4484,
|
|
"step": 1083
|
|
},
|
|
{
|
|
"epoch": 0.5835408115200861,
|
|
"grad_norm": 1.711059696428319,
|
|
"learning_rate": 3.897115394166738e-06,
|
|
"loss": 0.4682,
|
|
"step": 1084
|
|
},
|
|
{
|
|
"epoch": 0.5840791333019312,
|
|
"grad_norm": 1.8908190002251042,
|
|
"learning_rate": 3.8886100971880235e-06,
|
|
"loss": 0.4325,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.5846174550837764,
|
|
"grad_norm": 1.5374015806352503,
|
|
"learning_rate": 3.880108181937178e-06,
|
|
"loss": 0.4434,
|
|
"step": 1086
|
|
},
|
|
{
|
|
"epoch": 0.5851557768656215,
|
|
"grad_norm": 1.864521131460447,
|
|
"learning_rate": 3.871609674283757e-06,
|
|
"loss": 0.4649,
|
|
"step": 1087
|
|
},
|
|
{
|
|
"epoch": 0.5856940986474666,
|
|
"grad_norm": 1.9214802187823141,
|
|
"learning_rate": 3.863114600086948e-06,
|
|
"loss": 0.452,
|
|
"step": 1088
|
|
},
|
|
{
|
|
"epoch": 0.5862324204293117,
|
|
"grad_norm": 1.3598584887277212,
|
|
"learning_rate": 3.854622985195492e-06,
|
|
"loss": 0.466,
|
|
"step": 1089
|
|
},
|
|
{
|
|
"epoch": 0.5867707422111568,
|
|
"grad_norm": 1.6127091744766286,
|
|
"learning_rate": 3.846134855447602e-06,
|
|
"loss": 0.4627,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.5873090639930019,
|
|
"grad_norm": 1.4648349504902127,
|
|
"learning_rate": 3.837650236670892e-06,
|
|
"loss": 0.3967,
|
|
"step": 1091
|
|
},
|
|
{
|
|
"epoch": 0.587847385774847,
|
|
"grad_norm": 1.8146408700451369,
|
|
"learning_rate": 3.829169154682283e-06,
|
|
"loss": 0.4271,
|
|
"step": 1092
|
|
},
|
|
{
|
|
"epoch": 0.5883857075566921,
|
|
"grad_norm": 1.7751846942753446,
|
|
"learning_rate": 3.8206916352879446e-06,
|
|
"loss": 0.4464,
|
|
"step": 1093
|
|
},
|
|
{
|
|
"epoch": 0.5889240293385372,
|
|
"grad_norm": 1.6612024138612147,
|
|
"learning_rate": 3.8122177042832e-06,
|
|
"loss": 0.4107,
|
|
"step": 1094
|
|
},
|
|
{
|
|
"epoch": 0.5894623511203823,
|
|
"grad_norm": 2.812616379162355,
|
|
"learning_rate": 3.8037473874524542e-06,
|
|
"loss": 0.4584,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.5900006729022274,
|
|
"grad_norm": 1.3709537212409602,
|
|
"learning_rate": 3.7952807105691185e-06,
|
|
"loss": 0.4356,
|
|
"step": 1096
|
|
},
|
|
{
|
|
"epoch": 0.5905389946840724,
|
|
"grad_norm": 1.2984038273503478,
|
|
"learning_rate": 3.7868176993955253e-06,
|
|
"loss": 0.426,
|
|
"step": 1097
|
|
},
|
|
{
|
|
"epoch": 0.5910773164659175,
|
|
"grad_norm": 1.6589883894837865,
|
|
"learning_rate": 3.7783583796828543e-06,
|
|
"loss": 0.4449,
|
|
"step": 1098
|
|
},
|
|
{
|
|
"epoch": 0.5916156382477626,
|
|
"grad_norm": 1.66006556219293,
|
|
"learning_rate": 3.769902777171051e-06,
|
|
"loss": 0.493,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"epoch": 0.5921539600296077,
|
|
"grad_norm": 1.5937225644555308,
|
|
"learning_rate": 3.761450917588753e-06,
|
|
"loss": 0.4723,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.5926922818114528,
|
|
"grad_norm": 1.3456146090228862,
|
|
"learning_rate": 3.7530028266532074e-06,
|
|
"loss": 0.4137,
|
|
"step": 1101
|
|
},
|
|
{
|
|
"epoch": 0.5932306035932979,
|
|
"grad_norm": 1.679198037724048,
|
|
"learning_rate": 3.744558530070196e-06,
|
|
"loss": 0.4261,
|
|
"step": 1102
|
|
},
|
|
{
|
|
"epoch": 0.593768925375143,
|
|
"grad_norm": 1.581894355411804,
|
|
"learning_rate": 3.7361180535339504e-06,
|
|
"loss": 0.4612,
|
|
"step": 1103
|
|
},
|
|
{
|
|
"epoch": 0.5943072471569881,
|
|
"grad_norm": 1.4999393803804146,
|
|
"learning_rate": 3.7276814227270842e-06,
|
|
"loss": 0.4242,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"epoch": 0.5948455689388332,
|
|
"grad_norm": 1.6700110113661726,
|
|
"learning_rate": 3.719248663320506e-06,
|
|
"loss": 0.4536,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.5953838907206783,
|
|
"grad_norm": 1.4628534581538355,
|
|
"learning_rate": 3.7108198009733454e-06,
|
|
"loss": 0.3885,
|
|
"step": 1106
|
|
},
|
|
{
|
|
"epoch": 0.5959222125025234,
|
|
"grad_norm": 1.5174908060004981,
|
|
"learning_rate": 3.7023948613328736e-06,
|
|
"loss": 0.4688,
|
|
"step": 1107
|
|
},
|
|
{
|
|
"epoch": 0.5964605342843685,
|
|
"grad_norm": 1.6277090494975097,
|
|
"learning_rate": 3.6939738700344264e-06,
|
|
"loss": 0.4404,
|
|
"step": 1108
|
|
},
|
|
{
|
|
"epoch": 0.5969988560662136,
|
|
"grad_norm": 2.5097831655290954,
|
|
"learning_rate": 3.6855568527013273e-06,
|
|
"loss": 0.4608,
|
|
"step": 1109
|
|
},
|
|
{
|
|
"epoch": 0.5975371778480587,
|
|
"grad_norm": 1.4992012722834578,
|
|
"learning_rate": 3.677143834944803e-06,
|
|
"loss": 0.4446,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.5980754996299038,
|
|
"grad_norm": 1.4139401580995998,
|
|
"learning_rate": 3.6687348423639147e-06,
|
|
"loss": 0.4098,
|
|
"step": 1111
|
|
},
|
|
{
|
|
"epoch": 0.5986138214117489,
|
|
"grad_norm": 2.0752058550686585,
|
|
"learning_rate": 3.6603299005454744e-06,
|
|
"loss": 0.4234,
|
|
"step": 1112
|
|
},
|
|
{
|
|
"epoch": 0.599152143193594,
|
|
"grad_norm": 1.6967487088214965,
|
|
"learning_rate": 3.6519290350639697e-06,
|
|
"loss": 0.4348,
|
|
"step": 1113
|
|
},
|
|
{
|
|
"epoch": 0.5996904649754391,
|
|
"grad_norm": 1.7094622508466781,
|
|
"learning_rate": 3.6435322714814813e-06,
|
|
"loss": 0.4584,
|
|
"step": 1114
|
|
},
|
|
{
|
|
"epoch": 0.6002287867572842,
|
|
"grad_norm": 1.5333043053128887,
|
|
"learning_rate": 3.635139635347612e-06,
|
|
"loss": 0.4211,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.6007671085391293,
|
|
"grad_norm": 1.447440380533825,
|
|
"learning_rate": 3.626751152199406e-06,
|
|
"loss": 0.4392,
|
|
"step": 1116
|
|
},
|
|
{
|
|
"epoch": 0.6013054303209744,
|
|
"grad_norm": 1.558545230893266,
|
|
"learning_rate": 3.6183668475612665e-06,
|
|
"loss": 0.4553,
|
|
"step": 1117
|
|
},
|
|
{
|
|
"epoch": 0.6018437521028195,
|
|
"grad_norm": 1.7341397982742823,
|
|
"learning_rate": 3.6099867469448874e-06,
|
|
"loss": 0.4521,
|
|
"step": 1118
|
|
},
|
|
{
|
|
"epoch": 0.6023820738846646,
|
|
"grad_norm": 3.5577384559068075,
|
|
"learning_rate": 3.601610875849168e-06,
|
|
"loss": 0.4999,
|
|
"step": 1119
|
|
},
|
|
{
|
|
"epoch": 0.6029203956665097,
|
|
"grad_norm": 1.3499033786926813,
|
|
"learning_rate": 3.5932392597601396e-06,
|
|
"loss": 0.4273,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.6034587174483548,
|
|
"grad_norm": 1.49775810523526,
|
|
"learning_rate": 3.584871924150883e-06,
|
|
"loss": 0.4275,
|
|
"step": 1121
|
|
},
|
|
{
|
|
"epoch": 0.6039970392301999,
|
|
"grad_norm": 1.4867216376875734,
|
|
"learning_rate": 3.576508894481458e-06,
|
|
"loss": 0.443,
|
|
"step": 1122
|
|
},
|
|
{
|
|
"epoch": 0.604535361012045,
|
|
"grad_norm": 1.8077118144262816,
|
|
"learning_rate": 3.5681501961988212e-06,
|
|
"loss": 0.408,
|
|
"step": 1123
|
|
},
|
|
{
|
|
"epoch": 0.6050736827938901,
|
|
"grad_norm": 2.0530433441295535,
|
|
"learning_rate": 3.5597958547367507e-06,
|
|
"loss": 0.3988,
|
|
"step": 1124
|
|
},
|
|
{
|
|
"epoch": 0.6056120045757352,
|
|
"grad_norm": 1.4118492293118154,
|
|
"learning_rate": 3.551445895515765e-06,
|
|
"loss": 0.477,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.6061503263575803,
|
|
"grad_norm": 1.7018214299556869,
|
|
"learning_rate": 3.5431003439430493e-06,
|
|
"loss": 0.4441,
|
|
"step": 1126
|
|
},
|
|
{
|
|
"epoch": 0.6066886481394254,
|
|
"grad_norm": 1.434018580532193,
|
|
"learning_rate": 3.5347592254123795e-06,
|
|
"loss": 0.4539,
|
|
"step": 1127
|
|
},
|
|
{
|
|
"epoch": 0.6072269699212705,
|
|
"grad_norm": 1.4867130289511963,
|
|
"learning_rate": 3.526422565304042e-06,
|
|
"loss": 0.4158,
|
|
"step": 1128
|
|
},
|
|
{
|
|
"epoch": 0.6077652917031156,
|
|
"grad_norm": 1.4715457603229556,
|
|
"learning_rate": 3.518090388984753e-06,
|
|
"loss": 0.425,
|
|
"step": 1129
|
|
},
|
|
{
|
|
"epoch": 0.6083036134849606,
|
|
"grad_norm": 1.4891631829297116,
|
|
"learning_rate": 3.5097627218075905e-06,
|
|
"loss": 0.4551,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.6088419352668057,
|
|
"grad_norm": 1.38559309859237,
|
|
"learning_rate": 3.5014395891119112e-06,
|
|
"loss": 0.3903,
|
|
"step": 1131
|
|
},
|
|
{
|
|
"epoch": 0.6093802570486508,
|
|
"grad_norm": 1.5211311736282844,
|
|
"learning_rate": 3.4931210162232716e-06,
|
|
"loss": 0.474,
|
|
"step": 1132
|
|
},
|
|
{
|
|
"epoch": 0.6099185788304959,
|
|
"grad_norm": 3.910273590345733,
|
|
"learning_rate": 3.484807028453356e-06,
|
|
"loss": 0.4386,
|
|
"step": 1133
|
|
},
|
|
{
|
|
"epoch": 0.610456900612341,
|
|
"grad_norm": 1.21915593287012,
|
|
"learning_rate": 3.476497651099897e-06,
|
|
"loss": 0.4214,
|
|
"step": 1134
|
|
},
|
|
{
|
|
"epoch": 0.6109952223941861,
|
|
"grad_norm": 7.218438211629208,
|
|
"learning_rate": 3.4681929094465987e-06,
|
|
"loss": 0.4368,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.6115335441760312,
|
|
"grad_norm": 1.5885679173464573,
|
|
"learning_rate": 3.4598928287630585e-06,
|
|
"loss": 0.4304,
|
|
"step": 1136
|
|
},
|
|
{
|
|
"epoch": 0.6120718659578763,
|
|
"grad_norm": 1.6276966755475062,
|
|
"learning_rate": 3.451597434304692e-06,
|
|
"loss": 0.4303,
|
|
"step": 1137
|
|
},
|
|
{
|
|
"epoch": 0.6126101877397214,
|
|
"grad_norm": 2.4974771072637227,
|
|
"learning_rate": 3.443306751312656e-06,
|
|
"loss": 0.4812,
|
|
"step": 1138
|
|
},
|
|
{
|
|
"epoch": 0.6131485095215665,
|
|
"grad_norm": 1.8523418655749138,
|
|
"learning_rate": 3.435020805013773e-06,
|
|
"loss": 0.4464,
|
|
"step": 1139
|
|
},
|
|
{
|
|
"epoch": 0.6136868313034116,
|
|
"grad_norm": 1.6153961476534389,
|
|
"learning_rate": 3.4267396206204477e-06,
|
|
"loss": 0.4258,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.6136868313034116,
|
|
"eval_loss": 0.4358210265636444,
|
|
"eval_runtime": 1559.0889,
|
|
"eval_samples_per_second": 16.041,
|
|
"eval_steps_per_second": 0.502,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.6142251530852567,
|
|
"grad_norm": 1.5200314946583775,
|
|
"learning_rate": 3.4184632233306004e-06,
|
|
"loss": 0.4328,
|
|
"step": 1141
|
|
},
|
|
{
|
|
"epoch": 0.6147634748671018,
|
|
"grad_norm": 1.753239287330404,
|
|
"learning_rate": 3.4101916383275836e-06,
|
|
"loss": 0.4164,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"epoch": 0.6153017966489469,
|
|
"grad_norm": 1.3784614615536817,
|
|
"learning_rate": 3.4019248907801058e-06,
|
|
"loss": 0.407,
|
|
"step": 1143
|
|
},
|
|
{
|
|
"epoch": 0.615840118430792,
|
|
"grad_norm": 1.4916546024442217,
|
|
"learning_rate": 3.3936630058421567e-06,
|
|
"loss": 0.4449,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"epoch": 0.6163784402126371,
|
|
"grad_norm": 1.411016335795447,
|
|
"learning_rate": 3.385406008652931e-06,
|
|
"loss": 0.4137,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.6169167619944822,
|
|
"grad_norm": 1.969929829038151,
|
|
"learning_rate": 3.3771539243367517e-06,
|
|
"loss": 0.4569,
|
|
"step": 1146
|
|
},
|
|
{
|
|
"epoch": 0.6174550837763273,
|
|
"grad_norm": 1.4268646662770854,
|
|
"learning_rate": 3.3689067780029895e-06,
|
|
"loss": 0.4399,
|
|
"step": 1147
|
|
},
|
|
{
|
|
"epoch": 0.6179934055581724,
|
|
"grad_norm": 1.4858645297475759,
|
|
"learning_rate": 3.3606645947459933e-06,
|
|
"loss": 0.4318,
|
|
"step": 1148
|
|
},
|
|
{
|
|
"epoch": 0.6185317273400175,
|
|
"grad_norm": 2.07970165108201,
|
|
"learning_rate": 3.3524273996450087e-06,
|
|
"loss": 0.4804,
|
|
"step": 1149
|
|
},
|
|
{
|
|
"epoch": 0.6190700491218626,
|
|
"grad_norm": 1.5524399522642343,
|
|
"learning_rate": 3.3441952177641046e-06,
|
|
"loss": 0.448,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.6196083709037077,
|
|
"grad_norm": 1.5025047668730835,
|
|
"learning_rate": 3.335968074152094e-06,
|
|
"loss": 0.4229,
|
|
"step": 1151
|
|
},
|
|
{
|
|
"epoch": 0.6201466926855528,
|
|
"grad_norm": 1.51932290948172,
|
|
"learning_rate": 3.32774599384246e-06,
|
|
"loss": 0.4238,
|
|
"step": 1152
|
|
},
|
|
{
|
|
"epoch": 0.6206850144673979,
|
|
"grad_norm": 1.4003637291864899,
|
|
"learning_rate": 3.319529001853282e-06,
|
|
"loss": 0.4618,
|
|
"step": 1153
|
|
},
|
|
{
|
|
"epoch": 0.621223336249243,
|
|
"grad_norm": 1.3792399628540106,
|
|
"learning_rate": 3.311317123187151e-06,
|
|
"loss": 0.4052,
|
|
"step": 1154
|
|
},
|
|
{
|
|
"epoch": 0.6217616580310881,
|
|
"grad_norm": 1.4341824487711958,
|
|
"learning_rate": 3.3031103828311044e-06,
|
|
"loss": 0.4452,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.6222999798129332,
|
|
"grad_norm": 1.8890388921678993,
|
|
"learning_rate": 3.294908805756543e-06,
|
|
"loss": 0.4311,
|
|
"step": 1156
|
|
},
|
|
{
|
|
"epoch": 0.6228383015947783,
|
|
"grad_norm": 1.6873174271659632,
|
|
"learning_rate": 3.286712416919156e-06,
|
|
"loss": 0.465,
|
|
"step": 1157
|
|
},
|
|
{
|
|
"epoch": 0.6233766233766234,
|
|
"grad_norm": 2.113957712483436,
|
|
"learning_rate": 3.2785212412588464e-06,
|
|
"loss": 0.4103,
|
|
"step": 1158
|
|
},
|
|
{
|
|
"epoch": 0.6239149451584685,
|
|
"grad_norm": 1.6169473829408894,
|
|
"learning_rate": 3.2703353036996553e-06,
|
|
"loss": 0.4042,
|
|
"step": 1159
|
|
},
|
|
{
|
|
"epoch": 0.6244532669403136,
|
|
"grad_norm": 1.6678579140480474,
|
|
"learning_rate": 3.262154629149684e-06,
|
|
"loss": 0.4849,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.6249915887221587,
|
|
"grad_norm": 1.5133551741537392,
|
|
"learning_rate": 3.253979242501023e-06,
|
|
"loss": 0.4479,
|
|
"step": 1161
|
|
},
|
|
{
|
|
"epoch": 0.6255299105040037,
|
|
"grad_norm": 1.5463516633606489,
|
|
"learning_rate": 3.2458091686296666e-06,
|
|
"loss": 0.4589,
|
|
"step": 1162
|
|
},
|
|
{
|
|
"epoch": 0.6260682322858488,
|
|
"grad_norm": 1.3908513399535982,
|
|
"learning_rate": 3.2376444323954487e-06,
|
|
"loss": 0.407,
|
|
"step": 1163
|
|
},
|
|
{
|
|
"epoch": 0.6266065540676939,
|
|
"grad_norm": 1.4911824388993882,
|
|
"learning_rate": 3.2294850586419603e-06,
|
|
"loss": 0.4016,
|
|
"step": 1164
|
|
},
|
|
{
|
|
"epoch": 0.627144875849539,
|
|
"grad_norm": 1.4342504928355473,
|
|
"learning_rate": 3.2213310721964753e-06,
|
|
"loss": 0.4269,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.6276831976313841,
|
|
"grad_norm": 1.5982636474188436,
|
|
"learning_rate": 3.2131824978698744e-06,
|
|
"loss": 0.4532,
|
|
"step": 1166
|
|
},
|
|
{
|
|
"epoch": 0.6282215194132292,
|
|
"grad_norm": 1.3672342575621805,
|
|
"learning_rate": 3.2050393604565722e-06,
|
|
"loss": 0.3972,
|
|
"step": 1167
|
|
},
|
|
{
|
|
"epoch": 0.6287598411950743,
|
|
"grad_norm": 1.6874817093257244,
|
|
"learning_rate": 3.196901684734439e-06,
|
|
"loss": 0.457,
|
|
"step": 1168
|
|
},
|
|
{
|
|
"epoch": 0.6292981629769194,
|
|
"grad_norm": 1.5723777384143767,
|
|
"learning_rate": 3.188769495464725e-06,
|
|
"loss": 0.3892,
|
|
"step": 1169
|
|
},
|
|
{
|
|
"epoch": 0.6298364847587645,
|
|
"grad_norm": 1.601524939347794,
|
|
"learning_rate": 3.180642817391988e-06,
|
|
"loss": 0.4433,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.6303748065406096,
|
|
"grad_norm": 2.25805654454037,
|
|
"learning_rate": 3.172521675244016e-06,
|
|
"loss": 0.4322,
|
|
"step": 1171
|
|
},
|
|
{
|
|
"epoch": 0.6309131283224547,
|
|
"grad_norm": 1.5555079250741115,
|
|
"learning_rate": 3.1644060937317523e-06,
|
|
"loss": 0.391,
|
|
"step": 1172
|
|
},
|
|
{
|
|
"epoch": 0.6314514501042998,
|
|
"grad_norm": 1.4992699551350894,
|
|
"learning_rate": 3.1562960975492194e-06,
|
|
"loss": 0.4044,
|
|
"step": 1173
|
|
},
|
|
{
|
|
"epoch": 0.6319897718861449,
|
|
"grad_norm": 1.5799132322735037,
|
|
"learning_rate": 3.1481917113734474e-06,
|
|
"loss": 0.3812,
|
|
"step": 1174
|
|
},
|
|
{
|
|
"epoch": 0.63252809366799,
|
|
"grad_norm": 1.7698333563655604,
|
|
"learning_rate": 3.140092959864392e-06,
|
|
"loss": 0.4353,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.6330664154498351,
|
|
"grad_norm": 1.568455528145148,
|
|
"learning_rate": 3.1319998676648695e-06,
|
|
"loss": 0.4307,
|
|
"step": 1176
|
|
},
|
|
{
|
|
"epoch": 0.6336047372316802,
|
|
"grad_norm": 1.6539679705814518,
|
|
"learning_rate": 3.12391245940047e-06,
|
|
"loss": 0.4269,
|
|
"step": 1177
|
|
},
|
|
{
|
|
"epoch": 0.6341430590135253,
|
|
"grad_norm": 1.7204853297231233,
|
|
"learning_rate": 3.115830759679492e-06,
|
|
"loss": 0.4857,
|
|
"step": 1178
|
|
},
|
|
{
|
|
"epoch": 0.6346813807953704,
|
|
"grad_norm": 1.6626863719528417,
|
|
"learning_rate": 3.1077547930928652e-06,
|
|
"loss": 0.4681,
|
|
"step": 1179
|
|
},
|
|
{
|
|
"epoch": 0.6352197025772155,
|
|
"grad_norm": 1.6842711637823262,
|
|
"learning_rate": 3.0996845842140716e-06,
|
|
"loss": 0.4312,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.6357580243590606,
|
|
"grad_norm": 1.7431784823037149,
|
|
"learning_rate": 3.091620157599075e-06,
|
|
"loss": 0.4206,
|
|
"step": 1181
|
|
},
|
|
{
|
|
"epoch": 0.6362963461409057,
|
|
"grad_norm": 1.7565059915579697,
|
|
"learning_rate": 3.0835615377862453e-06,
|
|
"loss": 0.4787,
|
|
"step": 1182
|
|
},
|
|
{
|
|
"epoch": 0.6368346679227508,
|
|
"grad_norm": 1.5940508036600212,
|
|
"learning_rate": 3.0755087492962844e-06,
|
|
"loss": 0.3977,
|
|
"step": 1183
|
|
},
|
|
{
|
|
"epoch": 0.6373729897045959,
|
|
"grad_norm": 1.4265440236436624,
|
|
"learning_rate": 3.0674618166321477e-06,
|
|
"loss": 0.4455,
|
|
"step": 1184
|
|
},
|
|
{
|
|
"epoch": 0.637911311486441,
|
|
"grad_norm": 1.5203806820148102,
|
|
"learning_rate": 3.059420764278975e-06,
|
|
"loss": 0.4421,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.6384496332682861,
|
|
"grad_norm": 1.7485388075672719,
|
|
"learning_rate": 3.0513856167040123e-06,
|
|
"loss": 0.4337,
|
|
"step": 1186
|
|
},
|
|
{
|
|
"epoch": 0.6389879550501312,
|
|
"grad_norm": 1.5758916072812403,
|
|
"learning_rate": 3.0433563983565415e-06,
|
|
"loss": 0.483,
|
|
"step": 1187
|
|
},
|
|
{
|
|
"epoch": 0.6395262768319763,
|
|
"grad_norm": 1.7757740619316615,
|
|
"learning_rate": 3.0353331336677984e-06,
|
|
"loss": 0.402,
|
|
"step": 1188
|
|
},
|
|
{
|
|
"epoch": 0.6400645986138214,
|
|
"grad_norm": 1.5639356203741708,
|
|
"learning_rate": 3.027315847050906e-06,
|
|
"loss": 0.4588,
|
|
"step": 1189
|
|
},
|
|
{
|
|
"epoch": 0.6406029203956665,
|
|
"grad_norm": 1.900913903628273,
|
|
"learning_rate": 3.0193045629007982e-06,
|
|
"loss": 0.4318,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.6411412421775116,
|
|
"grad_norm": 1.7813979669008324,
|
|
"learning_rate": 3.011299305594141e-06,
|
|
"loss": 0.4444,
|
|
"step": 1191
|
|
},
|
|
{
|
|
"epoch": 0.6416795639593567,
|
|
"grad_norm": 1.4267787696799576,
|
|
"learning_rate": 3.0033000994892646e-06,
|
|
"loss": 0.4394,
|
|
"step": 1192
|
|
},
|
|
{
|
|
"epoch": 0.6422178857412018,
|
|
"grad_norm": 1.425734282167891,
|
|
"learning_rate": 2.995306968926087e-06,
|
|
"loss": 0.4729,
|
|
"step": 1193
|
|
},
|
|
{
|
|
"epoch": 0.6427562075230469,
|
|
"grad_norm": 1.6415657973276232,
|
|
"learning_rate": 2.98731993822604e-06,
|
|
"loss": 0.4644,
|
|
"step": 1194
|
|
},
|
|
{
|
|
"epoch": 0.643294529304892,
|
|
"grad_norm": 1.8314597950910743,
|
|
"learning_rate": 2.97933903169199e-06,
|
|
"loss": 0.5308,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.643832851086737,
|
|
"grad_norm": 1.5314208582263587,
|
|
"learning_rate": 2.9713642736081755e-06,
|
|
"loss": 0.4539,
|
|
"step": 1196
|
|
},
|
|
{
|
|
"epoch": 0.6443711728685821,
|
|
"grad_norm": 1.7043966331574372,
|
|
"learning_rate": 2.9633956882401215e-06,
|
|
"loss": 0.4478,
|
|
"step": 1197
|
|
},
|
|
{
|
|
"epoch": 0.6449094946504272,
|
|
"grad_norm": 1.3896380014466228,
|
|
"learning_rate": 2.955433299834576e-06,
|
|
"loss": 0.4274,
|
|
"step": 1198
|
|
},
|
|
{
|
|
"epoch": 0.6454478164322723,
|
|
"grad_norm": 1.328466975562685,
|
|
"learning_rate": 2.947477132619423e-06,
|
|
"loss": 0.4151,
|
|
"step": 1199
|
|
},
|
|
{
|
|
"epoch": 0.6459861382141174,
|
|
"grad_norm": 1.4947495053829816,
|
|
"learning_rate": 2.939527210803624e-06,
|
|
"loss": 0.4225,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.6459861382141174,
|
|
"eval_loss": 0.43335118889808655,
|
|
"eval_runtime": 1568.1591,
|
|
"eval_samples_per_second": 15.949,
|
|
"eval_steps_per_second": 0.499,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.6465244599959626,
|
|
"grad_norm": 1.7770419353679783,
|
|
"learning_rate": 2.9315835585771334e-06,
|
|
"loss": 0.4443,
|
|
"step": 1201
|
|
},
|
|
{
|
|
"epoch": 0.6470627817778077,
|
|
"grad_norm": 1.509257884926516,
|
|
"learning_rate": 2.923646200110832e-06,
|
|
"loss": 0.403,
|
|
"step": 1202
|
|
},
|
|
{
|
|
"epoch": 0.6476011035596528,
|
|
"grad_norm": 1.413359799607147,
|
|
"learning_rate": 2.915715159556444e-06,
|
|
"loss": 0.3995,
|
|
"step": 1203
|
|
},
|
|
{
|
|
"epoch": 0.6481394253414979,
|
|
"grad_norm": 1.4051405846579907,
|
|
"learning_rate": 2.9077904610464745e-06,
|
|
"loss": 0.3597,
|
|
"step": 1204
|
|
},
|
|
{
|
|
"epoch": 0.648677747123343,
|
|
"grad_norm": 1.5857210618229394,
|
|
"learning_rate": 2.89987212869413e-06,
|
|
"loss": 0.448,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.6492160689051881,
|
|
"grad_norm": 1.3723187404527468,
|
|
"learning_rate": 2.8919601865932456e-06,
|
|
"loss": 0.4522,
|
|
"step": 1206
|
|
},
|
|
{
|
|
"epoch": 0.6497543906870332,
|
|
"grad_norm": 1.3511061410304184,
|
|
"learning_rate": 2.884054658818214e-06,
|
|
"loss": 0.3792,
|
|
"step": 1207
|
|
},
|
|
{
|
|
"epoch": 0.6502927124688783,
|
|
"grad_norm": 1.387760091675675,
|
|
"learning_rate": 2.8761555694239046e-06,
|
|
"loss": 0.4515,
|
|
"step": 1208
|
|
},
|
|
{
|
|
"epoch": 0.6508310342507234,
|
|
"grad_norm": 1.4247593593472396,
|
|
"learning_rate": 2.868262942445603e-06,
|
|
"loss": 0.4489,
|
|
"step": 1209
|
|
},
|
|
{
|
|
"epoch": 0.6513693560325685,
|
|
"grad_norm": 1.600671347691334,
|
|
"learning_rate": 2.8603768018989275e-06,
|
|
"loss": 0.3944,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.6519076778144136,
|
|
"grad_norm": 1.4284428882228806,
|
|
"learning_rate": 2.852497171779761e-06,
|
|
"loss": 0.432,
|
|
"step": 1211
|
|
},
|
|
{
|
|
"epoch": 0.6524459995962587,
|
|
"grad_norm": 1.8170320001458748,
|
|
"learning_rate": 2.8446240760641762e-06,
|
|
"loss": 0.483,
|
|
"step": 1212
|
|
},
|
|
{
|
|
"epoch": 0.6529843213781038,
|
|
"grad_norm": 1.872300633931277,
|
|
"learning_rate": 2.836757538708362e-06,
|
|
"loss": 0.4226,
|
|
"step": 1213
|
|
},
|
|
{
|
|
"epoch": 0.6535226431599489,
|
|
"grad_norm": 1.5545253276420463,
|
|
"learning_rate": 2.8288975836485523e-06,
|
|
"loss": 0.4452,
|
|
"step": 1214
|
|
},
|
|
{
|
|
"epoch": 0.654060964941794,
|
|
"grad_norm": 1.4689119979210103,
|
|
"learning_rate": 2.8210442348009543e-06,
|
|
"loss": 0.4206,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.6545992867236391,
|
|
"grad_norm": 1.495722266239985,
|
|
"learning_rate": 2.8131975160616686e-06,
|
|
"loss": 0.4555,
|
|
"step": 1216
|
|
},
|
|
{
|
|
"epoch": 0.6551376085054842,
|
|
"grad_norm": 1.4286754464458904,
|
|
"learning_rate": 2.805357451306626e-06,
|
|
"loss": 0.4531,
|
|
"step": 1217
|
|
},
|
|
{
|
|
"epoch": 0.6556759302873293,
|
|
"grad_norm": 1.6604089854519999,
|
|
"learning_rate": 2.797524064391511e-06,
|
|
"loss": 0.4351,
|
|
"step": 1218
|
|
},
|
|
{
|
|
"epoch": 0.6562142520691744,
|
|
"grad_norm": 1.677727217993553,
|
|
"learning_rate": 2.7896973791516867e-06,
|
|
"loss": 0.4797,
|
|
"step": 1219
|
|
},
|
|
{
|
|
"epoch": 0.6567525738510195,
|
|
"grad_norm": 1.8188528752490087,
|
|
"learning_rate": 2.781877419402126e-06,
|
|
"loss": 0.3942,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.6572908956328646,
|
|
"grad_norm": 1.518304729497582,
|
|
"learning_rate": 2.7740642089373356e-06,
|
|
"loss": 0.4567,
|
|
"step": 1221
|
|
},
|
|
{
|
|
"epoch": 0.6578292174147097,
|
|
"grad_norm": 1.9076520179847476,
|
|
"learning_rate": 2.76625777153129e-06,
|
|
"loss": 0.4761,
|
|
"step": 1222
|
|
},
|
|
{
|
|
"epoch": 0.6583675391965548,
|
|
"grad_norm": 1.6501027454283104,
|
|
"learning_rate": 2.758458130937346e-06,
|
|
"loss": 0.4568,
|
|
"step": 1223
|
|
},
|
|
{
|
|
"epoch": 0.6589058609783999,
|
|
"grad_norm": 1.4971909664683323,
|
|
"learning_rate": 2.7506653108881885e-06,
|
|
"loss": 0.4534,
|
|
"step": 1224
|
|
},
|
|
{
|
|
"epoch": 0.659444182760245,
|
|
"grad_norm": 1.8216935826384455,
|
|
"learning_rate": 2.742879335095743e-06,
|
|
"loss": 0.4872,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.6599825045420901,
|
|
"grad_norm": 1.441369836777809,
|
|
"learning_rate": 2.735100227251113e-06,
|
|
"loss": 0.3857,
|
|
"step": 1226
|
|
},
|
|
{
|
|
"epoch": 0.6605208263239352,
|
|
"grad_norm": 1.3907320663098741,
|
|
"learning_rate": 2.7273280110245e-06,
|
|
"loss": 0.4055,
|
|
"step": 1227
|
|
},
|
|
{
|
|
"epoch": 0.6610591481057803,
|
|
"grad_norm": 1.3629302314750185,
|
|
"learning_rate": 2.719562710065142e-06,
|
|
"loss": 0.4059,
|
|
"step": 1228
|
|
},
|
|
{
|
|
"epoch": 0.6615974698876254,
|
|
"grad_norm": 1.5181251515722511,
|
|
"learning_rate": 2.711804348001231e-06,
|
|
"loss": 0.4927,
|
|
"step": 1229
|
|
},
|
|
{
|
|
"epoch": 0.6621357916694705,
|
|
"grad_norm": 1.583461554714453,
|
|
"learning_rate": 2.704052948439842e-06,
|
|
"loss": 0.4139,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.6626741134513155,
|
|
"grad_norm": 1.597683792644596,
|
|
"learning_rate": 2.6963085349668718e-06,
|
|
"loss": 0.4299,
|
|
"step": 1231
|
|
},
|
|
{
|
|
"epoch": 0.6632124352331606,
|
|
"grad_norm": 1.4538764746820028,
|
|
"learning_rate": 2.6885711311469547e-06,
|
|
"loss": 0.4238,
|
|
"step": 1232
|
|
},
|
|
{
|
|
"epoch": 0.6637507570150057,
|
|
"grad_norm": 1.5760098860778269,
|
|
"learning_rate": 2.6808407605234006e-06,
|
|
"loss": 0.4605,
|
|
"step": 1233
|
|
},
|
|
{
|
|
"epoch": 0.6642890787968508,
|
|
"grad_norm": 1.8819638022647283,
|
|
"learning_rate": 2.673117446618114e-06,
|
|
"loss": 0.4176,
|
|
"step": 1234
|
|
},
|
|
{
|
|
"epoch": 0.6648274005786959,
|
|
"grad_norm": 1.7467867886896942,
|
|
"learning_rate": 2.665401212931532e-06,
|
|
"loss": 0.4284,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.665365722360541,
|
|
"grad_norm": 1.3582161008888671,
|
|
"learning_rate": 2.6576920829425434e-06,
|
|
"loss": 0.449,
|
|
"step": 1236
|
|
},
|
|
{
|
|
"epoch": 0.6659040441423861,
|
|
"grad_norm": 1.7112669988534182,
|
|
"learning_rate": 2.6499900801084283e-06,
|
|
"loss": 0.4702,
|
|
"step": 1237
|
|
},
|
|
{
|
|
"epoch": 0.6664423659242312,
|
|
"grad_norm": 2.099925951296545,
|
|
"learning_rate": 2.6422952278647705e-06,
|
|
"loss": 0.4592,
|
|
"step": 1238
|
|
},
|
|
{
|
|
"epoch": 0.6669806877060763,
|
|
"grad_norm": 1.4352705146813356,
|
|
"learning_rate": 2.6346075496254054e-06,
|
|
"loss": 0.384,
|
|
"step": 1239
|
|
},
|
|
{
|
|
"epoch": 0.6675190094879214,
|
|
"grad_norm": 1.89895053480487,
|
|
"learning_rate": 2.6269270687823337e-06,
|
|
"loss": 0.4632,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.6680573312697665,
|
|
"grad_norm": 1.527126991788229,
|
|
"learning_rate": 2.619253808705661e-06,
|
|
"loss": 0.4304,
|
|
"step": 1241
|
|
},
|
|
{
|
|
"epoch": 0.6685956530516116,
|
|
"grad_norm": 1.9088122860113825,
|
|
"learning_rate": 2.6115877927435152e-06,
|
|
"loss": 0.4615,
|
|
"step": 1242
|
|
},
|
|
{
|
|
"epoch": 0.6691339748334567,
|
|
"grad_norm": 1.5152814714510374,
|
|
"learning_rate": 2.6039290442219884e-06,
|
|
"loss": 0.4019,
|
|
"step": 1243
|
|
},
|
|
{
|
|
"epoch": 0.6696722966153018,
|
|
"grad_norm": 1.490222426325067,
|
|
"learning_rate": 2.5962775864450563e-06,
|
|
"loss": 0.425,
|
|
"step": 1244
|
|
},
|
|
{
|
|
"epoch": 0.6702106183971469,
|
|
"grad_norm": 1.5269175130136061,
|
|
"learning_rate": 2.588633442694508e-06,
|
|
"loss": 0.3988,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.670748940178992,
|
|
"grad_norm": 1.4416954872355545,
|
|
"learning_rate": 2.5809966362298805e-06,
|
|
"loss": 0.4603,
|
|
"step": 1246
|
|
},
|
|
{
|
|
"epoch": 0.6712872619608371,
|
|
"grad_norm": 2.6364873275752014,
|
|
"learning_rate": 2.573367190288385e-06,
|
|
"loss": 0.4648,
|
|
"step": 1247
|
|
},
|
|
{
|
|
"epoch": 0.6718255837426822,
|
|
"grad_norm": 1.788546820645697,
|
|
"learning_rate": 2.5657451280848355e-06,
|
|
"loss": 0.4635,
|
|
"step": 1248
|
|
},
|
|
{
|
|
"epoch": 0.6723639055245273,
|
|
"grad_norm": 1.3806063124644692,
|
|
"learning_rate": 2.5581304728115797e-06,
|
|
"loss": 0.4943,
|
|
"step": 1249
|
|
},
|
|
{
|
|
"epoch": 0.6729022273063724,
|
|
"grad_norm": 1.402487270939909,
|
|
"learning_rate": 2.550523247638426e-06,
|
|
"loss": 0.4006,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.6734405490882175,
|
|
"grad_norm": 1.910681275697032,
|
|
"learning_rate": 2.542923475712574e-06,
|
|
"loss": 0.4609,
|
|
"step": 1251
|
|
},
|
|
{
|
|
"epoch": 0.6739788708700626,
|
|
"grad_norm": 1.446121535462886,
|
|
"learning_rate": 2.5353311801585507e-06,
|
|
"loss": 0.4092,
|
|
"step": 1252
|
|
},
|
|
{
|
|
"epoch": 0.6745171926519077,
|
|
"grad_norm": 1.6008122915794563,
|
|
"learning_rate": 2.5277463840781236e-06,
|
|
"loss": 0.4648,
|
|
"step": 1253
|
|
},
|
|
{
|
|
"epoch": 0.6750555144337528,
|
|
"grad_norm": 1.8052193116478468,
|
|
"learning_rate": 2.520169110550248e-06,
|
|
"loss": 0.4325,
|
|
"step": 1254
|
|
},
|
|
{
|
|
"epoch": 0.6755938362155979,
|
|
"grad_norm": 2.0544496666589245,
|
|
"learning_rate": 2.5125993826309904e-06,
|
|
"loss": 0.4102,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.676132157997443,
|
|
"grad_norm": 1.5511129757696938,
|
|
"learning_rate": 2.5050372233534526e-06,
|
|
"loss": 0.4443,
|
|
"step": 1256
|
|
},
|
|
{
|
|
"epoch": 0.6766704797792881,
|
|
"grad_norm": 1.8672906417068529,
|
|
"learning_rate": 2.4974826557277115e-06,
|
|
"loss": 0.4516,
|
|
"step": 1257
|
|
},
|
|
{
|
|
"epoch": 0.6772088015611332,
|
|
"grad_norm": 1.4831806217941237,
|
|
"learning_rate": 2.489935702740741e-06,
|
|
"loss": 0.4347,
|
|
"step": 1258
|
|
},
|
|
{
|
|
"epoch": 0.6777471233429783,
|
|
"grad_norm": 1.5986607931002996,
|
|
"learning_rate": 2.4823963873563487e-06,
|
|
"loss": 0.427,
|
|
"step": 1259
|
|
},
|
|
{
|
|
"epoch": 0.6782854451248234,
|
|
"grad_norm": 1.481767434298922,
|
|
"learning_rate": 2.4748647325150966e-06,
|
|
"loss": 0.4135,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.6782854451248234,
|
|
"eval_loss": 0.43108630180358887,
|
|
"eval_runtime": 1581.7954,
|
|
"eval_samples_per_second": 15.811,
|
|
"eval_steps_per_second": 0.494,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.6788237669066685,
|
|
"grad_norm": 1.491812080960543,
|
|
"learning_rate": 2.467340761134242e-06,
|
|
"loss": 0.4392,
|
|
"step": 1261
|
|
},
|
|
{
|
|
"epoch": 0.6793620886885136,
|
|
"grad_norm": 1.5403059882131847,
|
|
"learning_rate": 2.459824496107662e-06,
|
|
"loss": 0.4631,
|
|
"step": 1262
|
|
},
|
|
{
|
|
"epoch": 0.6799004104703587,
|
|
"grad_norm": 1.4488066174399352,
|
|
"learning_rate": 2.4523159603057858e-06,
|
|
"loss": 0.4401,
|
|
"step": 1263
|
|
},
|
|
{
|
|
"epoch": 0.6804387322522037,
|
|
"grad_norm": 1.6997928715987718,
|
|
"learning_rate": 2.444815176575521e-06,
|
|
"loss": 0.4671,
|
|
"step": 1264
|
|
},
|
|
{
|
|
"epoch": 0.6809770540340488,
|
|
"grad_norm": 1.6242395825984155,
|
|
"learning_rate": 2.4373221677401916e-06,
|
|
"loss": 0.4227,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.6815153758158939,
|
|
"grad_norm": 1.3272959133305353,
|
|
"learning_rate": 2.429836956599463e-06,
|
|
"loss": 0.3586,
|
|
"step": 1266
|
|
},
|
|
{
|
|
"epoch": 0.682053697597739,
|
|
"grad_norm": 1.723455688742321,
|
|
"learning_rate": 2.422359565929268e-06,
|
|
"loss": 0.4275,
|
|
"step": 1267
|
|
},
|
|
{
|
|
"epoch": 0.6825920193795841,
|
|
"grad_norm": 1.3911086482449566,
|
|
"learning_rate": 2.414890018481752e-06,
|
|
"loss": 0.4383,
|
|
"step": 1268
|
|
},
|
|
{
|
|
"epoch": 0.6831303411614292,
|
|
"grad_norm": 1.515918050738459,
|
|
"learning_rate": 2.40742833698519e-06,
|
|
"loss": 0.4342,
|
|
"step": 1269
|
|
},
|
|
{
|
|
"epoch": 0.6836686629432743,
|
|
"grad_norm": 1.6928322026664087,
|
|
"learning_rate": 2.3999745441439243e-06,
|
|
"loss": 0.4156,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.6842069847251194,
|
|
"grad_norm": 1.3632558682947689,
|
|
"learning_rate": 2.3925286626382926e-06,
|
|
"loss": 0.3914,
|
|
"step": 1271
|
|
},
|
|
{
|
|
"epoch": 0.6847453065069645,
|
|
"grad_norm": 3.139130094162036,
|
|
"learning_rate": 2.385090715124562e-06,
|
|
"loss": 0.4637,
|
|
"step": 1272
|
|
},
|
|
{
|
|
"epoch": 0.6852836282888096,
|
|
"grad_norm": 1.434440598705869,
|
|
"learning_rate": 2.3776607242348547e-06,
|
|
"loss": 0.437,
|
|
"step": 1273
|
|
},
|
|
{
|
|
"epoch": 0.6858219500706547,
|
|
"grad_norm": 1.5144260531076574,
|
|
"learning_rate": 2.3702387125770882e-06,
|
|
"loss": 0.4234,
|
|
"step": 1274
|
|
},
|
|
{
|
|
"epoch": 0.6863602718524998,
|
|
"grad_norm": 1.693660818176695,
|
|
"learning_rate": 2.362824702734893e-06,
|
|
"loss": 0.4164,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.6868985936343449,
|
|
"grad_norm": 1.3894626651308215,
|
|
"learning_rate": 2.355418717267558e-06,
|
|
"loss": 0.4221,
|
|
"step": 1276
|
|
},
|
|
{
|
|
"epoch": 0.68743691541619,
|
|
"grad_norm": 1.697033782203384,
|
|
"learning_rate": 2.3480207787099534e-06,
|
|
"loss": 0.4383,
|
|
"step": 1277
|
|
},
|
|
{
|
|
"epoch": 0.6879752371980351,
|
|
"grad_norm": 1.4858347246883488,
|
|
"learning_rate": 2.340630909572465e-06,
|
|
"loss": 0.4265,
|
|
"step": 1278
|
|
},
|
|
{
|
|
"epoch": 0.6885135589798802,
|
|
"grad_norm": 1.500359176091357,
|
|
"learning_rate": 2.3332491323409234e-06,
|
|
"loss": 0.4481,
|
|
"step": 1279
|
|
},
|
|
{
|
|
"epoch": 0.6890518807617253,
|
|
"grad_norm": 1.5297356725220441,
|
|
"learning_rate": 2.32587546947654e-06,
|
|
"loss": 0.4348,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.6895902025435704,
|
|
"grad_norm": 2.508398158502729,
|
|
"learning_rate": 2.3185099434158352e-06,
|
|
"loss": 0.4437,
|
|
"step": 1281
|
|
},
|
|
{
|
|
"epoch": 0.6901285243254155,
|
|
"grad_norm": 1.523641981004582,
|
|
"learning_rate": 2.311152576570566e-06,
|
|
"loss": 0.4575,
|
|
"step": 1282
|
|
},
|
|
{
|
|
"epoch": 0.6906668461072606,
|
|
"grad_norm": 1.6114434265747755,
|
|
"learning_rate": 2.303803391327669e-06,
|
|
"loss": 0.4378,
|
|
"step": 1283
|
|
},
|
|
{
|
|
"epoch": 0.6912051678891057,
|
|
"grad_norm": 1.4928444150803868,
|
|
"learning_rate": 2.296462410049183e-06,
|
|
"loss": 0.4411,
|
|
"step": 1284
|
|
},
|
|
{
|
|
"epoch": 0.6917434896709508,
|
|
"grad_norm": 1.5345549032626111,
|
|
"learning_rate": 2.289129655072185e-06,
|
|
"loss": 0.4324,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.6922818114527959,
|
|
"grad_norm": 1.4298368477097725,
|
|
"learning_rate": 2.2818051487087183e-06,
|
|
"loss": 0.426,
|
|
"step": 1286
|
|
},
|
|
{
|
|
"epoch": 0.692820133234641,
|
|
"grad_norm": 1.8725369506254443,
|
|
"learning_rate": 2.2744889132457314e-06,
|
|
"loss": 0.4541,
|
|
"step": 1287
|
|
},
|
|
{
|
|
"epoch": 0.6933584550164861,
|
|
"grad_norm": 1.77702449875276,
|
|
"learning_rate": 2.267180970945003e-06,
|
|
"loss": 0.432,
|
|
"step": 1288
|
|
},
|
|
{
|
|
"epoch": 0.6938967767983312,
|
|
"grad_norm": 1.4563290123647166,
|
|
"learning_rate": 2.259881344043081e-06,
|
|
"loss": 0.3832,
|
|
"step": 1289
|
|
},
|
|
{
|
|
"epoch": 0.6944350985801763,
|
|
"grad_norm": 1.3449801230990073,
|
|
"learning_rate": 2.252590054751205e-06,
|
|
"loss": 0.3962,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.6949734203620214,
|
|
"grad_norm": 1.8854534900995603,
|
|
"learning_rate": 2.2453071252552515e-06,
|
|
"loss": 0.4807,
|
|
"step": 1291
|
|
},
|
|
{
|
|
"epoch": 0.6955117421438665,
|
|
"grad_norm": 1.762423954535133,
|
|
"learning_rate": 2.238032577715656e-06,
|
|
"loss": 0.384,
|
|
"step": 1292
|
|
},
|
|
{
|
|
"epoch": 0.6960500639257116,
|
|
"grad_norm": 1.476803369543656,
|
|
"learning_rate": 2.2307664342673506e-06,
|
|
"loss": 0.4539,
|
|
"step": 1293
|
|
},
|
|
{
|
|
"epoch": 0.6965883857075567,
|
|
"grad_norm": 1.4854619250041479,
|
|
"learning_rate": 2.2235087170196966e-06,
|
|
"loss": 0.4396,
|
|
"step": 1294
|
|
},
|
|
{
|
|
"epoch": 0.6971267074894018,
|
|
"grad_norm": 1.41098403179678,
|
|
"learning_rate": 2.2162594480564155e-06,
|
|
"loss": 0.4005,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.6976650292712469,
|
|
"grad_norm": 1.2989632950912373,
|
|
"learning_rate": 2.2090186494355203e-06,
|
|
"loss": 0.4151,
|
|
"step": 1296
|
|
},
|
|
{
|
|
"epoch": 0.698203351053092,
|
|
"grad_norm": 1.6133874577700047,
|
|
"learning_rate": 2.2017863431892534e-06,
|
|
"loss": 0.4285,
|
|
"step": 1297
|
|
},
|
|
{
|
|
"epoch": 0.698741672834937,
|
|
"grad_norm": 1.333799397613619,
|
|
"learning_rate": 2.1945625513240154e-06,
|
|
"loss": 0.4041,
|
|
"step": 1298
|
|
},
|
|
{
|
|
"epoch": 0.6992799946167821,
|
|
"grad_norm": 1.4390186504294415,
|
|
"learning_rate": 2.1873472958202997e-06,
|
|
"loss": 0.4365,
|
|
"step": 1299
|
|
},
|
|
{
|
|
"epoch": 0.6998183163986272,
|
|
"grad_norm": 1.2866738586576456,
|
|
"learning_rate": 2.1801405986326245e-06,
|
|
"loss": 0.4665,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.7003566381804723,
|
|
"grad_norm": 2.2273828713275865,
|
|
"learning_rate": 2.1729424816894685e-06,
|
|
"loss": 0.4564,
|
|
"step": 1301
|
|
},
|
|
{
|
|
"epoch": 0.7008949599623174,
|
|
"grad_norm": 1.4546138888578992,
|
|
"learning_rate": 2.165752966893203e-06,
|
|
"loss": 0.4051,
|
|
"step": 1302
|
|
},
|
|
{
|
|
"epoch": 0.7014332817441625,
|
|
"grad_norm": 1.3514329197218915,
|
|
"learning_rate": 2.158572076120019e-06,
|
|
"loss": 0.4154,
|
|
"step": 1303
|
|
},
|
|
{
|
|
"epoch": 0.7019716035260076,
|
|
"grad_norm": 1.3870510485604055,
|
|
"learning_rate": 2.1513998312198734e-06,
|
|
"loss": 0.4269,
|
|
"step": 1304
|
|
},
|
|
{
|
|
"epoch": 0.7025099253078527,
|
|
"grad_norm": 1.6439661727082362,
|
|
"learning_rate": 2.1442362540164123e-06,
|
|
"loss": 0.4472,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.7030482470896978,
|
|
"grad_norm": 2.036208978375709,
|
|
"learning_rate": 2.1370813663069086e-06,
|
|
"loss": 0.4952,
|
|
"step": 1306
|
|
},
|
|
{
|
|
"epoch": 0.7035865688715429,
|
|
"grad_norm": 1.4306434260587932,
|
|
"learning_rate": 2.1299351898621938e-06,
|
|
"loss": 0.3815,
|
|
"step": 1307
|
|
},
|
|
{
|
|
"epoch": 0.704124890653388,
|
|
"grad_norm": 1.5518498802370642,
|
|
"learning_rate": 2.122797746426595e-06,
|
|
"loss": 0.4656,
|
|
"step": 1308
|
|
},
|
|
{
|
|
"epoch": 0.7046632124352331,
|
|
"grad_norm": 1.353149193018473,
|
|
"learning_rate": 2.1156690577178657e-06,
|
|
"loss": 0.4414,
|
|
"step": 1309
|
|
},
|
|
{
|
|
"epoch": 0.7052015342170782,
|
|
"grad_norm": 1.3081505827837419,
|
|
"learning_rate": 2.108549145427117e-06,
|
|
"loss": 0.4355,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.7057398559989233,
|
|
"grad_norm": 1.5741831120177514,
|
|
"learning_rate": 2.1014380312187593e-06,
|
|
"loss": 0.4396,
|
|
"step": 1311
|
|
},
|
|
{
|
|
"epoch": 0.7062781777807684,
|
|
"grad_norm": 1.5628460516936316,
|
|
"learning_rate": 2.094335736730433e-06,
|
|
"loss": 0.3687,
|
|
"step": 1312
|
|
},
|
|
{
|
|
"epoch": 0.7068164995626135,
|
|
"grad_norm": 3.0284027392779986,
|
|
"learning_rate": 2.0872422835729384e-06,
|
|
"loss": 0.4463,
|
|
"step": 1313
|
|
},
|
|
{
|
|
"epoch": 0.7073548213444586,
|
|
"grad_norm": 1.3447501399327724,
|
|
"learning_rate": 2.0801576933301757e-06,
|
|
"loss": 0.4371,
|
|
"step": 1314
|
|
},
|
|
{
|
|
"epoch": 0.7078931431263038,
|
|
"grad_norm": 1.8116776445346612,
|
|
"learning_rate": 2.073081987559077e-06,
|
|
"loss": 0.4109,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.7084314649081489,
|
|
"grad_norm": 1.571648134209876,
|
|
"learning_rate": 2.06601518778954e-06,
|
|
"loss": 0.432,
|
|
"step": 1316
|
|
},
|
|
{
|
|
"epoch": 0.708969786689994,
|
|
"grad_norm": 1.596166756734421,
|
|
"learning_rate": 2.0589573155243663e-06,
|
|
"loss": 0.4291,
|
|
"step": 1317
|
|
},
|
|
{
|
|
"epoch": 0.7095081084718391,
|
|
"grad_norm": 1.4446289087866433,
|
|
"learning_rate": 2.051908392239186e-06,
|
|
"loss": 0.4094,
|
|
"step": 1318
|
|
},
|
|
{
|
|
"epoch": 0.7100464302536842,
|
|
"grad_norm": 1.377063116073787,
|
|
"learning_rate": 2.044868439382406e-06,
|
|
"loss": 0.4696,
|
|
"step": 1319
|
|
},
|
|
{
|
|
"epoch": 0.7105847520355293,
|
|
"grad_norm": 1.3694098512093758,
|
|
"learning_rate": 2.0378374783751352e-06,
|
|
"loss": 0.402,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.7105847520355293,
|
|
"eval_loss": 0.4282020330429077,
|
|
"eval_runtime": 1515.7705,
|
|
"eval_samples_per_second": 16.5,
|
|
"eval_steps_per_second": 0.516,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.7111230738173744,
|
|
"grad_norm": 1.929826065439873,
|
|
"learning_rate": 2.030815530611123e-06,
|
|
"loss": 0.4159,
|
|
"step": 1321
|
|
},
|
|
{
|
|
"epoch": 0.7116613955992195,
|
|
"grad_norm": 1.4082500795847726,
|
|
"learning_rate": 2.023802617456694e-06,
|
|
"loss": 0.3941,
|
|
"step": 1322
|
|
},
|
|
{
|
|
"epoch": 0.7121997173810646,
|
|
"grad_norm": 1.8816103595399847,
|
|
"learning_rate": 2.01679876025068e-06,
|
|
"loss": 0.4244,
|
|
"step": 1323
|
|
},
|
|
{
|
|
"epoch": 0.7127380391629097,
|
|
"grad_norm": 1.5683369901785116,
|
|
"learning_rate": 2.0098039803043612e-06,
|
|
"loss": 0.4332,
|
|
"step": 1324
|
|
},
|
|
{
|
|
"epoch": 0.7132763609447548,
|
|
"grad_norm": 1.4453103994083734,
|
|
"learning_rate": 2.0028182989013923e-06,
|
|
"loss": 0.3945,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.7138146827265999,
|
|
"grad_norm": 1.6267798252157584,
|
|
"learning_rate": 1.9958417372977474e-06,
|
|
"loss": 0.4528,
|
|
"step": 1326
|
|
},
|
|
{
|
|
"epoch": 0.714353004508445,
|
|
"grad_norm": 1.6214655041789812,
|
|
"learning_rate": 1.9888743167216493e-06,
|
|
"loss": 0.4074,
|
|
"step": 1327
|
|
},
|
|
{
|
|
"epoch": 0.7148913262902901,
|
|
"grad_norm": 1.8595682807437428,
|
|
"learning_rate": 1.9819160583735077e-06,
|
|
"loss": 0.4494,
|
|
"step": 1328
|
|
},
|
|
{
|
|
"epoch": 0.7154296480721352,
|
|
"grad_norm": 1.4662467013475076,
|
|
"learning_rate": 1.974966983425852e-06,
|
|
"loss": 0.4066,
|
|
"step": 1329
|
|
},
|
|
{
|
|
"epoch": 0.7159679698539803,
|
|
"grad_norm": 2.5261174973160716,
|
|
"learning_rate": 1.9680271130232693e-06,
|
|
"loss": 0.4394,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.7165062916358254,
|
|
"grad_norm": 1.8084272539130577,
|
|
"learning_rate": 1.9610964682823407e-06,
|
|
"loss": 0.4601,
|
|
"step": 1331
|
|
},
|
|
{
|
|
"epoch": 0.7170446134176705,
|
|
"grad_norm": 1.820018846201368,
|
|
"learning_rate": 1.9541750702915706e-06,
|
|
"loss": 0.4446,
|
|
"step": 1332
|
|
},
|
|
{
|
|
"epoch": 0.7175829351995155,
|
|
"grad_norm": 1.3923517314522877,
|
|
"learning_rate": 1.9472629401113325e-06,
|
|
"loss": 0.3857,
|
|
"step": 1333
|
|
},
|
|
{
|
|
"epoch": 0.7181212569813606,
|
|
"grad_norm": 1.527238991242769,
|
|
"learning_rate": 1.9403600987737976e-06,
|
|
"loss": 0.4381,
|
|
"step": 1334
|
|
},
|
|
{
|
|
"epoch": 0.7186595787632057,
|
|
"grad_norm": 1.4006251254778943,
|
|
"learning_rate": 1.9334665672828736e-06,
|
|
"loss": 0.4332,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.7191979005450508,
|
|
"grad_norm": 2.1367769390904,
|
|
"learning_rate": 1.926582366614141e-06,
|
|
"loss": 0.4331,
|
|
"step": 1336
|
|
},
|
|
{
|
|
"epoch": 0.7197362223268959,
|
|
"grad_norm": 1.661348731930383,
|
|
"learning_rate": 1.9197075177147866e-06,
|
|
"loss": 0.4877,
|
|
"step": 1337
|
|
},
|
|
{
|
|
"epoch": 0.720274544108741,
|
|
"grad_norm": 1.4928525414429736,
|
|
"learning_rate": 1.9128420415035442e-06,
|
|
"loss": 0.4239,
|
|
"step": 1338
|
|
},
|
|
{
|
|
"epoch": 0.7208128658905861,
|
|
"grad_norm": 1.533499882863047,
|
|
"learning_rate": 1.9059859588706287e-06,
|
|
"loss": 0.3951,
|
|
"step": 1339
|
|
},
|
|
{
|
|
"epoch": 0.7213511876724312,
|
|
"grad_norm": 1.8392687775713348,
|
|
"learning_rate": 1.8991392906776668e-06,
|
|
"loss": 0.4395,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.7218895094542763,
|
|
"grad_norm": 1.573889490157054,
|
|
"learning_rate": 1.8923020577576452e-06,
|
|
"loss": 0.4162,
|
|
"step": 1341
|
|
},
|
|
{
|
|
"epoch": 0.7224278312361214,
|
|
"grad_norm": 1.5526149616819422,
|
|
"learning_rate": 1.885474280914838e-06,
|
|
"loss": 0.4579,
|
|
"step": 1342
|
|
},
|
|
{
|
|
"epoch": 0.7229661530179665,
|
|
"grad_norm": 1.5191810245344743,
|
|
"learning_rate": 1.8786559809247485e-06,
|
|
"loss": 0.4216,
|
|
"step": 1343
|
|
},
|
|
{
|
|
"epoch": 0.7235044747998116,
|
|
"grad_norm": 1.5555786435185341,
|
|
"learning_rate": 1.8718471785340414e-06,
|
|
"loss": 0.4122,
|
|
"step": 1344
|
|
},
|
|
{
|
|
"epoch": 0.7240427965816567,
|
|
"grad_norm": 1.3557551585285899,
|
|
"learning_rate": 1.8650478944604844e-06,
|
|
"loss": 0.3932,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.7245811183635018,
|
|
"grad_norm": 1.4728885839955113,
|
|
"learning_rate": 1.8582581493928837e-06,
|
|
"loss": 0.4934,
|
|
"step": 1346
|
|
},
|
|
{
|
|
"epoch": 0.7251194401453469,
|
|
"grad_norm": 1.5560703862712066,
|
|
"learning_rate": 1.8514779639910152e-06,
|
|
"loss": 0.4565,
|
|
"step": 1347
|
|
},
|
|
{
|
|
"epoch": 0.725657761927192,
|
|
"grad_norm": 1.4005810948444959,
|
|
"learning_rate": 1.8447073588855707e-06,
|
|
"loss": 0.45,
|
|
"step": 1348
|
|
},
|
|
{
|
|
"epoch": 0.7261960837090371,
|
|
"grad_norm": 1.4372886671511238,
|
|
"learning_rate": 1.8379463546780923e-06,
|
|
"loss": 0.4076,
|
|
"step": 1349
|
|
},
|
|
{
|
|
"epoch": 0.7267344054908822,
|
|
"grad_norm": 1.3561213817272149,
|
|
"learning_rate": 1.8311949719409056e-06,
|
|
"loss": 0.3991,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.7272727272727273,
|
|
"grad_norm": 1.592180627183088,
|
|
"learning_rate": 1.824453231217062e-06,
|
|
"loss": 0.4395,
|
|
"step": 1351
|
|
},
|
|
{
|
|
"epoch": 0.7278110490545724,
|
|
"grad_norm": 1.674234401633556,
|
|
"learning_rate": 1.8177211530202733e-06,
|
|
"loss": 0.5076,
|
|
"step": 1352
|
|
},
|
|
{
|
|
"epoch": 0.7283493708364175,
|
|
"grad_norm": 1.3869830990008478,
|
|
"learning_rate": 1.8109987578348504e-06,
|
|
"loss": 0.3823,
|
|
"step": 1353
|
|
},
|
|
{
|
|
"epoch": 0.7288876926182626,
|
|
"grad_norm": 1.8958736579636137,
|
|
"learning_rate": 1.8042860661156425e-06,
|
|
"loss": 0.4283,
|
|
"step": 1354
|
|
},
|
|
{
|
|
"epoch": 0.7294260144001077,
|
|
"grad_norm": 2.277391563720137,
|
|
"learning_rate": 1.7975830982879688e-06,
|
|
"loss": 0.4344,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.7299643361819528,
|
|
"grad_norm": 1.3788436987213148,
|
|
"learning_rate": 1.7908898747475656e-06,
|
|
"loss": 0.42,
|
|
"step": 1356
|
|
},
|
|
{
|
|
"epoch": 0.7305026579637979,
|
|
"grad_norm": 1.472584181988221,
|
|
"learning_rate": 1.784206415860516e-06,
|
|
"loss": 0.4554,
|
|
"step": 1357
|
|
},
|
|
{
|
|
"epoch": 0.731040979745643,
|
|
"grad_norm": 1.441497867695086,
|
|
"learning_rate": 1.7775327419631938e-06,
|
|
"loss": 0.3914,
|
|
"step": 1358
|
|
},
|
|
{
|
|
"epoch": 0.7315793015274881,
|
|
"grad_norm": 1.413962400530734,
|
|
"learning_rate": 1.7708688733621971e-06,
|
|
"loss": 0.4271,
|
|
"step": 1359
|
|
},
|
|
{
|
|
"epoch": 0.7321176233093332,
|
|
"grad_norm": 1.467777866704718,
|
|
"learning_rate": 1.7642148303342894e-06,
|
|
"loss": 0.4613,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.7326559450911783,
|
|
"grad_norm": 1.4588809601870538,
|
|
"learning_rate": 1.7575706331263392e-06,
|
|
"loss": 0.3732,
|
|
"step": 1361
|
|
},
|
|
{
|
|
"epoch": 0.7331942668730234,
|
|
"grad_norm": 1.9984141502445067,
|
|
"learning_rate": 1.7509363019552506e-06,
|
|
"loss": 0.4337,
|
|
"step": 1362
|
|
},
|
|
{
|
|
"epoch": 0.7337325886548685,
|
|
"grad_norm": 1.7211596185425657,
|
|
"learning_rate": 1.744311857007912e-06,
|
|
"loss": 0.4237,
|
|
"step": 1363
|
|
},
|
|
{
|
|
"epoch": 0.7342709104367136,
|
|
"grad_norm": 1.3275340316554045,
|
|
"learning_rate": 1.7376973184411294e-06,
|
|
"loss": 0.4026,
|
|
"step": 1364
|
|
},
|
|
{
|
|
"epoch": 0.7348092322185587,
|
|
"grad_norm": 1.3704150312314805,
|
|
"learning_rate": 1.7310927063815647e-06,
|
|
"loss": 0.4221,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.7353475540004037,
|
|
"grad_norm": 1.6240778919766734,
|
|
"learning_rate": 1.7244980409256768e-06,
|
|
"loss": 0.3956,
|
|
"step": 1366
|
|
},
|
|
{
|
|
"epoch": 0.7358858757822488,
|
|
"grad_norm": 1.5916150137066967,
|
|
"learning_rate": 1.7179133421396571e-06,
|
|
"loss": 0.449,
|
|
"step": 1367
|
|
},
|
|
{
|
|
"epoch": 0.7364241975640939,
|
|
"grad_norm": 1.3674325981426028,
|
|
"learning_rate": 1.7113386300593749e-06,
|
|
"loss": 0.469,
|
|
"step": 1368
|
|
},
|
|
{
|
|
"epoch": 0.736962519345939,
|
|
"grad_norm": 1.823579935483228,
|
|
"learning_rate": 1.7047739246903044e-06,
|
|
"loss": 0.4256,
|
|
"step": 1369
|
|
},
|
|
{
|
|
"epoch": 0.7375008411277841,
|
|
"grad_norm": 1.5992570631473233,
|
|
"learning_rate": 1.6982192460074787e-06,
|
|
"loss": 0.4364,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.7380391629096292,
|
|
"grad_norm": 1.83556587779534,
|
|
"learning_rate": 1.6916746139554186e-06,
|
|
"loss": 0.462,
|
|
"step": 1371
|
|
},
|
|
{
|
|
"epoch": 0.7385774846914743,
|
|
"grad_norm": 1.63962319033326,
|
|
"learning_rate": 1.6851400484480757e-06,
|
|
"loss": 0.4647,
|
|
"step": 1372
|
|
},
|
|
{
|
|
"epoch": 0.7391158064733194,
|
|
"grad_norm": 1.489565256988372,
|
|
"learning_rate": 1.6786155693687712e-06,
|
|
"loss": 0.4391,
|
|
"step": 1373
|
|
},
|
|
{
|
|
"epoch": 0.7396541282551645,
|
|
"grad_norm": 1.8781762497357959,
|
|
"learning_rate": 1.6721011965701344e-06,
|
|
"loss": 0.4429,
|
|
"step": 1374
|
|
},
|
|
{
|
|
"epoch": 0.7401924500370096,
|
|
"grad_norm": 1.394724821422672,
|
|
"learning_rate": 1.6655969498740455e-06,
|
|
"loss": 0.3781,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.7407307718188547,
|
|
"grad_norm": 1.7954529740174663,
|
|
"learning_rate": 1.6591028490715722e-06,
|
|
"loss": 0.4437,
|
|
"step": 1376
|
|
},
|
|
{
|
|
"epoch": 0.7412690936006998,
|
|
"grad_norm": 1.5625366322113399,
|
|
"learning_rate": 1.6526189139229072e-06,
|
|
"loss": 0.4221,
|
|
"step": 1377
|
|
},
|
|
{
|
|
"epoch": 0.7418074153825449,
|
|
"grad_norm": 1.49000718617141,
|
|
"learning_rate": 1.6461451641573156e-06,
|
|
"loss": 0.3824,
|
|
"step": 1378
|
|
},
|
|
{
|
|
"epoch": 0.74234573716439,
|
|
"grad_norm": 1.5501486593751905,
|
|
"learning_rate": 1.639681619473069e-06,
|
|
"loss": 0.4316,
|
|
"step": 1379
|
|
},
|
|
{
|
|
"epoch": 0.7428840589462351,
|
|
"grad_norm": 1.6012264627466746,
|
|
"learning_rate": 1.6332282995373867e-06,
|
|
"loss": 0.4414,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.7428840589462351,
|
|
"eval_loss": 0.4260067939758301,
|
|
"eval_runtime": 1520.5135,
|
|
"eval_samples_per_second": 16.448,
|
|
"eval_steps_per_second": 0.514,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.7434223807280802,
|
|
"grad_norm": 1.3868379821786618,
|
|
"learning_rate": 1.6267852239863763e-06,
|
|
"loss": 0.3962,
|
|
"step": 1381
|
|
},
|
|
{
|
|
"epoch": 0.7439607025099253,
|
|
"grad_norm": 1.563201406467786,
|
|
"learning_rate": 1.6203524124249742e-06,
|
|
"loss": 0.4359,
|
|
"step": 1382
|
|
},
|
|
{
|
|
"epoch": 0.7444990242917704,
|
|
"grad_norm": 2.0744885451879895,
|
|
"learning_rate": 1.613929884426887e-06,
|
|
"loss": 0.472,
|
|
"step": 1383
|
|
},
|
|
{
|
|
"epoch": 0.7450373460736155,
|
|
"grad_norm": 1.7165383734256863,
|
|
"learning_rate": 1.607517659534526e-06,
|
|
"loss": 0.4449,
|
|
"step": 1384
|
|
},
|
|
{
|
|
"epoch": 0.7455756678554606,
|
|
"grad_norm": 1.420966932605389,
|
|
"learning_rate": 1.6011157572589565e-06,
|
|
"loss": 0.4594,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.7461139896373057,
|
|
"grad_norm": 1.3843843466818937,
|
|
"learning_rate": 1.5947241970798332e-06,
|
|
"loss": 0.4021,
|
|
"step": 1386
|
|
},
|
|
{
|
|
"epoch": 0.7466523114191508,
|
|
"grad_norm": 2.021869994898455,
|
|
"learning_rate": 1.588342998445342e-06,
|
|
"loss": 0.4973,
|
|
"step": 1387
|
|
},
|
|
{
|
|
"epoch": 0.7471906332009959,
|
|
"grad_norm": 1.6308202289723368,
|
|
"learning_rate": 1.58197218077214e-06,
|
|
"loss": 0.4448,
|
|
"step": 1388
|
|
},
|
|
{
|
|
"epoch": 0.747728954982841,
|
|
"grad_norm": 1.5609319044422376,
|
|
"learning_rate": 1.5756117634452977e-06,
|
|
"loss": 0.4512,
|
|
"step": 1389
|
|
},
|
|
{
|
|
"epoch": 0.7482672767646861,
|
|
"grad_norm": 1.3798571945954525,
|
|
"learning_rate": 1.5692617658182402e-06,
|
|
"loss": 0.4332,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.7488055985465312,
|
|
"grad_norm": 1.5464889993436788,
|
|
"learning_rate": 1.5629222072126888e-06,
|
|
"loss": 0.4716,
|
|
"step": 1391
|
|
},
|
|
{
|
|
"epoch": 0.7493439203283763,
|
|
"grad_norm": 1.7517747662085987,
|
|
"learning_rate": 1.5565931069185946e-06,
|
|
"loss": 0.4305,
|
|
"step": 1392
|
|
},
|
|
{
|
|
"epoch": 0.7498822421102214,
|
|
"grad_norm": 1.5029346054542445,
|
|
"learning_rate": 1.5502744841940936e-06,
|
|
"loss": 0.4657,
|
|
"step": 1393
|
|
},
|
|
{
|
|
"epoch": 0.7504205638920665,
|
|
"grad_norm": 1.3544718143048395,
|
|
"learning_rate": 1.543966358265438e-06,
|
|
"loss": 0.418,
|
|
"step": 1394
|
|
},
|
|
{
|
|
"epoch": 0.7509588856739116,
|
|
"grad_norm": 1.52275975192662,
|
|
"learning_rate": 1.5376687483269404e-06,
|
|
"loss": 0.3732,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.7514972074557567,
|
|
"grad_norm": 1.691512607761959,
|
|
"learning_rate": 1.5313816735409148e-06,
|
|
"loss": 0.4606,
|
|
"step": 1396
|
|
},
|
|
{
|
|
"epoch": 0.7520355292376018,
|
|
"grad_norm": 1.6421517222533963,
|
|
"learning_rate": 1.5251051530376199e-06,
|
|
"loss": 0.413,
|
|
"step": 1397
|
|
},
|
|
{
|
|
"epoch": 0.7525738510194468,
|
|
"grad_norm": 1.7994036447279773,
|
|
"learning_rate": 1.518839205915202e-06,
|
|
"loss": 0.4167,
|
|
"step": 1398
|
|
},
|
|
{
|
|
"epoch": 0.753112172801292,
|
|
"grad_norm": 1.4116743542426848,
|
|
"learning_rate": 1.5125838512396278e-06,
|
|
"loss": 0.4502,
|
|
"step": 1399
|
|
},
|
|
{
|
|
"epoch": 0.753650494583137,
|
|
"grad_norm": 2.9318193198163414,
|
|
"learning_rate": 1.5063391080446404e-06,
|
|
"loss": 0.4523,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.7541888163649821,
|
|
"grad_norm": 1.3582596783082035,
|
|
"learning_rate": 1.500104995331692e-06,
|
|
"loss": 0.3758,
|
|
"step": 1401
|
|
},
|
|
{
|
|
"epoch": 0.7547271381468272,
|
|
"grad_norm": 2.1921211591651435,
|
|
"learning_rate": 1.493881532069889e-06,
|
|
"loss": 0.4725,
|
|
"step": 1402
|
|
},
|
|
{
|
|
"epoch": 0.7552654599286723,
|
|
"grad_norm": 1.5078767590789557,
|
|
"learning_rate": 1.487668737195932e-06,
|
|
"loss": 0.4137,
|
|
"step": 1403
|
|
},
|
|
{
|
|
"epoch": 0.7558037817105174,
|
|
"grad_norm": 1.7747344554372293,
|
|
"learning_rate": 1.4814666296140617e-06,
|
|
"loss": 0.4519,
|
|
"step": 1404
|
|
},
|
|
{
|
|
"epoch": 0.7563421034923625,
|
|
"grad_norm": 1.4869616706516326,
|
|
"learning_rate": 1.4752752281960003e-06,
|
|
"loss": 0.3805,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.7568804252742076,
|
|
"grad_norm": 1.688795973706041,
|
|
"learning_rate": 1.4690945517808897e-06,
|
|
"loss": 0.4993,
|
|
"step": 1406
|
|
},
|
|
{
|
|
"epoch": 0.7574187470560527,
|
|
"grad_norm": 1.583736337415557,
|
|
"learning_rate": 1.4629246191752406e-06,
|
|
"loss": 0.4382,
|
|
"step": 1407
|
|
},
|
|
{
|
|
"epoch": 0.7579570688378978,
|
|
"grad_norm": 1.405921968173557,
|
|
"learning_rate": 1.4567654491528732e-06,
|
|
"loss": 0.3952,
|
|
"step": 1408
|
|
},
|
|
{
|
|
"epoch": 0.7584953906197429,
|
|
"grad_norm": 1.3449184128012615,
|
|
"learning_rate": 1.4506170604548575e-06,
|
|
"loss": 0.4443,
|
|
"step": 1409
|
|
},
|
|
{
|
|
"epoch": 0.759033712401588,
|
|
"grad_norm": 1.5849926738123288,
|
|
"learning_rate": 1.4444794717894596e-06,
|
|
"loss": 0.4131,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.7595720341834331,
|
|
"grad_norm": 1.6555281403636608,
|
|
"learning_rate": 1.4383527018320825e-06,
|
|
"loss": 0.4414,
|
|
"step": 1411
|
|
},
|
|
{
|
|
"epoch": 0.7601103559652782,
|
|
"grad_norm": 1.6263621942357136,
|
|
"learning_rate": 1.432236769225211e-06,
|
|
"loss": 0.4346,
|
|
"step": 1412
|
|
},
|
|
{
|
|
"epoch": 0.7606486777471233,
|
|
"grad_norm": 2.0460094225135044,
|
|
"learning_rate": 1.426131692578354e-06,
|
|
"loss": 0.4493,
|
|
"step": 1413
|
|
},
|
|
{
|
|
"epoch": 0.7611869995289684,
|
|
"grad_norm": 1.472378438798274,
|
|
"learning_rate": 1.4200374904679853e-06,
|
|
"loss": 0.4562,
|
|
"step": 1414
|
|
},
|
|
{
|
|
"epoch": 0.7617253213108135,
|
|
"grad_norm": 1.7242311556580157,
|
|
"learning_rate": 1.413954181437493e-06,
|
|
"loss": 0.4043,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.7622636430926586,
|
|
"grad_norm": 1.6120964716761355,
|
|
"learning_rate": 1.4078817839971193e-06,
|
|
"loss": 0.4815,
|
|
"step": 1416
|
|
},
|
|
{
|
|
"epoch": 0.7628019648745037,
|
|
"grad_norm": 2.00633033152504,
|
|
"learning_rate": 1.4018203166239032e-06,
|
|
"loss": 0.5084,
|
|
"step": 1417
|
|
},
|
|
{
|
|
"epoch": 0.7633402866563488,
|
|
"grad_norm": 1.593451139015103,
|
|
"learning_rate": 1.3957697977616275e-06,
|
|
"loss": 0.4089,
|
|
"step": 1418
|
|
},
|
|
{
|
|
"epoch": 0.7638786084381939,
|
|
"grad_norm": 1.520947317999593,
|
|
"learning_rate": 1.38973024582076e-06,
|
|
"loss": 0.4204,
|
|
"step": 1419
|
|
},
|
|
{
|
|
"epoch": 0.764416930220039,
|
|
"grad_norm": 1.5671907812915762,
|
|
"learning_rate": 1.3837016791784002e-06,
|
|
"loss": 0.4011,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.7649552520018841,
|
|
"grad_norm": 2.3136360187940435,
|
|
"learning_rate": 1.3776841161782174e-06,
|
|
"loss": 0.5217,
|
|
"step": 1421
|
|
},
|
|
{
|
|
"epoch": 0.7654935737837292,
|
|
"grad_norm": 1.6259616459954453,
|
|
"learning_rate": 1.3716775751304024e-06,
|
|
"loss": 0.4094,
|
|
"step": 1422
|
|
},
|
|
{
|
|
"epoch": 0.7660318955655743,
|
|
"grad_norm": 1.2851781752532265,
|
|
"learning_rate": 1.365682074311609e-06,
|
|
"loss": 0.4371,
|
|
"step": 1423
|
|
},
|
|
{
|
|
"epoch": 0.7665702173474194,
|
|
"grad_norm": 1.6356127807123704,
|
|
"learning_rate": 1.3596976319648957e-06,
|
|
"loss": 0.4305,
|
|
"step": 1424
|
|
},
|
|
{
|
|
"epoch": 0.7671085391292645,
|
|
"grad_norm": 1.7847217896835836,
|
|
"learning_rate": 1.3537242662996741e-06,
|
|
"loss": 0.4228,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.7676468609111096,
|
|
"grad_norm": 1.9347446509271482,
|
|
"learning_rate": 1.347761995491651e-06,
|
|
"loss": 0.3528,
|
|
"step": 1426
|
|
},
|
|
{
|
|
"epoch": 0.7681851826929547,
|
|
"grad_norm": 1.7975930657160712,
|
|
"learning_rate": 1.3418108376827738e-06,
|
|
"loss": 0.4782,
|
|
"step": 1427
|
|
},
|
|
{
|
|
"epoch": 0.7687235044747998,
|
|
"grad_norm": 1.4744627345322843,
|
|
"learning_rate": 1.3358708109811775e-06,
|
|
"loss": 0.3919,
|
|
"step": 1428
|
|
},
|
|
{
|
|
"epoch": 0.769261826256645,
|
|
"grad_norm": 2.7855979759464926,
|
|
"learning_rate": 1.3299419334611213e-06,
|
|
"loss": 0.4646,
|
|
"step": 1429
|
|
},
|
|
{
|
|
"epoch": 0.7698001480384901,
|
|
"grad_norm": 1.4805916259048137,
|
|
"learning_rate": 1.324024223162947e-06,
|
|
"loss": 0.3906,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.7703384698203352,
|
|
"grad_norm": 1.7443733531704324,
|
|
"learning_rate": 1.3181176980930133e-06,
|
|
"loss": 0.4046,
|
|
"step": 1431
|
|
},
|
|
{
|
|
"epoch": 0.7708767916021803,
|
|
"grad_norm": 1.3403811088010225,
|
|
"learning_rate": 1.3122223762236446e-06,
|
|
"loss": 0.4585,
|
|
"step": 1432
|
|
},
|
|
{
|
|
"epoch": 0.7714151133840254,
|
|
"grad_norm": 1.8083215069181602,
|
|
"learning_rate": 1.306338275493077e-06,
|
|
"loss": 0.4488,
|
|
"step": 1433
|
|
},
|
|
{
|
|
"epoch": 0.7719534351658704,
|
|
"grad_norm": 2.257570529751952,
|
|
"learning_rate": 1.3004654138054035e-06,
|
|
"loss": 0.4411,
|
|
"step": 1434
|
|
},
|
|
{
|
|
"epoch": 0.7724917569477155,
|
|
"grad_norm": 1.5282453915471157,
|
|
"learning_rate": 1.2946038090305186e-06,
|
|
"loss": 0.3982,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.7730300787295606,
|
|
"grad_norm": 1.3350543760395588,
|
|
"learning_rate": 1.2887534790040623e-06,
|
|
"loss": 0.3529,
|
|
"step": 1436
|
|
},
|
|
{
|
|
"epoch": 0.7735684005114057,
|
|
"grad_norm": 1.5872897107277366,
|
|
"learning_rate": 1.2829144415273703e-06,
|
|
"loss": 0.4175,
|
|
"step": 1437
|
|
},
|
|
{
|
|
"epoch": 0.7741067222932508,
|
|
"grad_norm": 1.461133941363055,
|
|
"learning_rate": 1.2770867143674176e-06,
|
|
"loss": 0.4225,
|
|
"step": 1438
|
|
},
|
|
{
|
|
"epoch": 0.7746450440750959,
|
|
"grad_norm": 1.977273812214763,
|
|
"learning_rate": 1.2712703152567634e-06,
|
|
"loss": 0.3955,
|
|
"step": 1439
|
|
},
|
|
{
|
|
"epoch": 0.775183365856941,
|
|
"grad_norm": 1.6743349069669249,
|
|
"learning_rate": 1.2654652618934977e-06,
|
|
"loss": 0.3861,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.775183365856941,
|
|
"eval_loss": 0.42436715960502625,
|
|
"eval_runtime": 1522.7354,
|
|
"eval_samples_per_second": 16.424,
|
|
"eval_steps_per_second": 0.514,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.7757216876387861,
|
|
"grad_norm": 1.499262565396223,
|
|
"learning_rate": 1.2596715719411877e-06,
|
|
"loss": 0.4024,
|
|
"step": 1441
|
|
},
|
|
{
|
|
"epoch": 0.7762600094206312,
|
|
"grad_norm": 1.6235233768215886,
|
|
"learning_rate": 1.253889263028827e-06,
|
|
"loss": 0.3789,
|
|
"step": 1442
|
|
},
|
|
{
|
|
"epoch": 0.7767983312024763,
|
|
"grad_norm": 1.4115144384917186,
|
|
"learning_rate": 1.2481183527507734e-06,
|
|
"loss": 0.4605,
|
|
"step": 1443
|
|
},
|
|
{
|
|
"epoch": 0.7773366529843214,
|
|
"grad_norm": 1.4061010836073027,
|
|
"learning_rate": 1.2423588586667058e-06,
|
|
"loss": 0.394,
|
|
"step": 1444
|
|
},
|
|
{
|
|
"epoch": 0.7778749747661665,
|
|
"grad_norm": 1.4756730352326592,
|
|
"learning_rate": 1.2366107983015636e-06,
|
|
"loss": 0.3997,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.7784132965480116,
|
|
"grad_norm": 1.7767670811956109,
|
|
"learning_rate": 1.2308741891454978e-06,
|
|
"loss": 0.4388,
|
|
"step": 1446
|
|
},
|
|
{
|
|
"epoch": 0.7789516183298567,
|
|
"grad_norm": 1.9567881229548667,
|
|
"learning_rate": 1.2251490486538143e-06,
|
|
"loss": 0.4457,
|
|
"step": 1447
|
|
},
|
|
{
|
|
"epoch": 0.7794899401117018,
|
|
"grad_norm": 1.7149877959759003,
|
|
"learning_rate": 1.2194353942469217e-06,
|
|
"loss": 0.4482,
|
|
"step": 1448
|
|
},
|
|
{
|
|
"epoch": 0.7800282618935469,
|
|
"grad_norm": 1.5521839437257912,
|
|
"learning_rate": 1.2137332433102806e-06,
|
|
"loss": 0.469,
|
|
"step": 1449
|
|
},
|
|
{
|
|
"epoch": 0.780566583675392,
|
|
"grad_norm": 2.688209146479993,
|
|
"learning_rate": 1.2080426131943496e-06,
|
|
"loss": 0.3849,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.7811049054572371,
|
|
"grad_norm": 1.4274278905750635,
|
|
"learning_rate": 1.2023635212145262e-06,
|
|
"loss": 0.3923,
|
|
"step": 1451
|
|
},
|
|
{
|
|
"epoch": 0.7816432272390822,
|
|
"grad_norm": 1.5796240111966617,
|
|
"learning_rate": 1.1966959846511068e-06,
|
|
"loss": 0.4567,
|
|
"step": 1452
|
|
},
|
|
{
|
|
"epoch": 0.7821815490209273,
|
|
"grad_norm": 2.368565849047706,
|
|
"learning_rate": 1.191040020749223e-06,
|
|
"loss": 0.3885,
|
|
"step": 1453
|
|
},
|
|
{
|
|
"epoch": 0.7827198708027724,
|
|
"grad_norm": 1.7831232578884653,
|
|
"learning_rate": 1.1853956467187943e-06,
|
|
"loss": 0.3873,
|
|
"step": 1454
|
|
},
|
|
{
|
|
"epoch": 0.7832581925846175,
|
|
"grad_norm": 2.2089394022551363,
|
|
"learning_rate": 1.1797628797344752e-06,
|
|
"loss": 0.4341,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.7837965143664626,
|
|
"grad_norm": 1.7921663918566133,
|
|
"learning_rate": 1.1741417369356011e-06,
|
|
"loss": 0.4138,
|
|
"step": 1456
|
|
},
|
|
{
|
|
"epoch": 0.7843348361483077,
|
|
"grad_norm": 1.503278809860387,
|
|
"learning_rate": 1.1685322354261402e-06,
|
|
"loss": 0.4608,
|
|
"step": 1457
|
|
},
|
|
{
|
|
"epoch": 0.7848731579301528,
|
|
"grad_norm": 1.567305564830315,
|
|
"learning_rate": 1.1629343922746334e-06,
|
|
"loss": 0.4444,
|
|
"step": 1458
|
|
},
|
|
{
|
|
"epoch": 0.7854114797119979,
|
|
"grad_norm": 1.4431401966395603,
|
|
"learning_rate": 1.1573482245141525e-06,
|
|
"loss": 0.4353,
|
|
"step": 1459
|
|
},
|
|
{
|
|
"epoch": 0.785949801493843,
|
|
"grad_norm": 1.7031469874820835,
|
|
"learning_rate": 1.1517737491422415e-06,
|
|
"loss": 0.4433,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.7864881232756881,
|
|
"grad_norm": 1.9609977211459744,
|
|
"learning_rate": 1.1462109831208679e-06,
|
|
"loss": 0.4482,
|
|
"step": 1461
|
|
},
|
|
{
|
|
"epoch": 0.7870264450575332,
|
|
"grad_norm": 2.150596318263902,
|
|
"learning_rate": 1.1406599433763694e-06,
|
|
"loss": 0.4755,
|
|
"step": 1462
|
|
},
|
|
{
|
|
"epoch": 0.7875647668393783,
|
|
"grad_norm": 1.3265638431410287,
|
|
"learning_rate": 1.1351206467994018e-06,
|
|
"loss": 0.4102,
|
|
"step": 1463
|
|
},
|
|
{
|
|
"epoch": 0.7881030886212234,
|
|
"grad_norm": 4.188075621147485,
|
|
"learning_rate": 1.129593110244892e-06,
|
|
"loss": 0.3644,
|
|
"step": 1464
|
|
},
|
|
{
|
|
"epoch": 0.7886414104030685,
|
|
"grad_norm": 1.5439643283706193,
|
|
"learning_rate": 1.1240773505319824e-06,
|
|
"loss": 0.4707,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.7891797321849136,
|
|
"grad_norm": 1.695949064351043,
|
|
"learning_rate": 1.1185733844439778e-06,
|
|
"loss": 0.4506,
|
|
"step": 1466
|
|
},
|
|
{
|
|
"epoch": 0.7897180539667586,
|
|
"grad_norm": 1.4925323276596911,
|
|
"learning_rate": 1.113081228728301e-06,
|
|
"loss": 0.4062,
|
|
"step": 1467
|
|
},
|
|
{
|
|
"epoch": 0.7902563757486037,
|
|
"grad_norm": 1.810916777909123,
|
|
"learning_rate": 1.1076009000964384e-06,
|
|
"loss": 0.4617,
|
|
"step": 1468
|
|
},
|
|
{
|
|
"epoch": 0.7907946975304488,
|
|
"grad_norm": 1.5391006325796759,
|
|
"learning_rate": 1.102132415223886e-06,
|
|
"loss": 0.4341,
|
|
"step": 1469
|
|
},
|
|
{
|
|
"epoch": 0.7913330193122939,
|
|
"grad_norm": 1.3539603638585116,
|
|
"learning_rate": 1.0966757907501058e-06,
|
|
"loss": 0.4045,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.791871341094139,
|
|
"grad_norm": 1.585969494802185,
|
|
"learning_rate": 1.0912310432784673e-06,
|
|
"loss": 0.4889,
|
|
"step": 1471
|
|
},
|
|
{
|
|
"epoch": 0.7924096628759841,
|
|
"grad_norm": 1.3636312861290756,
|
|
"learning_rate": 1.0857981893762048e-06,
|
|
"loss": 0.4352,
|
|
"step": 1472
|
|
},
|
|
{
|
|
"epoch": 0.7929479846578292,
|
|
"grad_norm": 1.5823372906311277,
|
|
"learning_rate": 1.0803772455743572e-06,
|
|
"loss": 0.398,
|
|
"step": 1473
|
|
},
|
|
{
|
|
"epoch": 0.7934863064396743,
|
|
"grad_norm": 1.5278694836184388,
|
|
"learning_rate": 1.0749682283677288e-06,
|
|
"loss": 0.4228,
|
|
"step": 1474
|
|
},
|
|
{
|
|
"epoch": 0.7940246282215194,
|
|
"grad_norm": 1.1652690918407183,
|
|
"learning_rate": 1.0695711542148313e-06,
|
|
"loss": 0.3811,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.7945629500033645,
|
|
"grad_norm": 1.4886602129753284,
|
|
"learning_rate": 1.0641860395378367e-06,
|
|
"loss": 0.4037,
|
|
"step": 1476
|
|
},
|
|
{
|
|
"epoch": 0.7951012717852096,
|
|
"grad_norm": 1.5390850918633818,
|
|
"learning_rate": 1.0588129007225266e-06,
|
|
"loss": 0.3754,
|
|
"step": 1477
|
|
},
|
|
{
|
|
"epoch": 0.7956395935670547,
|
|
"grad_norm": 1.676720868561217,
|
|
"learning_rate": 1.0534517541182431e-06,
|
|
"loss": 0.4599,
|
|
"step": 1478
|
|
},
|
|
{
|
|
"epoch": 0.7961779153488998,
|
|
"grad_norm": 1.676144009500296,
|
|
"learning_rate": 1.0481026160378394e-06,
|
|
"loss": 0.4203,
|
|
"step": 1479
|
|
},
|
|
{
|
|
"epoch": 0.7967162371307449,
|
|
"grad_norm": 1.3949722623692342,
|
|
"learning_rate": 1.042765502757625e-06,
|
|
"loss": 0.4149,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.79725455891259,
|
|
"grad_norm": 1.6398344004557446,
|
|
"learning_rate": 1.0374404305173247e-06,
|
|
"loss": 0.4215,
|
|
"step": 1481
|
|
},
|
|
{
|
|
"epoch": 0.7977928806944351,
|
|
"grad_norm": 1.6715940485370635,
|
|
"learning_rate": 1.0321274155200234e-06,
|
|
"loss": 0.4393,
|
|
"step": 1482
|
|
},
|
|
{
|
|
"epoch": 0.7983312024762802,
|
|
"grad_norm": 1.395308837290767,
|
|
"learning_rate": 1.0268264739321194e-06,
|
|
"loss": 0.4398,
|
|
"step": 1483
|
|
},
|
|
{
|
|
"epoch": 0.7988695242581253,
|
|
"grad_norm": 1.6597231226511682,
|
|
"learning_rate": 1.0215376218832723e-06,
|
|
"loss": 0.4185,
|
|
"step": 1484
|
|
},
|
|
{
|
|
"epoch": 0.7994078460399704,
|
|
"grad_norm": 1.5059702316944186,
|
|
"learning_rate": 1.0162608754663572e-06,
|
|
"loss": 0.4428,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.7999461678218155,
|
|
"grad_norm": 1.774717767949121,
|
|
"learning_rate": 1.0109962507374139e-06,
|
|
"loss": 0.456,
|
|
"step": 1486
|
|
},
|
|
{
|
|
"epoch": 0.8004844896036606,
|
|
"grad_norm": 1.5763966693479707,
|
|
"learning_rate": 1.0057437637155997e-06,
|
|
"loss": 0.4742,
|
|
"step": 1487
|
|
},
|
|
{
|
|
"epoch": 0.8010228113855057,
|
|
"grad_norm": 1.66961890257069,
|
|
"learning_rate": 1.0005034303831352e-06,
|
|
"loss": 0.4479,
|
|
"step": 1488
|
|
},
|
|
{
|
|
"epoch": 0.8015611331673508,
|
|
"grad_norm": 1.4312052717987154,
|
|
"learning_rate": 9.95275266685264e-07,
|
|
"loss": 0.3894,
|
|
"step": 1489
|
|
},
|
|
{
|
|
"epoch": 0.8020994549491959,
|
|
"grad_norm": 1.5395533368166758,
|
|
"learning_rate": 9.900592885301986e-07,
|
|
"loss": 0.433,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.802637776731041,
|
|
"grad_norm": 1.7267038818610854,
|
|
"learning_rate": 9.848555117890734e-07,
|
|
"loss": 0.4399,
|
|
"step": 1491
|
|
},
|
|
{
|
|
"epoch": 0.8031760985128861,
|
|
"grad_norm": 1.588155903799363,
|
|
"learning_rate": 9.796639522958972e-07,
|
|
"loss": 0.4662,
|
|
"step": 1492
|
|
},
|
|
{
|
|
"epoch": 0.8037144202947312,
|
|
"grad_norm": 1.278378381771794,
|
|
"learning_rate": 9.744846258475032e-07,
|
|
"loss": 0.4023,
|
|
"step": 1493
|
|
},
|
|
{
|
|
"epoch": 0.8042527420765763,
|
|
"grad_norm": 1.630276962177858,
|
|
"learning_rate": 9.693175482035038e-07,
|
|
"loss": 0.4352,
|
|
"step": 1494
|
|
},
|
|
{
|
|
"epoch": 0.8047910638584214,
|
|
"grad_norm": 1.7375887913272672,
|
|
"learning_rate": 9.641627350862371e-07,
|
|
"loss": 0.4451,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.8053293856402665,
|
|
"grad_norm": 1.5671830810820253,
|
|
"learning_rate": 9.590202021807266e-07,
|
|
"loss": 0.4944,
|
|
"step": 1496
|
|
},
|
|
{
|
|
"epoch": 0.8058677074221116,
|
|
"grad_norm": 1.5984498803682108,
|
|
"learning_rate": 9.538899651346278e-07,
|
|
"loss": 0.4171,
|
|
"step": 1497
|
|
},
|
|
{
|
|
"epoch": 0.8064060292039567,
|
|
"grad_norm": 1.4646889528560627,
|
|
"learning_rate": 9.487720395581829e-07,
|
|
"loss": 0.3802,
|
|
"step": 1498
|
|
},
|
|
{
|
|
"epoch": 0.8069443509858018,
|
|
"grad_norm": 1.3512741257951366,
|
|
"learning_rate": 9.436664410241736e-07,
|
|
"loss": 0.4309,
|
|
"step": 1499
|
|
},
|
|
{
|
|
"epoch": 0.8074826727676468,
|
|
"grad_norm": 1.5243040161927932,
|
|
"learning_rate": 9.385731850678714e-07,
|
|
"loss": 0.4321,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.8074826727676468,
|
|
"eval_loss": 0.42280885577201843,
|
|
"eval_runtime": 1525.8015,
|
|
"eval_samples_per_second": 16.391,
|
|
"eval_steps_per_second": 0.513,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.8080209945494919,
|
|
"grad_norm": 1.7335916518675676,
|
|
"learning_rate": 9.334922871869933e-07,
|
|
"loss": 0.4613,
|
|
"step": 1501
|
|
},
|
|
{
|
|
"epoch": 0.808559316331337,
|
|
"grad_norm": 1.4183990627505498,
|
|
"learning_rate": 9.284237628416537e-07,
|
|
"loss": 0.4245,
|
|
"step": 1502
|
|
},
|
|
{
|
|
"epoch": 0.8090976381131821,
|
|
"grad_norm": 1.6705452727321846,
|
|
"learning_rate": 9.233676274543141e-07,
|
|
"loss": 0.4186,
|
|
"step": 1503
|
|
},
|
|
{
|
|
"epoch": 0.8096359598950272,
|
|
"grad_norm": 1.6195072788491132,
|
|
"learning_rate": 9.183238964097408e-07,
|
|
"loss": 0.4606,
|
|
"step": 1504
|
|
},
|
|
{
|
|
"epoch": 0.8101742816768723,
|
|
"grad_norm": 1.5392537994753088,
|
|
"learning_rate": 9.132925850549573e-07,
|
|
"loss": 0.4261,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.8107126034587174,
|
|
"grad_norm": 1.5937406024477896,
|
|
"learning_rate": 9.082737086991955e-07,
|
|
"loss": 0.378,
|
|
"step": 1506
|
|
},
|
|
{
|
|
"epoch": 0.8112509252405625,
|
|
"grad_norm": 1.6757621701627432,
|
|
"learning_rate": 9.0326728261385e-07,
|
|
"loss": 0.4782,
|
|
"step": 1507
|
|
},
|
|
{
|
|
"epoch": 0.8117892470224076,
|
|
"grad_norm": 2.005066048659624,
|
|
"learning_rate": 8.982733220324319e-07,
|
|
"loss": 0.4419,
|
|
"step": 1508
|
|
},
|
|
{
|
|
"epoch": 0.8123275688042527,
|
|
"grad_norm": 1.5506134684388948,
|
|
"learning_rate": 8.932918421505244e-07,
|
|
"loss": 0.4669,
|
|
"step": 1509
|
|
},
|
|
{
|
|
"epoch": 0.8128658905860978,
|
|
"grad_norm": 1.8474324824508042,
|
|
"learning_rate": 8.883228581257297e-07,
|
|
"loss": 0.4416,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.8134042123679429,
|
|
"grad_norm": 1.5536434524734581,
|
|
"learning_rate": 8.83366385077632e-07,
|
|
"loss": 0.4377,
|
|
"step": 1511
|
|
},
|
|
{
|
|
"epoch": 0.813942534149788,
|
|
"grad_norm": 1.399796692285853,
|
|
"learning_rate": 8.784224380877454e-07,
|
|
"loss": 0.4392,
|
|
"step": 1512
|
|
},
|
|
{
|
|
"epoch": 0.8144808559316331,
|
|
"grad_norm": 1.5556950965685121,
|
|
"learning_rate": 8.734910321994717e-07,
|
|
"loss": 0.406,
|
|
"step": 1513
|
|
},
|
|
{
|
|
"epoch": 0.8150191777134782,
|
|
"grad_norm": 1.5480188724931883,
|
|
"learning_rate": 8.685721824180499e-07,
|
|
"loss": 0.4433,
|
|
"step": 1514
|
|
},
|
|
{
|
|
"epoch": 0.8155574994953233,
|
|
"grad_norm": 1.4971651714962706,
|
|
"learning_rate": 8.636659037105149e-07,
|
|
"loss": 0.3966,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.8160958212771684,
|
|
"grad_norm": 1.6155911416639859,
|
|
"learning_rate": 8.587722110056529e-07,
|
|
"loss": 0.4212,
|
|
"step": 1516
|
|
},
|
|
{
|
|
"epoch": 0.8166341430590135,
|
|
"grad_norm": 1.976217129048654,
|
|
"learning_rate": 8.538911191939475e-07,
|
|
"loss": 0.4107,
|
|
"step": 1517
|
|
},
|
|
{
|
|
"epoch": 0.8171724648408586,
|
|
"grad_norm": 1.9846803772964912,
|
|
"learning_rate": 8.490226431275456e-07,
|
|
"loss": 0.4094,
|
|
"step": 1518
|
|
},
|
|
{
|
|
"epoch": 0.8177107866227037,
|
|
"grad_norm": 3.0586074935315133,
|
|
"learning_rate": 8.441667976202045e-07,
|
|
"loss": 0.4492,
|
|
"step": 1519
|
|
},
|
|
{
|
|
"epoch": 0.8182491084045488,
|
|
"grad_norm": 1.6149445557914077,
|
|
"learning_rate": 8.393235974472497e-07,
|
|
"loss": 0.4361,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.8187874301863939,
|
|
"grad_norm": 1.4631036764406664,
|
|
"learning_rate": 8.344930573455323e-07,
|
|
"loss": 0.4343,
|
|
"step": 1521
|
|
},
|
|
{
|
|
"epoch": 0.819325751968239,
|
|
"grad_norm": 1.3342306529935604,
|
|
"learning_rate": 8.296751920133794e-07,
|
|
"loss": 0.3546,
|
|
"step": 1522
|
|
},
|
|
{
|
|
"epoch": 0.8198640737500841,
|
|
"grad_norm": 2.0226246030817356,
|
|
"learning_rate": 8.248700161105483e-07,
|
|
"loss": 0.4281,
|
|
"step": 1523
|
|
},
|
|
{
|
|
"epoch": 0.8204023955319292,
|
|
"grad_norm": 1.9696807317895189,
|
|
"learning_rate": 8.200775442581893e-07,
|
|
"loss": 0.4215,
|
|
"step": 1524
|
|
},
|
|
{
|
|
"epoch": 0.8209407173137743,
|
|
"grad_norm": 1.4820095683603027,
|
|
"learning_rate": 8.152977910387955e-07,
|
|
"loss": 0.4928,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.8214790390956194,
|
|
"grad_norm": 1.5809021302001485,
|
|
"learning_rate": 8.105307709961602e-07,
|
|
"loss": 0.442,
|
|
"step": 1526
|
|
},
|
|
{
|
|
"epoch": 0.8220173608774645,
|
|
"grad_norm": 1.3682019844229378,
|
|
"learning_rate": 8.057764986353317e-07,
|
|
"loss": 0.448,
|
|
"step": 1527
|
|
},
|
|
{
|
|
"epoch": 0.8225556826593096,
|
|
"grad_norm": 1.6136391165039332,
|
|
"learning_rate": 8.010349884225699e-07,
|
|
"loss": 0.4458,
|
|
"step": 1528
|
|
},
|
|
{
|
|
"epoch": 0.8230940044411547,
|
|
"grad_norm": 1.2595845723052967,
|
|
"learning_rate": 7.963062547853023e-07,
|
|
"loss": 0.4014,
|
|
"step": 1529
|
|
},
|
|
{
|
|
"epoch": 0.8236323262229998,
|
|
"grad_norm": 2.650357568288943,
|
|
"learning_rate": 7.915903121120816e-07,
|
|
"loss": 0.4475,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.8241706480048449,
|
|
"grad_norm": 1.5993270434912978,
|
|
"learning_rate": 7.868871747525353e-07,
|
|
"loss": 0.3952,
|
|
"step": 1531
|
|
},
|
|
{
|
|
"epoch": 0.82470896978669,
|
|
"grad_norm": 1.5445035783730348,
|
|
"learning_rate": 7.821968570173321e-07,
|
|
"loss": 0.4546,
|
|
"step": 1532
|
|
},
|
|
{
|
|
"epoch": 0.825247291568535,
|
|
"grad_norm": 1.7600163478435773,
|
|
"learning_rate": 7.775193731781316e-07,
|
|
"loss": 0.3925,
|
|
"step": 1533
|
|
},
|
|
{
|
|
"epoch": 0.8257856133503801,
|
|
"grad_norm": 1.9376227278838558,
|
|
"learning_rate": 7.728547374675421e-07,
|
|
"loss": 0.4142,
|
|
"step": 1534
|
|
},
|
|
{
|
|
"epoch": 0.8263239351322252,
|
|
"grad_norm": 1.5661272939035957,
|
|
"learning_rate": 7.682029640790783e-07,
|
|
"loss": 0.408,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.8268622569140703,
|
|
"grad_norm": 1.7751314318755442,
|
|
"learning_rate": 7.635640671671168e-07,
|
|
"loss": 0.4748,
|
|
"step": 1536
|
|
},
|
|
{
|
|
"epoch": 0.8274005786959154,
|
|
"grad_norm": 1.4328800747976576,
|
|
"learning_rate": 7.589380608468549e-07,
|
|
"loss": 0.445,
|
|
"step": 1537
|
|
},
|
|
{
|
|
"epoch": 0.8279389004777605,
|
|
"grad_norm": 1.770544068666416,
|
|
"learning_rate": 7.543249591942647e-07,
|
|
"loss": 0.3877,
|
|
"step": 1538
|
|
},
|
|
{
|
|
"epoch": 0.8284772222596056,
|
|
"grad_norm": 1.4644257793154838,
|
|
"learning_rate": 7.497247762460535e-07,
|
|
"loss": 0.4729,
|
|
"step": 1539
|
|
},
|
|
{
|
|
"epoch": 0.8290155440414507,
|
|
"grad_norm": 2.0251569316621354,
|
|
"learning_rate": 7.451375259996196e-07,
|
|
"loss": 0.3926,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.8295538658232958,
|
|
"grad_norm": 1.5659705563939743,
|
|
"learning_rate": 7.405632224130094e-07,
|
|
"loss": 0.3978,
|
|
"step": 1541
|
|
},
|
|
{
|
|
"epoch": 0.8300921876051409,
|
|
"grad_norm": 1.5791357169071338,
|
|
"learning_rate": 7.360018794048757e-07,
|
|
"loss": 0.4482,
|
|
"step": 1542
|
|
},
|
|
{
|
|
"epoch": 0.830630509386986,
|
|
"grad_norm": 1.5219436138787439,
|
|
"learning_rate": 7.314535108544346e-07,
|
|
"loss": 0.3993,
|
|
"step": 1543
|
|
},
|
|
{
|
|
"epoch": 0.8311688311688312,
|
|
"grad_norm": 1.5116221556805869,
|
|
"learning_rate": 7.26918130601425e-07,
|
|
"loss": 0.4431,
|
|
"step": 1544
|
|
},
|
|
{
|
|
"epoch": 0.8317071529506763,
|
|
"grad_norm": 1.5355423700033741,
|
|
"learning_rate": 7.223957524460612e-07,
|
|
"loss": 0.3847,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.8322454747325214,
|
|
"grad_norm": 1.6301347275924607,
|
|
"learning_rate": 7.17886390148999e-07,
|
|
"loss": 0.4149,
|
|
"step": 1546
|
|
},
|
|
{
|
|
"epoch": 0.8327837965143665,
|
|
"grad_norm": 1.39164969438826,
|
|
"learning_rate": 7.133900574312885e-07,
|
|
"loss": 0.444,
|
|
"step": 1547
|
|
},
|
|
{
|
|
"epoch": 0.8333221182962116,
|
|
"grad_norm": 1.6360359120384138,
|
|
"learning_rate": 7.089067679743322e-07,
|
|
"loss": 0.4387,
|
|
"step": 1548
|
|
},
|
|
{
|
|
"epoch": 0.8338604400780567,
|
|
"grad_norm": 1.1463330927551836,
|
|
"learning_rate": 7.044365354198462e-07,
|
|
"loss": 0.367,
|
|
"step": 1549
|
|
},
|
|
{
|
|
"epoch": 0.8343987618599018,
|
|
"grad_norm": 1.3951952353250727,
|
|
"learning_rate": 6.999793733698168e-07,
|
|
"loss": 0.4537,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.8349370836417469,
|
|
"grad_norm": 1.444313279525601,
|
|
"learning_rate": 6.955352953864592e-07,
|
|
"loss": 0.4517,
|
|
"step": 1551
|
|
},
|
|
{
|
|
"epoch": 0.835475405423592,
|
|
"grad_norm": 1.4922885632634126,
|
|
"learning_rate": 6.91104314992177e-07,
|
|
"loss": 0.4182,
|
|
"step": 1552
|
|
},
|
|
{
|
|
"epoch": 0.8360137272054371,
|
|
"grad_norm": 1.361490120387784,
|
|
"learning_rate": 6.866864456695189e-07,
|
|
"loss": 0.3819,
|
|
"step": 1553
|
|
},
|
|
{
|
|
"epoch": 0.8365520489872822,
|
|
"grad_norm": 1.3785822196112183,
|
|
"learning_rate": 6.822817008611409e-07,
|
|
"loss": 0.4315,
|
|
"step": 1554
|
|
},
|
|
{
|
|
"epoch": 0.8370903707691273,
|
|
"grad_norm": 1.786812938484116,
|
|
"learning_rate": 6.778900939697642e-07,
|
|
"loss": 0.4352,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.8376286925509724,
|
|
"grad_norm": 1.51980814160385,
|
|
"learning_rate": 6.735116383581325e-07,
|
|
"loss": 0.4681,
|
|
"step": 1556
|
|
},
|
|
{
|
|
"epoch": 0.8381670143328175,
|
|
"grad_norm": 1.6909398106864937,
|
|
"learning_rate": 6.691463473489751e-07,
|
|
"loss": 0.3764,
|
|
"step": 1557
|
|
},
|
|
{
|
|
"epoch": 0.8387053361146626,
|
|
"grad_norm": 1.3032028525505768,
|
|
"learning_rate": 6.647942342249619e-07,
|
|
"loss": 0.4571,
|
|
"step": 1558
|
|
},
|
|
{
|
|
"epoch": 0.8392436578965077,
|
|
"grad_norm": 2.673478994173862,
|
|
"learning_rate": 6.604553122286672e-07,
|
|
"loss": 0.4424,
|
|
"step": 1559
|
|
},
|
|
{
|
|
"epoch": 0.8397819796783528,
|
|
"grad_norm": 1.8774151039134228,
|
|
"learning_rate": 6.561295945625246e-07,
|
|
"loss": 0.4289,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.8397819796783528,
|
|
"eval_loss": 0.42163270711898804,
|
|
"eval_runtime": 1532.1805,
|
|
"eval_samples_per_second": 16.323,
|
|
"eval_steps_per_second": 0.51,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.8403203014601979,
|
|
"grad_norm": 1.3658795551777532,
|
|
"learning_rate": 6.51817094388793e-07,
|
|
"loss": 0.4041,
|
|
"step": 1561
|
|
},
|
|
{
|
|
"epoch": 0.840858623242043,
|
|
"grad_norm": 2.0775682420189683,
|
|
"learning_rate": 6.475178248295111e-07,
|
|
"loss": 0.4626,
|
|
"step": 1562
|
|
},
|
|
{
|
|
"epoch": 0.8413969450238881,
|
|
"grad_norm": 2.0811838469436137,
|
|
"learning_rate": 6.432317989664599e-07,
|
|
"loss": 0.4316,
|
|
"step": 1563
|
|
},
|
|
{
|
|
"epoch": 0.8419352668057332,
|
|
"grad_norm": 1.6387122228577398,
|
|
"learning_rate": 6.389590298411236e-07,
|
|
"loss": 0.4198,
|
|
"step": 1564
|
|
},
|
|
{
|
|
"epoch": 0.8424735885875783,
|
|
"grad_norm": 1.6679858558099225,
|
|
"learning_rate": 6.346995304546482e-07,
|
|
"loss": 0.3999,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.8430119103694234,
|
|
"grad_norm": 1.4149904617289844,
|
|
"learning_rate": 6.304533137678026e-07,
|
|
"loss": 0.418,
|
|
"step": 1566
|
|
},
|
|
{
|
|
"epoch": 0.8435502321512685,
|
|
"grad_norm": 1.58157239985269,
|
|
"learning_rate": 6.262203927009403e-07,
|
|
"loss": 0.4279,
|
|
"step": 1567
|
|
},
|
|
{
|
|
"epoch": 0.8440885539331136,
|
|
"grad_norm": 1.7638599414290634,
|
|
"learning_rate": 6.220007801339562e-07,
|
|
"loss": 0.4042,
|
|
"step": 1568
|
|
},
|
|
{
|
|
"epoch": 0.8446268757149586,
|
|
"grad_norm": 1.5007385916657803,
|
|
"learning_rate": 6.17794488906252e-07,
|
|
"loss": 0.4402,
|
|
"step": 1569
|
|
},
|
|
{
|
|
"epoch": 0.8451651974968037,
|
|
"grad_norm": 1.345366896432651,
|
|
"learning_rate": 6.136015318166966e-07,
|
|
"loss": 0.3642,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.8457035192786488,
|
|
"grad_norm": 1.5235663558748846,
|
|
"learning_rate": 6.094219216235841e-07,
|
|
"loss": 0.3964,
|
|
"step": 1571
|
|
},
|
|
{
|
|
"epoch": 0.8462418410604939,
|
|
"grad_norm": 1.3657476470037149,
|
|
"learning_rate": 6.052556710445972e-07,
|
|
"loss": 0.3748,
|
|
"step": 1572
|
|
},
|
|
{
|
|
"epoch": 0.846780162842339,
|
|
"grad_norm": 1.4394596688138968,
|
|
"learning_rate": 6.011027927567681e-07,
|
|
"loss": 0.441,
|
|
"step": 1573
|
|
},
|
|
{
|
|
"epoch": 0.8473184846241841,
|
|
"grad_norm": 1.5318361149430813,
|
|
"learning_rate": 5.969632993964414e-07,
|
|
"loss": 0.4621,
|
|
"step": 1574
|
|
},
|
|
{
|
|
"epoch": 0.8478568064060292,
|
|
"grad_norm": 1.6075753885114712,
|
|
"learning_rate": 5.928372035592306e-07,
|
|
"loss": 0.4645,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.8483951281878743,
|
|
"grad_norm": 1.5722006469692726,
|
|
"learning_rate": 5.887245177999867e-07,
|
|
"loss": 0.4446,
|
|
"step": 1576
|
|
},
|
|
{
|
|
"epoch": 0.8489334499697194,
|
|
"grad_norm": 1.4551383751314828,
|
|
"learning_rate": 5.846252546327547e-07,
|
|
"loss": 0.43,
|
|
"step": 1577
|
|
},
|
|
{
|
|
"epoch": 0.8494717717515645,
|
|
"grad_norm": 1.4487392657122655,
|
|
"learning_rate": 5.805394265307391e-07,
|
|
"loss": 0.4032,
|
|
"step": 1578
|
|
},
|
|
{
|
|
"epoch": 0.8500100935334096,
|
|
"grad_norm": 1.6691803468661808,
|
|
"learning_rate": 5.764670459262622e-07,
|
|
"loss": 0.4328,
|
|
"step": 1579
|
|
},
|
|
{
|
|
"epoch": 0.8505484153152547,
|
|
"grad_norm": 1.6197190610235175,
|
|
"learning_rate": 5.724081252107311e-07,
|
|
"loss": 0.4045,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.8510867370970998,
|
|
"grad_norm": 1.6633094952520224,
|
|
"learning_rate": 5.683626767345951e-07,
|
|
"loss": 0.4271,
|
|
"step": 1581
|
|
},
|
|
{
|
|
"epoch": 0.8516250588789449,
|
|
"grad_norm": 1.3383638616282105,
|
|
"learning_rate": 5.6433071280731e-07,
|
|
"loss": 0.3742,
|
|
"step": 1582
|
|
},
|
|
{
|
|
"epoch": 0.85216338066079,
|
|
"grad_norm": 1.3573201978569531,
|
|
"learning_rate": 5.60312245697302e-07,
|
|
"loss": 0.355,
|
|
"step": 1583
|
|
},
|
|
{
|
|
"epoch": 0.8527017024426351,
|
|
"grad_norm": 1.5087600985731158,
|
|
"learning_rate": 5.563072876319292e-07,
|
|
"loss": 0.4275,
|
|
"step": 1584
|
|
},
|
|
{
|
|
"epoch": 0.8532400242244802,
|
|
"grad_norm": 1.9174671861368988,
|
|
"learning_rate": 5.523158507974452e-07,
|
|
"loss": 0.4523,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.8537783460063253,
|
|
"grad_norm": 1.2701535232392451,
|
|
"learning_rate": 5.483379473389599e-07,
|
|
"loss": 0.4157,
|
|
"step": 1586
|
|
},
|
|
{
|
|
"epoch": 0.8543166677881704,
|
|
"grad_norm": 1.3648674048032239,
|
|
"learning_rate": 5.443735893604041e-07,
|
|
"loss": 0.443,
|
|
"step": 1587
|
|
},
|
|
{
|
|
"epoch": 0.8548549895700155,
|
|
"grad_norm": 1.7303772028968518,
|
|
"learning_rate": 5.404227889244939e-07,
|
|
"loss": 0.3945,
|
|
"step": 1588
|
|
},
|
|
{
|
|
"epoch": 0.8553933113518606,
|
|
"grad_norm": 1.4650825399074572,
|
|
"learning_rate": 5.364855580526923e-07,
|
|
"loss": 0.4183,
|
|
"step": 1589
|
|
},
|
|
{
|
|
"epoch": 0.8559316331337057,
|
|
"grad_norm": 1.7612420028556155,
|
|
"learning_rate": 5.325619087251704e-07,
|
|
"loss": 0.4472,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.8564699549155508,
|
|
"grad_norm": 1.6090688100302808,
|
|
"learning_rate": 5.28651852880776e-07,
|
|
"loss": 0.4348,
|
|
"step": 1591
|
|
},
|
|
{
|
|
"epoch": 0.8570082766973959,
|
|
"grad_norm": 1.59025634923398,
|
|
"learning_rate": 5.247554024169949e-07,
|
|
"loss": 0.4132,
|
|
"step": 1592
|
|
},
|
|
{
|
|
"epoch": 0.857546598479241,
|
|
"grad_norm": 1.8249117304980227,
|
|
"learning_rate": 5.20872569189913e-07,
|
|
"loss": 0.415,
|
|
"step": 1593
|
|
},
|
|
{
|
|
"epoch": 0.8580849202610861,
|
|
"grad_norm": 1.3724204134525155,
|
|
"learning_rate": 5.170033650141837e-07,
|
|
"loss": 0.4645,
|
|
"step": 1594
|
|
},
|
|
{
|
|
"epoch": 0.8586232420429312,
|
|
"grad_norm": 2.066798117946357,
|
|
"learning_rate": 5.131478016629888e-07,
|
|
"loss": 0.4225,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.8591615638247763,
|
|
"grad_norm": 2.780252323052268,
|
|
"learning_rate": 5.093058908680043e-07,
|
|
"loss": 0.4048,
|
|
"step": 1596
|
|
},
|
|
{
|
|
"epoch": 0.8596998856066214,
|
|
"grad_norm": 1.4726854180656292,
|
|
"learning_rate": 5.054776443193626e-07,
|
|
"loss": 0.4337,
|
|
"step": 1597
|
|
},
|
|
{
|
|
"epoch": 0.8602382073884665,
|
|
"grad_norm": 1.7991832445280496,
|
|
"learning_rate": 5.016630736656213e-07,
|
|
"loss": 0.3871,
|
|
"step": 1598
|
|
},
|
|
{
|
|
"epoch": 0.8607765291703116,
|
|
"grad_norm": 1.6803342666413155,
|
|
"learning_rate": 4.978621905137238e-07,
|
|
"loss": 0.4332,
|
|
"step": 1599
|
|
},
|
|
{
|
|
"epoch": 0.8613148509521567,
|
|
"grad_norm": 1.4355251448306459,
|
|
"learning_rate": 4.940750064289657e-07,
|
|
"loss": 0.3924,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.8618531727340017,
|
|
"grad_norm": 1.3604897046592517,
|
|
"learning_rate": 4.903015329349581e-07,
|
|
"loss": 0.4057,
|
|
"step": 1601
|
|
},
|
|
{
|
|
"epoch": 0.8623914945158468,
|
|
"grad_norm": 1.6598958205265515,
|
|
"learning_rate": 4.865417815135958e-07,
|
|
"loss": 0.3885,
|
|
"step": 1602
|
|
},
|
|
{
|
|
"epoch": 0.8629298162976919,
|
|
"grad_norm": 1.4613049538096838,
|
|
"learning_rate": 4.827957636050179e-07,
|
|
"loss": 0.3922,
|
|
"step": 1603
|
|
},
|
|
{
|
|
"epoch": 0.863468138079537,
|
|
"grad_norm": 1.5965664706849296,
|
|
"learning_rate": 4.790634906075775e-07,
|
|
"loss": 0.4828,
|
|
"step": 1604
|
|
},
|
|
{
|
|
"epoch": 0.8640064598613821,
|
|
"grad_norm": 1.8120189192545764,
|
|
"learning_rate": 4.753449738778021e-07,
|
|
"loss": 0.429,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.8645447816432272,
|
|
"grad_norm": 1.8371969884713577,
|
|
"learning_rate": 4.716402247303631e-07,
|
|
"loss": 0.4074,
|
|
"step": 1606
|
|
},
|
|
{
|
|
"epoch": 0.8650831034250723,
|
|
"grad_norm": 1.5256250240541858,
|
|
"learning_rate": 4.6794925443804097e-07,
|
|
"loss": 0.4015,
|
|
"step": 1607
|
|
},
|
|
{
|
|
"epoch": 0.8656214252069174,
|
|
"grad_norm": 1.6504131905617414,
|
|
"learning_rate": 4.642720742316886e-07,
|
|
"loss": 0.4619,
|
|
"step": 1608
|
|
},
|
|
{
|
|
"epoch": 0.8661597469887625,
|
|
"grad_norm": 1.7464812669613627,
|
|
"learning_rate": 4.6060869530019983e-07,
|
|
"loss": 0.4537,
|
|
"step": 1609
|
|
},
|
|
{
|
|
"epoch": 0.8666980687706076,
|
|
"grad_norm": 1.8767060082708276,
|
|
"learning_rate": 4.569591287904723e-07,
|
|
"loss": 0.4612,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.8672363905524527,
|
|
"grad_norm": 1.3070105173969313,
|
|
"learning_rate": 4.5332338580737824e-07,
|
|
"loss": 0.3629,
|
|
"step": 1611
|
|
},
|
|
{
|
|
"epoch": 0.8677747123342978,
|
|
"grad_norm": 4.572221630177869,
|
|
"learning_rate": 4.4970147741372315e-07,
|
|
"loss": 0.4587,
|
|
"step": 1612
|
|
},
|
|
{
|
|
"epoch": 0.8683130341161429,
|
|
"grad_norm": 1.4960042467223587,
|
|
"learning_rate": 4.460934146302215e-07,
|
|
"loss": 0.4734,
|
|
"step": 1613
|
|
},
|
|
{
|
|
"epoch": 0.868851355897988,
|
|
"grad_norm": 1.9121190508560355,
|
|
"learning_rate": 4.424992084354551e-07,
|
|
"loss": 0.4016,
|
|
"step": 1614
|
|
},
|
|
{
|
|
"epoch": 0.8693896776798331,
|
|
"grad_norm": 1.706342167134769,
|
|
"learning_rate": 4.389188697658453e-07,
|
|
"loss": 0.4207,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.8699279994616782,
|
|
"grad_norm": 1.5621521598790504,
|
|
"learning_rate": 4.3535240951561695e-07,
|
|
"loss": 0.4101,
|
|
"step": 1616
|
|
},
|
|
{
|
|
"epoch": 0.8704663212435233,
|
|
"grad_norm": 1.4806315484210542,
|
|
"learning_rate": 4.3179983853676386e-07,
|
|
"loss": 0.4608,
|
|
"step": 1617
|
|
},
|
|
{
|
|
"epoch": 0.8710046430253684,
|
|
"grad_norm": 1.526083402719131,
|
|
"learning_rate": 4.2826116763902135e-07,
|
|
"loss": 0.4183,
|
|
"step": 1618
|
|
},
|
|
{
|
|
"epoch": 0.8715429648072135,
|
|
"grad_norm": 1.6689772565592038,
|
|
"learning_rate": 4.247364075898258e-07,
|
|
"loss": 0.4288,
|
|
"step": 1619
|
|
},
|
|
{
|
|
"epoch": 0.8720812865890586,
|
|
"grad_norm": 1.3834588776364911,
|
|
"learning_rate": 4.2122556911428744e-07,
|
|
"loss": 0.4032,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.8720812865890586,
|
|
"eval_loss": 0.42079228162765503,
|
|
"eval_runtime": 1541.5294,
|
|
"eval_samples_per_second": 16.224,
|
|
"eval_steps_per_second": 0.507,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.8726196083709037,
|
|
"grad_norm": 1.5791149363732657,
|
|
"learning_rate": 4.177286628951566e-07,
|
|
"loss": 0.4388,
|
|
"step": 1621
|
|
},
|
|
{
|
|
"epoch": 0.8731579301527488,
|
|
"grad_norm": 1.7565308716827732,
|
|
"learning_rate": 4.142456995727906e-07,
|
|
"loss": 0.4403,
|
|
"step": 1622
|
|
},
|
|
{
|
|
"epoch": 0.8736962519345939,
|
|
"grad_norm": 1.8536625820585364,
|
|
"learning_rate": 4.107766897451204e-07,
|
|
"loss": 0.377,
|
|
"step": 1623
|
|
},
|
|
{
|
|
"epoch": 0.874234573716439,
|
|
"grad_norm": 1.557798623706775,
|
|
"learning_rate": 4.073216439676203e-07,
|
|
"loss": 0.4099,
|
|
"step": 1624
|
|
},
|
|
{
|
|
"epoch": 0.8747728954982841,
|
|
"grad_norm": 1.5848805929742247,
|
|
"learning_rate": 4.0388057275327466e-07,
|
|
"loss": 0.4127,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.8753112172801292,
|
|
"grad_norm": 1.4737469672067065,
|
|
"learning_rate": 4.004534865725462e-07,
|
|
"loss": 0.4125,
|
|
"step": 1626
|
|
},
|
|
{
|
|
"epoch": 0.8758495390619743,
|
|
"grad_norm": 1.4866822244945306,
|
|
"learning_rate": 3.970403958533436e-07,
|
|
"loss": 0.4081,
|
|
"step": 1627
|
|
},
|
|
{
|
|
"epoch": 0.8763878608438194,
|
|
"grad_norm": 1.6255821682103373,
|
|
"learning_rate": 3.936413109809906e-07,
|
|
"loss": 0.4465,
|
|
"step": 1628
|
|
},
|
|
{
|
|
"epoch": 0.8769261826256645,
|
|
"grad_norm": 1.4642881317646486,
|
|
"learning_rate": 3.902562422981937e-07,
|
|
"loss": 0.4286,
|
|
"step": 1629
|
|
},
|
|
{
|
|
"epoch": 0.8774645044075096,
|
|
"grad_norm": 1.580573409189922,
|
|
"learning_rate": 3.8688520010501276e-07,
|
|
"loss": 0.4527,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.8780028261893547,
|
|
"grad_norm": 2.0543315708956387,
|
|
"learning_rate": 3.835281946588254e-07,
|
|
"loss": 0.4377,
|
|
"step": 1631
|
|
},
|
|
{
|
|
"epoch": 0.8785411479711998,
|
|
"grad_norm": 1.5115782436115135,
|
|
"learning_rate": 3.801852361743008e-07,
|
|
"loss": 0.4525,
|
|
"step": 1632
|
|
},
|
|
{
|
|
"epoch": 0.8790794697530449,
|
|
"grad_norm": 1.8374746527735237,
|
|
"learning_rate": 3.7685633482336504e-07,
|
|
"loss": 0.4242,
|
|
"step": 1633
|
|
},
|
|
{
|
|
"epoch": 0.87961779153489,
|
|
"grad_norm": 1.5036770046647692,
|
|
"learning_rate": 3.7354150073516947e-07,
|
|
"loss": 0.4474,
|
|
"step": 1634
|
|
},
|
|
{
|
|
"epoch": 0.880156113316735,
|
|
"grad_norm": 1.658882270187231,
|
|
"learning_rate": 3.702407439960648e-07,
|
|
"loss": 0.4321,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 0.8806944350985801,
|
|
"grad_norm": 1.6020319338410256,
|
|
"learning_rate": 3.669540746495653e-07,
|
|
"loss": 0.4212,
|
|
"step": 1636
|
|
},
|
|
{
|
|
"epoch": 0.8812327568804252,
|
|
"grad_norm": 1.7415071086793177,
|
|
"learning_rate": 3.636815026963214e-07,
|
|
"loss": 0.4229,
|
|
"step": 1637
|
|
},
|
|
{
|
|
"epoch": 0.8817710786622703,
|
|
"grad_norm": 1.328144623680027,
|
|
"learning_rate": 3.604230380940871e-07,
|
|
"loss": 0.4135,
|
|
"step": 1638
|
|
},
|
|
{
|
|
"epoch": 0.8823094004441154,
|
|
"grad_norm": 1.8361744282067538,
|
|
"learning_rate": 3.5717869075769187e-07,
|
|
"loss": 0.4448,
|
|
"step": 1639
|
|
},
|
|
{
|
|
"epoch": 0.8828477222259605,
|
|
"grad_norm": 1.4454157174291669,
|
|
"learning_rate": 3.5394847055900794e-07,
|
|
"loss": 0.4339,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.8833860440078056,
|
|
"grad_norm": 1.6322475345286311,
|
|
"learning_rate": 3.5073238732692305e-07,
|
|
"loss": 0.4176,
|
|
"step": 1641
|
|
},
|
|
{
|
|
"epoch": 0.8839243657896507,
|
|
"grad_norm": 1.445292085363601,
|
|
"learning_rate": 3.475304508473071e-07,
|
|
"loss": 0.4554,
|
|
"step": 1642
|
|
},
|
|
{
|
|
"epoch": 0.8844626875714958,
|
|
"grad_norm": 1.4938616353672438,
|
|
"learning_rate": 3.44342670862986e-07,
|
|
"loss": 0.4088,
|
|
"step": 1643
|
|
},
|
|
{
|
|
"epoch": 0.8850010093533409,
|
|
"grad_norm": 1.47760594711673,
|
|
"learning_rate": 3.411690570737097e-07,
|
|
"loss": 0.3793,
|
|
"step": 1644
|
|
},
|
|
{
|
|
"epoch": 0.885539331135186,
|
|
"grad_norm": 1.6041036008050786,
|
|
"learning_rate": 3.3800961913612427e-07,
|
|
"loss": 0.4648,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 0.8860776529170311,
|
|
"grad_norm": 1.6055085861001368,
|
|
"learning_rate": 3.3486436666374024e-07,
|
|
"loss": 0.3958,
|
|
"step": 1646
|
|
},
|
|
{
|
|
"epoch": 0.8866159746988762,
|
|
"grad_norm": 1.592597656491022,
|
|
"learning_rate": 3.3173330922690594e-07,
|
|
"loss": 0.4534,
|
|
"step": 1647
|
|
},
|
|
{
|
|
"epoch": 0.8871542964807213,
|
|
"grad_norm": 1.3972942678399092,
|
|
"learning_rate": 3.2861645635277715e-07,
|
|
"loss": 0.4075,
|
|
"step": 1648
|
|
},
|
|
{
|
|
"epoch": 0.8876926182625664,
|
|
"grad_norm": 1.299571800868061,
|
|
"learning_rate": 3.255138175252859e-07,
|
|
"loss": 0.4322,
|
|
"step": 1649
|
|
},
|
|
{
|
|
"epoch": 0.8882309400444115,
|
|
"grad_norm": 1.6074089216828915,
|
|
"learning_rate": 3.22425402185117e-07,
|
|
"loss": 0.4442,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.8887692618262566,
|
|
"grad_norm": 1.6515277192815747,
|
|
"learning_rate": 3.1935121972967387e-07,
|
|
"loss": 0.3974,
|
|
"step": 1651
|
|
},
|
|
{
|
|
"epoch": 0.8893075836081017,
|
|
"grad_norm": 1.9560867162587892,
|
|
"learning_rate": 3.1629127951305407e-07,
|
|
"loss": 0.4419,
|
|
"step": 1652
|
|
},
|
|
{
|
|
"epoch": 0.8898459053899468,
|
|
"grad_norm": 1.4109620050170866,
|
|
"learning_rate": 3.132455908460175e-07,
|
|
"loss": 0.4006,
|
|
"step": 1653
|
|
},
|
|
{
|
|
"epoch": 0.8903842271717919,
|
|
"grad_norm": 1.3778369174445322,
|
|
"learning_rate": 3.1021416299595985e-07,
|
|
"loss": 0.3917,
|
|
"step": 1654
|
|
},
|
|
{
|
|
"epoch": 0.890922548953637,
|
|
"grad_norm": 1.7547858079840999,
|
|
"learning_rate": 3.0719700518688447e-07,
|
|
"loss": 0.4698,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 0.8914608707354821,
|
|
"grad_norm": 1.5659476763978994,
|
|
"learning_rate": 3.0419412659937477e-07,
|
|
"loss": 0.4172,
|
|
"step": 1656
|
|
},
|
|
{
|
|
"epoch": 0.8919991925173272,
|
|
"grad_norm": 3.093400384631848,
|
|
"learning_rate": 3.0120553637056293e-07,
|
|
"loss": 0.3883,
|
|
"step": 1657
|
|
},
|
|
{
|
|
"epoch": 0.8925375142991724,
|
|
"grad_norm": 1.4466790084982413,
|
|
"learning_rate": 2.9823124359410706e-07,
|
|
"loss": 0.391,
|
|
"step": 1658
|
|
},
|
|
{
|
|
"epoch": 0.8930758360810175,
|
|
"grad_norm": 1.2602029099448362,
|
|
"learning_rate": 2.9527125732015995e-07,
|
|
"loss": 0.41,
|
|
"step": 1659
|
|
},
|
|
{
|
|
"epoch": 0.8936141578628626,
|
|
"grad_norm": 1.5682198116188635,
|
|
"learning_rate": 2.923255865553432e-07,
|
|
"loss": 0.4361,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.8941524796447077,
|
|
"grad_norm": 1.7284038118874672,
|
|
"learning_rate": 2.8939424026271923e-07,
|
|
"loss": 0.4248,
|
|
"step": 1661
|
|
},
|
|
{
|
|
"epoch": 0.8946908014265528,
|
|
"grad_norm": 1.4256983828332148,
|
|
"learning_rate": 2.8647722736176333e-07,
|
|
"loss": 0.4291,
|
|
"step": 1662
|
|
},
|
|
{
|
|
"epoch": 0.8952291232083979,
|
|
"grad_norm": 1.4976102627551229,
|
|
"learning_rate": 2.8357455672833933e-07,
|
|
"loss": 0.3813,
|
|
"step": 1663
|
|
},
|
|
{
|
|
"epoch": 0.895767444990243,
|
|
"grad_norm": 1.8854495681463317,
|
|
"learning_rate": 2.8068623719466725e-07,
|
|
"loss": 0.4516,
|
|
"step": 1664
|
|
},
|
|
{
|
|
"epoch": 0.8963057667720881,
|
|
"grad_norm": 1.5693149002013742,
|
|
"learning_rate": 2.7781227754930253e-07,
|
|
"loss": 0.4585,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 0.8968440885539332,
|
|
"grad_norm": 1.573734503341506,
|
|
"learning_rate": 2.7495268653710493e-07,
|
|
"loss": 0.4483,
|
|
"step": 1666
|
|
},
|
|
{
|
|
"epoch": 0.8973824103357783,
|
|
"grad_norm": 1.5481263062327042,
|
|
"learning_rate": 2.7210747285921435e-07,
|
|
"loss": 0.4468,
|
|
"step": 1667
|
|
},
|
|
{
|
|
"epoch": 0.8979207321176234,
|
|
"grad_norm": 1.7822442462595496,
|
|
"learning_rate": 2.692766451730233e-07,
|
|
"loss": 0.4234,
|
|
"step": 1668
|
|
},
|
|
{
|
|
"epoch": 0.8984590538994685,
|
|
"grad_norm": 1.8797060608535148,
|
|
"learning_rate": 2.6646021209215003e-07,
|
|
"loss": 0.4063,
|
|
"step": 1669
|
|
},
|
|
{
|
|
"epoch": 0.8989973756813135,
|
|
"grad_norm": 1.4047802142985153,
|
|
"learning_rate": 2.636581821864148e-07,
|
|
"loss": 0.3933,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.8995356974631586,
|
|
"grad_norm": 1.9919594742667397,
|
|
"learning_rate": 2.6087056398180823e-07,
|
|
"loss": 0.4259,
|
|
"step": 1671
|
|
},
|
|
{
|
|
"epoch": 0.9000740192450037,
|
|
"grad_norm": 1.439697905572551,
|
|
"learning_rate": 2.580973659604735e-07,
|
|
"loss": 0.4234,
|
|
"step": 1672
|
|
},
|
|
{
|
|
"epoch": 0.9006123410268488,
|
|
"grad_norm": 1.4340034850095604,
|
|
"learning_rate": 2.553385965606736e-07,
|
|
"loss": 0.4011,
|
|
"step": 1673
|
|
},
|
|
{
|
|
"epoch": 0.9011506628086939,
|
|
"grad_norm": 1.6008407880111504,
|
|
"learning_rate": 2.525942641767687e-07,
|
|
"loss": 0.4064,
|
|
"step": 1674
|
|
},
|
|
{
|
|
"epoch": 0.901688984590539,
|
|
"grad_norm": 1.393769083088064,
|
|
"learning_rate": 2.498643771591908e-07,
|
|
"loss": 0.3878,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.9022273063723841,
|
|
"grad_norm": 1.5473000323872435,
|
|
"learning_rate": 2.47148943814417e-07,
|
|
"loss": 0.4125,
|
|
"step": 1676
|
|
},
|
|
{
|
|
"epoch": 0.9027656281542292,
|
|
"grad_norm": 1.504947787937997,
|
|
"learning_rate": 2.4444797240494533e-07,
|
|
"loss": 0.4328,
|
|
"step": 1677
|
|
},
|
|
{
|
|
"epoch": 0.9033039499360743,
|
|
"grad_norm": 1.8071042005817233,
|
|
"learning_rate": 2.4176147114927e-07,
|
|
"loss": 0.4429,
|
|
"step": 1678
|
|
},
|
|
{
|
|
"epoch": 0.9038422717179194,
|
|
"grad_norm": 1.5975781936612632,
|
|
"learning_rate": 2.3908944822185144e-07,
|
|
"loss": 0.4279,
|
|
"step": 1679
|
|
},
|
|
{
|
|
"epoch": 0.9043805934997645,
|
|
"grad_norm": 1.4408734852067904,
|
|
"learning_rate": 2.364319117531011e-07,
|
|
"loss": 0.404,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.9043805934997645,
|
|
"eval_loss": 0.42025431990623474,
|
|
"eval_runtime": 1550.3923,
|
|
"eval_samples_per_second": 16.131,
|
|
"eval_steps_per_second": 0.504,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.9049189152816096,
|
|
"grad_norm": 1.6629310324181896,
|
|
"learning_rate": 2.3378886982934778e-07,
|
|
"loss": 0.4876,
|
|
"step": 1681
|
|
},
|
|
{
|
|
"epoch": 0.9054572370634547,
|
|
"grad_norm": 1.5275509334845596,
|
|
"learning_rate": 2.311603304928173e-07,
|
|
"loss": 0.4428,
|
|
"step": 1682
|
|
},
|
|
{
|
|
"epoch": 0.9059955588452998,
|
|
"grad_norm": 1.6372832685609333,
|
|
"learning_rate": 2.285463017416073e-07,
|
|
"loss": 0.4815,
|
|
"step": 1683
|
|
},
|
|
{
|
|
"epoch": 0.9065338806271449,
|
|
"grad_norm": 1.846596894090347,
|
|
"learning_rate": 2.2594679152966258e-07,
|
|
"loss": 0.4724,
|
|
"step": 1684
|
|
},
|
|
{
|
|
"epoch": 0.90707220240899,
|
|
"grad_norm": 1.7091710123282846,
|
|
"learning_rate": 2.2336180776675154e-07,
|
|
"loss": 0.4447,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 0.9076105241908351,
|
|
"grad_norm": 1.4759554995733482,
|
|
"learning_rate": 2.2079135831843956e-07,
|
|
"loss": 0.4421,
|
|
"step": 1686
|
|
},
|
|
{
|
|
"epoch": 0.9081488459726802,
|
|
"grad_norm": 1.4044547819882969,
|
|
"learning_rate": 2.1823545100606914e-07,
|
|
"loss": 0.4438,
|
|
"step": 1687
|
|
},
|
|
{
|
|
"epoch": 0.9086871677545253,
|
|
"grad_norm": 1.6839786445608516,
|
|
"learning_rate": 2.1569409360673422e-07,
|
|
"loss": 0.4295,
|
|
"step": 1688
|
|
},
|
|
{
|
|
"epoch": 0.9092254895363704,
|
|
"grad_norm": 1.695687328944884,
|
|
"learning_rate": 2.131672938532553e-07,
|
|
"loss": 0.4001,
|
|
"step": 1689
|
|
},
|
|
{
|
|
"epoch": 0.9097638113182155,
|
|
"grad_norm": 1.6064285368620497,
|
|
"learning_rate": 2.1065505943415775e-07,
|
|
"loss": 0.426,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.9103021331000606,
|
|
"grad_norm": 1.805677873651136,
|
|
"learning_rate": 2.0815739799364743e-07,
|
|
"loss": 0.4109,
|
|
"step": 1691
|
|
},
|
|
{
|
|
"epoch": 0.9108404548819057,
|
|
"grad_norm": 1.6393066274059234,
|
|
"learning_rate": 2.0567431713158726e-07,
|
|
"loss": 0.4377,
|
|
"step": 1692
|
|
},
|
|
{
|
|
"epoch": 0.9113787766637508,
|
|
"grad_norm": 1.6183131956225818,
|
|
"learning_rate": 2.032058244034757e-07,
|
|
"loss": 0.4412,
|
|
"step": 1693
|
|
},
|
|
{
|
|
"epoch": 0.9119170984455959,
|
|
"grad_norm": 1.5002695967364554,
|
|
"learning_rate": 2.007519273204206e-07,
|
|
"loss": 0.4437,
|
|
"step": 1694
|
|
},
|
|
{
|
|
"epoch": 0.912455420227441,
|
|
"grad_norm": 1.647362717510626,
|
|
"learning_rate": 1.9831263334911977e-07,
|
|
"loss": 0.4808,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 0.9129937420092861,
|
|
"grad_norm": 1.5964438963275278,
|
|
"learning_rate": 1.95887949911836e-07,
|
|
"loss": 0.4393,
|
|
"step": 1696
|
|
},
|
|
{
|
|
"epoch": 0.9135320637911312,
|
|
"grad_norm": 1.8713869106599383,
|
|
"learning_rate": 1.934778843863766e-07,
|
|
"loss": 0.434,
|
|
"step": 1697
|
|
},
|
|
{
|
|
"epoch": 0.9140703855729763,
|
|
"grad_norm": 1.9039547376831083,
|
|
"learning_rate": 1.9108244410606823e-07,
|
|
"loss": 0.4364,
|
|
"step": 1698
|
|
},
|
|
{
|
|
"epoch": 0.9146087073548214,
|
|
"grad_norm": 1.5450254177283191,
|
|
"learning_rate": 1.887016363597366e-07,
|
|
"loss": 0.4589,
|
|
"step": 1699
|
|
},
|
|
{
|
|
"epoch": 0.9151470291366665,
|
|
"grad_norm": 1.543879530191546,
|
|
"learning_rate": 1.8633546839168403e-07,
|
|
"loss": 0.4064,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.9156853509185116,
|
|
"grad_norm": 1.5304353330893454,
|
|
"learning_rate": 1.839839474016658e-07,
|
|
"loss": 0.442,
|
|
"step": 1701
|
|
},
|
|
{
|
|
"epoch": 0.9162236727003567,
|
|
"grad_norm": 2.3452574340826233,
|
|
"learning_rate": 1.8164708054487002e-07,
|
|
"loss": 0.422,
|
|
"step": 1702
|
|
},
|
|
{
|
|
"epoch": 0.9167619944822017,
|
|
"grad_norm": 1.9150867244566236,
|
|
"learning_rate": 1.7932487493189598e-07,
|
|
"loss": 0.4294,
|
|
"step": 1703
|
|
},
|
|
{
|
|
"epoch": 0.9173003162640468,
|
|
"grad_norm": 1.6124806051656038,
|
|
"learning_rate": 1.7701733762873152e-07,
|
|
"loss": 0.428,
|
|
"step": 1704
|
|
},
|
|
{
|
|
"epoch": 0.9178386380458919,
|
|
"grad_norm": 1.4187608860726189,
|
|
"learning_rate": 1.7472447565673177e-07,
|
|
"loss": 0.4038,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 0.918376959827737,
|
|
"grad_norm": 1.4661931221135862,
|
|
"learning_rate": 1.7244629599259767e-07,
|
|
"loss": 0.3848,
|
|
"step": 1706
|
|
},
|
|
{
|
|
"epoch": 0.9189152816095821,
|
|
"grad_norm": 1.6206434175751971,
|
|
"learning_rate": 1.7018280556835632e-07,
|
|
"loss": 0.3851,
|
|
"step": 1707
|
|
},
|
|
{
|
|
"epoch": 0.9194536033914272,
|
|
"grad_norm": 1.8423442465927384,
|
|
"learning_rate": 1.6793401127133513e-07,
|
|
"loss": 0.4079,
|
|
"step": 1708
|
|
},
|
|
{
|
|
"epoch": 0.9199919251732723,
|
|
"grad_norm": 1.3950233471823357,
|
|
"learning_rate": 1.6569991994414835e-07,
|
|
"loss": 0.3994,
|
|
"step": 1709
|
|
},
|
|
{
|
|
"epoch": 0.9205302469551174,
|
|
"grad_norm": 1.5142214065755961,
|
|
"learning_rate": 1.6348053838466937e-07,
|
|
"loss": 0.4189,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.9210685687369625,
|
|
"grad_norm": 1.5917351975615364,
|
|
"learning_rate": 1.6127587334601458e-07,
|
|
"loss": 0.4314,
|
|
"step": 1711
|
|
},
|
|
{
|
|
"epoch": 0.9216068905188076,
|
|
"grad_norm": 1.605064219083874,
|
|
"learning_rate": 1.5908593153651952e-07,
|
|
"loss": 0.4237,
|
|
"step": 1712
|
|
},
|
|
{
|
|
"epoch": 0.9221452123006527,
|
|
"grad_norm": 1.7341654884483175,
|
|
"learning_rate": 1.5691071961972116e-07,
|
|
"loss": 0.4131,
|
|
"step": 1713
|
|
},
|
|
{
|
|
"epoch": 0.9226835340824978,
|
|
"grad_norm": 1.6343186301580133,
|
|
"learning_rate": 1.547502442143356e-07,
|
|
"loss": 0.4233,
|
|
"step": 1714
|
|
},
|
|
{
|
|
"epoch": 0.9232218558643429,
|
|
"grad_norm": 1.5099995374537671,
|
|
"learning_rate": 1.526045118942404e-07,
|
|
"loss": 0.3982,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 0.923760177646188,
|
|
"grad_norm": 1.7958348974891065,
|
|
"learning_rate": 1.504735291884507e-07,
|
|
"loss": 0.4331,
|
|
"step": 1716
|
|
},
|
|
{
|
|
"epoch": 0.9242984994280331,
|
|
"grad_norm": 1.7356588334735397,
|
|
"learning_rate": 1.4835730258110303e-07,
|
|
"loss": 0.4357,
|
|
"step": 1717
|
|
},
|
|
{
|
|
"epoch": 0.9248368212098782,
|
|
"grad_norm": 2.500196744283525,
|
|
"learning_rate": 1.4625583851143432e-07,
|
|
"loss": 0.3799,
|
|
"step": 1718
|
|
},
|
|
{
|
|
"epoch": 0.9253751429917233,
|
|
"grad_norm": 1.3646453068750661,
|
|
"learning_rate": 1.4416914337376132e-07,
|
|
"loss": 0.4128,
|
|
"step": 1719
|
|
},
|
|
{
|
|
"epoch": 0.9259134647735684,
|
|
"grad_norm": 1.642640642870041,
|
|
"learning_rate": 1.420972235174628e-07,
|
|
"loss": 0.4506,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.9264517865554135,
|
|
"grad_norm": 1.592814733182936,
|
|
"learning_rate": 1.4004008524695912e-07,
|
|
"loss": 0.4296,
|
|
"step": 1721
|
|
},
|
|
{
|
|
"epoch": 0.9269901083372586,
|
|
"grad_norm": 1.4652552983592342,
|
|
"learning_rate": 1.3799773482169378e-07,
|
|
"loss": 0.4233,
|
|
"step": 1722
|
|
},
|
|
{
|
|
"epoch": 0.9275284301191037,
|
|
"grad_norm": 1.7410090898687602,
|
|
"learning_rate": 1.3597017845611181e-07,
|
|
"loss": 0.4594,
|
|
"step": 1723
|
|
},
|
|
{
|
|
"epoch": 0.9280667519009488,
|
|
"grad_norm": 1.559448064084867,
|
|
"learning_rate": 1.3395742231964658e-07,
|
|
"loss": 0.4336,
|
|
"step": 1724
|
|
},
|
|
{
|
|
"epoch": 0.9286050736827939,
|
|
"grad_norm": 1.9623398348887997,
|
|
"learning_rate": 1.3195947253669518e-07,
|
|
"loss": 0.4724,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.929143395464639,
|
|
"grad_norm": 1.4765323135961603,
|
|
"learning_rate": 1.2997633518660125e-07,
|
|
"loss": 0.4122,
|
|
"step": 1726
|
|
},
|
|
{
|
|
"epoch": 0.9296817172464841,
|
|
"grad_norm": 1.9030353185015407,
|
|
"learning_rate": 1.2800801630364013e-07,
|
|
"loss": 0.4414,
|
|
"step": 1727
|
|
},
|
|
{
|
|
"epoch": 0.9302200390283292,
|
|
"grad_norm": 1.3486307498615422,
|
|
"learning_rate": 1.2605452187699484e-07,
|
|
"loss": 0.4799,
|
|
"step": 1728
|
|
},
|
|
{
|
|
"epoch": 0.9307583608101743,
|
|
"grad_norm": 1.4474994381201687,
|
|
"learning_rate": 1.2411585785074232e-07,
|
|
"loss": 0.4353,
|
|
"step": 1729
|
|
},
|
|
{
|
|
"epoch": 0.9312966825920194,
|
|
"grad_norm": 1.460955137197927,
|
|
"learning_rate": 1.221920301238333e-07,
|
|
"loss": 0.4248,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.9318350043738645,
|
|
"grad_norm": 1.8140612572363009,
|
|
"learning_rate": 1.2028304455007412e-07,
|
|
"loss": 0.3888,
|
|
"step": 1731
|
|
},
|
|
{
|
|
"epoch": 0.9323733261557096,
|
|
"grad_norm": 1.4724419135884532,
|
|
"learning_rate": 1.1838890693811055e-07,
|
|
"loss": 0.3868,
|
|
"step": 1732
|
|
},
|
|
{
|
|
"epoch": 0.9329116479375547,
|
|
"grad_norm": 1.4562877473919869,
|
|
"learning_rate": 1.1650962305140845e-07,
|
|
"loss": 0.4305,
|
|
"step": 1733
|
|
},
|
|
{
|
|
"epoch": 0.9334499697193998,
|
|
"grad_norm": 2.0045234339432763,
|
|
"learning_rate": 1.1464519860823698e-07,
|
|
"loss": 0.5062,
|
|
"step": 1734
|
|
},
|
|
{
|
|
"epoch": 0.9339882915012448,
|
|
"grad_norm": 1.8962618785171959,
|
|
"learning_rate": 1.1279563928165094e-07,
|
|
"loss": 0.4049,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 0.93452661328309,
|
|
"grad_norm": 1.580337734175196,
|
|
"learning_rate": 1.1096095069947466e-07,
|
|
"loss": 0.4465,
|
|
"step": 1736
|
|
},
|
|
{
|
|
"epoch": 0.935064935064935,
|
|
"grad_norm": 1.6703156179249958,
|
|
"learning_rate": 1.091411384442831e-07,
|
|
"loss": 0.4174,
|
|
"step": 1737
|
|
},
|
|
{
|
|
"epoch": 0.9356032568467801,
|
|
"grad_norm": 1.4707795804039079,
|
|
"learning_rate": 1.0733620805338462e-07,
|
|
"loss": 0.3582,
|
|
"step": 1738
|
|
},
|
|
{
|
|
"epoch": 0.9361415786286252,
|
|
"grad_norm": 1.5443607495595517,
|
|
"learning_rate": 1.0554616501880722e-07,
|
|
"loss": 0.4322,
|
|
"step": 1739
|
|
},
|
|
{
|
|
"epoch": 0.9366799004104703,
|
|
"grad_norm": 1.647874029047969,
|
|
"learning_rate": 1.0377101478727835e-07,
|
|
"loss": 0.4465,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.9366799004104703,
|
|
"eval_loss": 0.41988879442214966,
|
|
"eval_runtime": 1559.0337,
|
|
"eval_samples_per_second": 16.042,
|
|
"eval_steps_per_second": 0.502,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.9372182221923154,
|
|
"grad_norm": 1.6210033117188805,
|
|
"learning_rate": 1.0201076276021072e-07,
|
|
"loss": 0.4432,
|
|
"step": 1741
|
|
},
|
|
{
|
|
"epoch": 0.9377565439741605,
|
|
"grad_norm": 1.9123170938822815,
|
|
"learning_rate": 1.0026541429368431e-07,
|
|
"loss": 0.4024,
|
|
"step": 1742
|
|
},
|
|
{
|
|
"epoch": 0.9382948657560056,
|
|
"grad_norm": 2.5680416907462864,
|
|
"learning_rate": 9.853497469843043e-08,
|
|
"loss": 0.3973,
|
|
"step": 1743
|
|
},
|
|
{
|
|
"epoch": 0.9388331875378507,
|
|
"grad_norm": 1.462242975230514,
|
|
"learning_rate": 9.681944923981724e-08,
|
|
"loss": 0.455,
|
|
"step": 1744
|
|
},
|
|
{
|
|
"epoch": 0.9393715093196958,
|
|
"grad_norm": 1.4330622858448745,
|
|
"learning_rate": 9.511884313782915e-08,
|
|
"loss": 0.409,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 0.9399098311015409,
|
|
"grad_norm": 1.5924131568344673,
|
|
"learning_rate": 9.343316156705751e-08,
|
|
"loss": 0.4709,
|
|
"step": 1746
|
|
},
|
|
{
|
|
"epoch": 0.940448152883386,
|
|
"grad_norm": 2.1748083360521,
|
|
"learning_rate": 9.176240965668049e-08,
|
|
"loss": 0.4975,
|
|
"step": 1747
|
|
},
|
|
{
|
|
"epoch": 0.9409864746652311,
|
|
"grad_norm": 2.240808535802813,
|
|
"learning_rate": 9.01065924904465e-08,
|
|
"loss": 0.4817,
|
|
"step": 1748
|
|
},
|
|
{
|
|
"epoch": 0.9415247964470762,
|
|
"grad_norm": 1.7231015704313604,
|
|
"learning_rate": 8.846571510666369e-08,
|
|
"loss": 0.4094,
|
|
"step": 1749
|
|
},
|
|
{
|
|
"epoch": 0.9420631182289213,
|
|
"grad_norm": 1.4693480082476622,
|
|
"learning_rate": 8.683978249817981e-08,
|
|
"loss": 0.4453,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.9426014400107664,
|
|
"grad_norm": 1.6509935540008158,
|
|
"learning_rate": 8.52287996123674e-08,
|
|
"loss": 0.4065,
|
|
"step": 1751
|
|
},
|
|
{
|
|
"epoch": 0.9431397617926115,
|
|
"grad_norm": 1.6701873629796138,
|
|
"learning_rate": 8.363277135111314e-08,
|
|
"loss": 0.3761,
|
|
"step": 1752
|
|
},
|
|
{
|
|
"epoch": 0.9436780835744566,
|
|
"grad_norm": 1.2809352240300242,
|
|
"learning_rate": 8.205170257079786e-08,
|
|
"loss": 0.4159,
|
|
"step": 1753
|
|
},
|
|
{
|
|
"epoch": 0.9442164053563017,
|
|
"grad_norm": 1.62872520153001,
|
|
"learning_rate": 8.048559808228496e-08,
|
|
"loss": 0.3973,
|
|
"step": 1754
|
|
},
|
|
{
|
|
"epoch": 0.9447547271381468,
|
|
"grad_norm": 1.6888413344801536,
|
|
"learning_rate": 7.89344626509031e-08,
|
|
"loss": 0.4219,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 0.9452930489199919,
|
|
"grad_norm": 1.6223202323912347,
|
|
"learning_rate": 7.739830099643464e-08,
|
|
"loss": 0.4303,
|
|
"step": 1756
|
|
},
|
|
{
|
|
"epoch": 0.945831370701837,
|
|
"grad_norm": 1.2810729846885742,
|
|
"learning_rate": 7.587711779309947e-08,
|
|
"loss": 0.3868,
|
|
"step": 1757
|
|
},
|
|
{
|
|
"epoch": 0.9463696924836821,
|
|
"grad_norm": 1.6840497326805903,
|
|
"learning_rate": 7.437091766954119e-08,
|
|
"loss": 0.434,
|
|
"step": 1758
|
|
},
|
|
{
|
|
"epoch": 0.9469080142655272,
|
|
"grad_norm": 1.765752446431431,
|
|
"learning_rate": 7.287970520881205e-08,
|
|
"loss": 0.4461,
|
|
"step": 1759
|
|
},
|
|
{
|
|
"epoch": 0.9474463360473723,
|
|
"grad_norm": 1.4694297184744327,
|
|
"learning_rate": 7.140348494836191e-08,
|
|
"loss": 0.4374,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.9479846578292174,
|
|
"grad_norm": 1.456090878683348,
|
|
"learning_rate": 6.994226138002047e-08,
|
|
"loss": 0.4204,
|
|
"step": 1761
|
|
},
|
|
{
|
|
"epoch": 0.9485229796110625,
|
|
"grad_norm": 1.5114503786906142,
|
|
"learning_rate": 6.849603894998725e-08,
|
|
"loss": 0.4431,
|
|
"step": 1762
|
|
},
|
|
{
|
|
"epoch": 0.9490613013929076,
|
|
"grad_norm": 1.9303693867033398,
|
|
"learning_rate": 6.706482205881548e-08,
|
|
"loss": 0.4292,
|
|
"step": 1763
|
|
},
|
|
{
|
|
"epoch": 0.9495996231747527,
|
|
"grad_norm": 1.3436489528854563,
|
|
"learning_rate": 6.564861506139996e-08,
|
|
"loss": 0.3854,
|
|
"step": 1764
|
|
},
|
|
{
|
|
"epoch": 0.9501379449565978,
|
|
"grad_norm": 1.3843500014884988,
|
|
"learning_rate": 6.424742226696312e-08,
|
|
"loss": 0.3969,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 0.9506762667384429,
|
|
"grad_norm": 1.3401735876692071,
|
|
"learning_rate": 6.286124793904336e-08,
|
|
"loss": 0.4183,
|
|
"step": 1766
|
|
},
|
|
{
|
|
"epoch": 0.951214588520288,
|
|
"grad_norm": 1.685672633138118,
|
|
"learning_rate": 6.149009629547897e-08,
|
|
"loss": 0.4468,
|
|
"step": 1767
|
|
},
|
|
{
|
|
"epoch": 0.951752910302133,
|
|
"grad_norm": 1.8943339017606036,
|
|
"learning_rate": 6.013397150839983e-08,
|
|
"loss": 0.4361,
|
|
"step": 1768
|
|
},
|
|
{
|
|
"epoch": 0.9522912320839781,
|
|
"grad_norm": 1.7967244404705551,
|
|
"learning_rate": 5.8792877704211274e-08,
|
|
"loss": 0.4491,
|
|
"step": 1769
|
|
},
|
|
{
|
|
"epoch": 0.9528295538658232,
|
|
"grad_norm": 1.4606147240071112,
|
|
"learning_rate": 5.746681896358131e-08,
|
|
"loss": 0.4019,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.9533678756476683,
|
|
"grad_norm": 1.455938194249448,
|
|
"learning_rate": 5.615579932143067e-08,
|
|
"loss": 0.3948,
|
|
"step": 1771
|
|
},
|
|
{
|
|
"epoch": 0.9539061974295135,
|
|
"grad_norm": 1.2759206549407909,
|
|
"learning_rate": 5.485982276691892e-08,
|
|
"loss": 0.3949,
|
|
"step": 1772
|
|
},
|
|
{
|
|
"epoch": 0.9544445192113586,
|
|
"grad_norm": 1.5731889340664074,
|
|
"learning_rate": 5.35788932434328e-08,
|
|
"loss": 0.4422,
|
|
"step": 1773
|
|
},
|
|
{
|
|
"epoch": 0.9549828409932037,
|
|
"grad_norm": 1.4900834870938766,
|
|
"learning_rate": 5.2313014648573966e-08,
|
|
"loss": 0.3651,
|
|
"step": 1774
|
|
},
|
|
{
|
|
"epoch": 0.9555211627750488,
|
|
"grad_norm": 1.3653648358156305,
|
|
"learning_rate": 5.1062190834146875e-08,
|
|
"loss": 0.403,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.9560594845568939,
|
|
"grad_norm": 1.5012692588758656,
|
|
"learning_rate": 4.9826425606148145e-08,
|
|
"loss": 0.4056,
|
|
"step": 1776
|
|
},
|
|
{
|
|
"epoch": 0.956597806338739,
|
|
"grad_norm": 1.7114437223613954,
|
|
"learning_rate": 4.860572272475384e-08,
|
|
"loss": 0.4219,
|
|
"step": 1777
|
|
},
|
|
{
|
|
"epoch": 0.9571361281205841,
|
|
"grad_norm": 1.5710449681536929,
|
|
"learning_rate": 4.740008590430778e-08,
|
|
"loss": 0.4504,
|
|
"step": 1778
|
|
},
|
|
{
|
|
"epoch": 0.9576744499024292,
|
|
"grad_norm": 1.5334464777855485,
|
|
"learning_rate": 4.620951881331215e-08,
|
|
"loss": 0.4078,
|
|
"step": 1779
|
|
},
|
|
{
|
|
"epoch": 0.9582127716842743,
|
|
"grad_norm": 1.665311340751073,
|
|
"learning_rate": 4.5034025074414124e-08,
|
|
"loss": 0.388,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.9587510934661194,
|
|
"grad_norm": 1.6819133415223784,
|
|
"learning_rate": 4.3873608264397014e-08,
|
|
"loss": 0.4318,
|
|
"step": 1781
|
|
},
|
|
{
|
|
"epoch": 0.9592894152479645,
|
|
"grad_norm": 2.1910803064926947,
|
|
"learning_rate": 4.272827191416584e-08,
|
|
"loss": 0.3862,
|
|
"step": 1782
|
|
},
|
|
{
|
|
"epoch": 0.9598277370298096,
|
|
"grad_norm": 1.3743310605178427,
|
|
"learning_rate": 4.159801950874176e-08,
|
|
"loss": 0.382,
|
|
"step": 1783
|
|
},
|
|
{
|
|
"epoch": 0.9603660588116547,
|
|
"grad_norm": 1.753291691489888,
|
|
"learning_rate": 4.048285448724709e-08,
|
|
"loss": 0.4677,
|
|
"step": 1784
|
|
},
|
|
{
|
|
"epoch": 0.9609043805934998,
|
|
"grad_norm": 1.4424214242693971,
|
|
"learning_rate": 3.938278024289644e-08,
|
|
"loss": 0.4012,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 0.9614427023753449,
|
|
"grad_norm": 1.4573151134275804,
|
|
"learning_rate": 3.829780012298612e-08,
|
|
"loss": 0.4058,
|
|
"step": 1786
|
|
},
|
|
{
|
|
"epoch": 0.96198102415719,
|
|
"grad_norm": 1.4245212432098524,
|
|
"learning_rate": 3.722791742888476e-08,
|
|
"loss": 0.3958,
|
|
"step": 1787
|
|
},
|
|
{
|
|
"epoch": 0.9625193459390351,
|
|
"grad_norm": 1.533496999870574,
|
|
"learning_rate": 3.617313541602274e-08,
|
|
"loss": 0.4195,
|
|
"step": 1788
|
|
},
|
|
{
|
|
"epoch": 0.9630576677208802,
|
|
"grad_norm": 1.854726516234056,
|
|
"learning_rate": 3.5133457293881626e-08,
|
|
"loss": 0.4376,
|
|
"step": 1789
|
|
},
|
|
{
|
|
"epoch": 0.9635959895027253,
|
|
"grad_norm": 1.9373159151394588,
|
|
"learning_rate": 3.410888622598585e-08,
|
|
"loss": 0.4312,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.9641343112845704,
|
|
"grad_norm": 2.153201724460075,
|
|
"learning_rate": 3.3099425329890525e-08,
|
|
"loss": 0.4494,
|
|
"step": 1791
|
|
},
|
|
{
|
|
"epoch": 0.9646726330664155,
|
|
"grad_norm": 1.4498518000265068,
|
|
"learning_rate": 3.210507767717586e-08,
|
|
"loss": 0.4199,
|
|
"step": 1792
|
|
},
|
|
{
|
|
"epoch": 0.9652109548482606,
|
|
"grad_norm": 1.6032986767797375,
|
|
"learning_rate": 3.1125846293433846e-08,
|
|
"loss": 0.3771,
|
|
"step": 1793
|
|
},
|
|
{
|
|
"epoch": 0.9657492766301057,
|
|
"grad_norm": 2.1622319654687057,
|
|
"learning_rate": 3.0161734158261625e-08,
|
|
"loss": 0.4214,
|
|
"step": 1794
|
|
},
|
|
{
|
|
"epoch": 0.9662875984119508,
|
|
"grad_norm": 1.4345400536711836,
|
|
"learning_rate": 2.9212744205252553e-08,
|
|
"loss": 0.3797,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 0.9668259201937959,
|
|
"grad_norm": 1.6565073229021858,
|
|
"learning_rate": 2.8278879321983477e-08,
|
|
"loss": 0.3874,
|
|
"step": 1796
|
|
},
|
|
{
|
|
"epoch": 0.967364241975641,
|
|
"grad_norm": 2.0557097314570196,
|
|
"learning_rate": 2.736014235001194e-08,
|
|
"loss": 0.4341,
|
|
"step": 1797
|
|
},
|
|
{
|
|
"epoch": 0.9679025637574861,
|
|
"grad_norm": 1.64490095462292,
|
|
"learning_rate": 2.6456536084862872e-08,
|
|
"loss": 0.3979,
|
|
"step": 1798
|
|
},
|
|
{
|
|
"epoch": 0.9684408855393312,
|
|
"grad_norm": 1.6729564375619899,
|
|
"learning_rate": 2.5568063276021347e-08,
|
|
"loss": 0.397,
|
|
"step": 1799
|
|
},
|
|
{
|
|
"epoch": 0.9689792073211763,
|
|
"grad_norm": 1.5597222162662605,
|
|
"learning_rate": 2.4694726626925403e-08,
|
|
"loss": 0.432,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.9689792073211763,
|
|
"eval_loss": 0.4197918474674225,
|
|
"eval_runtime": 1571.0705,
|
|
"eval_samples_per_second": 15.919,
|
|
"eval_steps_per_second": 0.498,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.9695175291030214,
|
|
"grad_norm": 1.4076281710448164,
|
|
"learning_rate": 2.383652879495657e-08,
|
|
"loss": 0.3963,
|
|
"step": 1801
|
|
},
|
|
{
|
|
"epoch": 0.9700558508848665,
|
|
"grad_norm": 1.645367632025504,
|
|
"learning_rate": 2.299347239143157e-08,
|
|
"loss": 0.4272,
|
|
"step": 1802
|
|
},
|
|
{
|
|
"epoch": 0.9705941726667116,
|
|
"grad_norm": 1.3956889574044051,
|
|
"learning_rate": 2.2165559981595642e-08,
|
|
"loss": 0.429,
|
|
"step": 1803
|
|
},
|
|
{
|
|
"epoch": 0.9711324944485566,
|
|
"grad_norm": 1.4793349281728767,
|
|
"learning_rate": 2.1352794084613658e-08,
|
|
"loss": 0.4479,
|
|
"step": 1804
|
|
},
|
|
{
|
|
"epoch": 0.9716708162304017,
|
|
"grad_norm": 1.580535608856093,
|
|
"learning_rate": 2.0555177173562925e-08,
|
|
"loss": 0.431,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 0.9722091380122468,
|
|
"grad_norm": 1.7015563233283766,
|
|
"learning_rate": 1.9772711675425937e-08,
|
|
"loss": 0.3984,
|
|
"step": 1806
|
|
},
|
|
{
|
|
"epoch": 0.9727474597940919,
|
|
"grad_norm": 1.5158636017258738,
|
|
"learning_rate": 1.9005399971080974e-08,
|
|
"loss": 0.4166,
|
|
"step": 1807
|
|
},
|
|
{
|
|
"epoch": 0.973285781575937,
|
|
"grad_norm": 1.4220838677616172,
|
|
"learning_rate": 1.8253244395298186e-08,
|
|
"loss": 0.3988,
|
|
"step": 1808
|
|
},
|
|
{
|
|
"epoch": 0.9738241033577821,
|
|
"grad_norm": 1.3963959999222404,
|
|
"learning_rate": 1.7516247236731288e-08,
|
|
"loss": 0.4224,
|
|
"step": 1809
|
|
},
|
|
{
|
|
"epoch": 0.9743624251396272,
|
|
"grad_norm": 1.7337278360138024,
|
|
"learning_rate": 1.679441073790755e-08,
|
|
"loss": 0.4738,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.9749007469214723,
|
|
"grad_norm": 1.4861221398216466,
|
|
"learning_rate": 1.6087737095225598e-08,
|
|
"loss": 0.4449,
|
|
"step": 1811
|
|
},
|
|
{
|
|
"epoch": 0.9754390687033174,
|
|
"grad_norm": 1.3145810749185178,
|
|
"learning_rate": 1.539622845894595e-08,
|
|
"loss": 0.3885,
|
|
"step": 1812
|
|
},
|
|
{
|
|
"epoch": 0.9759773904851625,
|
|
"grad_norm": 1.3176971825763986,
|
|
"learning_rate": 1.471988693318549e-08,
|
|
"loss": 0.4232,
|
|
"step": 1813
|
|
},
|
|
{
|
|
"epoch": 0.9765157122670076,
|
|
"grad_norm": 1.442309770679218,
|
|
"learning_rate": 1.4058714575910238e-08,
|
|
"loss": 0.4328,
|
|
"step": 1814
|
|
},
|
|
{
|
|
"epoch": 0.9770540340488527,
|
|
"grad_norm": 1.5157478456952573,
|
|
"learning_rate": 1.3412713398930355e-08,
|
|
"loss": 0.3911,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 0.9775923558306978,
|
|
"grad_norm": 1.779840899462066,
|
|
"learning_rate": 1.2781885367892377e-08,
|
|
"loss": 0.4179,
|
|
"step": 1816
|
|
},
|
|
{
|
|
"epoch": 0.9781306776125429,
|
|
"grad_norm": 1.6067561255260123,
|
|
"learning_rate": 1.2166232402275325e-08,
|
|
"loss": 0.3987,
|
|
"step": 1817
|
|
},
|
|
{
|
|
"epoch": 0.978668999394388,
|
|
"grad_norm": 1.4429159861518235,
|
|
"learning_rate": 1.156575637538182e-08,
|
|
"loss": 0.3752,
|
|
"step": 1818
|
|
},
|
|
{
|
|
"epoch": 0.9792073211762331,
|
|
"grad_norm": 1.6134101059886168,
|
|
"learning_rate": 1.0980459114335318e-08,
|
|
"loss": 0.4491,
|
|
"step": 1819
|
|
},
|
|
{
|
|
"epoch": 0.9797456429580782,
|
|
"grad_norm": 1.3430032688894593,
|
|
"learning_rate": 1.0410342400073992e-08,
|
|
"loss": 0.4446,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.9802839647399233,
|
|
"grad_norm": 1.5854543749606242,
|
|
"learning_rate": 9.855407967344078e-09,
|
|
"loss": 0.4022,
|
|
"step": 1821
|
|
},
|
|
{
|
|
"epoch": 0.9808222865217684,
|
|
"grad_norm": 1.3429626400579588,
|
|
"learning_rate": 9.31565750469543e-09,
|
|
"loss": 0.4173,
|
|
"step": 1822
|
|
},
|
|
{
|
|
"epoch": 0.9813606083036135,
|
|
"grad_norm": 1.8181594324695687,
|
|
"learning_rate": 8.791092654476529e-09,
|
|
"loss": 0.4699,
|
|
"step": 1823
|
|
},
|
|
{
|
|
"epoch": 0.9818989300854586,
|
|
"grad_norm": 1.3189784151442827,
|
|
"learning_rate": 8.281715012827817e-09,
|
|
"loss": 0.3847,
|
|
"step": 1824
|
|
},
|
|
{
|
|
"epoch": 0.9824372518673037,
|
|
"grad_norm": 1.29942395236663,
|
|
"learning_rate": 7.78752612968059e-09,
|
|
"loss": 0.3989,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.9829755736491488,
|
|
"grad_norm": 1.6481398184837366,
|
|
"learning_rate": 7.3085275087475535e-09,
|
|
"loss": 0.385,
|
|
"step": 1826
|
|
},
|
|
{
|
|
"epoch": 0.9835138954309939,
|
|
"grad_norm": 1.2097016930732503,
|
|
"learning_rate": 6.844720607522282e-09,
|
|
"loss": 0.4635,
|
|
"step": 1827
|
|
},
|
|
{
|
|
"epoch": 0.984052217212839,
|
|
"grad_norm": 1.3353672523995217,
|
|
"learning_rate": 6.3961068372725425e-09,
|
|
"loss": 0.4659,
|
|
"step": 1828
|
|
},
|
|
{
|
|
"epoch": 0.9845905389946841,
|
|
"grad_norm": 1.6604758834668205,
|
|
"learning_rate": 5.962687563036418e-09,
|
|
"loss": 0.4182,
|
|
"step": 1829
|
|
},
|
|
{
|
|
"epoch": 0.9851288607765292,
|
|
"grad_norm": 1.365766973195823,
|
|
"learning_rate": 5.544464103618419e-09,
|
|
"loss": 0.4496,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.9856671825583743,
|
|
"grad_norm": 1.7311791534397065,
|
|
"learning_rate": 5.1414377315855965e-09,
|
|
"loss": 0.4091,
|
|
"step": 1831
|
|
},
|
|
{
|
|
"epoch": 0.9862055043402194,
|
|
"grad_norm": 1.6223056568910816,
|
|
"learning_rate": 4.753609673263104e-09,
|
|
"loss": 0.435,
|
|
"step": 1832
|
|
},
|
|
{
|
|
"epoch": 0.9867438261220645,
|
|
"grad_norm": 1.4811187708876057,
|
|
"learning_rate": 4.380981108730309e-09,
|
|
"loss": 0.4229,
|
|
"step": 1833
|
|
},
|
|
{
|
|
"epoch": 0.9872821479039096,
|
|
"grad_norm": 1.5639619332709622,
|
|
"learning_rate": 4.023553171819128e-09,
|
|
"loss": 0.4434,
|
|
"step": 1834
|
|
},
|
|
{
|
|
"epoch": 0.9878204696857547,
|
|
"grad_norm": 1.4607336838401341,
|
|
"learning_rate": 3.681326950107922e-09,
|
|
"loss": 0.3892,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 0.9883587914675998,
|
|
"grad_norm": 1.4459818740856154,
|
|
"learning_rate": 3.3543034849192746e-09,
|
|
"loss": 0.4613,
|
|
"step": 1836
|
|
},
|
|
{
|
|
"epoch": 0.9888971132494448,
|
|
"grad_norm": 1.727956071768554,
|
|
"learning_rate": 3.0424837713188825e-09,
|
|
"loss": 0.4321,
|
|
"step": 1837
|
|
},
|
|
{
|
|
"epoch": 0.98943543503129,
|
|
"grad_norm": 1.4250494159267046,
|
|
"learning_rate": 2.7458687581072284e-09,
|
|
"loss": 0.4361,
|
|
"step": 1838
|
|
},
|
|
{
|
|
"epoch": 0.989973756813135,
|
|
"grad_norm": 1.6825614414547043,
|
|
"learning_rate": 2.4644593478240218e-09,
|
|
"loss": 0.4247,
|
|
"step": 1839
|
|
},
|
|
{
|
|
"epoch": 0.9905120785949801,
|
|
"grad_norm": 1.3394226647545722,
|
|
"learning_rate": 2.1982563967376525e-09,
|
|
"loss": 0.4224,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.9910504003768252,
|
|
"grad_norm": 1.3878090062249357,
|
|
"learning_rate": 1.9472607148490752e-09,
|
|
"loss": 0.4671,
|
|
"step": 1841
|
|
},
|
|
{
|
|
"epoch": 0.9915887221586703,
|
|
"grad_norm": 1.8045067084462034,
|
|
"learning_rate": 1.71147306588626e-09,
|
|
"loss": 0.4093,
|
|
"step": 1842
|
|
},
|
|
{
|
|
"epoch": 0.9921270439405154,
|
|
"grad_norm": 1.6487465697670387,
|
|
"learning_rate": 1.4908941673008604e-09,
|
|
"loss": 0.4768,
|
|
"step": 1843
|
|
},
|
|
{
|
|
"epoch": 0.9926653657223605,
|
|
"grad_norm": 1.3894142004683563,
|
|
"learning_rate": 1.2855246902693241e-09,
|
|
"loss": 0.4126,
|
|
"step": 1844
|
|
},
|
|
{
|
|
"epoch": 0.9932036875042056,
|
|
"grad_norm": 1.5382669595746958,
|
|
"learning_rate": 1.0953652596878972e-09,
|
|
"loss": 0.4662,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 0.9937420092860507,
|
|
"grad_norm": 1.5055759777025033,
|
|
"learning_rate": 9.204164541720683e-10,
|
|
"loss": 0.3911,
|
|
"step": 1846
|
|
},
|
|
{
|
|
"epoch": 0.9942803310678958,
|
|
"grad_norm": 1.4883627722190473,
|
|
"learning_rate": 7.606788060543491e-10,
|
|
"loss": 0.4005,
|
|
"step": 1847
|
|
},
|
|
{
|
|
"epoch": 0.9948186528497409,
|
|
"grad_norm": 1.7929841052447726,
|
|
"learning_rate": 6.16152801383163e-10,
|
|
"loss": 0.4239,
|
|
"step": 1848
|
|
},
|
|
{
|
|
"epoch": 0.995356974631586,
|
|
"grad_norm": 1.3514634100350202,
|
|
"learning_rate": 4.86838879921736e-10,
|
|
"loss": 0.4122,
|
|
"step": 1849
|
|
},
|
|
{
|
|
"epoch": 0.9958952964134311,
|
|
"grad_norm": 1.5688583282415778,
|
|
"learning_rate": 3.7273743514476544e-10,
|
|
"loss": 0.3613,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.9964336181952762,
|
|
"grad_norm": 1.3790895255701852,
|
|
"learning_rate": 2.73848814238975e-10,
|
|
"loss": 0.3974,
|
|
"step": 1851
|
|
},
|
|
{
|
|
"epoch": 0.9969719399771213,
|
|
"grad_norm": 1.4609310145673613,
|
|
"learning_rate": 1.9017331810256002e-10,
|
|
"loss": 0.4287,
|
|
"step": 1852
|
|
},
|
|
{
|
|
"epoch": 0.9975102617589664,
|
|
"grad_norm": 1.6915446904327818,
|
|
"learning_rate": 1.2171120134185643e-10,
|
|
"loss": 0.4238,
|
|
"step": 1853
|
|
},
|
|
{
|
|
"epoch": 0.9980485835408115,
|
|
"grad_norm": 1.636253995850887,
|
|
"learning_rate": 6.846267227356152e-11,
|
|
"loss": 0.4105,
|
|
"step": 1854
|
|
},
|
|
{
|
|
"epoch": 0.9985869053226566,
|
|
"grad_norm": 1.3210272324277625,
|
|
"learning_rate": 3.042789292140302e-11,
|
|
"loss": 0.3978,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 0.9991252271045017,
|
|
"grad_norm": 1.7798971238230394,
|
|
"learning_rate": 7.606979016694383e-12,
|
|
"loss": 0.4537,
|
|
"step": 1856
|
|
},
|
|
{
|
|
"epoch": 0.9996635488863468,
|
|
"grad_norm": 1.6132079869080023,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.4395,
|
|
"step": 1857
|
|
},
|
|
{
|
|
"epoch": 0.9996635488863468,
|
|
"step": 1857,
|
|
"total_flos": 1.243798906601472e+16,
|
|
"train_loss": 0.0,
|
|
"train_runtime": 0.4818,
|
|
"train_samples_per_second": 987062.335,
|
|
"train_steps_per_second": 3854.561
|
|
}
|
|
],
|
|
"logging_steps": 1.0,
|
|
"max_steps": 1857,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 60,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.243798906601472e+16,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|