Files
qwen_openthoughts_science_c…/trainer_state.json
ModelHub XC 4366ff2ebd 初始化项目,由ModelHub XC社区提供模型
Model: mlfoundations-dev/qwen_openthoughts_science_claude
Source: Original Platform
2026-04-11 18:45:57 +08:00

1954 lines
47 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.987419245154709,
"eval_steps": 500,
"global_step": 273,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01088065283917035,
"grad_norm": 5.769476412265319,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.8514,
"step": 1
},
{
"epoch": 0.0217613056783407,
"grad_norm": 5.824165855722914,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.8564,
"step": 2
},
{
"epoch": 0.032641958517511054,
"grad_norm": 5.385924326553742,
"learning_rate": 8.571428571428571e-06,
"loss": 0.8354,
"step": 3
},
{
"epoch": 0.0435226113566814,
"grad_norm": 2.307280123324516,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.771,
"step": 4
},
{
"epoch": 0.05440326419585175,
"grad_norm": 3.8643476531037586,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.7531,
"step": 5
},
{
"epoch": 0.06528391703502211,
"grad_norm": 4.185929449072588,
"learning_rate": 1.7142857142857142e-05,
"loss": 0.7605,
"step": 6
},
{
"epoch": 0.07616456987419246,
"grad_norm": 4.238592120499111,
"learning_rate": 2e-05,
"loss": 0.7288,
"step": 7
},
{
"epoch": 0.0870452227133628,
"grad_norm": 2.702549334633677,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.7069,
"step": 8
},
{
"epoch": 0.09792587555253315,
"grad_norm": 2.8265702023660415,
"learning_rate": 2.5714285714285718e-05,
"loss": 0.6744,
"step": 9
},
{
"epoch": 0.1088065283917035,
"grad_norm": 2.1843367934470335,
"learning_rate": 2.8571428571428574e-05,
"loss": 0.6654,
"step": 10
},
{
"epoch": 0.11968718123087385,
"grad_norm": 1.5063626985021334,
"learning_rate": 3.142857142857143e-05,
"loss": 0.6374,
"step": 11
},
{
"epoch": 0.13056783407004421,
"grad_norm": 1.3685321573654194,
"learning_rate": 3.4285714285714284e-05,
"loss": 0.6326,
"step": 12
},
{
"epoch": 0.14144848690921455,
"grad_norm": 1.4488139563790012,
"learning_rate": 3.714285714285715e-05,
"loss": 0.6164,
"step": 13
},
{
"epoch": 0.1523291397483849,
"grad_norm": 1.1107459336385455,
"learning_rate": 4e-05,
"loss": 0.6096,
"step": 14
},
{
"epoch": 0.16320979258755525,
"grad_norm": 1.4034801732959803,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.5988,
"step": 15
},
{
"epoch": 0.1740904454267256,
"grad_norm": 1.1173039797504758,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.5979,
"step": 16
},
{
"epoch": 0.18497109826589594,
"grad_norm": 1.3758839510979688,
"learning_rate": 4.857142857142857e-05,
"loss": 0.5824,
"step": 17
},
{
"epoch": 0.1958517511050663,
"grad_norm": 0.9901489591062091,
"learning_rate": 5.1428571428571436e-05,
"loss": 0.5851,
"step": 18
},
{
"epoch": 0.20673240394423664,
"grad_norm": 1.3298335358386597,
"learning_rate": 5.4285714285714295e-05,
"loss": 0.5798,
"step": 19
},
{
"epoch": 0.217613056783407,
"grad_norm": 1.1614826605670152,
"learning_rate": 5.714285714285715e-05,
"loss": 0.5623,
"step": 20
},
{
"epoch": 0.22849370962257737,
"grad_norm": 1.3114977506132772,
"learning_rate": 6.000000000000001e-05,
"loss": 0.5617,
"step": 21
},
{
"epoch": 0.2393743624617477,
"grad_norm": 0.9156072730087671,
"learning_rate": 6.285714285714286e-05,
"loss": 0.554,
"step": 22
},
{
"epoch": 0.25025501530091804,
"grad_norm": 1.5985352795964587,
"learning_rate": 6.571428571428571e-05,
"loss": 0.5728,
"step": 23
},
{
"epoch": 0.26113566814008843,
"grad_norm": 1.347383875892142,
"learning_rate": 6.857142857142857e-05,
"loss": 0.5663,
"step": 24
},
{
"epoch": 0.27201632097925876,
"grad_norm": 0.8364475716969787,
"learning_rate": 7.142857142857143e-05,
"loss": 0.5589,
"step": 25
},
{
"epoch": 0.2828969738184291,
"grad_norm": 1.6877024422160516,
"learning_rate": 7.42857142857143e-05,
"loss": 0.5561,
"step": 26
},
{
"epoch": 0.29377762665759943,
"grad_norm": 0.8544602518620728,
"learning_rate": 7.714285714285715e-05,
"loss": 0.5371,
"step": 27
},
{
"epoch": 0.3046582794967698,
"grad_norm": 1.10746206935203,
"learning_rate": 8e-05,
"loss": 0.5482,
"step": 28
},
{
"epoch": 0.31553893233594016,
"grad_norm": 0.8901965542271334,
"learning_rate": 7.999671154713278e-05,
"loss": 0.5468,
"step": 29
},
{
"epoch": 0.3264195851751105,
"grad_norm": 1.3171275211566689,
"learning_rate": 7.99868467292272e-05,
"loss": 0.5484,
"step": 30
},
{
"epoch": 0.3373002380142809,
"grad_norm": 1.119133823137938,
"learning_rate": 7.997040716828271e-05,
"loss": 0.5391,
"step": 31
},
{
"epoch": 0.3481808908534512,
"grad_norm": 766.2890164269288,
"learning_rate": 7.994739556733538e-05,
"loss": 7.0977,
"step": 32
},
{
"epoch": 0.35906154369262155,
"grad_norm": 360.8841002867855,
"learning_rate": 7.991781571001347e-05,
"loss": 8.9951,
"step": 33
},
{
"epoch": 0.3699421965317919,
"grad_norm": 378.0503338631233,
"learning_rate": 7.988167245991528e-05,
"loss": 6.6897,
"step": 34
},
{
"epoch": 0.3808228493709623,
"grad_norm": 7.672002821902226,
"learning_rate": 7.983897175980957e-05,
"loss": 0.7981,
"step": 35
},
{
"epoch": 0.3917035022101326,
"grad_norm": 4.42825926220236,
"learning_rate": 7.97897206306583e-05,
"loss": 0.7036,
"step": 36
},
{
"epoch": 0.40258415504930295,
"grad_norm": 13.3824847890278,
"learning_rate": 7.973392717046233e-05,
"loss": 0.6454,
"step": 37
},
{
"epoch": 0.4134648078884733,
"grad_norm": 4.184331419144958,
"learning_rate": 7.967160055292984e-05,
"loss": 0.6386,
"step": 38
},
{
"epoch": 0.4243454607276437,
"grad_norm": 2.5999303493722135,
"learning_rate": 7.960275102596809e-05,
"loss": 0.6128,
"step": 39
},
{
"epoch": 0.435226113566814,
"grad_norm": 5.547292352391667,
"learning_rate": 7.952738990999824e-05,
"loss": 0.5773,
"step": 40
},
{
"epoch": 0.44610676640598435,
"grad_norm": 2.1944412304611167,
"learning_rate": 7.94455295960942e-05,
"loss": 0.569,
"step": 41
},
{
"epoch": 0.45698741924515474,
"grad_norm": 1.3921897468618165,
"learning_rate": 7.93571835439452e-05,
"loss": 0.5501,
"step": 42
},
{
"epoch": 0.46786807208432507,
"grad_norm": 1.3666229580107998,
"learning_rate": 7.926236627964262e-05,
"loss": 0.5486,
"step": 43
},
{
"epoch": 0.4787487249234954,
"grad_norm": 1.2707740638081064,
"learning_rate": 7.916109339329173e-05,
"loss": 0.5452,
"step": 44
},
{
"epoch": 0.48962937776266574,
"grad_norm": 0.7728181787873867,
"learning_rate": 7.905338153644818e-05,
"loss": 0.5349,
"step": 45
},
{
"epoch": 0.5005100306018361,
"grad_norm": 1.081424627354813,
"learning_rate": 7.89392484193802e-05,
"loss": 0.5359,
"step": 46
},
{
"epoch": 0.5113906834410065,
"grad_norm": 0.8451219565877732,
"learning_rate": 7.881871280815659e-05,
"loss": 0.5192,
"step": 47
},
{
"epoch": 0.5222713362801769,
"grad_norm": 0.9838361376918019,
"learning_rate": 7.869179452156118e-05,
"loss": 0.523,
"step": 48
},
{
"epoch": 0.5331519891193471,
"grad_norm": 0.7178144668508193,
"learning_rate": 7.855851442783414e-05,
"loss": 0.5277,
"step": 49
},
{
"epoch": 0.5440326419585175,
"grad_norm": 0.8653884854595517,
"learning_rate": 7.841889444124078e-05,
"loss": 0.5171,
"step": 50
},
{
"epoch": 0.5549132947976878,
"grad_norm": 0.9615871234590915,
"learning_rate": 7.827295751846836e-05,
"loss": 0.5228,
"step": 51
},
{
"epoch": 0.5657939476368582,
"grad_norm": 0.6109242817024244,
"learning_rate": 7.81207276548515e-05,
"loss": 0.5176,
"step": 52
},
{
"epoch": 0.5766746004760286,
"grad_norm": 0.7262379927240948,
"learning_rate": 7.796222988042676e-05,
"loss": 0.5173,
"step": 53
},
{
"epoch": 0.5875552533151989,
"grad_norm": 0.8000849152573422,
"learning_rate": 7.779749025581717e-05,
"loss": 0.5112,
"step": 54
},
{
"epoch": 0.5984359061543693,
"grad_norm": 0.6837286870058386,
"learning_rate": 7.762653586794731e-05,
"loss": 0.5207,
"step": 55
},
{
"epoch": 0.6093165589935396,
"grad_norm": 0.502300421602433,
"learning_rate": 7.74493948255895e-05,
"loss": 0.5023,
"step": 56
},
{
"epoch": 0.6201972118327099,
"grad_norm": 0.5742765191011999,
"learning_rate": 7.726609625474218e-05,
"loss": 0.493,
"step": 57
},
{
"epoch": 0.6310778646718803,
"grad_norm": 0.7752392361844105,
"learning_rate": 7.707667029384088e-05,
"loss": 0.5002,
"step": 58
},
{
"epoch": 0.6419585175110507,
"grad_norm": 0.8003464724621063,
"learning_rate": 7.688114808880283e-05,
"loss": 0.5014,
"step": 59
},
{
"epoch": 0.652839170350221,
"grad_norm": 0.4467202145391146,
"learning_rate": 7.667956178790582e-05,
"loss": 0.4932,
"step": 60
},
{
"epoch": 0.6637198231893914,
"grad_norm": 0.47406542774154264,
"learning_rate": 7.647194453650228e-05,
"loss": 0.5052,
"step": 61
},
{
"epoch": 0.6746004760285618,
"grad_norm": 0.61628828690436,
"learning_rate": 7.625833047156953e-05,
"loss": 0.48,
"step": 62
},
{
"epoch": 0.685481128867732,
"grad_norm": 0.6613121807693437,
"learning_rate": 7.603875471609677e-05,
"loss": 0.496,
"step": 63
},
{
"epoch": 0.6963617817069024,
"grad_norm": 0.5547782762077185,
"learning_rate": 7.581325337331013e-05,
"loss": 0.4961,
"step": 64
},
{
"epoch": 0.7072424345460727,
"grad_norm": 0.4876382130153157,
"learning_rate": 7.558186352073648e-05,
"loss": 0.4835,
"step": 65
},
{
"epoch": 0.7181230873852431,
"grad_norm": 0.5304127406212,
"learning_rate": 7.534462320410702e-05,
"loss": 0.4935,
"step": 66
},
{
"epoch": 0.7290037402244135,
"grad_norm": 0.41710180085319953,
"learning_rate": 7.510157143110172e-05,
"loss": 0.4906,
"step": 67
},
{
"epoch": 0.7398843930635838,
"grad_norm": 0.48357676956883017,
"learning_rate": 7.485274816493558e-05,
"loss": 0.4924,
"step": 68
},
{
"epoch": 0.7507650459027542,
"grad_norm": 0.5917098078277447,
"learning_rate": 7.459819431778775e-05,
"loss": 0.4947,
"step": 69
},
{
"epoch": 0.7616456987419246,
"grad_norm": 0.7595830676091241,
"learning_rate": 7.433795174407465e-05,
"loss": 0.4817,
"step": 70
},
{
"epoch": 0.7725263515810948,
"grad_norm": 0.7936800209738104,
"learning_rate": 7.407206323356818e-05,
"loss": 0.4918,
"step": 71
},
{
"epoch": 0.7834070044202652,
"grad_norm": 0.7864537130215999,
"learning_rate": 7.380057250436006e-05,
"loss": 0.4887,
"step": 72
},
{
"epoch": 0.7942876572594356,
"grad_norm": 0.49916613247579816,
"learning_rate": 7.352352419567362e-05,
"loss": 0.4816,
"step": 73
},
{
"epoch": 0.8051683100986059,
"grad_norm": 0.41101864085328815,
"learning_rate": 7.324096386052416e-05,
"loss": 0.485,
"step": 74
},
{
"epoch": 0.8160489629377763,
"grad_norm": 0.656161004995155,
"learning_rate": 7.295293795822887e-05,
"loss": 0.4744,
"step": 75
},
{
"epoch": 0.8269296157769466,
"grad_norm": 0.5394326858320477,
"learning_rate": 7.265949384676795e-05,
"loss": 0.4722,
"step": 76
},
{
"epoch": 0.837810268616117,
"grad_norm": 0.2642185151465239,
"learning_rate": 7.236067977499791e-05,
"loss": 0.4807,
"step": 77
},
{
"epoch": 0.8486909214552874,
"grad_norm": 0.5422041951823006,
"learning_rate": 7.205654487471826e-05,
"loss": 0.4797,
"step": 78
},
{
"epoch": 0.8595715742944576,
"grad_norm": 0.5501247208421062,
"learning_rate": 7.174713915259331e-05,
"loss": 0.475,
"step": 79
},
{
"epoch": 0.870452227133628,
"grad_norm": 0.4027113374681404,
"learning_rate": 7.143251348192971e-05,
"loss": 0.4677,
"step": 80
},
{
"epoch": 0.8813328799727984,
"grad_norm": 0.46847225987486674,
"learning_rate": 7.111271959431189e-05,
"loss": 0.4793,
"step": 81
},
{
"epoch": 0.8922135328119687,
"grad_norm": 0.37608598585235453,
"learning_rate": 7.078781007109625e-05,
"loss": 0.4695,
"step": 82
},
{
"epoch": 0.9030941856511391,
"grad_norm": 0.3314250240053766,
"learning_rate": 7.045783833476538e-05,
"loss": 0.4794,
"step": 83
},
{
"epoch": 0.9139748384903095,
"grad_norm": 0.35538493046088,
"learning_rate": 7.012285864014445e-05,
"loss": 0.4758,
"step": 84
},
{
"epoch": 0.9248554913294798,
"grad_norm": 0.40899275410289987,
"learning_rate": 6.978292606548029e-05,
"loss": 0.4716,
"step": 85
},
{
"epoch": 0.9357361441686501,
"grad_norm": 0.39602646601922714,
"learning_rate": 6.943809650338541e-05,
"loss": 0.4703,
"step": 86
},
{
"epoch": 0.9466167970078204,
"grad_norm": 0.32981646879813115,
"learning_rate": 6.908842665164789e-05,
"loss": 0.4665,
"step": 87
},
{
"epoch": 0.9574974498469908,
"grad_norm": 0.45700638514003344,
"learning_rate": 6.873397400390911e-05,
"loss": 0.4718,
"step": 88
},
{
"epoch": 0.9683781026861612,
"grad_norm": 0.4952520819569817,
"learning_rate": 6.837479684021032e-05,
"loss": 0.4726,
"step": 89
},
{
"epoch": 0.9792587555253315,
"grad_norm": 0.480434270208596,
"learning_rate": 6.80109542174102e-05,
"loss": 0.4699,
"step": 90
},
{
"epoch": 0.9901394083645019,
"grad_norm": 0.593562930442879,
"learning_rate": 6.76425059594746e-05,
"loss": 0.4788,
"step": 91
},
{
"epoch": 1.009520571234274,
"grad_norm": 0.6664418776633385,
"learning_rate": 6.726951264763998e-05,
"loss": 0.456,
"step": 92
},
{
"epoch": 1.0204012240734444,
"grad_norm": 0.6935514470470976,
"learning_rate": 6.689203561045268e-05,
"loss": 0.4545,
"step": 93
},
{
"epoch": 1.0312818769126149,
"grad_norm": 0.8262173326620568,
"learning_rate": 6.651013691368492e-05,
"loss": 0.4589,
"step": 94
},
{
"epoch": 1.0421625297517851,
"grad_norm": 0.9951587995620871,
"learning_rate": 6.612387935012995e-05,
"loss": 0.4594,
"step": 95
},
{
"epoch": 1.0530431825909554,
"grad_norm": 0.9844429089044019,
"learning_rate": 6.573332642927737e-05,
"loss": 0.4514,
"step": 96
},
{
"epoch": 1.063923835430126,
"grad_norm": 0.6991926155223892,
"learning_rate": 6.53385423668708e-05,
"loss": 0.4416,
"step": 97
},
{
"epoch": 1.0748044882692962,
"grad_norm": 0.585309533517111,
"learning_rate": 6.493959207434934e-05,
"loss": 0.4497,
"step": 98
},
{
"epoch": 1.0856851411084665,
"grad_norm": 0.7423320704031524,
"learning_rate": 6.453654114817467e-05,
"loss": 0.457,
"step": 99
},
{
"epoch": 1.0965657939476368,
"grad_norm": 0.6185657582865267,
"learning_rate": 6.412945585904545e-05,
"loss": 0.4481,
"step": 100
},
{
"epoch": 1.1074464467868073,
"grad_norm": 0.45496645094782234,
"learning_rate": 6.371840314100104e-05,
"loss": 0.4514,
"step": 101
},
{
"epoch": 1.1183270996259775,
"grad_norm": 0.6039336404495378,
"learning_rate": 6.330345058041585e-05,
"loss": 0.4583,
"step": 102
},
{
"epoch": 1.1292077524651478,
"grad_norm": 0.4358480651096581,
"learning_rate": 6.288466640488679e-05,
"loss": 0.4374,
"step": 103
},
{
"epoch": 1.1400884053043183,
"grad_norm": 0.43724639626696055,
"learning_rate": 6.2462119472015e-05,
"loss": 0.4428,
"step": 104
},
{
"epoch": 1.1509690581434886,
"grad_norm": 0.5729837222594828,
"learning_rate": 6.20358792580841e-05,
"loss": 0.4471,
"step": 105
},
{
"epoch": 1.1618497109826589,
"grad_norm": 0.39024335737726285,
"learning_rate": 6.160601584663681e-05,
"loss": 0.4453,
"step": 106
},
{
"epoch": 1.1727303638218294,
"grad_norm": 0.3646061376356188,
"learning_rate": 6.11725999169515e-05,
"loss": 0.4491,
"step": 107
},
{
"epoch": 1.1836110166609997,
"grad_norm": 0.46982607381941577,
"learning_rate": 6.0735702732421015e-05,
"loss": 0.4443,
"step": 108
},
{
"epoch": 1.19449166950017,
"grad_norm": 0.29189646752552,
"learning_rate": 6.029539612883529e-05,
"loss": 0.4402,
"step": 109
},
{
"epoch": 1.2053723223393404,
"grad_norm": 0.3520773279039917,
"learning_rate": 5.9851752502570015e-05,
"loss": 0.435,
"step": 110
},
{
"epoch": 1.2162529751785107,
"grad_norm": 0.32075025375118393,
"learning_rate": 5.940484479868288e-05,
"loss": 0.4462,
"step": 111
},
{
"epoch": 1.227133628017681,
"grad_norm": 0.2882478429202779,
"learning_rate": 5.895474649891995e-05,
"loss": 0.4421,
"step": 112
},
{
"epoch": 1.2380142808568515,
"grad_norm": 0.31925565019590013,
"learning_rate": 5.8501531609633424e-05,
"loss": 0.446,
"step": 113
},
{
"epoch": 1.2488949336960218,
"grad_norm": 0.21223179457423266,
"learning_rate": 5.8045274649613386e-05,
"loss": 0.4448,
"step": 114
},
{
"epoch": 1.259775586535192,
"grad_norm": 0.26953627133631786,
"learning_rate": 5.7586050637835295e-05,
"loss": 0.4374,
"step": 115
},
{
"epoch": 1.2706562393743623,
"grad_norm": 0.21178369672133335,
"learning_rate": 5.7123935081125034e-05,
"loss": 0.4458,
"step": 116
},
{
"epoch": 1.2815368922135328,
"grad_norm": 0.21155767077821688,
"learning_rate": 5.6659003961743965e-05,
"loss": 0.4376,
"step": 117
},
{
"epoch": 1.2924175450527031,
"grad_norm": 0.21347423268768667,
"learning_rate": 5.619133372489575e-05,
"loss": 0.4515,
"step": 118
},
{
"epoch": 1.3032981978918734,
"grad_norm": 0.21576745513388285,
"learning_rate": 5.572100126615695e-05,
"loss": 0.4443,
"step": 119
},
{
"epoch": 1.314178850731044,
"grad_norm": 0.24546247378355868,
"learning_rate": 5.524808391883367e-05,
"loss": 0.441,
"step": 120
},
{
"epoch": 1.3250595035702142,
"grad_norm": 0.2207361069513953,
"learning_rate": 5.477265944124626e-05,
"loss": 0.4354,
"step": 121
},
{
"epoch": 1.3359401564093845,
"grad_norm": 0.2444333045464523,
"learning_rate": 5.429480600394405e-05,
"loss": 0.4407,
"step": 122
},
{
"epoch": 1.346820809248555,
"grad_norm": 0.24749025049340784,
"learning_rate": 5.381460217685231e-05,
"loss": 0.4359,
"step": 123
},
{
"epoch": 1.3577014620877252,
"grad_norm": 0.22381053800753611,
"learning_rate": 5.333212691635368e-05,
"loss": 0.4347,
"step": 124
},
{
"epoch": 1.3685821149268955,
"grad_norm": 0.24301738733290523,
"learning_rate": 5.2847459552305834e-05,
"loss": 0.4337,
"step": 125
},
{
"epoch": 1.379462767766066,
"grad_norm": 0.24671775018284153,
"learning_rate": 5.23606797749979e-05,
"loss": 0.4384,
"step": 126
},
{
"epoch": 1.3903434206052363,
"grad_norm": 0.23843656791767695,
"learning_rate": 5.1871867622047624e-05,
"loss": 0.4444,
"step": 127
},
{
"epoch": 1.4012240734444066,
"grad_norm": 0.18481585189252123,
"learning_rate": 5.13811034652413e-05,
"loss": 0.4371,
"step": 128
},
{
"epoch": 1.412104726283577,
"grad_norm": 0.18518123274811235,
"learning_rate": 5.088846799731885e-05,
"loss": 0.4342,
"step": 129
},
{
"epoch": 1.4229853791227474,
"grad_norm": 0.2010634600287658,
"learning_rate": 5.039404221870612e-05,
"loss": 0.4296,
"step": 130
},
{
"epoch": 1.4338660319619176,
"grad_norm": 0.23061272903823807,
"learning_rate": 4.989790742419658e-05,
"loss": 0.4415,
"step": 131
},
{
"epoch": 1.4447466848010881,
"grad_norm": 0.2507778238204534,
"learning_rate": 4.940014518958461e-05,
"loss": 0.4338,
"step": 132
},
{
"epoch": 1.4556273376402584,
"grad_norm": 0.27522736481625426,
"learning_rate": 4.890083735825258e-05,
"loss": 0.4397,
"step": 133
},
{
"epoch": 1.4665079904794287,
"grad_norm": 0.2145456230157738,
"learning_rate": 4.8400066027713974e-05,
"loss": 0.4271,
"step": 134
},
{
"epoch": 1.4773886433185992,
"grad_norm": 0.20238631194211945,
"learning_rate": 4.789791353611469e-05,
"loss": 0.4229,
"step": 135
},
{
"epoch": 1.4882692961577695,
"grad_norm": 0.17925376141197102,
"learning_rate": 4.7394462448694756e-05,
"loss": 0.4383,
"step": 136
},
{
"epoch": 1.4991499489969398,
"grad_norm": 0.16639345435419553,
"learning_rate": 4.688979554421276e-05,
"loss": 0.4307,
"step": 137
},
{
"epoch": 1.5100306018361103,
"grad_norm": 0.16546944794615542,
"learning_rate": 4.6383995801335176e-05,
"loss": 0.4413,
"step": 138
},
{
"epoch": 1.5209112546752805,
"grad_norm": 0.22823688961718305,
"learning_rate": 4.5877146384992725e-05,
"loss": 0.4343,
"step": 139
},
{
"epoch": 1.5317919075144508,
"grad_norm": 0.2077914352935504,
"learning_rate": 4.5369330632706223e-05,
"loss": 0.4304,
"step": 140
},
{
"epoch": 1.5426725603536213,
"grad_norm": 0.178090044195881,
"learning_rate": 4.486063204088402e-05,
"loss": 0.433,
"step": 141
},
{
"epoch": 1.5535532131927916,
"grad_norm": 0.17467896850970988,
"learning_rate": 4.435113425109324e-05,
"loss": 0.4298,
"step": 142
},
{
"epoch": 1.5644338660319619,
"grad_norm": 0.17004530557039635,
"learning_rate": 4.3840921036307274e-05,
"loss": 0.4348,
"step": 143
},
{
"epoch": 1.5753145188711324,
"grad_norm": 0.1956158514979538,
"learning_rate": 4.333007628713158e-05,
"loss": 0.4384,
"step": 144
},
{
"epoch": 1.5861951717103027,
"grad_norm": 0.16196619070156484,
"learning_rate": 4.281868399801016e-05,
"loss": 0.4362,
"step": 145
},
{
"epoch": 1.597075824549473,
"grad_norm": 0.15774918769209736,
"learning_rate": 4.230682825341498e-05,
"loss": 0.4321,
"step": 146
},
{
"epoch": 1.6079564773886434,
"grad_norm": 0.14677757963300414,
"learning_rate": 4.17945932140206e-05,
"loss": 0.4343,
"step": 147
},
{
"epoch": 1.6188371302278135,
"grad_norm": 0.1586788180866913,
"learning_rate": 4.128206310286622e-05,
"loss": 0.4319,
"step": 148
},
{
"epoch": 1.629717783066984,
"grad_norm": 0.15657539290671857,
"learning_rate": 4.0769322191507485e-05,
"loss": 0.4349,
"step": 149
},
{
"epoch": 1.6405984359061545,
"grad_norm": 0.13771725448321517,
"learning_rate": 4.025645478616045e-05,
"loss": 0.4286,
"step": 150
},
{
"epoch": 1.6514790887453246,
"grad_norm": 0.1792608152635085,
"learning_rate": 3.974354521383956e-05,
"loss": 0.4326,
"step": 151
},
{
"epoch": 1.662359741584495,
"grad_norm": 0.19697920039522113,
"learning_rate": 3.923067780849252e-05,
"loss": 0.4325,
"step": 152
},
{
"epoch": 1.6732403944236656,
"grad_norm": 0.15631987727039992,
"learning_rate": 3.87179368971338e-05,
"loss": 0.4269,
"step": 153
},
{
"epoch": 1.6841210472628356,
"grad_norm": 0.1789741559362441,
"learning_rate": 3.820540678597942e-05,
"loss": 0.4352,
"step": 154
},
{
"epoch": 1.6950017001020061,
"grad_norm": 0.14589986696711105,
"learning_rate": 3.769317174658503e-05,
"loss": 0.4331,
"step": 155
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.16739347857930229,
"learning_rate": 3.718131600198984e-05,
"loss": 0.4385,
"step": 156
},
{
"epoch": 1.7167630057803467,
"grad_norm": 0.15981306017402697,
"learning_rate": 3.666992371286843e-05,
"loss": 0.4313,
"step": 157
},
{
"epoch": 1.7276436586195172,
"grad_norm": 0.15235627870495205,
"learning_rate": 3.615907896369273e-05,
"loss": 0.442,
"step": 158
},
{
"epoch": 1.7385243114586875,
"grad_norm": 0.15094599321638194,
"learning_rate": 3.564886574890677e-05,
"loss": 0.4376,
"step": 159
},
{
"epoch": 1.7494049642978577,
"grad_norm": 0.15338762976356135,
"learning_rate": 3.5139367959115986e-05,
"loss": 0.438,
"step": 160
},
{
"epoch": 1.7602856171370282,
"grad_norm": 0.13609508940355913,
"learning_rate": 3.4630669367293797e-05,
"loss": 0.4301,
"step": 161
},
{
"epoch": 1.7711662699761985,
"grad_norm": 0.14408065680405582,
"learning_rate": 3.412285361500729e-05,
"loss": 0.4365,
"step": 162
},
{
"epoch": 1.7820469228153688,
"grad_norm": 0.1523591488642166,
"learning_rate": 3.3616004198664845e-05,
"loss": 0.4261,
"step": 163
},
{
"epoch": 1.7929275756545393,
"grad_norm": 0.16103662054938223,
"learning_rate": 3.311020445578725e-05,
"loss": 0.4358,
"step": 164
},
{
"epoch": 1.8038082284937096,
"grad_norm": 0.1713601588521581,
"learning_rate": 3.260553755130525e-05,
"loss": 0.4317,
"step": 165
},
{
"epoch": 1.8146888813328799,
"grad_norm": 0.15595643791635083,
"learning_rate": 3.210208646388532e-05,
"loss": 0.4201,
"step": 166
},
{
"epoch": 1.8255695341720504,
"grad_norm": 0.14708760810268764,
"learning_rate": 3.1599933972286026e-05,
"loss": 0.4296,
"step": 167
},
{
"epoch": 1.8364501870112206,
"grad_norm": 0.13811567230657157,
"learning_rate": 3.109916264174743e-05,
"loss": 0.4252,
"step": 168
},
{
"epoch": 1.847330839850391,
"grad_norm": 0.14763414231896016,
"learning_rate": 3.0599854810415393e-05,
"loss": 0.4201,
"step": 169
},
{
"epoch": 1.8582114926895614,
"grad_norm": 0.129668633060907,
"learning_rate": 3.0102092575803435e-05,
"loss": 0.4343,
"step": 170
},
{
"epoch": 1.8690921455287317,
"grad_norm": 0.13079136624725912,
"learning_rate": 2.9605957781293893e-05,
"loss": 0.4388,
"step": 171
},
{
"epoch": 1.879972798367902,
"grad_norm": 0.14118617253700078,
"learning_rate": 2.911153200268116e-05,
"loss": 0.4361,
"step": 172
},
{
"epoch": 1.8908534512070725,
"grad_norm": 0.13385452899697334,
"learning_rate": 2.8618896534758707e-05,
"loss": 0.4303,
"step": 173
},
{
"epoch": 1.9017341040462428,
"grad_norm": 0.14708401774021865,
"learning_rate": 2.8128132377952376e-05,
"loss": 0.4332,
"step": 174
},
{
"epoch": 1.912614756885413,
"grad_norm": 0.12272534113340453,
"learning_rate": 2.7639320225002108e-05,
"loss": 0.4228,
"step": 175
},
{
"epoch": 1.9234954097245835,
"grad_norm": 0.1138558266069995,
"learning_rate": 2.715254044769418e-05,
"loss": 0.4282,
"step": 176
},
{
"epoch": 1.9343760625637538,
"grad_norm": 0.12343156100233148,
"learning_rate": 2.666787308364634e-05,
"loss": 0.4284,
"step": 177
},
{
"epoch": 1.945256715402924,
"grad_norm": 0.11224315249119837,
"learning_rate": 2.6185397823147703e-05,
"loss": 0.4265,
"step": 178
},
{
"epoch": 1.9561373682420946,
"grad_norm": 0.10896139436338177,
"learning_rate": 2.5705193996055977e-05,
"loss": 0.4255,
"step": 179
},
{
"epoch": 1.967018021081265,
"grad_norm": 0.12908408500196814,
"learning_rate": 2.5227340558753755e-05,
"loss": 0.4267,
"step": 180
},
{
"epoch": 1.9778986739204352,
"grad_norm": 0.11019554007619542,
"learning_rate": 2.4751916081166336e-05,
"loss": 0.4322,
"step": 181
},
{
"epoch": 1.9887793267596057,
"grad_norm": 0.1233996372590572,
"learning_rate": 2.427899873384306e-05,
"loss": 0.4237,
"step": 182
},
{
"epoch": 2.0081604896293777,
"grad_norm": 0.15359648389426395,
"learning_rate": 2.3808666275104248e-05,
"loss": 0.4132,
"step": 183
},
{
"epoch": 2.019041142468548,
"grad_norm": 0.13345360808541243,
"learning_rate": 2.334099603825605e-05,
"loss": 0.4088,
"step": 184
},
{
"epoch": 2.0299217953077187,
"grad_norm": 0.17411332047982292,
"learning_rate": 2.2876064918874993e-05,
"loss": 0.4049,
"step": 185
},
{
"epoch": 2.0408024481468887,
"grad_norm": 0.1422404399378093,
"learning_rate": 2.241394936216472e-05,
"loss": 0.3949,
"step": 186
},
{
"epoch": 2.051683100986059,
"grad_norm": 0.14694793749020102,
"learning_rate": 2.1954725350386614e-05,
"loss": 0.4004,
"step": 187
},
{
"epoch": 2.0625637538252297,
"grad_norm": 0.1408354112834221,
"learning_rate": 2.14984683903666e-05,
"loss": 0.4047,
"step": 188
},
{
"epoch": 2.0734444066643998,
"grad_norm": 0.13765680224710142,
"learning_rate": 2.1045253501080058e-05,
"loss": 0.4029,
"step": 189
},
{
"epoch": 2.0843250595035703,
"grad_norm": 0.13251592143019264,
"learning_rate": 2.0595155201317115e-05,
"loss": 0.4033,
"step": 190
},
{
"epoch": 2.095205712342741,
"grad_norm": 0.13758159475310247,
"learning_rate": 2.0148247497430012e-05,
"loss": 0.4035,
"step": 191
},
{
"epoch": 2.106086365181911,
"grad_norm": 0.12620065204508432,
"learning_rate": 1.970460387116472e-05,
"loss": 0.4003,
"step": 192
},
{
"epoch": 2.1169670180210813,
"grad_norm": 0.13100037610614787,
"learning_rate": 1.9264297267579e-05,
"loss": 0.4,
"step": 193
},
{
"epoch": 2.127847670860252,
"grad_norm": 0.1342586056051866,
"learning_rate": 1.8827400083048503e-05,
"loss": 0.4032,
"step": 194
},
{
"epoch": 2.138728323699422,
"grad_norm": 0.1314403186569449,
"learning_rate": 1.8393984153363203e-05,
"loss": 0.4084,
"step": 195
},
{
"epoch": 2.1496089765385924,
"grad_norm": 0.1275916516562326,
"learning_rate": 1.7964120741915905e-05,
"loss": 0.4021,
"step": 196
},
{
"epoch": 2.1604896293777625,
"grad_norm": 0.12979778271394132,
"learning_rate": 1.753788052798501e-05,
"loss": 0.4063,
"step": 197
},
{
"epoch": 2.171370282216933,
"grad_norm": 0.13548411500192925,
"learning_rate": 1.7115333595113225e-05,
"loss": 0.4093,
"step": 198
},
{
"epoch": 2.1822509350561035,
"grad_norm": 0.11424114716502747,
"learning_rate": 1.669654941958416e-05,
"loss": 0.3998,
"step": 199
},
{
"epoch": 2.1931315878952735,
"grad_norm": 0.14575557443845474,
"learning_rate": 1.628159685899897e-05,
"loss": 0.404,
"step": 200
},
{
"epoch": 2.204012240734444,
"grad_norm": 0.12565905473323868,
"learning_rate": 1.5870544140954543e-05,
"loss": 0.4029,
"step": 201
},
{
"epoch": 2.2148928935736145,
"grad_norm": 0.1322140532060304,
"learning_rate": 1.5463458851825345e-05,
"loss": 0.4034,
"step": 202
},
{
"epoch": 2.2257735464127846,
"grad_norm": 0.11539342585698967,
"learning_rate": 1.5060407925650662e-05,
"loss": 0.4047,
"step": 203
},
{
"epoch": 2.236654199251955,
"grad_norm": 0.110043261615137,
"learning_rate": 1.466145763312922e-05,
"loss": 0.4028,
"step": 204
},
{
"epoch": 2.2475348520911256,
"grad_norm": 0.12462346384772446,
"learning_rate": 1.426667357072265e-05,
"loss": 0.4057,
"step": 205
},
{
"epoch": 2.2584155049302956,
"grad_norm": 0.11246035528357057,
"learning_rate": 1.3876120649870051e-05,
"loss": 0.401,
"step": 206
},
{
"epoch": 2.269296157769466,
"grad_norm": 0.11025464415618055,
"learning_rate": 1.3489863086315085e-05,
"loss": 0.406,
"step": 207
},
{
"epoch": 2.2801768106086366,
"grad_norm": 0.12202834262583902,
"learning_rate": 1.3107964389547326e-05,
"loss": 0.4105,
"step": 208
},
{
"epoch": 2.2910574634478067,
"grad_norm": 0.10891021403569234,
"learning_rate": 1.2730487352360026e-05,
"loss": 0.3986,
"step": 209
},
{
"epoch": 2.301938116286977,
"grad_norm": 0.11067574316750495,
"learning_rate": 1.2357494040525416e-05,
"loss": 0.4026,
"step": 210
},
{
"epoch": 2.3128187691261477,
"grad_norm": 0.12013626538786122,
"learning_rate": 1.1989045782589815e-05,
"loss": 0.4019,
"step": 211
},
{
"epoch": 2.3236994219653178,
"grad_norm": 0.11734274809139549,
"learning_rate": 1.1625203159789686e-05,
"loss": 0.404,
"step": 212
},
{
"epoch": 2.3345800748044883,
"grad_norm": 0.13868399399365283,
"learning_rate": 1.1266025996090902e-05,
"loss": 0.3968,
"step": 213
},
{
"epoch": 2.3454607276436588,
"grad_norm": 0.1021758536629292,
"learning_rate": 1.0911573348352107e-05,
"loss": 0.3997,
"step": 214
},
{
"epoch": 2.356341380482829,
"grad_norm": 0.11161156337434158,
"learning_rate": 1.0561903496614603e-05,
"loss": 0.4019,
"step": 215
},
{
"epoch": 2.3672220333219993,
"grad_norm": 0.11628871391170659,
"learning_rate": 1.0217073934519726e-05,
"loss": 0.3961,
"step": 216
},
{
"epoch": 2.37810268616117,
"grad_norm": 0.10053214623418112,
"learning_rate": 9.877141359855567e-06,
"loss": 0.4107,
"step": 217
},
{
"epoch": 2.38898333900034,
"grad_norm": 0.10333880551211917,
"learning_rate": 9.542161665234623e-06,
"loss": 0.4041,
"step": 218
},
{
"epoch": 2.3998639918395104,
"grad_norm": 0.10378698888338332,
"learning_rate": 9.212189928903758e-06,
"loss": 0.4057,
"step": 219
},
{
"epoch": 2.410744644678681,
"grad_norm": 0.10095387706934292,
"learning_rate": 8.887280405688106e-06,
"loss": 0.4035,
"step": 220
},
{
"epoch": 2.421625297517851,
"grad_norm": 0.09702668485060174,
"learning_rate": 8.567486518070306e-06,
"loss": 0.3979,
"step": 221
},
{
"epoch": 2.4325059503570214,
"grad_norm": 0.0968990526407044,
"learning_rate": 8.252860847406712e-06,
"loss": 0.4007,
"step": 222
},
{
"epoch": 2.443386603196192,
"grad_norm": 0.09758367833082218,
"learning_rate": 7.943455125281741e-06,
"loss": 0.4038,
"step": 223
},
{
"epoch": 2.454267256035362,
"grad_norm": 0.09335426131514203,
"learning_rate": 7.639320225002106e-06,
"loss": 0.3987,
"step": 224
},
{
"epoch": 2.4651479088745325,
"grad_norm": 0.09620155060950364,
"learning_rate": 7.340506153232052e-06,
"loss": 0.4051,
"step": 225
},
{
"epoch": 2.476028561713703,
"grad_norm": 0.09238550728932193,
"learning_rate": 7.047062041771133e-06,
"loss": 0.4038,
"step": 226
},
{
"epoch": 2.486909214552873,
"grad_norm": 0.08653478796623289,
"learning_rate": 6.759036139475843e-06,
"loss": 0.4035,
"step": 227
},
{
"epoch": 2.4977898673920436,
"grad_norm": 0.09462132238499406,
"learning_rate": 6.476475804326377e-06,
"loss": 0.3945,
"step": 228
},
{
"epoch": 2.508670520231214,
"grad_norm": 0.09364175398439001,
"learning_rate": 6.199427495639963e-06,
"loss": 0.3953,
"step": 229
},
{
"epoch": 2.519551173070384,
"grad_norm": 0.09533512853144702,
"learning_rate": 5.927936766431836e-06,
"loss": 0.3951,
"step": 230
},
{
"epoch": 2.5304318259095546,
"grad_norm": 0.08988259771164411,
"learning_rate": 5.662048255925357e-06,
"loss": 0.4007,
"step": 231
},
{
"epoch": 2.5413124787487247,
"grad_norm": 0.08544449128239766,
"learning_rate": 5.40180568221226e-06,
"loss": 0.3956,
"step": 232
},
{
"epoch": 2.552193131587895,
"grad_norm": 0.08214234004836413,
"learning_rate": 5.147251835064424e-06,
"loss": 0.401,
"step": 233
},
{
"epoch": 2.5630737844270657,
"grad_norm": 0.08978715763135,
"learning_rate": 4.898428568898288e-06,
"loss": 0.4018,
"step": 234
},
{
"epoch": 2.573954437266236,
"grad_norm": 0.09111850134473688,
"learning_rate": 4.65537679589299e-06,
"loss": 0.4033,
"step": 235
},
{
"epoch": 2.5848350901054062,
"grad_norm": 0.08281874038672186,
"learning_rate": 4.418136479263533e-06,
"loss": 0.3957,
"step": 236
},
{
"epoch": 2.5957157429445767,
"grad_norm": 0.0875486805676823,
"learning_rate": 4.186746626689879e-06,
"loss": 0.3978,
"step": 237
},
{
"epoch": 2.606596395783747,
"grad_norm": 0.08327809325190605,
"learning_rate": 3.961245283903239e-06,
"loss": 0.4004,
"step": 238
},
{
"epoch": 2.6174770486229173,
"grad_norm": 0.08331308067123125,
"learning_rate": 3.7416695284304737e-06,
"loss": 0.4062,
"step": 239
},
{
"epoch": 2.628357701462088,
"grad_norm": 0.08388783642844128,
"learning_rate": 3.5280554634977217e-06,
"loss": 0.3974,
"step": 240
},
{
"epoch": 2.6392383543012583,
"grad_norm": 0.08349690695940903,
"learning_rate": 3.320438212094197e-06,
"loss": 0.4021,
"step": 241
},
{
"epoch": 2.6501190071404284,
"grad_norm": 0.08518098898214005,
"learning_rate": 3.1188519111971804e-06,
"loss": 0.4046,
"step": 242
},
{
"epoch": 2.660999659979599,
"grad_norm": 0.08404039117187238,
"learning_rate": 2.9233297061591346e-06,
"loss": 0.3986,
"step": 243
},
{
"epoch": 2.671880312818769,
"grad_norm": 0.08468392011098298,
"learning_rate": 2.733903745257838e-06,
"loss": 0.4143,
"step": 244
},
{
"epoch": 2.6827609656579394,
"grad_norm": 0.08035934775357019,
"learning_rate": 2.550605174410512e-06,
"loss": 0.3982,
"step": 245
},
{
"epoch": 2.69364161849711,
"grad_norm": 0.08100249829782087,
"learning_rate": 2.373464132052701e-06,
"loss": 0.3975,
"step": 246
},
{
"epoch": 2.7045222713362804,
"grad_norm": 0.08405527434090163,
"learning_rate": 2.202509744182835e-06,
"loss": 0.3958,
"step": 247
},
{
"epoch": 2.7154029241754505,
"grad_norm": 0.08458090883428918,
"learning_rate": 2.0377701195732545e-06,
"loss": 0.4094,
"step": 248
},
{
"epoch": 2.726283577014621,
"grad_norm": 0.08120194628003821,
"learning_rate": 1.879272345148513e-06,
"loss": 0.4071,
"step": 249
},
{
"epoch": 2.737164229853791,
"grad_norm": 0.08381423080148898,
"learning_rate": 1.727042481531651e-06,
"loss": 0.3997,
"step": 250
},
{
"epoch": 2.7480448826929615,
"grad_norm": 0.08003721871020845,
"learning_rate": 1.5811055587592283e-06,
"loss": 0.4032,
"step": 251
},
{
"epoch": 2.758925535532132,
"grad_norm": 0.07906608009605874,
"learning_rate": 1.4414855721658705e-06,
"loss": 0.4011,
"step": 252
},
{
"epoch": 2.7698061883713025,
"grad_norm": 0.07989568893361568,
"learning_rate": 1.3082054784388221e-06,
"loss": 0.3938,
"step": 253
},
{
"epoch": 2.7806868412104726,
"grad_norm": 0.07531713942027396,
"learning_rate": 1.1812871918434143e-06,
"loss": 0.4036,
"step": 254
},
{
"epoch": 2.791567494049643,
"grad_norm": 0.07349157785840128,
"learning_rate": 1.0607515806198142e-06,
"loss": 0.3975,
"step": 255
},
{
"epoch": 2.802448146888813,
"grad_norm": 0.07248419100011405,
"learning_rate": 9.466184635518361e-07,
"loss": 0.397,
"step": 256
},
{
"epoch": 2.8133287997279837,
"grad_norm": 0.07667024498059592,
"learning_rate": 8.389066067082852e-07,
"loss": 0.4011,
"step": 257
},
{
"epoch": 2.824209452567154,
"grad_norm": 0.07642598936379766,
"learning_rate": 7.376337203573824e-07,
"loss": 0.4,
"step": 258
},
{
"epoch": 2.8350901054063242,
"grad_norm": 0.07461923023970361,
"learning_rate": 6.428164560548134e-07,
"loss": 0.4021,
"step": 259
},
{
"epoch": 2.8459707582454947,
"grad_norm": 0.08011069712526726,
"learning_rate": 5.544704039058025e-07,
"loss": 0.4001,
"step": 260
},
{
"epoch": 2.8568514110846652,
"grad_norm": 0.0778371155244566,
"learning_rate": 4.7261009000177274e-07,
"loss": 0.4025,
"step": 261
},
{
"epoch": 2.8677320639238353,
"grad_norm": 0.0756170688560216,
"learning_rate": 3.972489740319274e-07,
"loss": 0.4091,
"step": 262
},
{
"epoch": 2.878612716763006,
"grad_norm": 0.0747265642036147,
"learning_rate": 3.283994470701579e-07,
"loss": 0.4055,
"step": 263
},
{
"epoch": 2.8894933696021763,
"grad_norm": 0.07119073646902688,
"learning_rate": 2.66072829537678e-07,
"loss": 0.408,
"step": 264
},
{
"epoch": 2.9003740224413463,
"grad_norm": 0.07649012974948233,
"learning_rate": 2.102793693417038e-07,
"loss": 0.402,
"step": 265
},
{
"epoch": 2.911254675280517,
"grad_norm": 0.07588643659629785,
"learning_rate": 1.6102824019043728e-07,
"loss": 0.3985,
"step": 266
},
{
"epoch": 2.9221353281196873,
"grad_norm": 0.07431991189730484,
"learning_rate": 1.1832754008472614e-07,
"loss": 0.4025,
"step": 267
},
{
"epoch": 2.9330159809588574,
"grad_norm": 0.0777086423840054,
"learning_rate": 8.21842899865466e-08,
"loss": 0.3962,
"step": 268
},
{
"epoch": 2.943896633798028,
"grad_norm": 0.07387694283814854,
"learning_rate": 5.260443266462467e-08,
"loss": 0.3982,
"step": 269
},
{
"epoch": 2.9547772866371984,
"grad_norm": 0.07271598906861958,
"learning_rate": 2.9592831717293326e-08,
"loss": 0.3988,
"step": 270
},
{
"epoch": 2.9656579394763685,
"grad_norm": 0.07496499490626318,
"learning_rate": 1.3153270772807702e-08,
"loss": 0.4009,
"step": 271
},
{
"epoch": 2.976538592315539,
"grad_norm": 0.07330139638176505,
"learning_rate": 3.2884528672294523e-09,
"loss": 0.4001,
"step": 272
},
{
"epoch": 2.987419245154709,
"grad_norm": 0.07348531822682038,
"learning_rate": 0.0,
"loss": 0.4048,
"step": 273
},
{
"epoch": 2.987419245154709,
"step": 273,
"total_flos": 7.259120111838036e+18,
"train_loss": 0.5428441533675561,
"train_runtime": 82839.4057,
"train_samples_per_second": 1.704,
"train_steps_per_second": 0.003
}
],
"logging_steps": 1,
"max_steps": 273,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.259120111838036e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}