3952 lines
96 KiB
JSON
3952 lines
96 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.9993174061433447,
|
||
|
|
"eval_steps": 183,
|
||
|
|
"global_step": 549,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0018202502844141069,
|
||
|
|
"grad_norm": 7.828993836368125,
|
||
|
|
"learning_rate": 4.705882352941176e-07,
|
||
|
|
"loss": 0.7927,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0036405005688282138,
|
||
|
|
"grad_norm": 6.714284407371177,
|
||
|
|
"learning_rate": 9.411764705882352e-07,
|
||
|
|
"loss": 0.7939,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.005460750853242321,
|
||
|
|
"grad_norm": 6.364056661574126,
|
||
|
|
"learning_rate": 1.411764705882353e-06,
|
||
|
|
"loss": 0.8186,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0072810011376564275,
|
||
|
|
"grad_norm": 6.001686432641966,
|
||
|
|
"learning_rate": 1.8823529411764705e-06,
|
||
|
|
"loss": 0.7232,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.009101251422070534,
|
||
|
|
"grad_norm": 5.534730246504558,
|
||
|
|
"learning_rate": 2.352941176470588e-06,
|
||
|
|
"loss": 0.7891,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.010921501706484642,
|
||
|
|
"grad_norm": 3.4121020699424713,
|
||
|
|
"learning_rate": 2.823529411764706e-06,
|
||
|
|
"loss": 0.7351,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01274175199089875,
|
||
|
|
"grad_norm": 3.9324785806724476,
|
||
|
|
"learning_rate": 3.294117647058823e-06,
|
||
|
|
"loss": 0.6899,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.014562002275312855,
|
||
|
|
"grad_norm": 2.5947504275059496,
|
||
|
|
"learning_rate": 3.764705882352941e-06,
|
||
|
|
"loss": 0.6103,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.016382252559726963,
|
||
|
|
"grad_norm": 2.5403086872594955,
|
||
|
|
"learning_rate": 4.235294117647058e-06,
|
||
|
|
"loss": 0.6377,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01820250284414107,
|
||
|
|
"grad_norm": 2.339440216808695,
|
||
|
|
"learning_rate": 4.705882352941176e-06,
|
||
|
|
"loss": 0.6723,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.020022753128555178,
|
||
|
|
"grad_norm": 2.1983237574211922,
|
||
|
|
"learning_rate": 5.176470588235294e-06,
|
||
|
|
"loss": 0.684,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.021843003412969283,
|
||
|
|
"grad_norm": 1.6424427461297102,
|
||
|
|
"learning_rate": 5.647058823529412e-06,
|
||
|
|
"loss": 0.6027,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02366325369738339,
|
||
|
|
"grad_norm": 1.6005922788724476,
|
||
|
|
"learning_rate": 6.1176470588235285e-06,
|
||
|
|
"loss": 0.7195,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0254835039817975,
|
||
|
|
"grad_norm": 1.8438931230594375,
|
||
|
|
"learning_rate": 6.588235294117646e-06,
|
||
|
|
"loss": 0.6329,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.027303754266211604,
|
||
|
|
"grad_norm": 1.8086010126416687,
|
||
|
|
"learning_rate": 7.058823529411764e-06,
|
||
|
|
"loss": 0.6812,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02912400455062571,
|
||
|
|
"grad_norm": 1.6390713129791323,
|
||
|
|
"learning_rate": 7.529411764705882e-06,
|
||
|
|
"loss": 0.6848,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03094425483503982,
|
||
|
|
"grad_norm": 1.5725949727834614,
|
||
|
|
"learning_rate": 8e-06,
|
||
|
|
"loss": 0.6415,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.032764505119453925,
|
||
|
|
"grad_norm": 1.5078400807519021,
|
||
|
|
"learning_rate": 7.999930256262932e-06,
|
||
|
|
"loss": 0.6284,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03458475540386803,
|
||
|
|
"grad_norm": 1.5292651701417765,
|
||
|
|
"learning_rate": 7.999721027483818e-06,
|
||
|
|
"loss": 0.6503,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03640500568828214,
|
||
|
|
"grad_norm": 1.4983054037314452,
|
||
|
|
"learning_rate": 7.999372320958861e-06,
|
||
|
|
"loss": 0.6167,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03822525597269624,
|
||
|
|
"grad_norm": 1.4412298959675836,
|
||
|
|
"learning_rate": 7.998884148848109e-06,
|
||
|
|
"loss": 0.6245,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.040045506257110355,
|
||
|
|
"grad_norm": 1.2365167423542114,
|
||
|
|
"learning_rate": 7.998256528175033e-06,
|
||
|
|
"loss": 0.5953,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04186575654152446,
|
||
|
|
"grad_norm": 1.5357829692377438,
|
||
|
|
"learning_rate": 7.997489480825941e-06,
|
||
|
|
"loss": 0.6367,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04368600682593857,
|
||
|
|
"grad_norm": 1.3171822289121358,
|
||
|
|
"learning_rate": 7.996583033549204e-06,
|
||
|
|
"loss": 0.5577,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04550625711035267,
|
||
|
|
"grad_norm": 1.3245922461587984,
|
||
|
|
"learning_rate": 7.995537217954335e-06,
|
||
|
|
"loss": 0.5706,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04732650739476678,
|
||
|
|
"grad_norm": 1.4166210756319584,
|
||
|
|
"learning_rate": 7.994352070510876e-06,
|
||
|
|
"loss": 0.6612,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049146757679180884,
|
||
|
|
"grad_norm": 1.4367785667776511,
|
||
|
|
"learning_rate": 7.993027632547137e-06,
|
||
|
|
"loss": 0.5766,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.050967007963595,
|
||
|
|
"grad_norm": 1.2803978303957413,
|
||
|
|
"learning_rate": 7.991563950248739e-06,
|
||
|
|
"loss": 0.6023,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0527872582480091,
|
||
|
|
"grad_norm": 1.3201964359281728,
|
||
|
|
"learning_rate": 7.989961074657023e-06,
|
||
|
|
"loss": 0.6026,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05460750853242321,
|
||
|
|
"grad_norm": 1.5466402476684098,
|
||
|
|
"learning_rate": 7.988219061667252e-06,
|
||
|
|
"loss": 0.5979,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.056427758816837315,
|
||
|
|
"grad_norm": 1.372807482234666,
|
||
|
|
"learning_rate": 7.986337972026678e-06,
|
||
|
|
"loss": 0.5928,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05824800910125142,
|
||
|
|
"grad_norm": 1.2476758007512014,
|
||
|
|
"learning_rate": 7.98431787133241e-06,
|
||
|
|
"loss": 0.5506,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.060068259385665526,
|
||
|
|
"grad_norm": 1.3332407362456573,
|
||
|
|
"learning_rate": 7.982158830029133e-06,
|
||
|
|
"loss": 0.5252,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06188850967007964,
|
||
|
|
"grad_norm": 1.2956829613552345,
|
||
|
|
"learning_rate": 7.979860923406654e-06,
|
||
|
|
"loss": 0.6162,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06370875995449374,
|
||
|
|
"grad_norm": 1.3358171325973744,
|
||
|
|
"learning_rate": 7.977424231597266e-06,
|
||
|
|
"loss": 0.6323,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06552901023890785,
|
||
|
|
"grad_norm": 1.2668022187917536,
|
||
|
|
"learning_rate": 7.97484883957297e-06,
|
||
|
|
"loss": 0.5481,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06734926052332196,
|
||
|
|
"grad_norm": 1.3032578120954865,
|
||
|
|
"learning_rate": 7.972134837142497e-06,
|
||
|
|
"loss": 0.6982,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06916951080773606,
|
||
|
|
"grad_norm": 1.438077454657845,
|
||
|
|
"learning_rate": 7.969282318948179e-06,
|
||
|
|
"loss": 0.6386,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07098976109215017,
|
||
|
|
"grad_norm": 1.2348261155030926,
|
||
|
|
"learning_rate": 7.966291384462662e-06,
|
||
|
|
"loss": 0.5691,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07281001137656427,
|
||
|
|
"grad_norm": 1.7781144758356542,
|
||
|
|
"learning_rate": 7.963162137985416e-06,
|
||
|
|
"loss": 0.6133,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07463026166097839,
|
||
|
|
"grad_norm": 1.3915614778891998,
|
||
|
|
"learning_rate": 7.959894688639114e-06,
|
||
|
|
"loss": 0.6097,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07645051194539249,
|
||
|
|
"grad_norm": 1.3714026109891253,
|
||
|
|
"learning_rate": 7.956489150365818e-06,
|
||
|
|
"loss": 0.7127,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0782707622298066,
|
||
|
|
"grad_norm": 1.2639378240340353,
|
||
|
|
"learning_rate": 7.952945641923014e-06,
|
||
|
|
"loss": 0.5649,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08009101251422071,
|
||
|
|
"grad_norm": 1.3658987432003644,
|
||
|
|
"learning_rate": 7.949264286879461e-06,
|
||
|
|
"loss": 0.5975,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08191126279863481,
|
||
|
|
"grad_norm": 1.3869528512684368,
|
||
|
|
"learning_rate": 7.94544521361089e-06,
|
||
|
|
"loss": 0.5851,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08373151308304892,
|
||
|
|
"grad_norm": 1.3457362580768018,
|
||
|
|
"learning_rate": 7.941488555295519e-06,
|
||
|
|
"loss": 0.6241,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08555176336746302,
|
||
|
|
"grad_norm": 1.3016443215121218,
|
||
|
|
"learning_rate": 7.937394449909417e-06,
|
||
|
|
"loss": 0.5603,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08737201365187713,
|
||
|
|
"grad_norm": 1.4891468132527028,
|
||
|
|
"learning_rate": 7.933163040221691e-06,
|
||
|
|
"loss": 0.6103,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08919226393629125,
|
||
|
|
"grad_norm": 1.3296810848522813,
|
||
|
|
"learning_rate": 7.928794473789502e-06,
|
||
|
|
"loss": 0.5823,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09101251422070535,
|
||
|
|
"grad_norm": 1.5647412617424314,
|
||
|
|
"learning_rate": 7.924288902952924e-06,
|
||
|
|
"loss": 0.6222,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09283276450511946,
|
||
|
|
"grad_norm": 1.224983623587384,
|
||
|
|
"learning_rate": 7.91964648482963e-06,
|
||
|
|
"loss": 0.5779,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09465301478953356,
|
||
|
|
"grad_norm": 1.3384527596977494,
|
||
|
|
"learning_rate": 7.914867381309417e-06,
|
||
|
|
"loss": 0.5721,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09647326507394767,
|
||
|
|
"grad_norm": 1.3725597146198896,
|
||
|
|
"learning_rate": 7.909951759048553e-06,
|
||
|
|
"loss": 0.6531,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09829351535836177,
|
||
|
|
"grad_norm": 1.4825184286499173,
|
||
|
|
"learning_rate": 7.904899789463974e-06,
|
||
|
|
"loss": 0.5767,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10011376564277588,
|
||
|
|
"grad_norm": 1.2710681313032695,
|
||
|
|
"learning_rate": 7.899711648727295e-06,
|
||
|
|
"loss": 0.5447,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10193401592719,
|
||
|
|
"grad_norm": 1.468311538305991,
|
||
|
|
"learning_rate": 7.894387517758679e-06,
|
||
|
|
"loss": 0.6303,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1037542662116041,
|
||
|
|
"grad_norm": 1.1661278644804365,
|
||
|
|
"learning_rate": 7.888927582220521e-06,
|
||
|
|
"loss": 0.606,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1055745164960182,
|
||
|
|
"grad_norm": 1.3217868795788283,
|
||
|
|
"learning_rate": 7.883332032510978e-06,
|
||
|
|
"loss": 0.5329,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1073947667804323,
|
||
|
|
"grad_norm": 1.33515774996865,
|
||
|
|
"learning_rate": 7.877601063757322e-06,
|
||
|
|
"loss": 0.5335,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10921501706484642,
|
||
|
|
"grad_norm": 1.4916296734659389,
|
||
|
|
"learning_rate": 7.871734875809141e-06,
|
||
|
|
"loss": 0.5705,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11103526734926053,
|
||
|
|
"grad_norm": 1.3630503455015932,
|
||
|
|
"learning_rate": 7.86573367323137e-06,
|
||
|
|
"loss": 0.6279,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11285551763367463,
|
||
|
|
"grad_norm": 1.2409472798363426,
|
||
|
|
"learning_rate": 7.859597665297158e-06,
|
||
|
|
"loss": 0.5096,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11467576791808874,
|
||
|
|
"grad_norm": 1.241131166674189,
|
||
|
|
"learning_rate": 7.853327065980567e-06,
|
||
|
|
"loss": 0.5792,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11649601820250284,
|
||
|
|
"grad_norm": 1.2488483644853932,
|
||
|
|
"learning_rate": 7.84692209394911e-06,
|
||
|
|
"loss": 0.5191,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11831626848691695,
|
||
|
|
"grad_norm": 1.4362376863954367,
|
||
|
|
"learning_rate": 7.84038297255613e-06,
|
||
|
|
"loss": 0.5749,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12013651877133105,
|
||
|
|
"grad_norm": 1.301660597552013,
|
||
|
|
"learning_rate": 7.83370992983301e-06,
|
||
|
|
"loss": 0.5598,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12195676905574517,
|
||
|
|
"grad_norm": 1.2871188302606258,
|
||
|
|
"learning_rate": 7.826903198481218e-06,
|
||
|
|
"loss": 0.6357,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12377701934015928,
|
||
|
|
"grad_norm": 1.2412889792540172,
|
||
|
|
"learning_rate": 7.819963015864195e-06,
|
||
|
|
"loss": 0.6025,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12559726962457338,
|
||
|
|
"grad_norm": 1.2417123308251397,
|
||
|
|
"learning_rate": 7.812889623999077e-06,
|
||
|
|
"loss": 0.5973,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12741751990898748,
|
||
|
|
"grad_norm": 1.334500041209377,
|
||
|
|
"learning_rate": 7.805683269548253e-06,
|
||
|
|
"loss": 0.5339,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1292377701934016,
|
||
|
|
"grad_norm": 1.2800707133087328,
|
||
|
|
"learning_rate": 7.798344203810772e-06,
|
||
|
|
"loss": 0.5506,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1310580204778157,
|
||
|
|
"grad_norm": 1.2634004680746123,
|
||
|
|
"learning_rate": 7.790872682713567e-06,
|
||
|
|
"loss": 0.554,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1328782707622298,
|
||
|
|
"grad_norm": 1.3715962042311087,
|
||
|
|
"learning_rate": 7.783268966802538e-06,
|
||
|
|
"loss": 0.5949,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13469852104664393,
|
||
|
|
"grad_norm": 1.4002060242886838,
|
||
|
|
"learning_rate": 7.77553332123347e-06,
|
||
|
|
"loss": 0.6422,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13651877133105803,
|
||
|
|
"grad_norm": 1.2759192431077615,
|
||
|
|
"learning_rate": 7.767666015762775e-06,
|
||
|
|
"loss": 0.607,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13833902161547212,
|
||
|
|
"grad_norm": 1.6921865669723448,
|
||
|
|
"learning_rate": 7.7596673247381e-06,
|
||
|
|
"loss": 0.6002,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14015927189988622,
|
||
|
|
"grad_norm": 1.3571851968954738,
|
||
|
|
"learning_rate": 7.751537527088742e-06,
|
||
|
|
"loss": 0.5215,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14197952218430035,
|
||
|
|
"grad_norm": 1.4785928356534102,
|
||
|
|
"learning_rate": 7.743276906315936e-06,
|
||
|
|
"loss": 0.6101,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14379977246871445,
|
||
|
|
"grad_norm": 1.465222696303414,
|
||
|
|
"learning_rate": 7.734885750482967e-06,
|
||
|
|
"loss": 0.6187,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14562002275312855,
|
||
|
|
"grad_norm": 1.264573511241066,
|
||
|
|
"learning_rate": 7.726364352205116e-06,
|
||
|
|
"loss": 0.5673,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14744027303754267,
|
||
|
|
"grad_norm": 1.238555624330946,
|
||
|
|
"learning_rate": 7.717713008639463e-06,
|
||
|
|
"loss": 0.6066,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14926052332195677,
|
||
|
|
"grad_norm": 1.20038139697854,
|
||
|
|
"learning_rate": 7.708932021474524e-06,
|
||
|
|
"loss": 0.5678,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15108077360637087,
|
||
|
|
"grad_norm": 1.3190323958334018,
|
||
|
|
"learning_rate": 7.70002169691973e-06,
|
||
|
|
"loss": 0.5544,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15290102389078497,
|
||
|
|
"grad_norm": 1.3333392166861238,
|
||
|
|
"learning_rate": 7.690982345694746e-06,
|
||
|
|
"loss": 0.5212,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1547212741751991,
|
||
|
|
"grad_norm": 1.5189079377057624,
|
||
|
|
"learning_rate": 7.68181428301864e-06,
|
||
|
|
"loss": 0.5411,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1565415244596132,
|
||
|
|
"grad_norm": 1.2341512401643826,
|
||
|
|
"learning_rate": 7.67251782859889e-06,
|
||
|
|
"loss": 0.5,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1583617747440273,
|
||
|
|
"grad_norm": 1.3710782832898465,
|
||
|
|
"learning_rate": 7.663093306620228e-06,
|
||
|
|
"loss": 0.567,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16018202502844142,
|
||
|
|
"grad_norm": 1.3224943188093254,
|
||
|
|
"learning_rate": 7.653541045733351e-06,
|
||
|
|
"loss": 0.6514,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16200227531285552,
|
||
|
|
"grad_norm": 1.3825136099871158,
|
||
|
|
"learning_rate": 7.643861379043442e-06,
|
||
|
|
"loss": 0.49,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16382252559726962,
|
||
|
|
"grad_norm": 1.6656667334345212,
|
||
|
|
"learning_rate": 7.634054644098566e-06,
|
||
|
|
"loss": 0.649,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16564277588168372,
|
||
|
|
"grad_norm": 1.3683202937271444,
|
||
|
|
"learning_rate": 7.624121182877892e-06,
|
||
|
|
"loss": 0.497,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16746302616609784,
|
||
|
|
"grad_norm": 1.5512864267072353,
|
||
|
|
"learning_rate": 7.614061341779777e-06,
|
||
|
|
"loss": 0.6176,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16928327645051194,
|
||
|
|
"grad_norm": 1.5790193819370095,
|
||
|
|
"learning_rate": 7.6038754716096755e-06,
|
||
|
|
"loss": 0.5634,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17110352673492604,
|
||
|
|
"grad_norm": 1.4344008312589909,
|
||
|
|
"learning_rate": 7.593563927567915e-06,
|
||
|
|
"loss": 0.5932,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17292377701934017,
|
||
|
|
"grad_norm": 1.2510278162330568,
|
||
|
|
"learning_rate": 7.583127069237302e-06,
|
||
|
|
"loss": 0.5604,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17474402730375427,
|
||
|
|
"grad_norm": 1.1926891094591303,
|
||
|
|
"learning_rate": 7.5725652605705876e-06,
|
||
|
|
"loss": 0.5746,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17656427758816837,
|
||
|
|
"grad_norm": 1.3458065119541616,
|
||
|
|
"learning_rate": 7.561878869877778e-06,
|
||
|
|
"loss": 0.5,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1783845278725825,
|
||
|
|
"grad_norm": 1.311426359460282,
|
||
|
|
"learning_rate": 7.551068269813282e-06,
|
||
|
|
"loss": 0.503,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1802047781569966,
|
||
|
|
"grad_norm": 1.2792152183438508,
|
||
|
|
"learning_rate": 7.540133837362924e-06,
|
||
|
|
"loss": 0.5279,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1820250284414107,
|
||
|
|
"grad_norm": 1.2349765362905594,
|
||
|
|
"learning_rate": 7.5290759538307944e-06,
|
||
|
|
"loss": 0.5159,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1838452787258248,
|
||
|
|
"grad_norm": 1.3587018583909733,
|
||
|
|
"learning_rate": 7.517895004825955e-06,
|
||
|
|
"loss": 0.573,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18566552901023892,
|
||
|
|
"grad_norm": 1.3554993825796526,
|
||
|
|
"learning_rate": 7.506591380248991e-06,
|
||
|
|
"loss": 0.5801,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18748577929465302,
|
||
|
|
"grad_norm": 1.2364653944665345,
|
||
|
|
"learning_rate": 7.495165474278411e-06,
|
||
|
|
"loss": 0.5618,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18930602957906711,
|
||
|
|
"grad_norm": 1.226975873209754,
|
||
|
|
"learning_rate": 7.483617685356906e-06,
|
||
|
|
"loss": 0.6663,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19112627986348124,
|
||
|
|
"grad_norm": 1.4312274290636884,
|
||
|
|
"learning_rate": 7.471948416177452e-06,
|
||
|
|
"loss": 0.5473,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19294653014789534,
|
||
|
|
"grad_norm": 1.467104665014613,
|
||
|
|
"learning_rate": 7.460158073669271e-06,
|
||
|
|
"loss": 0.5418,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19476678043230944,
|
||
|
|
"grad_norm": 1.1804815586636788,
|
||
|
|
"learning_rate": 7.448247068983638e-06,
|
||
|
|
"loss": 0.5378,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19658703071672354,
|
||
|
|
"grad_norm": 1.3602412614708939,
|
||
|
|
"learning_rate": 7.43621581747954e-06,
|
||
|
|
"loss": 0.5026,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19840728100113766,
|
||
|
|
"grad_norm": 1.2691524680339796,
|
||
|
|
"learning_rate": 7.4240647387092e-06,
|
||
|
|
"loss": 0.591,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20022753128555176,
|
||
|
|
"grad_norm": 1.2783869708675566,
|
||
|
|
"learning_rate": 7.411794256403439e-06,
|
||
|
|
"loss": 0.5085,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20204778156996586,
|
||
|
|
"grad_norm": 1.2804361153159327,
|
||
|
|
"learning_rate": 7.399404798456901e-06,
|
||
|
|
"loss": 0.6244,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20386803185438,
|
||
|
|
"grad_norm": 1.2444898246213776,
|
||
|
|
"learning_rate": 7.3868967969131364e-06,
|
||
|
|
"loss": 0.5313,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2056882821387941,
|
||
|
|
"grad_norm": 1.3015010555018793,
|
||
|
|
"learning_rate": 7.374270687949531e-06,
|
||
|
|
"loss": 0.5512,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2075085324232082,
|
||
|
|
"grad_norm": 1.2989535634921763,
|
||
|
|
"learning_rate": 7.3615269118620945e-06,
|
||
|
|
"loss": 0.5612,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20932878270762229,
|
||
|
|
"grad_norm": 1.2893299441070913,
|
||
|
|
"learning_rate": 7.348665913050114e-06,
|
||
|
|
"loss": 0.4779,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2111490329920364,
|
||
|
|
"grad_norm": 1.4374586177315487,
|
||
|
|
"learning_rate": 7.3356881400006485e-06,
|
||
|
|
"loss": 0.6057,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2129692832764505,
|
||
|
|
"grad_norm": 1.2587242021503462,
|
||
|
|
"learning_rate": 7.3225940452728915e-06,
|
||
|
|
"loss": 0.5679,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2147895335608646,
|
||
|
|
"grad_norm": 1.4250046519243573,
|
||
|
|
"learning_rate": 7.309384085482396e-06,
|
||
|
|
"loss": 0.5,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21660978384527874,
|
||
|
|
"grad_norm": 1.222873795275555,
|
||
|
|
"learning_rate": 7.29605872128514e-06,
|
||
|
|
"loss": 0.6714,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21843003412969283,
|
||
|
|
"grad_norm": 1.4243471522403268,
|
||
|
|
"learning_rate": 7.282618417361476e-06,
|
||
|
|
"loss": 0.6238,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22025028441410693,
|
||
|
|
"grad_norm": 1.3065783409043068,
|
||
|
|
"learning_rate": 7.269063642399912e-06,
|
||
|
|
"loss": 0.5464,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22207053469852106,
|
||
|
|
"grad_norm": 1.4517652323683794,
|
||
|
|
"learning_rate": 7.25539486908078e-06,
|
||
|
|
"loss": 0.4985,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22389078498293516,
|
||
|
|
"grad_norm": 1.1483510986901082,
|
||
|
|
"learning_rate": 7.241612574059745e-06,
|
||
|
|
"loss": 0.4978,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22571103526734926,
|
||
|
|
"grad_norm": 1.4389217565677268,
|
||
|
|
"learning_rate": 7.227717237951189e-06,
|
||
|
|
"loss": 0.6112,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22753128555176336,
|
||
|
|
"grad_norm": 1.3389550682905482,
|
||
|
|
"learning_rate": 7.213709345311444e-06,
|
||
|
|
"loss": 0.6476,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22935153583617748,
|
||
|
|
"grad_norm": 1.4169245520418259,
|
||
|
|
"learning_rate": 7.1995893846219035e-06,
|
||
|
|
"loss": 0.5354,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23117178612059158,
|
||
|
|
"grad_norm": 1.2613504469980097,
|
||
|
|
"learning_rate": 7.185357848271977e-06,
|
||
|
|
"loss": 0.5467,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23299203640500568,
|
||
|
|
"grad_norm": 1.1666125829814091,
|
||
|
|
"learning_rate": 7.17101523254193e-06,
|
||
|
|
"loss": 0.4698,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2348122866894198,
|
||
|
|
"grad_norm": 1.352110005355786,
|
||
|
|
"learning_rate": 7.156562037585575e-06,
|
||
|
|
"loss": 0.6109,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2366325369738339,
|
||
|
|
"grad_norm": 1.2180780408157383,
|
||
|
|
"learning_rate": 7.1419987674128225e-06,
|
||
|
|
"loss": 0.5332,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.238452787258248,
|
||
|
|
"grad_norm": 1.3933377570677665,
|
||
|
|
"learning_rate": 7.127325929872119e-06,
|
||
|
|
"loss": 0.6671,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2402730375426621,
|
||
|
|
"grad_norm": 1.5246008222442193,
|
||
|
|
"learning_rate": 7.1125440366327245e-06,
|
||
|
|
"loss": 0.6212,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24209328782707623,
|
||
|
|
"grad_norm": 1.2745899656103845,
|
||
|
|
"learning_rate": 7.0976536031668775e-06,
|
||
|
|
"loss": 0.6395,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24391353811149033,
|
||
|
|
"grad_norm": 1.226190343569202,
|
||
|
|
"learning_rate": 7.082655148731815e-06,
|
||
|
|
"loss": 0.5761,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24573378839590443,
|
||
|
|
"grad_norm": 1.1905155038972701,
|
||
|
|
"learning_rate": 7.067549196351669e-06,
|
||
|
|
"loss": 0.5418,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24755403868031856,
|
||
|
|
"grad_norm": 1.2116012422540454,
|
||
|
|
"learning_rate": 7.052336272799226e-06,
|
||
|
|
"loss": 0.5273,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24937428896473265,
|
||
|
|
"grad_norm": 1.282002116739219,
|
||
|
|
"learning_rate": 7.037016908577555e-06,
|
||
|
|
"loss": 0.4506,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25119453924914675,
|
||
|
|
"grad_norm": 1.2499521919340497,
|
||
|
|
"learning_rate": 7.02159163790151e-06,
|
||
|
|
"loss": 0.5606,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2530147895335609,
|
||
|
|
"grad_norm": 1.7373606199315674,
|
||
|
|
"learning_rate": 7.006060998679105e-06,
|
||
|
|
"loss": 0.559,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25483503981797495,
|
||
|
|
"grad_norm": 1.4650449441633262,
|
||
|
|
"learning_rate": 6.990425532492747e-06,
|
||
|
|
"loss": 0.5135,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2566552901023891,
|
||
|
|
"grad_norm": 1.154885986165648,
|
||
|
|
"learning_rate": 6.974685784580359e-06,
|
||
|
|
"loss": 0.5039,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2584755403868032,
|
||
|
|
"grad_norm": 1.2096650723076037,
|
||
|
|
"learning_rate": 6.958842303816359e-06,
|
||
|
|
"loss": 0.5079,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2602957906712173,
|
||
|
|
"grad_norm": 1.1620810226211598,
|
||
|
|
"learning_rate": 6.942895642692527e-06,
|
||
|
|
"loss": 0.5245,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2621160409556314,
|
||
|
|
"grad_norm": 1.3200231462656904,
|
||
|
|
"learning_rate": 6.926846357298732e-06,
|
||
|
|
"loss": 0.5935,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26393629124004553,
|
||
|
|
"grad_norm": 1.2355460824632627,
|
||
|
|
"learning_rate": 6.910695007303544e-06,
|
||
|
|
"loss": 0.5543,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2657565415244596,
|
||
|
|
"grad_norm": 1.2033075815432748,
|
||
|
|
"learning_rate": 6.894442155934719e-06,
|
||
|
|
"loss": 0.4831,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2675767918088737,
|
||
|
|
"grad_norm": 1.1907917135137838,
|
||
|
|
"learning_rate": 6.878088369959553e-06,
|
||
|
|
"loss": 0.5221,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26939704209328785,
|
||
|
|
"grad_norm": 1.3255509879738674,
|
||
|
|
"learning_rate": 6.861634219665117e-06,
|
||
|
|
"loss": 0.6086,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2712172923777019,
|
||
|
|
"grad_norm": 1.2271219658785495,
|
||
|
|
"learning_rate": 6.845080278838381e-06,
|
||
|
|
"loss": 0.5825,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27303754266211605,
|
||
|
|
"grad_norm": 1.1690589610793065,
|
||
|
|
"learning_rate": 6.82842712474619e-06,
|
||
|
|
"loss": 0.5807,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2748577929465301,
|
||
|
|
"grad_norm": 1.2888207762623227,
|
||
|
|
"learning_rate": 6.811675338115146e-06,
|
||
|
|
"loss": 0.6188,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27667804323094425,
|
||
|
|
"grad_norm": 1.2514369430789496,
|
||
|
|
"learning_rate": 6.7948255031113505e-06,
|
||
|
|
"loss": 0.5913,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2784982935153584,
|
||
|
|
"grad_norm": 1.3076941421175066,
|
||
|
|
"learning_rate": 6.777878207320034e-06,
|
||
|
|
"loss": 0.5054,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28031854379977245,
|
||
|
|
"grad_norm": 1.2943985087075844,
|
||
|
|
"learning_rate": 6.760834041725068e-06,
|
||
|
|
"loss": 0.4915,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2821387940841866,
|
||
|
|
"grad_norm": 1.2887581327542428,
|
||
|
|
"learning_rate": 6.743693600688352e-06,
|
||
|
|
"loss": 0.5538,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2839590443686007,
|
||
|
|
"grad_norm": 1.157113796843012,
|
||
|
|
"learning_rate": 6.726457481929095e-06,
|
||
|
|
"loss": 0.537,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28577929465301477,
|
||
|
|
"grad_norm": 1.1766314672266696,
|
||
|
|
"learning_rate": 6.7091262865029645e-06,
|
||
|
|
"loss": 0.5896,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2875995449374289,
|
||
|
|
"grad_norm": 1.2292202798354899,
|
||
|
|
"learning_rate": 6.691700618781126e-06,
|
||
|
|
"loss": 0.6347,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.289419795221843,
|
||
|
|
"grad_norm": 1.1513406588801496,
|
||
|
|
"learning_rate": 6.674181086429177e-06,
|
||
|
|
"loss": 0.4663,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2912400455062571,
|
||
|
|
"grad_norm": 1.2946840584006447,
|
||
|
|
"learning_rate": 6.656568300385944e-06,
|
||
|
|
"loss": 0.6247,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2930602957906712,
|
||
|
|
"grad_norm": 1.2952632442735728,
|
||
|
|
"learning_rate": 6.6388628748421895e-06,
|
||
|
|
"loss": 0.4728,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29488054607508535,
|
||
|
|
"grad_norm": 1.2684543190366842,
|
||
|
|
"learning_rate": 6.62106542721918e-06,
|
||
|
|
"loss": 0.5,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2967007963594994,
|
||
|
|
"grad_norm": 1.2434590334770437,
|
||
|
|
"learning_rate": 6.603176578147174e-06,
|
||
|
|
"loss": 0.552,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29852104664391355,
|
||
|
|
"grad_norm": 1.2853162665121605,
|
||
|
|
"learning_rate": 6.585196951443763e-06,
|
||
|
|
"loss": 0.5311,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3003412969283277,
|
||
|
|
"grad_norm": 1.3910181828529422,
|
||
|
|
"learning_rate": 6.5671271740921266e-06,
|
||
|
|
"loss": 0.5595,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30216154721274174,
|
||
|
|
"grad_norm": 1.318698152941268,
|
||
|
|
"learning_rate": 6.548967876219163e-06,
|
||
|
|
"loss": 0.5323,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30398179749715587,
|
||
|
|
"grad_norm": 1.259567167046916,
|
||
|
|
"learning_rate": 6.530719691073521e-06,
|
||
|
|
"loss": 0.5773,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30580204778156994,
|
||
|
|
"grad_norm": 1.3201679730014977,
|
||
|
|
"learning_rate": 6.5123832550035165e-06,
|
||
|
|
"loss": 0.5143,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30762229806598407,
|
||
|
|
"grad_norm": 1.3232034824966301,
|
||
|
|
"learning_rate": 6.493959207434934e-06,
|
||
|
|
"loss": 0.553,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3094425483503982,
|
||
|
|
"grad_norm": 1.3960220649200046,
|
||
|
|
"learning_rate": 6.47544819084874e-06,
|
||
|
|
"loss": 0.561,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31126279863481227,
|
||
|
|
"grad_norm": 1.3348060079340793,
|
||
|
|
"learning_rate": 6.4568508507586715e-06,
|
||
|
|
"loss": 0.5047,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3130830489192264,
|
||
|
|
"grad_norm": 1.2868651237482562,
|
||
|
|
"learning_rate": 6.438167835688725e-06,
|
||
|
|
"loss": 0.5094,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3149032992036405,
|
||
|
|
"grad_norm": 1.2603952904899627,
|
||
|
|
"learning_rate": 6.41939979715055e-06,
|
||
|
|
"loss": 0.5323,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3167235494880546,
|
||
|
|
"grad_norm": 1.2921556438401538,
|
||
|
|
"learning_rate": 6.400547389620716e-06,
|
||
|
|
"loss": 0.5554,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3185437997724687,
|
||
|
|
"grad_norm": 1.2590564886848532,
|
||
|
|
"learning_rate": 6.3816112705178984e-06,
|
||
|
|
"loss": 0.5288,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32036405005688284,
|
||
|
|
"grad_norm": 1.3084436554782835,
|
||
|
|
"learning_rate": 6.362592100179958e-06,
|
||
|
|
"loss": 0.6402,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3221843003412969,
|
||
|
|
"grad_norm": 1.4261334165831296,
|
||
|
|
"learning_rate": 6.343490541840899e-06,
|
||
|
|
"loss": 0.489,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32400455062571104,
|
||
|
|
"grad_norm": 1.4986503257367303,
|
||
|
|
"learning_rate": 6.3243072616077535e-06,
|
||
|
|
"loss": 0.5957,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32582480091012517,
|
||
|
|
"grad_norm": 1.2030242787629297,
|
||
|
|
"learning_rate": 6.3050429284373465e-06,
|
||
|
|
"loss": 0.4974,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32764505119453924,
|
||
|
|
"grad_norm": 1.3717989144113625,
|
||
|
|
"learning_rate": 6.285698214112974e-06,
|
||
|
|
"loss": 0.593,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32946530147895337,
|
||
|
|
"grad_norm": 1.2021644407962897,
|
||
|
|
"learning_rate": 6.2662737932209695e-06,
|
||
|
|
"loss": 0.616,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33128555176336744,
|
||
|
|
"grad_norm": 1.281410126238882,
|
||
|
|
"learning_rate": 6.246770343127185e-06,
|
||
|
|
"loss": 0.5598,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33310580204778156,
|
||
|
|
"grad_norm": 1.39438795848328,
|
||
|
|
"learning_rate": 6.227188543953368e-06,
|
||
|
|
"loss": 0.5932,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33310580204778156,
|
||
|
|
"eval_accuracy": 0.8092656088844726,
|
||
|
|
"eval_accuracy_first_token": 0.7541679610645128,
|
||
|
|
"eval_accuracy_first_token_<": 0.9296587926509187,
|
||
|
|
"eval_accuracy_first_token_<_total": 1905,
|
||
|
|
"eval_accuracy_first_token_<|python_tag|>": 0.8752515090543259,
|
||
|
|
"eval_accuracy_first_token_<|python_tag|>_total": 994,
|
||
|
|
"eval_accuracy_first_token_Certainly": 0.7024793388429752,
|
||
|
|
"eval_accuracy_first_token_Certainly_total": 363,
|
||
|
|
"eval_accuracy_first_token_The": 0.9059161873459326,
|
||
|
|
"eval_accuracy_first_token_The_total": 2434,
|
||
|
|
"eval_accuracy_first_token_To": 0.8237179487179487,
|
||
|
|
"eval_accuracy_first_token_To_total": 936,
|
||
|
|
"eval_loss": 0.5801064372062683,
|
||
|
|
"eval_perplexity": 1.1141803737974993,
|
||
|
|
"eval_runtime": 508.2626,
|
||
|
|
"eval_samples_per_second": 1.371,
|
||
|
|
"eval_steps_per_second": 0.687,
|
||
|
|
"eval_total_number_first_token": 9657,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3349260523321957,
|
||
|
|
"grad_norm": 1.3081524231573554,
|
||
|
|
"learning_rate": 6.207529078553444e-06,
|
||
|
|
"loss": 0.5457,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33674630261660976,
|
||
|
|
"grad_norm": 1.3716078335539046,
|
||
|
|
"learning_rate": 6.1877926324897085e-06,
|
||
|
|
"loss": 0.5473,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3385665529010239,
|
||
|
|
"grad_norm": 1.4145939624062198,
|
||
|
|
"learning_rate": 6.16797989400891e-06,
|
||
|
|
"loss": 0.5786,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.340386803185438,
|
||
|
|
"grad_norm": 1.2294721126569037,
|
||
|
|
"learning_rate": 6.148091554018264e-06,
|
||
|
|
"loss": 0.5902,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3422070534698521,
|
||
|
|
"grad_norm": 1.2925378088030424,
|
||
|
|
"learning_rate": 6.128128306061346e-06,
|
||
|
|
"loss": 0.5142,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3440273037542662,
|
||
|
|
"grad_norm": 1.2279588518524418,
|
||
|
|
"learning_rate": 6.108090846293915e-06,
|
||
|
|
"loss": 0.5135,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34584755403868034,
|
||
|
|
"grad_norm": 1.268146835786646,
|
||
|
|
"learning_rate": 6.087979873459634e-06,
|
||
|
|
"loss": 0.5447,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3476678043230944,
|
||
|
|
"grad_norm": 1.318995573559777,
|
||
|
|
"learning_rate": 6.0677960888657015e-06,
|
||
|
|
"loss": 0.6744,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34948805460750854,
|
||
|
|
"grad_norm": 1.231758642404661,
|
||
|
|
"learning_rate": 6.047540196358404e-06,
|
||
|
|
"loss": 0.5809,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35130830489192266,
|
||
|
|
"grad_norm": 1.2372891673165372,
|
||
|
|
"learning_rate": 6.02721290229856e-06,
|
||
|
|
"loss": 0.5807,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35312855517633673,
|
||
|
|
"grad_norm": 1.481210652387573,
|
||
|
|
"learning_rate": 6.006814915536894e-06,
|
||
|
|
"loss": 0.5936,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35494880546075086,
|
||
|
|
"grad_norm": 1.2305803524181071,
|
||
|
|
"learning_rate": 5.9863469473893225e-06,
|
||
|
|
"loss": 0.5438,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.356769055745165,
|
||
|
|
"grad_norm": 1.2206928256434937,
|
||
|
|
"learning_rate": 5.965809711612137e-06,
|
||
|
|
"loss": 0.5005,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35858930602957906,
|
||
|
|
"grad_norm": 1.086573502847394,
|
||
|
|
"learning_rate": 5.945203924377125e-06,
|
||
|
|
"loss": 0.4889,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3604095563139932,
|
||
|
|
"grad_norm": 1.3513470624112347,
|
||
|
|
"learning_rate": 5.92453030424659e-06,
|
||
|
|
"loss": 0.5599,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36222980659840726,
|
||
|
|
"grad_norm": 1.1113956980921844,
|
||
|
|
"learning_rate": 5.903789572148294e-06,
|
||
|
|
"loss": 0.5182,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3640500568828214,
|
||
|
|
"grad_norm": 1.4891052629080104,
|
||
|
|
"learning_rate": 5.88298245135032e-06,
|
||
|
|
"loss": 0.5716,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3658703071672355,
|
||
|
|
"grad_norm": 1.3005774833983796,
|
||
|
|
"learning_rate": 5.862109667435853e-06,
|
||
|
|
"loss": 0.5665,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3676905574516496,
|
||
|
|
"grad_norm": 1.2151067893045482,
|
||
|
|
"learning_rate": 5.8411719482778645e-06,
|
||
|
|
"loss": 0.4965,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3695108077360637,
|
||
|
|
"grad_norm": 1.5031392413729012,
|
||
|
|
"learning_rate": 5.820170024013746e-06,
|
||
|
|
"loss": 0.5398,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37133105802047783,
|
||
|
|
"grad_norm": 1.1627104663425107,
|
||
|
|
"learning_rate": 5.79910462701984e-06,
|
||
|
|
"loss": 0.4461,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3731513083048919,
|
||
|
|
"grad_norm": 1.3656640622390992,
|
||
|
|
"learning_rate": 5.777976491885903e-06,
|
||
|
|
"loss": 0.6048,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37497155858930603,
|
||
|
|
"grad_norm": 1.2327820864728312,
|
||
|
|
"learning_rate": 5.756786355389481e-06,
|
||
|
|
"loss": 0.5052,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37679180887372016,
|
||
|
|
"grad_norm": 1.3098660955959893,
|
||
|
|
"learning_rate": 5.735534956470232e-06,
|
||
|
|
"loss": 0.5507,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37861205915813423,
|
||
|
|
"grad_norm": 1.2828934352712993,
|
||
|
|
"learning_rate": 5.714223036204144e-06,
|
||
|
|
"loss": 0.5973,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38043230944254836,
|
||
|
|
"grad_norm": 1.1860097743128348,
|
||
|
|
"learning_rate": 5.6928513377777e-06,
|
||
|
|
"loss": 0.4965,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3822525597269625,
|
||
|
|
"grad_norm": 1.2517621268060033,
|
||
|
|
"learning_rate": 5.671420606461956e-06,
|
||
|
|
"loss": 0.5487,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38407281001137655,
|
||
|
|
"grad_norm": 1.271873202647325,
|
||
|
|
"learning_rate": 5.649931589586557e-06,
|
||
|
|
"loss": 0.5979,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3858930602957907,
|
||
|
|
"grad_norm": 1.2859574150365818,
|
||
|
|
"learning_rate": 5.628385036513676e-06,
|
||
|
|
"loss": 0.4776,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38771331058020475,
|
||
|
|
"grad_norm": 1.354537404525919,
|
||
|
|
"learning_rate": 5.606781698611878e-06,
|
||
|
|
"loss": 0.4877,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3895335608646189,
|
||
|
|
"grad_norm": 1.3284236870109494,
|
||
|
|
"learning_rate": 5.585122329229923e-06,
|
||
|
|
"loss": 0.5859,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.391353811149033,
|
||
|
|
"grad_norm": 1.2340130409038237,
|
||
|
|
"learning_rate": 5.56340768367049e-06,
|
||
|
|
"loss": 0.5305,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3931740614334471,
|
||
|
|
"grad_norm": 1.453487991090255,
|
||
|
|
"learning_rate": 5.541638519163849e-06,
|
||
|
|
"loss": 0.55,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3949943117178612,
|
||
|
|
"grad_norm": 1.298062739902415,
|
||
|
|
"learning_rate": 5.51981559484144e-06,
|
||
|
|
"loss": 0.5169,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39681456200227533,
|
||
|
|
"grad_norm": 1.311968473611326,
|
||
|
|
"learning_rate": 5.49793967170941e-06,
|
||
|
|
"loss": 0.5751,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3986348122866894,
|
||
|
|
"grad_norm": 1.2412899795871963,
|
||
|
|
"learning_rate": 5.476011512622076e-06,
|
||
|
|
"loss": 0.6166,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4004550625711035,
|
||
|
|
"grad_norm": 1.242116646999028,
|
||
|
|
"learning_rate": 5.454031882255319e-06,
|
||
|
|
"loss": 0.5578,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40227531285551765,
|
||
|
|
"grad_norm": 1.4168825327120473,
|
||
|
|
"learning_rate": 5.43200154707992e-06,
|
||
|
|
"loss": 0.5662,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4040955631399317,
|
||
|
|
"grad_norm": 1.426587220049501,
|
||
|
|
"learning_rate": 5.4099212753348294e-06,
|
||
|
|
"loss": 0.5169,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40591581342434585,
|
||
|
|
"grad_norm": 1.3014887756398712,
|
||
|
|
"learning_rate": 5.3877918370003806e-06,
|
||
|
|
"loss": 0.5117,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40773606370876,
|
||
|
|
"grad_norm": 1.3406748606110184,
|
||
|
|
"learning_rate": 5.365614003771439e-06,
|
||
|
|
"loss": 0.5549,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40955631399317405,
|
||
|
|
"grad_norm": 1.2661574936204552,
|
||
|
|
"learning_rate": 5.343388549030491e-06,
|
||
|
|
"loss": 0.5163,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4113765642775882,
|
||
|
|
"grad_norm": 1.085801335463159,
|
||
|
|
"learning_rate": 5.321116247820669e-06,
|
||
|
|
"loss": 0.5244,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4131968145620023,
|
||
|
|
"grad_norm": 1.1238570077454868,
|
||
|
|
"learning_rate": 5.298797876818734e-06,
|
||
|
|
"loss": 0.4877,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4150170648464164,
|
||
|
|
"grad_norm": 1.2501588033198834,
|
||
|
|
"learning_rate": 5.276434214307986e-06,
|
||
|
|
"loss": 0.5175,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4168373151308305,
|
||
|
|
"grad_norm": 1.1104132675236253,
|
||
|
|
"learning_rate": 5.2540260401511255e-06,
|
||
|
|
"loss": 0.4912,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41865756541524457,
|
||
|
|
"grad_norm": 1.3469476360006967,
|
||
|
|
"learning_rate": 5.231574135763052e-06,
|
||
|
|
"loss": 0.5119,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4204778156996587,
|
||
|
|
"grad_norm": 1.3917881004299013,
|
||
|
|
"learning_rate": 5.209079284083626e-06,
|
||
|
|
"loss": 0.5893,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4222980659840728,
|
||
|
|
"grad_norm": 1.3049155919134754,
|
||
|
|
"learning_rate": 5.186542269550359e-06,
|
||
|
|
"loss": 0.5863,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4241183162684869,
|
||
|
|
"grad_norm": 1.4688437185729748,
|
||
|
|
"learning_rate": 5.163963878071058e-06,
|
||
|
|
"loss": 0.6134,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.425938566552901,
|
||
|
|
"grad_norm": 1.334594097803803,
|
||
|
|
"learning_rate": 5.141344896996421e-06,
|
||
|
|
"loss": 0.4778,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42775881683731515,
|
||
|
|
"grad_norm": 1.349733087487026,
|
||
|
|
"learning_rate": 5.1186861150925844e-06,
|
||
|
|
"loss": 0.5989,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4295790671217292,
|
||
|
|
"grad_norm": 1.1945891754612503,
|
||
|
|
"learning_rate": 5.09598832251361e-06,
|
||
|
|
"loss": 0.4466,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43139931740614335,
|
||
|
|
"grad_norm": 1.388728814777883,
|
||
|
|
"learning_rate": 5.073252310773939e-06,
|
||
|
|
"loss": 0.6193,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43321956769055747,
|
||
|
|
"grad_norm": 1.4284168159961905,
|
||
|
|
"learning_rate": 5.050478872720782e-06,
|
||
|
|
"loss": 0.5535,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43503981797497154,
|
||
|
|
"grad_norm": 1.3786811940208537,
|
||
|
|
"learning_rate": 5.027668802506476e-06,
|
||
|
|
"loss": 0.4974,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43686006825938567,
|
||
|
|
"grad_norm": 1.2927644778322318,
|
||
|
|
"learning_rate": 5.004822895560794e-06,
|
||
|
|
"loss": 0.5029,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4386803185437998,
|
||
|
|
"grad_norm": 1.252387489439096,
|
||
|
|
"learning_rate": 4.981941948563196e-06,
|
||
|
|
"loss": 0.5278,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44050056882821387,
|
||
|
|
"grad_norm": 1.339410297479304,
|
||
|
|
"learning_rate": 4.959026759415061e-06,
|
||
|
|
"loss": 0.4939,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.442320819112628,
|
||
|
|
"grad_norm": 1.364314280772671,
|
||
|
|
"learning_rate": 4.936078127211849e-06,
|
||
|
|
"loss": 0.5951,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4441410693970421,
|
||
|
|
"grad_norm": 1.3075257340817037,
|
||
|
|
"learning_rate": 4.913096852215248e-06,
|
||
|
|
"loss": 0.6049,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4459613196814562,
|
||
|
|
"grad_norm": 1.4449459168578944,
|
||
|
|
"learning_rate": 4.890083735825257e-06,
|
||
|
|
"loss": 0.6495,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4477815699658703,
|
||
|
|
"grad_norm": 1.250520396487667,
|
||
|
|
"learning_rate": 4.867039580552247e-06,
|
||
|
|
"loss": 0.5094,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4496018202502844,
|
||
|
|
"grad_norm": 1.3729253299629682,
|
||
|
|
"learning_rate": 4.843965189988969e-06,
|
||
|
|
"loss": 0.601,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4514220705346985,
|
||
|
|
"grad_norm": 1.3588267674574899,
|
||
|
|
"learning_rate": 4.820861368782537e-06,
|
||
|
|
"loss": 0.6282,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45324232081911264,
|
||
|
|
"grad_norm": 1.2605268123722788,
|
||
|
|
"learning_rate": 4.79772892260637e-06,
|
||
|
|
"loss": 0.5305,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4550625711035267,
|
||
|
|
"grad_norm": 1.2681522997617236,
|
||
|
|
"learning_rate": 4.774568658132086e-06,
|
||
|
|
"loss": 0.5748,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45688282138794084,
|
||
|
|
"grad_norm": 1.3533665486552986,
|
||
|
|
"learning_rate": 4.751381383001386e-06,
|
||
|
|
"loss": 0.4689,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45870307167235497,
|
||
|
|
"grad_norm": 1.267434752321521,
|
||
|
|
"learning_rate": 4.728167905797877e-06,
|
||
|
|
"loss": 0.534,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46052332195676904,
|
||
|
|
"grad_norm": 1.2225430366963792,
|
||
|
|
"learning_rate": 4.7049290360188875e-06,
|
||
|
|
"loss": 0.5003,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46234357224118316,
|
||
|
|
"grad_norm": 1.3388181688742944,
|
||
|
|
"learning_rate": 4.681665584047227e-06,
|
||
|
|
"loss": 0.5219,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4641638225255973,
|
||
|
|
"grad_norm": 1.3028544028541067,
|
||
|
|
"learning_rate": 4.658378361122936e-06,
|
||
|
|
"loss": 0.5452,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46598407281001136,
|
||
|
|
"grad_norm": 1.157965088527389,
|
||
|
|
"learning_rate": 4.6350681793149884e-06,
|
||
|
|
"loss": 0.5229,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4678043230944255,
|
||
|
|
"grad_norm": 1.3045701677810966,
|
||
|
|
"learning_rate": 4.611735851492984e-06,
|
||
|
|
"loss": 0.5728,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4696245733788396,
|
||
|
|
"grad_norm": 1.2850295730378811,
|
||
|
|
"learning_rate": 4.588382191298787e-06,
|
||
|
|
"loss": 0.5537,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4714448236632537,
|
||
|
|
"grad_norm": 1.1541773263319788,
|
||
|
|
"learning_rate": 4.5650080131181675e-06,
|
||
|
|
"loss": 0.538,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4732650739476678,
|
||
|
|
"grad_norm": 1.2910982881953428,
|
||
|
|
"learning_rate": 4.541614132052393e-06,
|
||
|
|
"loss": 0.5612,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4750853242320819,
|
||
|
|
"grad_norm": 1.3654547247389965,
|
||
|
|
"learning_rate": 4.51820136388981e-06,
|
||
|
|
"loss": 0.4475,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.476905574516496,
|
||
|
|
"grad_norm": 1.2698111645568326,
|
||
|
|
"learning_rate": 4.494770525077391e-06,
|
||
|
|
"loss": 0.5621,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47872582480091014,
|
||
|
|
"grad_norm": 1.3935553561279155,
|
||
|
|
"learning_rate": 4.4713224326922655e-06,
|
||
|
|
"loss": 0.599,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4805460750853242,
|
||
|
|
"grad_norm": 1.3156202456183232,
|
||
|
|
"learning_rate": 4.447857904413231e-06,
|
||
|
|
"loss": 0.532,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48236632536973834,
|
||
|
|
"grad_norm": 1.1337315757674442,
|
||
|
|
"learning_rate": 4.424377758492232e-06,
|
||
|
|
"loss": 0.5353,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48418657565415246,
|
||
|
|
"grad_norm": 1.4101220722381445,
|
||
|
|
"learning_rate": 4.40088281372583e-06,
|
||
|
|
"loss": 0.5006,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48600682593856653,
|
||
|
|
"grad_norm": 1.2300783759396936,
|
||
|
|
"learning_rate": 4.377373889426649e-06,
|
||
|
|
"loss": 0.5438,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48782707622298066,
|
||
|
|
"grad_norm": 1.17585511931677,
|
||
|
|
"learning_rate": 4.353851805394808e-06,
|
||
|
|
"loss": 0.5369,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4896473265073948,
|
||
|
|
"grad_norm": 1.2115284188121456,
|
||
|
|
"learning_rate": 4.33031738188933e-06,
|
||
|
|
"loss": 0.4524,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49146757679180886,
|
||
|
|
"grad_norm": 1.4317839486717954,
|
||
|
|
"learning_rate": 4.306771439599534e-06,
|
||
|
|
"loss": 0.6436,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.493287827076223,
|
||
|
|
"grad_norm": 1.4621543028546877,
|
||
|
|
"learning_rate": 4.283214799616428e-06,
|
||
|
|
"loss": 0.4368,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4951080773606371,
|
||
|
|
"grad_norm": 1.2936498719418335,
|
||
|
|
"learning_rate": 4.259648283404062e-06,
|
||
|
|
"loss": 0.5541,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4969283276450512,
|
||
|
|
"grad_norm": 1.120893056901481,
|
||
|
|
"learning_rate": 4.236072712770891e-06,
|
||
|
|
"loss": 0.5822,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4987485779294653,
|
||
|
|
"grad_norm": 1.429920580926913,
|
||
|
|
"learning_rate": 4.2124889098411175e-06,
|
||
|
|
"loss": 0.5302,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5005688282138794,
|
||
|
|
"grad_norm": 1.312546385183068,
|
||
|
|
"learning_rate": 4.1888976970260135e-06,
|
||
|
|
"loss": 0.4835,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5023890784982935,
|
||
|
|
"grad_norm": 1.3107231912852029,
|
||
|
|
"learning_rate": 4.165299896995252e-06,
|
||
|
|
"loss": 0.5421,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5042093287827076,
|
||
|
|
"grad_norm": 1.232720241327702,
|
||
|
|
"learning_rate": 4.141696332648216e-06,
|
||
|
|
"loss": 0.5012,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5060295790671218,
|
||
|
|
"grad_norm": 1.2132932647482422,
|
||
|
|
"learning_rate": 4.118087827085294e-06,
|
||
|
|
"loss": 0.5463,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5078498293515359,
|
||
|
|
"grad_norm": 1.270716742837696,
|
||
|
|
"learning_rate": 4.094475203579191e-06,
|
||
|
|
"loss": 0.5383,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5096700796359499,
|
||
|
|
"grad_norm": 1.2438093689244545,
|
||
|
|
"learning_rate": 4.070859285546209e-06,
|
||
|
|
"loss": 0.5556,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.511490329920364,
|
||
|
|
"grad_norm": 1.3516997697288733,
|
||
|
|
"learning_rate": 4.047240896517539e-06,
|
||
|
|
"loss": 0.6018,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5133105802047782,
|
||
|
|
"grad_norm": 1.2439174788361766,
|
||
|
|
"learning_rate": 4.023620860110533e-06,
|
||
|
|
"loss": 0.4133,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5151308304891923,
|
||
|
|
"grad_norm": 1.3778918541026397,
|
||
|
|
"learning_rate": 4e-06,
|
||
|
|
"loss": 0.6007,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5169510807736064,
|
||
|
|
"grad_norm": 1.2268835899477202,
|
||
|
|
"learning_rate": 3.976379139889467e-06,
|
||
|
|
"loss": 0.5331,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5187713310580204,
|
||
|
|
"grad_norm": 1.3429317349537357,
|
||
|
|
"learning_rate": 3.9527591034824616e-06,
|
||
|
|
"loss": 0.5311,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5205915813424346,
|
||
|
|
"grad_norm": 1.2134030361634403,
|
||
|
|
"learning_rate": 3.929140714453791e-06,
|
||
|
|
"loss": 0.481,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5224118316268487,
|
||
|
|
"grad_norm": 1.2462618006711519,
|
||
|
|
"learning_rate": 3.9055247964208075e-06,
|
||
|
|
"loss": 0.5273,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5242320819112628,
|
||
|
|
"grad_norm": 1.5559356945744065,
|
||
|
|
"learning_rate": 3.8819121729147055e-06,
|
||
|
|
"loss": 0.6021,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5260523321956769,
|
||
|
|
"grad_norm": 1.3832902722589653,
|
||
|
|
"learning_rate": 3.8583036673517845e-06,
|
||
|
|
"loss": 0.4454,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5278725824800911,
|
||
|
|
"grad_norm": 1.3978629990846738,
|
||
|
|
"learning_rate": 3.834700103004747e-06,
|
||
|
|
"loss": 0.5124,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5296928327645051,
|
||
|
|
"grad_norm": 1.2666994035299775,
|
||
|
|
"learning_rate": 3.8111023029739866e-06,
|
||
|
|
"loss": 0.4667,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5315130830489192,
|
||
|
|
"grad_norm": 1.3305625183535323,
|
||
|
|
"learning_rate": 3.787511090158884e-06,
|
||
|
|
"loss": 0.5368,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5333333333333333,
|
||
|
|
"grad_norm": 1.2554353067602742,
|
||
|
|
"learning_rate": 3.763927287229109e-06,
|
||
|
|
"loss": 0.5499,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5351535836177475,
|
||
|
|
"grad_norm": 1.307288214215709,
|
||
|
|
"learning_rate": 3.740351716595939e-06,
|
||
|
|
"loss": 0.5055,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5369738339021616,
|
||
|
|
"grad_norm": 1.3279514539943822,
|
||
|
|
"learning_rate": 3.7167852003835723e-06,
|
||
|
|
"loss": 0.511,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5387940841865757,
|
||
|
|
"grad_norm": 1.223828271060312,
|
||
|
|
"learning_rate": 3.6932285604004656e-06,
|
||
|
|
"loss": 0.4595,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5406143344709897,
|
||
|
|
"grad_norm": 1.332815030257366,
|
||
|
|
"learning_rate": 3.669682618110671e-06,
|
||
|
|
"loss": 0.6227,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5424345847554038,
|
||
|
|
"grad_norm": 1.2271598713957907,
|
||
|
|
"learning_rate": 3.646148194605191e-06,
|
||
|
|
"loss": 0.5925,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.544254835039818,
|
||
|
|
"grad_norm": 1.2856112828358344,
|
||
|
|
"learning_rate": 3.622626110573351e-06,
|
||
|
|
"loss": 0.4888,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5460750853242321,
|
||
|
|
"grad_norm": 1.2933732045646906,
|
||
|
|
"learning_rate": 3.5991171862741713e-06,
|
||
|
|
"loss": 0.5072,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5478953356086462,
|
||
|
|
"grad_norm": 1.928545669129003,
|
||
|
|
"learning_rate": 3.575622241507768e-06,
|
||
|
|
"loss": 0.525,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5497155858930602,
|
||
|
|
"grad_norm": 1.1113802518291283,
|
||
|
|
"learning_rate": 3.5521420955867683e-06,
|
||
|
|
"loss": 0.5977,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5515358361774744,
|
||
|
|
"grad_norm": 1.3617289948905469,
|
||
|
|
"learning_rate": 3.5286775673077332e-06,
|
||
|
|
"loss": 0.5839,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5533560864618885,
|
||
|
|
"grad_norm": 1.391130826033813,
|
||
|
|
"learning_rate": 3.505229474922609e-06,
|
||
|
|
"loss": 0.5181,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5551763367463026,
|
||
|
|
"grad_norm": 1.2316652646361441,
|
||
|
|
"learning_rate": 3.481798636110191e-06,
|
||
|
|
"loss": 0.4945,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5569965870307167,
|
||
|
|
"grad_norm": 1.2873086430494702,
|
||
|
|
"learning_rate": 3.458385867947607e-06,
|
||
|
|
"loss": 0.4924,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5588168373151309,
|
||
|
|
"grad_norm": 1.430369532857129,
|
||
|
|
"learning_rate": 3.434991986881833e-06,
|
||
|
|
"loss": 0.4821,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5606370875995449,
|
||
|
|
"grad_norm": 1.1646260708946579,
|
||
|
|
"learning_rate": 3.4116178087012136e-06,
|
||
|
|
"loss": 0.5052,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.562457337883959,
|
||
|
|
"grad_norm": 1.2327288356772756,
|
||
|
|
"learning_rate": 3.388264148507016e-06,
|
||
|
|
"loss": 0.5057,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5642775881683731,
|
||
|
|
"grad_norm": 1.3536008520463172,
|
||
|
|
"learning_rate": 3.3649318206850116e-06,
|
||
|
|
"loss": 0.5178,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5660978384527873,
|
||
|
|
"grad_norm": 1.1896041830424324,
|
||
|
|
"learning_rate": 3.3416216388770635e-06,
|
||
|
|
"loss": 0.5417,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5679180887372014,
|
||
|
|
"grad_norm": 1.2975919761075365,
|
||
|
|
"learning_rate": 3.3183344159527736e-06,
|
||
|
|
"loss": 0.5234,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5697383390216155,
|
||
|
|
"grad_norm": 1.1688323545338841,
|
||
|
|
"learning_rate": 3.2950709639811134e-06,
|
||
|
|
"loss": 0.4888,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5715585893060295,
|
||
|
|
"grad_norm": 1.3939258642019638,
|
||
|
|
"learning_rate": 3.271832094202123e-06,
|
||
|
|
"loss": 0.5183,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5733788395904437,
|
||
|
|
"grad_norm": 1.1897624823605304,
|
||
|
|
"learning_rate": 3.2486186169986153e-06,
|
||
|
|
"loss": 0.5454,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5751990898748578,
|
||
|
|
"grad_norm": 1.1944714828344472,
|
||
|
|
"learning_rate": 3.2254313418679154e-06,
|
||
|
|
"loss": 0.4807,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5770193401592719,
|
||
|
|
"grad_norm": 1.2256094296723554,
|
||
|
|
"learning_rate": 3.2022710773936304e-06,
|
||
|
|
"loss": 0.5223,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.578839590443686,
|
||
|
|
"grad_norm": 1.2296819521179183,
|
||
|
|
"learning_rate": 3.1791386312174633e-06,
|
||
|
|
"loss": 0.4951,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5806598407281001,
|
||
|
|
"grad_norm": 1.4525163093513003,
|
||
|
|
"learning_rate": 3.1560348100110315e-06,
|
||
|
|
"loss": 0.4874,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5824800910125142,
|
||
|
|
"grad_norm": 1.4766150215295402,
|
||
|
|
"learning_rate": 3.1329604194477535e-06,
|
||
|
|
"loss": 0.5186,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5843003412969283,
|
||
|
|
"grad_norm": 1.5620410502792905,
|
||
|
|
"learning_rate": 3.1099162641747427e-06,
|
||
|
|
"loss": 0.5542,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5861205915813424,
|
||
|
|
"grad_norm": 1.1792003162185065,
|
||
|
|
"learning_rate": 3.0869031477847507e-06,
|
||
|
|
"loss": 0.4751,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5879408418657566,
|
||
|
|
"grad_norm": 1.4371388899245734,
|
||
|
|
"learning_rate": 3.0639218727881508e-06,
|
||
|
|
"loss": 0.5066,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5897610921501707,
|
||
|
|
"grad_norm": 1.3321828422036859,
|
||
|
|
"learning_rate": 3.04097324058494e-06,
|
||
|
|
"loss": 0.418,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5915813424345847,
|
||
|
|
"grad_norm": 1.2642329608748821,
|
||
|
|
"learning_rate": 3.0180580514368034e-06,
|
||
|
|
"loss": 0.6167,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5934015927189988,
|
||
|
|
"grad_norm": 1.1538255682096556,
|
||
|
|
"learning_rate": 2.9951771044392066e-06,
|
||
|
|
"loss": 0.5297,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.595221843003413,
|
||
|
|
"grad_norm": 1.2453988044078719,
|
||
|
|
"learning_rate": 2.972331197493523e-06,
|
||
|
|
"loss": 0.4552,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5970420932878271,
|
||
|
|
"grad_norm": 1.3576907607149231,
|
||
|
|
"learning_rate": 2.949521127279218e-06,
|
||
|
|
"loss": 0.5003,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5988623435722412,
|
||
|
|
"grad_norm": 1.3497348777364608,
|
||
|
|
"learning_rate": 2.926747689226062e-06,
|
||
|
|
"loss": 0.5561,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6006825938566553,
|
||
|
|
"grad_norm": 1.259164100959422,
|
||
|
|
"learning_rate": 2.9040116774863896e-06,
|
||
|
|
"loss": 0.4856,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6025028441410694,
|
||
|
|
"grad_norm": 1.2529485220686642,
|
||
|
|
"learning_rate": 2.881313884907416e-06,
|
||
|
|
"loss": 0.5575,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6043230944254835,
|
||
|
|
"grad_norm": 1.0669208953569564,
|
||
|
|
"learning_rate": 2.8586551030035797e-06,
|
||
|
|
"loss": 0.4644,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6061433447098976,
|
||
|
|
"grad_norm": 1.3366563458096783,
|
||
|
|
"learning_rate": 2.836036121928942e-06,
|
||
|
|
"loss": 0.453,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6079635949943117,
|
||
|
|
"grad_norm": 1.1476061284968695,
|
||
|
|
"learning_rate": 2.813457730449641e-06,
|
||
|
|
"loss": 0.4207,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6097838452787259,
|
||
|
|
"grad_norm": 1.3411031407126155,
|
||
|
|
"learning_rate": 2.790920715916372e-06,
|
||
|
|
"loss": 0.5404,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6116040955631399,
|
||
|
|
"grad_norm": 1.1482077310699785,
|
||
|
|
"learning_rate": 2.7684258642369484e-06,
|
||
|
|
"loss": 0.5205,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.613424345847554,
|
||
|
|
"grad_norm": 1.42649538957105,
|
||
|
|
"learning_rate": 2.7459739598488762e-06,
|
||
|
|
"loss": 0.5013,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6152445961319681,
|
||
|
|
"grad_norm": 1.2243282105614175,
|
||
|
|
"learning_rate": 2.723565785692013e-06,
|
||
|
|
"loss": 0.5464,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6170648464163823,
|
||
|
|
"grad_norm": 1.2484628027395077,
|
||
|
|
"learning_rate": 2.701202123181266e-06,
|
||
|
|
"loss": 0.5519,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6188850967007964,
|
||
|
|
"grad_norm": 1.3240902412697022,
|
||
|
|
"learning_rate": 2.6788837521793328e-06,
|
||
|
|
"loss": 0.5205,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6207053469852105,
|
||
|
|
"grad_norm": 1.2873575493742448,
|
||
|
|
"learning_rate": 2.6566114509695096e-06,
|
||
|
|
"loss": 0.4761,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6225255972696245,
|
||
|
|
"grad_norm": 1.2093203753299095,
|
||
|
|
"learning_rate": 2.634385996228561e-06,
|
||
|
|
"loss": 0.4753,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6243458475540387,
|
||
|
|
"grad_norm": 1.300179853101682,
|
||
|
|
"learning_rate": 2.6122081629996195e-06,
|
||
|
|
"loss": 0.4934,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6261660978384528,
|
||
|
|
"grad_norm": 1.2890047749069995,
|
||
|
|
"learning_rate": 2.5900787246651715e-06,
|
||
|
|
"loss": 0.4873,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6279863481228669,
|
||
|
|
"grad_norm": 1.4341436462807016,
|
||
|
|
"learning_rate": 2.567998452920081e-06,
|
||
|
|
"loss": 0.5213,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.629806598407281,
|
||
|
|
"grad_norm": 1.299103431190263,
|
||
|
|
"learning_rate": 2.5459681177446797e-06,
|
||
|
|
"loss": 0.4783,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.631626848691695,
|
||
|
|
"grad_norm": 1.331377030608932,
|
||
|
|
"learning_rate": 2.523988487377924e-06,
|
||
|
|
"loss": 0.5045,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6334470989761092,
|
||
|
|
"grad_norm": 1.3571947037370755,
|
||
|
|
"learning_rate": 2.50206032829059e-06,
|
||
|
|
"loss": 0.5005,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6352673492605233,
|
||
|
|
"grad_norm": 1.5380704302051296,
|
||
|
|
"learning_rate": 2.4801844051585604e-06,
|
||
|
|
"loss": 0.5238,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6370875995449374,
|
||
|
|
"grad_norm": 1.3042046637167102,
|
||
|
|
"learning_rate": 2.4583614808361508e-06,
|
||
|
|
"loss": 0.4785,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6389078498293516,
|
||
|
|
"grad_norm": 1.1803375558623432,
|
||
|
|
"learning_rate": 2.4365923163295083e-06,
|
||
|
|
"loss": 0.5518,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6407281001137657,
|
||
|
|
"grad_norm": 1.1305498748692666,
|
||
|
|
"learning_rate": 2.4148776707700775e-06,
|
||
|
|
"loss": 0.4627,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6425483503981797,
|
||
|
|
"grad_norm": 1.3696351562191598,
|
||
|
|
"learning_rate": 2.393218301388123e-06,
|
||
|
|
"loss": 0.4714,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6443686006825938,
|
||
|
|
"grad_norm": 1.1646766669675297,
|
||
|
|
"learning_rate": 2.3716149634863244e-06,
|
||
|
|
"loss": 0.461,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.646188850967008,
|
||
|
|
"grad_norm": 1.432326181786707,
|
||
|
|
"learning_rate": 2.3500684104134433e-06,
|
||
|
|
"loss": 0.4775,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6480091012514221,
|
||
|
|
"grad_norm": 1.2811226649238618,
|
||
|
|
"learning_rate": 2.328579393538046e-06,
|
||
|
|
"loss": 0.4473,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6498293515358362,
|
||
|
|
"grad_norm": 1.2847418645420832,
|
||
|
|
"learning_rate": 2.3071486622223e-06,
|
||
|
|
"loss": 0.473,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6516496018202503,
|
||
|
|
"grad_norm": 1.1991535525500763,
|
||
|
|
"learning_rate": 2.2857769637958554e-06,
|
||
|
|
"loss": 0.4548,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6534698521046644,
|
||
|
|
"grad_norm": 1.3510869929117142,
|
||
|
|
"learning_rate": 2.2644650435297675e-06,
|
||
|
|
"loss": 0.474,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6552901023890785,
|
||
|
|
"grad_norm": 1.2247454838152558,
|
||
|
|
"learning_rate": 2.243213644610519e-06,
|
||
|
|
"loss": 0.4063,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6571103526734926,
|
||
|
|
"grad_norm": 1.224682187747472,
|
||
|
|
"learning_rate": 2.2220235081140985e-06,
|
||
|
|
"loss": 0.5137,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6589306029579067,
|
||
|
|
"grad_norm": 1.5257557938450914,
|
||
|
|
"learning_rate": 2.2008953729801583e-06,
|
||
|
|
"loss": 0.4591,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6607508532423209,
|
||
|
|
"grad_norm": 1.221121328273825,
|
||
|
|
"learning_rate": 2.1798299759862545e-06,
|
||
|
|
"loss": 0.5614,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6625711035267349,
|
||
|
|
"grad_norm": 1.2199344648755224,
|
||
|
|
"learning_rate": 2.158828051722137e-06,
|
||
|
|
"loss": 0.5104,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.664391353811149,
|
||
|
|
"grad_norm": 1.3052741269665118,
|
||
|
|
"learning_rate": 2.137890332564147e-06,
|
||
|
|
"loss": 0.4732,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6662116040955631,
|
||
|
|
"grad_norm": 1.1647023893060888,
|
||
|
|
"learning_rate": 2.117017548649678e-06,
|
||
|
|
"loss": 0.5229,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6662116040955631,
|
||
|
|
"eval_accuracy": 0.8129295561130228,
|
||
|
|
"eval_accuracy_first_token": 0.7684581132856995,
|
||
|
|
"eval_accuracy_first_token_<": 0.9595800524934384,
|
||
|
|
"eval_accuracy_first_token_<_total": 1905,
|
||
|
|
"eval_accuracy_first_token_<|python_tag|>": 0.9094567404426559,
|
||
|
|
"eval_accuracy_first_token_<|python_tag|>_total": 994,
|
||
|
|
"eval_accuracy_first_token_Certainly": 0.7741046831955923,
|
||
|
|
"eval_accuracy_first_token_Certainly_total": 363,
|
||
|
|
"eval_accuracy_first_token_The": 0.8948233360723089,
|
||
|
|
"eval_accuracy_first_token_The_total": 2434,
|
||
|
|
"eval_accuracy_first_token_To": 0.8044871794871795,
|
||
|
|
"eval_accuracy_first_token_To_total": 936,
|
||
|
|
"eval_loss": 0.5655013918876648,
|
||
|
|
"eval_perplexity": 1.1114110979501997,
|
||
|
|
"eval_runtime": 507.2948,
|
||
|
|
"eval_samples_per_second": 1.374,
|
||
|
|
"eval_steps_per_second": 0.688,
|
||
|
|
"eval_total_number_first_token": 9657,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6680318543799773,
|
||
|
|
"grad_norm": 1.2605323109478153,
|
||
|
|
"learning_rate": 2.0962104278517058e-06,
|
||
|
|
"loss": 0.4634,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6698521046643914,
|
||
|
|
"grad_norm": 1.153711484102447,
|
||
|
|
"learning_rate": 2.0754696957534105e-06,
|
||
|
|
"loss": 0.4578,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6716723549488055,
|
||
|
|
"grad_norm": 1.4112272127644152,
|
||
|
|
"learning_rate": 2.0547960756228746e-06,
|
||
|
|
"loss": 0.5903,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6734926052332195,
|
||
|
|
"grad_norm": 1.3058143917601592,
|
||
|
|
"learning_rate": 2.0341902883878626e-06,
|
||
|
|
"loss": 0.4261,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6753128555176336,
|
||
|
|
"grad_norm": 1.241032329122879,
|
||
|
|
"learning_rate": 2.013653052610678e-06,
|
||
|
|
"loss": 0.4901,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6771331058020478,
|
||
|
|
"grad_norm": 1.134116834066691,
|
||
|
|
"learning_rate": 1.993185084463106e-06,
|
||
|
|
"loss": 0.5478,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6789533560864619,
|
||
|
|
"grad_norm": 1.2621524843864569,
|
||
|
|
"learning_rate": 1.97278709770144e-06,
|
||
|
|
"loss": 0.4521,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.680773606370876,
|
||
|
|
"grad_norm": 1.2445963074217277,
|
||
|
|
"learning_rate": 1.952459803641597e-06,
|
||
|
|
"loss": 0.5048,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6825938566552902,
|
||
|
|
"grad_norm": 1.3367185945909759,
|
||
|
|
"learning_rate": 1.9322039111342977e-06,
|
||
|
|
"loss": 0.4859,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6844141069397042,
|
||
|
|
"grad_norm": 1.502198228897516,
|
||
|
|
"learning_rate": 1.912020126540366e-06,
|
||
|
|
"loss": 0.5483,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6862343572241183,
|
||
|
|
"grad_norm": 1.5682296957615942,
|
||
|
|
"learning_rate": 1.8919091537060847e-06,
|
||
|
|
"loss": 0.5403,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6880546075085324,
|
||
|
|
"grad_norm": 1.2186683041461865,
|
||
|
|
"learning_rate": 1.8718716939386541e-06,
|
||
|
|
"loss": 0.4953,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6898748577929465,
|
||
|
|
"grad_norm": 1.2104649746142353,
|
||
|
|
"learning_rate": 1.8519084459817362e-06,
|
||
|
|
"loss": 0.4599,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6916951080773607,
|
||
|
|
"grad_norm": 1.1390426306451955,
|
||
|
|
"learning_rate": 1.83202010599109e-06,
|
||
|
|
"loss": 0.4164,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6935153583617747,
|
||
|
|
"grad_norm": 1.2956325376708957,
|
||
|
|
"learning_rate": 1.8122073675102932e-06,
|
||
|
|
"loss": 0.5417,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6953356086461888,
|
||
|
|
"grad_norm": 1.1586136644085798,
|
||
|
|
"learning_rate": 1.792470921446557e-06,
|
||
|
|
"loss": 0.4365,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.697155858930603,
|
||
|
|
"grad_norm": 1.1975210529143365,
|
||
|
|
"learning_rate": 1.7728114560466324e-06,
|
||
|
|
"loss": 0.4956,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6989761092150171,
|
||
|
|
"grad_norm": 1.5675227569116297,
|
||
|
|
"learning_rate": 1.753229656872815e-06,
|
||
|
|
"loss": 0.4646,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7007963594994312,
|
||
|
|
"grad_norm": 1.1981622083221466,
|
||
|
|
"learning_rate": 1.7337262067790319e-06,
|
||
|
|
"loss": 0.5042,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7026166097838453,
|
||
|
|
"grad_norm": 1.291822326824022,
|
||
|
|
"learning_rate": 1.7143017858870259e-06,
|
||
|
|
"loss": 0.5786,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7044368600682593,
|
||
|
|
"grad_norm": 1.3381873610330526,
|
||
|
|
"learning_rate": 1.6949570715626532e-06,
|
||
|
|
"loss": 0.3987,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7062571103526735,
|
||
|
|
"grad_norm": 1.5233756050791378,
|
||
|
|
"learning_rate": 1.675692738392247e-06,
|
||
|
|
"loss": 0.5373,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7080773606370876,
|
||
|
|
"grad_norm": 1.405379762218711,
|
||
|
|
"learning_rate": 1.6565094581591015e-06,
|
||
|
|
"loss": 0.5151,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7098976109215017,
|
||
|
|
"grad_norm": 1.3827588130238773,
|
||
|
|
"learning_rate": 1.6374078998200424e-06,
|
||
|
|
"loss": 0.4868,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7117178612059158,
|
||
|
|
"grad_norm": 1.3281467896725871,
|
||
|
|
"learning_rate": 1.6183887294820995e-06,
|
||
|
|
"loss": 0.4892,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.71353811149033,
|
||
|
|
"grad_norm": 1.5562464103926885,
|
||
|
|
"learning_rate": 1.5994526103792852e-06,
|
||
|
|
"loss": 0.5977,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.715358361774744,
|
||
|
|
"grad_norm": 1.2645130650718202,
|
||
|
|
"learning_rate": 1.5806002028494509e-06,
|
||
|
|
"loss": 0.4245,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7171786120591581,
|
||
|
|
"grad_norm": 1.3281593922925885,
|
||
|
|
"learning_rate": 1.5618321643112738e-06,
|
||
|
|
"loss": 0.5813,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7189988623435722,
|
||
|
|
"grad_norm": 1.1215366227811656,
|
||
|
|
"learning_rate": 1.5431491492413286e-06,
|
||
|
|
"loss": 0.4276,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7208191126279864,
|
||
|
|
"grad_norm": 1.3212838118308114,
|
||
|
|
"learning_rate": 1.52455180915126e-06,
|
||
|
|
"loss": 0.5774,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7226393629124005,
|
||
|
|
"grad_norm": 1.2852914600481689,
|
||
|
|
"learning_rate": 1.506040792565066e-06,
|
||
|
|
"loss": 0.5057,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7244596131968145,
|
||
|
|
"grad_norm": 1.280275275618163,
|
||
|
|
"learning_rate": 1.487616744996484e-06,
|
||
|
|
"loss": 0.444,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7262798634812286,
|
||
|
|
"grad_norm": 1.1583238977099228,
|
||
|
|
"learning_rate": 1.4692803089264772e-06,
|
||
|
|
"loss": 0.5377,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7281001137656428,
|
||
|
|
"grad_norm": 1.435157708312753,
|
||
|
|
"learning_rate": 1.4510321237808377e-06,
|
||
|
|
"loss": 0.5444,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7299203640500569,
|
||
|
|
"grad_norm": 1.3208185752900872,
|
||
|
|
"learning_rate": 1.4328728259078746e-06,
|
||
|
|
"loss": 0.5566,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.731740614334471,
|
||
|
|
"grad_norm": 1.2130339190915678,
|
||
|
|
"learning_rate": 1.414803048556236e-06,
|
||
|
|
"loss": 0.4988,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7335608646188851,
|
||
|
|
"grad_norm": 1.1363530661008532,
|
||
|
|
"learning_rate": 1.396823421852825e-06,
|
||
|
|
"loss": 0.6129,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7353811149032992,
|
||
|
|
"grad_norm": 1.3222588910481998,
|
||
|
|
"learning_rate": 1.3789345727808207e-06,
|
||
|
|
"loss": 0.546,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7372013651877133,
|
||
|
|
"grad_norm": 1.3949194783709729,
|
||
|
|
"learning_rate": 1.3611371251578114e-06,
|
||
|
|
"loss": 0.5583,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7390216154721274,
|
||
|
|
"grad_norm": 1.2917335175784925,
|
||
|
|
"learning_rate": 1.3434316996140553e-06,
|
||
|
|
"loss": 0.5151,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7408418657565415,
|
||
|
|
"grad_norm": 1.2895735708732046,
|
||
|
|
"learning_rate": 1.3258189135708229e-06,
|
||
|
|
"loss": 0.5098,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7426621160409557,
|
||
|
|
"grad_norm": 1.2978294874532978,
|
||
|
|
"learning_rate": 1.3082993812188735e-06,
|
||
|
|
"loss": 0.5414,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7444823663253698,
|
||
|
|
"grad_norm": 1.2095221030821062,
|
||
|
|
"learning_rate": 1.2908737134970364e-06,
|
||
|
|
"loss": 0.5268,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7463026166097838,
|
||
|
|
"grad_norm": 1.3840563503977592,
|
||
|
|
"learning_rate": 1.2735425180709039e-06,
|
||
|
|
"loss": 0.479,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7481228668941979,
|
||
|
|
"grad_norm": 1.2789076883026242,
|
||
|
|
"learning_rate": 1.2563063993116482e-06,
|
||
|
|
"loss": 0.5503,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7499431171786121,
|
||
|
|
"grad_norm": 1.283587802005637,
|
||
|
|
"learning_rate": 1.239165958274933e-06,
|
||
|
|
"loss": 0.4113,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7517633674630262,
|
||
|
|
"grad_norm": 1.2909165266250262,
|
||
|
|
"learning_rate": 1.2221217926799652e-06,
|
||
|
|
"loss": 0.535,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7535836177474403,
|
||
|
|
"grad_norm": 1.3531455484884616,
|
||
|
|
"learning_rate": 1.2051744968886489e-06,
|
||
|
|
"loss": 0.5052,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7554038680318543,
|
||
|
|
"grad_norm": 1.2730404093480168,
|
||
|
|
"learning_rate": 1.1883246618848533e-06,
|
||
|
|
"loss": 0.4566,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7572241183162685,
|
||
|
|
"grad_norm": 1.37616764437592,
|
||
|
|
"learning_rate": 1.1715728752538101e-06,
|
||
|
|
"loss": 0.566,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7590443686006826,
|
||
|
|
"grad_norm": 1.1512441975212944,
|
||
|
|
"learning_rate": 1.1549197211616203e-06,
|
||
|
|
"loss": 0.5044,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7608646188850967,
|
||
|
|
"grad_norm": 1.2438970988598956,
|
||
|
|
"learning_rate": 1.1383657803348835e-06,
|
||
|
|
"loss": 0.5109,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7626848691695108,
|
||
|
|
"grad_norm": 1.5233735431446764,
|
||
|
|
"learning_rate": 1.1219116300404486e-06,
|
||
|
|
"loss": 0.507,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.764505119453925,
|
||
|
|
"grad_norm": 1.3253161212074762,
|
||
|
|
"learning_rate": 1.10555784406528e-06,
|
||
|
|
"loss": 0.5082,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.766325369738339,
|
||
|
|
"grad_norm": 1.1775521474516462,
|
||
|
|
"learning_rate": 1.089304992696455e-06,
|
||
|
|
"loss": 0.46,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7681456200227531,
|
||
|
|
"grad_norm": 1.2462962157301152,
|
||
|
|
"learning_rate": 1.0731536427012695e-06,
|
||
|
|
"loss": 0.5253,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7699658703071672,
|
||
|
|
"grad_norm": 1.3347631673740097,
|
||
|
|
"learning_rate": 1.0571043573074736e-06,
|
||
|
|
"loss": 0.4449,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7717861205915814,
|
||
|
|
"grad_norm": 1.292727758187721,
|
||
|
|
"learning_rate": 1.041157696183641e-06,
|
||
|
|
"loss": 0.441,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7736063708759955,
|
||
|
|
"grad_norm": 1.293278742294603,
|
||
|
|
"learning_rate": 1.0253142154196415e-06,
|
||
|
|
"loss": 0.4867,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7754266211604095,
|
||
|
|
"grad_norm": 1.2102494852297525,
|
||
|
|
"learning_rate": 1.0095744675072525e-06,
|
||
|
|
"loss": 0.4898,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7772468714448236,
|
||
|
|
"grad_norm": 1.224313028246693,
|
||
|
|
"learning_rate": 9.93939001320895e-07,
|
||
|
|
"loss": 0.4686,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7790671217292378,
|
||
|
|
"grad_norm": 1.3632517015375165,
|
||
|
|
"learning_rate": 9.784083620984884e-07,
|
||
|
|
"loss": 0.4639,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7808873720136519,
|
||
|
|
"grad_norm": 1.3987002856426751,
|
||
|
|
"learning_rate": 9.62983091422446e-07,
|
||
|
|
"loss": 0.4528,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.782707622298066,
|
||
|
|
"grad_norm": 1.329331750067852,
|
||
|
|
"learning_rate": 9.476637272007746e-07,
|
||
|
|
"loss": 0.4562,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7845278725824801,
|
||
|
|
"grad_norm": 1.4216744583623766,
|
||
|
|
"learning_rate": 9.324508036483303e-07,
|
||
|
|
"loss": 0.4622,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7863481228668942,
|
||
|
|
"grad_norm": 1.3060911776176307,
|
||
|
|
"learning_rate": 9.173448512681848e-07,
|
||
|
|
"loss": 0.5405,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7881683731513083,
|
||
|
|
"grad_norm": 1.3971532684012182,
|
||
|
|
"learning_rate": 9.023463968331238e-07,
|
||
|
|
"loss": 0.4642,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7899886234357224,
|
||
|
|
"grad_norm": 1.351332971443725,
|
||
|
|
"learning_rate": 8.874559633672754e-07,
|
||
|
|
"loss": 0.4146,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7918088737201365,
|
||
|
|
"grad_norm": 1.2506853747891504,
|
||
|
|
"learning_rate": 8.726740701278808e-07,
|
||
|
|
"loss": 0.5233,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7936291240045507,
|
||
|
|
"grad_norm": 1.2588296359051319,
|
||
|
|
"learning_rate": 8.580012325871773e-07,
|
||
|
|
"loss": 0.5196,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7954493742889648,
|
||
|
|
"grad_norm": 1.3656683873360818,
|
||
|
|
"learning_rate": 8.434379624144261e-07,
|
||
|
|
"loss": 0.4426,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7972696245733788,
|
||
|
|
"grad_norm": 1.3415371986074633,
|
||
|
|
"learning_rate": 8.289847674580702e-07,
|
||
|
|
"loss": 0.5025,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7990898748577929,
|
||
|
|
"grad_norm": 1.210310044679145,
|
||
|
|
"learning_rate": 8.146421517280226e-07,
|
||
|
|
"loss": 0.4922,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.800910125142207,
|
||
|
|
"grad_norm": 1.675036054936253,
|
||
|
|
"learning_rate": 8.004106153780967e-07,
|
||
|
|
"loss": 0.4396,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8027303754266212,
|
||
|
|
"grad_norm": 1.1849449434556916,
|
||
|
|
"learning_rate": 7.862906546885559e-07,
|
||
|
|
"loss": 0.5348,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8045506257110353,
|
||
|
|
"grad_norm": 1.3294402423567042,
|
||
|
|
"learning_rate": 7.722827620488108e-07,
|
||
|
|
"loss": 0.4472,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8063708759954493,
|
||
|
|
"grad_norm": 1.2341888202472633,
|
||
|
|
"learning_rate": 7.583874259402545e-07,
|
||
|
|
"loss": 0.4926,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8081911262798634,
|
||
|
|
"grad_norm": 1.3727750069417188,
|
||
|
|
"learning_rate": 7.446051309192203e-07,
|
||
|
|
"loss": 0.5187,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8100113765642776,
|
||
|
|
"grad_norm": 1.1665673148184286,
|
||
|
|
"learning_rate": 7.30936357600088e-07,
|
||
|
|
"loss": 0.4459,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8118316268486917,
|
||
|
|
"grad_norm": 1.4461908262228584,
|
||
|
|
"learning_rate": 7.173815826385246e-07,
|
||
|
|
"loss": 0.5931,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8136518771331058,
|
||
|
|
"grad_norm": 1.2164762112018974,
|
||
|
|
"learning_rate": 7.039412787148586e-07,
|
||
|
|
"loss": 0.5769,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.81547212741752,
|
||
|
|
"grad_norm": 1.3268169931538385,
|
||
|
|
"learning_rate": 6.906159145176049e-07,
|
||
|
|
"loss": 0.4962,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.817292377701934,
|
||
|
|
"grad_norm": 1.3585533527783662,
|
||
|
|
"learning_rate": 6.774059547271087e-07,
|
||
|
|
"loss": 0.5011,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8191126279863481,
|
||
|
|
"grad_norm": 1.2715237655057547,
|
||
|
|
"learning_rate": 6.643118599993518e-07,
|
||
|
|
"loss": 0.4591,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8209328782707622,
|
||
|
|
"grad_norm": 1.1129340141314334,
|
||
|
|
"learning_rate": 6.513340869498858e-07,
|
||
|
|
"loss": 0.4818,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8227531285551763,
|
||
|
|
"grad_norm": 1.1993408265317198,
|
||
|
|
"learning_rate": 6.384730881379048e-07,
|
||
|
|
"loss": 0.4826,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8245733788395905,
|
||
|
|
"grad_norm": 1.305009025174831,
|
||
|
|
"learning_rate": 6.257293120504692e-07,
|
||
|
|
"loss": 0.4824,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8263936291240046,
|
||
|
|
"grad_norm": 1.2620160974509798,
|
||
|
|
"learning_rate": 6.131032030868635e-07,
|
||
|
|
"loss": 0.4479,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8282138794084186,
|
||
|
|
"grad_norm": 1.2693469945741236,
|
||
|
|
"learning_rate": 6.005952015430993e-07,
|
||
|
|
"loss": 0.5286,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8300341296928327,
|
||
|
|
"grad_norm": 1.2953927032105943,
|
||
|
|
"learning_rate": 5.882057435965619e-07,
|
||
|
|
"loss": 0.5802,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8318543799772469,
|
||
|
|
"grad_norm": 1.3055790274997285,
|
||
|
|
"learning_rate": 5.759352612907999e-07,
|
||
|
|
"loss": 0.5273,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.833674630261661,
|
||
|
|
"grad_norm": 1.3009913306704852,
|
||
|
|
"learning_rate": 5.637841825204588e-07,
|
||
|
|
"loss": 0.4434,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8354948805460751,
|
||
|
|
"grad_norm": 1.3010149850935786,
|
||
|
|
"learning_rate": 5.517529310163627e-07,
|
||
|
|
"loss": 0.5302,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8373151308304891,
|
||
|
|
"grad_norm": 1.1588504398899486,
|
||
|
|
"learning_rate": 5.398419263307281e-07,
|
||
|
|
"loss": 0.4898,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8391353811149033,
|
||
|
|
"grad_norm": 1.427556447905731,
|
||
|
|
"learning_rate": 5.280515838225477e-07,
|
||
|
|
"loss": 0.4583,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8409556313993174,
|
||
|
|
"grad_norm": 1.3382828189315212,
|
||
|
|
"learning_rate": 5.163823146430944e-07,
|
||
|
|
"loss": 0.4544,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8427758816837315,
|
||
|
|
"grad_norm": 1.3048820751365628,
|
||
|
|
"learning_rate": 5.048345257215892e-07,
|
||
|
|
"loss": 0.5348,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8445961319681456,
|
||
|
|
"grad_norm": 1.3464339683482869,
|
||
|
|
"learning_rate": 4.934086197510088e-07,
|
||
|
|
"loss": 0.4866,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8464163822525598,
|
||
|
|
"grad_norm": 1.3076973707605393,
|
||
|
|
"learning_rate": 4.821049951740441e-07,
|
||
|
|
"loss": 0.4374,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8482366325369738,
|
||
|
|
"grad_norm": 1.207783472984328,
|
||
|
|
"learning_rate": 4.7092404616920547e-07,
|
||
|
|
"loss": 0.5268,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8500568828213879,
|
||
|
|
"grad_norm": 1.3340034898150066,
|
||
|
|
"learning_rate": 4.59866162637077e-07,
|
||
|
|
"loss": 0.5163,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.851877133105802,
|
||
|
|
"grad_norm": 1.2793323359204207,
|
||
|
|
"learning_rate": 4.4893173018671816e-07,
|
||
|
|
"loss": 0.464,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8536973833902162,
|
||
|
|
"grad_norm": 1.3875887367624027,
|
||
|
|
"learning_rate": 4.3812113012222164e-07,
|
||
|
|
"loss": 0.5605,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8555176336746303,
|
||
|
|
"grad_norm": 1.2752397131609516,
|
||
|
|
"learning_rate": 4.2743473942941177e-07,
|
||
|
|
"loss": 0.5166,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8573378839590444,
|
||
|
|
"grad_norm": 1.361832548391048,
|
||
|
|
"learning_rate": 4.168729307626977e-07,
|
||
|
|
"loss": 0.4494,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8591581342434584,
|
||
|
|
"grad_norm": 1.3313280628055624,
|
||
|
|
"learning_rate": 4.0643607243208455e-07,
|
||
|
|
"loss": 0.4531,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8609783845278726,
|
||
|
|
"grad_norm": 1.238927541446331,
|
||
|
|
"learning_rate": 3.9612452839032384e-07,
|
||
|
|
"loss": 0.4629,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8627986348122867,
|
||
|
|
"grad_norm": 1.440299941933543,
|
||
|
|
"learning_rate": 3.859386582202231e-07,
|
||
|
|
"loss": 0.5238,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8646188850967008,
|
||
|
|
"grad_norm": 1.2998009060977955,
|
||
|
|
"learning_rate": 3.758788171221079e-07,
|
||
|
|
"loss": 0.4126,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8664391353811149,
|
||
|
|
"grad_norm": 1.2344313543035759,
|
||
|
|
"learning_rate": 3.659453559014345e-07,
|
||
|
|
"loss": 0.3997,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.868259385665529,
|
||
|
|
"grad_norm": 1.3123945291493502,
|
||
|
|
"learning_rate": 3.561386209565582e-07,
|
||
|
|
"loss": 0.4354,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8700796359499431,
|
||
|
|
"grad_norm": 1.3385863981096489,
|
||
|
|
"learning_rate": 3.464589542666485e-07,
|
||
|
|
"loss": 0.5423,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8718998862343572,
|
||
|
|
"grad_norm": 1.4693361278099728,
|
||
|
|
"learning_rate": 3.3690669337976996e-07,
|
||
|
|
"loss": 0.5439,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8737201365187713,
|
||
|
|
"grad_norm": 1.219115488818529,
|
||
|
|
"learning_rate": 3.2748217140111e-07,
|
||
|
|
"loss": 0.55,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8755403868031855,
|
||
|
|
"grad_norm": 1.3943875753971013,
|
||
|
|
"learning_rate": 3.1818571698135976e-07,
|
||
|
|
"loss": 0.479,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8773606370875996,
|
||
|
|
"grad_norm": 1.31283354874802,
|
||
|
|
"learning_rate": 3.0901765430525337e-07,
|
||
|
|
"loss": 0.4546,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8791808873720136,
|
||
|
|
"grad_norm": 1.2760697675194013,
|
||
|
|
"learning_rate": 2.9997830308027003e-07,
|
||
|
|
"loss": 0.5241,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8810011376564277,
|
||
|
|
"grad_norm": 1.3057898765814404,
|
||
|
|
"learning_rate": 2.9106797852547483e-07,
|
||
|
|
"loss": 0.5045,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8828213879408419,
|
||
|
|
"grad_norm": 1.228721100779524,
|
||
|
|
"learning_rate": 2.8228699136053726e-07,
|
||
|
|
"loss": 0.4588,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.884641638225256,
|
||
|
|
"grad_norm": 1.4327825477254865,
|
||
|
|
"learning_rate": 2.7363564779488446e-07,
|
||
|
|
"loss": 0.4911,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8864618885096701,
|
||
|
|
"grad_norm": 1.1675697744027835,
|
||
|
|
"learning_rate": 2.6511424951703244e-07,
|
||
|
|
"loss": 0.4503,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8882821387940842,
|
||
|
|
"grad_norm": 1.3573793521283821,
|
||
|
|
"learning_rate": 2.567230936840632e-07,
|
||
|
|
"loss": 0.5537,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8901023890784983,
|
||
|
|
"grad_norm": 1.2385857779190943,
|
||
|
|
"learning_rate": 2.4846247291125897e-07,
|
||
|
|
"loss": 0.5261,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8919226393629124,
|
||
|
|
"grad_norm": 1.3747886513978498,
|
||
|
|
"learning_rate": 2.4033267526190057e-07,
|
||
|
|
"loss": 0.5116,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8937428896473265,
|
||
|
|
"grad_norm": 1.3015002806547666,
|
||
|
|
"learning_rate": 2.323339842372234e-07,
|
||
|
|
"loss": 0.501,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8955631399317406,
|
||
|
|
"grad_norm": 1.2282471393147485,
|
||
|
|
"learning_rate": 2.2446667876652968e-07,
|
||
|
|
"loss": 0.5615,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8973833902161548,
|
||
|
|
"grad_norm": 1.2246787300329813,
|
||
|
|
"learning_rate": 2.1673103319746146e-07,
|
||
|
|
"loss": 0.5847,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8992036405005688,
|
||
|
|
"grad_norm": 1.381507003520726,
|
||
|
|
"learning_rate": 2.0912731728643362e-07,
|
||
|
|
"loss": 0.4593,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9010238907849829,
|
||
|
|
"grad_norm": 1.2236872730147548,
|
||
|
|
"learning_rate": 2.0165579618922757e-07,
|
||
|
|
"loss": 0.426,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.902844141069397,
|
||
|
|
"grad_norm": 1.5642555968533283,
|
||
|
|
"learning_rate": 1.943167304517459e-07,
|
||
|
|
"loss": 0.4669,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9046643913538112,
|
||
|
|
"grad_norm": 1.4113035349877263,
|
||
|
|
"learning_rate": 1.871103760009234e-07,
|
||
|
|
"loss": 0.5189,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9064846416382253,
|
||
|
|
"grad_norm": 1.2945664446971985,
|
||
|
|
"learning_rate": 1.8003698413580427e-07,
|
||
|
|
"loss": 0.5331,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9083048919226394,
|
||
|
|
"grad_norm": 1.1216172834522593,
|
||
|
|
"learning_rate": 1.7309680151878126e-07,
|
||
|
|
"loss": 0.4596,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9101251422070534,
|
||
|
|
"grad_norm": 1.4490178400997769,
|
||
|
|
"learning_rate": 1.6629007016698916e-07,
|
||
|
|
"loss": 0.5719,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9119453924914676,
|
||
|
|
"grad_norm": 1.4927164965040023,
|
||
|
|
"learning_rate": 1.5961702744386973e-07,
|
||
|
|
"loss": 0.4637,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9137656427758817,
|
||
|
|
"grad_norm": 1.2926779903672145,
|
||
|
|
"learning_rate": 1.5307790605089045e-07,
|
||
|
|
"loss": 0.4931,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9155858930602958,
|
||
|
|
"grad_norm": 1.3434407972538571,
|
||
|
|
"learning_rate": 1.4667293401943393e-07,
|
||
|
|
"loss": 0.4843,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9174061433447099,
|
||
|
|
"grad_norm": 1.2627460036138376,
|
||
|
|
"learning_rate": 1.404023347028418e-07,
|
||
|
|
"loss": 0.4628,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.919226393629124,
|
||
|
|
"grad_norm": 1.1980109325087624,
|
||
|
|
"learning_rate": 1.342663267686297e-07,
|
||
|
|
"loss": 0.547,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9210466439135381,
|
||
|
|
"grad_norm": 1.4394748326258473,
|
||
|
|
"learning_rate": 1.2826512419085922e-07,
|
||
|
|
"loss": 0.4852,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9228668941979522,
|
||
|
|
"grad_norm": 1.1712799414971835,
|
||
|
|
"learning_rate": 1.223989362426785e-07,
|
||
|
|
"loss": 0.5027,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9246871444823663,
|
||
|
|
"grad_norm": 1.2917639503148088,
|
||
|
|
"learning_rate": 1.1666796748902142e-07,
|
||
|
|
"loss": 0.4318,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9265073947667805,
|
||
|
|
"grad_norm": 1.407559329871179,
|
||
|
|
"learning_rate": 1.1107241777947774e-07,
|
||
|
|
"loss": 0.452,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9283276450511946,
|
||
|
|
"grad_norm": 1.4395176866301798,
|
||
|
|
"learning_rate": 1.0561248224132091e-07,
|
||
|
|
"loss": 0.5792,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9301478953356086,
|
||
|
|
"grad_norm": 1.3107228117658043,
|
||
|
|
"learning_rate": 1.0028835127270552e-07,
|
||
|
|
"loss": 0.523,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9319681456200227,
|
||
|
|
"grad_norm": 1.319280624009732,
|
||
|
|
"learning_rate": 9.510021053602679e-08,
|
||
|
|
"loss": 0.4903,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9337883959044369,
|
||
|
|
"grad_norm": 1.2825750147020196,
|
||
|
|
"learning_rate": 9.004824095144581e-08,
|
||
|
|
"loss": 0.486,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.935608646188851,
|
||
|
|
"grad_norm": 1.3550036994824897,
|
||
|
|
"learning_rate": 8.513261869058209e-08,
|
||
|
|
"loss": 0.4342,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9374288964732651,
|
||
|
|
"grad_norm": 1.2912511428181583,
|
||
|
|
"learning_rate": 8.035351517036914e-08,
|
||
|
|
"loss": 0.4975,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9392491467576792,
|
||
|
|
"grad_norm": 1.2630516224119532,
|
||
|
|
"learning_rate": 7.571109704707623e-08,
|
||
|
|
"loss": 0.4942,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9410693970420932,
|
||
|
|
"grad_norm": 1.2630983628627157,
|
||
|
|
"learning_rate": 7.120552621049825e-08,
|
||
|
|
"loss": 0.4581,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9428896473265074,
|
||
|
|
"grad_norm": 1.184276479260659,
|
||
|
|
"learning_rate": 6.68369597783096e-08,
|
||
|
|
"loss": 0.4245,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9447098976109215,
|
||
|
|
"grad_norm": 1.3479750123046965,
|
||
|
|
"learning_rate": 6.260555009058288e-08,
|
||
|
|
"loss": 0.4734,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9465301478953356,
|
||
|
|
"grad_norm": 1.184265059530281,
|
||
|
|
"learning_rate": 5.851144470448144e-08,
|
||
|
|
"loss": 0.5263,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9483503981797498,
|
||
|
|
"grad_norm": 1.3131542129196199,
|
||
|
|
"learning_rate": 5.455478638911071e-08,
|
||
|
|
"loss": 0.369,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9501706484641638,
|
||
|
|
"grad_norm": 1.3396828056059393,
|
||
|
|
"learning_rate": 5.073571312053815e-08,
|
||
|
|
"loss": 0.5098,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9519908987485779,
|
||
|
|
"grad_norm": 1.3620509437765531,
|
||
|
|
"learning_rate": 4.705435807698555e-08,
|
||
|
|
"loss": 0.5595,
|
||
|
|
"step": 523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.953811149032992,
|
||
|
|
"grad_norm": 1.3476395824069989,
|
||
|
|
"learning_rate": 4.351084963418117e-08,
|
||
|
|
"loss": 0.5332,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9556313993174061,
|
||
|
|
"grad_norm": 1.4056028428746756,
|
||
|
|
"learning_rate": 4.010531136088691e-08,
|
||
|
|
"loss": 0.5135,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9574516496018203,
|
||
|
|
"grad_norm": 1.2931973314368226,
|
||
|
|
"learning_rate": 3.683786201458439e-08,
|
||
|
|
"loss": 0.4869,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9592718998862344,
|
||
|
|
"grad_norm": 1.3728587745363008,
|
||
|
|
"learning_rate": 3.370861553733784e-08,
|
||
|
|
"loss": 0.544,
|
||
|
|
"step": 527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9610921501706484,
|
||
|
|
"grad_norm": 1.379130929011516,
|
||
|
|
"learning_rate": 3.071768105181993e-08,
|
||
|
|
"loss": 0.4312,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9629124004550625,
|
||
|
|
"grad_norm": 1.4162454724368647,
|
||
|
|
"learning_rate": 2.786516285750373e-08,
|
||
|
|
"loss": 0.4464,
|
||
|
|
"step": 529
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9647326507394767,
|
||
|
|
"grad_norm": 1.310107669303508,
|
||
|
|
"learning_rate": 2.5151160427029582e-08,
|
||
|
|
"loss": 0.4641,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9665529010238908,
|
||
|
|
"grad_norm": 1.3049449814100964,
|
||
|
|
"learning_rate": 2.2575768402733232e-08,
|
||
|
|
"loss": 0.5079,
|
||
|
|
"step": 531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9683731513083049,
|
||
|
|
"grad_norm": 1.301610299072927,
|
||
|
|
"learning_rate": 2.013907659334624e-08,
|
||
|
|
"loss": 0.4798,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.970193401592719,
|
||
|
|
"grad_norm": 1.2912511438851022,
|
||
|
|
"learning_rate": 1.7841169970866042e-08,
|
||
|
|
"loss": 0.4962,
|
||
|
|
"step": 533
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9720136518771331,
|
||
|
|
"grad_norm": 1.3741948558886383,
|
||
|
|
"learning_rate": 1.5682128667589e-08,
|
||
|
|
"loss": 0.4556,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9738339021615472,
|
||
|
|
"grad_norm": 1.4394930710163565,
|
||
|
|
"learning_rate": 1.3662027973320612e-08,
|
||
|
|
"loss": 0.4808,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9756541524459613,
|
||
|
|
"grad_norm": 1.3189189784853037,
|
||
|
|
"learning_rate": 1.1780938332746515e-08,
|
||
|
|
"loss": 0.4601,
|
||
|
|
"step": 536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9774744027303754,
|
||
|
|
"grad_norm": 1.3675135301050803,
|
||
|
|
"learning_rate": 1.0038925342977122e-08,
|
||
|
|
"loss": 0.4862,
|
||
|
|
"step": 537
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9792946530147896,
|
||
|
|
"grad_norm": 1.2496142970199702,
|
||
|
|
"learning_rate": 8.43604975126011e-09,
|
||
|
|
"loss": 0.4972,
|
||
|
|
"step": 538
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9811149032992036,
|
||
|
|
"grad_norm": 1.2590220357743287,
|
||
|
|
"learning_rate": 6.972367452863004e-09,
|
||
|
|
"loss": 0.5048,
|
||
|
|
"step": 539
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9829351535836177,
|
||
|
|
"grad_norm": 1.4321645089041766,
|
||
|
|
"learning_rate": 5.647929489122738e-09,
|
||
|
|
"loss": 0.5688,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9847554038680318,
|
||
|
|
"grad_norm": 1.3266758203446563,
|
||
|
|
"learning_rate": 4.462782045664859e-09,
|
||
|
|
"loss": 0.4745,
|
||
|
|
"step": 541
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.986575654152446,
|
||
|
|
"grad_norm": 1.1860109568892805,
|
||
|
|
"learning_rate": 3.4169664507959216e-09,
|
||
|
|
"loss": 0.4616,
|
||
|
|
"step": 542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9883959044368601,
|
||
|
|
"grad_norm": 1.2475217679660848,
|
||
|
|
"learning_rate": 2.5105191740597553e-09,
|
||
|
|
"loss": 0.6489,
|
||
|
|
"step": 543
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9902161547212742,
|
||
|
|
"grad_norm": 1.4270596684886099,
|
||
|
|
"learning_rate": 1.7434718249664803e-09,
|
||
|
|
"loss": 0.4712,
|
||
|
|
"step": 544
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9920364050056882,
|
||
|
|
"grad_norm": 1.2999695109285117,
|
||
|
|
"learning_rate": 1.1158511518902791e-09,
|
||
|
|
"loss": 0.5143,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9938566552901024,
|
||
|
|
"grad_norm": 1.5041014788909566,
|
||
|
|
"learning_rate": 6.276790411372524e-10,
|
||
|
|
"loss": 0.4971,
|
||
|
|
"step": 546
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9956769055745165,
|
||
|
|
"grad_norm": 1.151430673100721,
|
||
|
|
"learning_rate": 2.789725161806977e-10,
|
||
|
|
"loss": 0.5446,
|
||
|
|
"step": 547
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9974971558589306,
|
||
|
|
"grad_norm": 1.2513093960410882,
|
||
|
|
"learning_rate": 6.974373706869486e-11,
|
||
|
|
"loss": 0.5494,
|
||
|
|
"step": 548
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9993174061433447,
|
||
|
|
"grad_norm": 1.3143110421818924,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 0.4345,
|
||
|
|
"step": 549
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9993174061433447,
|
||
|
|
"eval_accuracy": 0.8141491715694426,
|
||
|
|
"eval_accuracy_first_token": 0.7775706741223982,
|
||
|
|
"eval_accuracy_first_token_<": 0.9648293963254593,
|
||
|
|
"eval_accuracy_first_token_<_total": 1905,
|
||
|
|
"eval_accuracy_first_token_<|python_tag|>": 0.9014084507042254,
|
||
|
|
"eval_accuracy_first_token_<|python_tag|>_total": 994,
|
||
|
|
"eval_accuracy_first_token_Certainly": 0.743801652892562,
|
||
|
|
"eval_accuracy_first_token_Certainly_total": 363,
|
||
|
|
"eval_accuracy_first_token_The": 0.9030402629416598,
|
||
|
|
"eval_accuracy_first_token_The_total": 2434,
|
||
|
|
"eval_accuracy_first_token_To": 0.8076923076923077,
|
||
|
|
"eval_accuracy_first_token_To_total": 936,
|
||
|
|
"eval_loss": 0.5610479116439819,
|
||
|
|
"eval_perplexity": 1.110590475782418,
|
||
|
|
"eval_runtime": 507.5321,
|
||
|
|
"eval_samples_per_second": 1.373,
|
||
|
|
"eval_steps_per_second": 0.688,
|
||
|
|
"eval_total_number_first_token": 9657,
|
||
|
|
"step": 549
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9993174061433447,
|
||
|
|
"step": 549,
|
||
|
|
"total_flos": 229846517022720.0,
|
||
|
|
"train_loss": 0.5347839987994544,
|
||
|
|
"train_runtime": 35925.1648,
|
||
|
|
"train_samples_per_second": 0.245,
|
||
|
|
"train_steps_per_second": 0.015
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1.0,
|
||
|
|
"max_steps": 549,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 5.0,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": false,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 229846517022720.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|