5067 lines
124 KiB
JSON
5067 lines
124 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.7502206531332745,
|
|
"eval_steps": 100,
|
|
"global_step": 3400,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0011032656663724624,
|
|
"grad_norm": 5.878592491149902,
|
|
"learning_rate": 8.810572687224672e-08,
|
|
"loss": 0.4689,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.002206531332744925,
|
|
"grad_norm": 5.2574687004089355,
|
|
"learning_rate": 1.982378854625551e-07,
|
|
"loss": 0.4759,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0033097969991173876,
|
|
"grad_norm": 5.635329723358154,
|
|
"learning_rate": 3.083700440528635e-07,
|
|
"loss": 0.466,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.00441306266548985,
|
|
"grad_norm": 5.272955417633057,
|
|
"learning_rate": 4.1850220264317185e-07,
|
|
"loss": 0.4468,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.005516328331862313,
|
|
"grad_norm": 5.219903945922852,
|
|
"learning_rate": 5.286343612334802e-07,
|
|
"loss": 0.4513,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.006619593998234775,
|
|
"grad_norm": 4.160714626312256,
|
|
"learning_rate": 6.387665198237886e-07,
|
|
"loss": 0.4372,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.007722859664607238,
|
|
"grad_norm": 3.6638221740722656,
|
|
"learning_rate": 7.48898678414097e-07,
|
|
"loss": 0.4449,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.0088261253309797,
|
|
"grad_norm": 3.3206021785736084,
|
|
"learning_rate": 8.590308370044054e-07,
|
|
"loss": 0.4125,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.009929390997352162,
|
|
"grad_norm": 2.711574077606201,
|
|
"learning_rate": 9.691629955947138e-07,
|
|
"loss": 0.4156,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.011032656663724626,
|
|
"grad_norm": 1.6887717247009277,
|
|
"learning_rate": 1.0792951541850223e-06,
|
|
"loss": 0.4075,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.012135922330097087,
|
|
"grad_norm": 1.177046775817871,
|
|
"learning_rate": 1.1894273127753305e-06,
|
|
"loss": 0.3647,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.01323918799646955,
|
|
"grad_norm": 0.8924531936645508,
|
|
"learning_rate": 1.299559471365639e-06,
|
|
"loss": 0.3854,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.014342453662842012,
|
|
"grad_norm": 1.1198736429214478,
|
|
"learning_rate": 1.4096916299559475e-06,
|
|
"loss": 0.3644,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.015445719329214475,
|
|
"grad_norm": 0.6698480248451233,
|
|
"learning_rate": 1.5198237885462555e-06,
|
|
"loss": 0.344,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.01654898499558694,
|
|
"grad_norm": 0.6314343214035034,
|
|
"learning_rate": 1.629955947136564e-06,
|
|
"loss": 0.3258,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.0176522506619594,
|
|
"grad_norm": 0.5537658929824829,
|
|
"learning_rate": 1.7400881057268722e-06,
|
|
"loss": 0.3236,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.01875551632833186,
|
|
"grad_norm": 0.6194472312927246,
|
|
"learning_rate": 1.8502202643171807e-06,
|
|
"loss": 0.3219,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.019858781994704325,
|
|
"grad_norm": 0.4850139915943146,
|
|
"learning_rate": 1.960352422907489e-06,
|
|
"loss": 0.3041,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.020962047661076788,
|
|
"grad_norm": 0.5193836092948914,
|
|
"learning_rate": 2.0704845814977977e-06,
|
|
"loss": 0.3198,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.02206531332744925,
|
|
"grad_norm": 0.49118679761886597,
|
|
"learning_rate": 2.180616740088106e-06,
|
|
"loss": 0.3266,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.02206531332744925,
|
|
"eval_loss": 0.3100859820842743,
|
|
"eval_runtime": 269.4841,
|
|
"eval_samples_per_second": 56.638,
|
|
"eval_steps_per_second": 7.08,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.02316857899382171,
|
|
"grad_norm": 0.47427237033843994,
|
|
"learning_rate": 2.290748898678414e-06,
|
|
"loss": 0.2955,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.024271844660194174,
|
|
"grad_norm": 0.49798524379730225,
|
|
"learning_rate": 2.400881057268723e-06,
|
|
"loss": 0.302,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.025375110326566638,
|
|
"grad_norm": 0.46623724699020386,
|
|
"learning_rate": 2.511013215859031e-06,
|
|
"loss": 0.2971,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.0264783759929391,
|
|
"grad_norm": 0.4659285247325897,
|
|
"learning_rate": 2.6211453744493394e-06,
|
|
"loss": 0.2918,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.02758164165931156,
|
|
"grad_norm": 0.46458956599235535,
|
|
"learning_rate": 2.731277533039648e-06,
|
|
"loss": 0.2819,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.028684907325684024,
|
|
"grad_norm": 0.5174154043197632,
|
|
"learning_rate": 2.841409691629956e-06,
|
|
"loss": 0.2953,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.029788172992056487,
|
|
"grad_norm": 0.49397629499435425,
|
|
"learning_rate": 2.9515418502202646e-06,
|
|
"loss": 0.2932,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.03089143865842895,
|
|
"grad_norm": 0.4400649964809418,
|
|
"learning_rate": 3.061674008810573e-06,
|
|
"loss": 0.2829,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.031994704324801414,
|
|
"grad_norm": 0.4720049798488617,
|
|
"learning_rate": 3.1718061674008815e-06,
|
|
"loss": 0.2905,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.03309796999117388,
|
|
"grad_norm": 0.46713733673095703,
|
|
"learning_rate": 3.2819383259911898e-06,
|
|
"loss": 0.2949,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.03420123565754634,
|
|
"grad_norm": 0.4691792130470276,
|
|
"learning_rate": 3.3920704845814985e-06,
|
|
"loss": 0.2845,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.0353045013239188,
|
|
"grad_norm": 0.507400393486023,
|
|
"learning_rate": 3.5022026431718063e-06,
|
|
"loss": 0.2929,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.03640776699029126,
|
|
"grad_norm": 0.5042280554771423,
|
|
"learning_rate": 3.6123348017621146e-06,
|
|
"loss": 0.2903,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.03751103265666372,
|
|
"grad_norm": 0.4863748252391815,
|
|
"learning_rate": 3.7224669603524232e-06,
|
|
"loss": 0.2871,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.038614298323036186,
|
|
"grad_norm": 0.510486364364624,
|
|
"learning_rate": 3.8325991189427315e-06,
|
|
"loss": 0.2995,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.03971756398940865,
|
|
"grad_norm": 0.5123398900032043,
|
|
"learning_rate": 3.94273127753304e-06,
|
|
"loss": 0.2794,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.04082082965578111,
|
|
"grad_norm": 0.45075932145118713,
|
|
"learning_rate": 4.052863436123348e-06,
|
|
"loss": 0.2827,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.041924095322153576,
|
|
"grad_norm": 0.4237598180770874,
|
|
"learning_rate": 4.162995594713657e-06,
|
|
"loss": 0.2806,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.04302736098852604,
|
|
"grad_norm": 0.43099313974380493,
|
|
"learning_rate": 4.273127753303965e-06,
|
|
"loss": 0.2667,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.0441306266548985,
|
|
"grad_norm": 0.5144179463386536,
|
|
"learning_rate": 4.383259911894274e-06,
|
|
"loss": 0.2748,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.0441306266548985,
|
|
"eval_loss": 0.28673937916755676,
|
|
"eval_runtime": 271.0828,
|
|
"eval_samples_per_second": 56.304,
|
|
"eval_steps_per_second": 7.038,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.04523389232127096,
|
|
"grad_norm": 0.4969826340675354,
|
|
"learning_rate": 4.493392070484582e-06,
|
|
"loss": 0.2763,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.04633715798764342,
|
|
"grad_norm": 0.471080482006073,
|
|
"learning_rate": 4.60352422907489e-06,
|
|
"loss": 0.2775,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.047440423654015886,
|
|
"grad_norm": 0.4287184476852417,
|
|
"learning_rate": 4.7136563876651984e-06,
|
|
"loss": 0.2694,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.04854368932038835,
|
|
"grad_norm": 0.5114341974258423,
|
|
"learning_rate": 4.823788546255507e-06,
|
|
"loss": 0.2803,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.04964695498676081,
|
|
"grad_norm": 0.5093077421188354,
|
|
"learning_rate": 4.933920704845816e-06,
|
|
"loss": 0.2802,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.050750220653133275,
|
|
"grad_norm": 0.5257371664047241,
|
|
"learning_rate": 5.044052863436124e-06,
|
|
"loss": 0.292,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.05185348631950574,
|
|
"grad_norm": 0.3984781503677368,
|
|
"learning_rate": 5.154185022026432e-06,
|
|
"loss": 0.2659,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.0529567519858782,
|
|
"grad_norm": 0.4864000082015991,
|
|
"learning_rate": 5.2643171806167406e-06,
|
|
"loss": 0.2722,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.054060017652250665,
|
|
"grad_norm": 0.592187762260437,
|
|
"learning_rate": 5.374449339207049e-06,
|
|
"loss": 0.2793,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.05516328331862312,
|
|
"grad_norm": 0.6081680655479431,
|
|
"learning_rate": 5.484581497797358e-06,
|
|
"loss": 0.2723,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.056266548984995585,
|
|
"grad_norm": 0.4736359715461731,
|
|
"learning_rate": 5.594713656387666e-06,
|
|
"loss": 0.2539,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.05736981465136805,
|
|
"grad_norm": 0.46948790550231934,
|
|
"learning_rate": 5.704845814977974e-06,
|
|
"loss": 0.2748,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.05847308031774051,
|
|
"grad_norm": 0.5093392729759216,
|
|
"learning_rate": 5.814977973568282e-06,
|
|
"loss": 0.2614,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.059576345984112974,
|
|
"grad_norm": 0.5078116655349731,
|
|
"learning_rate": 5.925110132158591e-06,
|
|
"loss": 0.2706,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.06067961165048544,
|
|
"grad_norm": 0.5074042677879333,
|
|
"learning_rate": 6.035242290748899e-06,
|
|
"loss": 0.2644,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.0617828773168579,
|
|
"grad_norm": 0.45398184657096863,
|
|
"learning_rate": 6.1453744493392075e-06,
|
|
"loss": 0.2612,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.06288614298323036,
|
|
"grad_norm": 0.47826600074768066,
|
|
"learning_rate": 6.255506607929516e-06,
|
|
"loss": 0.275,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.06398940864960283,
|
|
"grad_norm": 0.49216607213020325,
|
|
"learning_rate": 6.365638766519824e-06,
|
|
"loss": 0.2639,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.06509267431597529,
|
|
"grad_norm": 0.5131933093070984,
|
|
"learning_rate": 6.475770925110133e-06,
|
|
"loss": 0.286,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.06619593998234775,
|
|
"grad_norm": 0.4883180856704712,
|
|
"learning_rate": 6.585903083700441e-06,
|
|
"loss": 0.2813,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.06619593998234775,
|
|
"eval_loss": 0.2764524519443512,
|
|
"eval_runtime": 269.5554,
|
|
"eval_samples_per_second": 56.623,
|
|
"eval_steps_per_second": 7.078,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.06729920564872022,
|
|
"grad_norm": 0.47087526321411133,
|
|
"learning_rate": 6.69603524229075e-06,
|
|
"loss": 0.2724,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.06840247131509268,
|
|
"grad_norm": 0.4638593792915344,
|
|
"learning_rate": 6.806167400881057e-06,
|
|
"loss": 0.2572,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.06950573698146513,
|
|
"grad_norm": 0.5100425481796265,
|
|
"learning_rate": 6.916299559471367e-06,
|
|
"loss": 0.2814,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.0706090026478376,
|
|
"grad_norm": 0.505667507648468,
|
|
"learning_rate": 7.026431718061674e-06,
|
|
"loss": 0.2672,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.07171226831421006,
|
|
"grad_norm": 0.4753463864326477,
|
|
"learning_rate": 7.136563876651983e-06,
|
|
"loss": 0.2672,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.07281553398058252,
|
|
"grad_norm": 0.5588636994361877,
|
|
"learning_rate": 7.246696035242291e-06,
|
|
"loss": 0.2732,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.07391879964695498,
|
|
"grad_norm": 0.5111093521118164,
|
|
"learning_rate": 7.3568281938326e-06,
|
|
"loss": 0.2643,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.07502206531332745,
|
|
"grad_norm": 0.49439841508865356,
|
|
"learning_rate": 7.466960352422908e-06,
|
|
"loss": 0.2677,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.07612533097969991,
|
|
"grad_norm": 0.4980764389038086,
|
|
"learning_rate": 7.5770925110132166e-06,
|
|
"loss": 0.2577,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.07722859664607237,
|
|
"grad_norm": 0.5545366406440735,
|
|
"learning_rate": 7.687224669603525e-06,
|
|
"loss": 0.2735,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.07833186231244484,
|
|
"grad_norm": 0.45210951566696167,
|
|
"learning_rate": 7.797356828193832e-06,
|
|
"loss": 0.2544,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.0794351279788173,
|
|
"grad_norm": 0.4653448462486267,
|
|
"learning_rate": 7.907488986784141e-06,
|
|
"loss": 0.2815,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.08053839364518976,
|
|
"grad_norm": 0.4663446247577667,
|
|
"learning_rate": 8.01762114537445e-06,
|
|
"loss": 0.2647,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.08164165931156223,
|
|
"grad_norm": 0.5129761695861816,
|
|
"learning_rate": 8.127753303964758e-06,
|
|
"loss": 0.2561,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.08274492497793469,
|
|
"grad_norm": 0.5074746012687683,
|
|
"learning_rate": 8.237885462555067e-06,
|
|
"loss": 0.2642,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.08384819064430715,
|
|
"grad_norm": 0.48852622509002686,
|
|
"learning_rate": 8.348017621145376e-06,
|
|
"loss": 0.2484,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.08495145631067962,
|
|
"grad_norm": 0.46195775270462036,
|
|
"learning_rate": 8.458149779735683e-06,
|
|
"loss": 0.2432,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.08605472197705208,
|
|
"grad_norm": 0.5792168974876404,
|
|
"learning_rate": 8.568281938325993e-06,
|
|
"loss": 0.2711,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.08715798764342454,
|
|
"grad_norm": 0.57877516746521,
|
|
"learning_rate": 8.6784140969163e-06,
|
|
"loss": 0.2758,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.088261253309797,
|
|
"grad_norm": 0.4454537332057953,
|
|
"learning_rate": 8.788546255506607e-06,
|
|
"loss": 0.269,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.088261253309797,
|
|
"eval_loss": 0.27229130268096924,
|
|
"eval_runtime": 269.3494,
|
|
"eval_samples_per_second": 56.666,
|
|
"eval_steps_per_second": 7.084,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.08936451897616945,
|
|
"grad_norm": 0.5199728608131409,
|
|
"learning_rate": 8.898678414096917e-06,
|
|
"loss": 0.2594,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.09046778464254192,
|
|
"grad_norm": 0.4717728793621063,
|
|
"learning_rate": 9.008810572687226e-06,
|
|
"loss": 0.267,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.09157105030891438,
|
|
"grad_norm": 0.4550599157810211,
|
|
"learning_rate": 9.118942731277533e-06,
|
|
"loss": 0.254,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.09267431597528684,
|
|
"grad_norm": 0.5019270777702332,
|
|
"learning_rate": 9.229074889867842e-06,
|
|
"loss": 0.263,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.09377758164165931,
|
|
"grad_norm": 0.5012001991271973,
|
|
"learning_rate": 9.339207048458151e-06,
|
|
"loss": 0.2588,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.09488084730803177,
|
|
"grad_norm": 0.5489994883537292,
|
|
"learning_rate": 9.449339207048459e-06,
|
|
"loss": 0.2684,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.09598411297440423,
|
|
"grad_norm": 0.5029094219207764,
|
|
"learning_rate": 9.559471365638768e-06,
|
|
"loss": 0.2635,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.0970873786407767,
|
|
"grad_norm": 0.5363909006118774,
|
|
"learning_rate": 9.669603524229075e-06,
|
|
"loss": 0.2572,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.09819064430714916,
|
|
"grad_norm": 0.6020154356956482,
|
|
"learning_rate": 9.779735682819384e-06,
|
|
"loss": 0.2554,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.09929390997352162,
|
|
"grad_norm": 0.5789384841918945,
|
|
"learning_rate": 9.889867841409693e-06,
|
|
"loss": 0.2814,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.10039717563989409,
|
|
"grad_norm": 0.5141489505767822,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2498,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.10150044130626655,
|
|
"grad_norm": 0.5430559515953064,
|
|
"learning_rate": 9.99999167904182e-06,
|
|
"loss": 0.276,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.10260370697263901,
|
|
"grad_norm": 0.5350551009178162,
|
|
"learning_rate": 9.999966716194973e-06,
|
|
"loss": 0.2364,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.10370697263901148,
|
|
"grad_norm": 0.5607656836509705,
|
|
"learning_rate": 9.999925111542544e-06,
|
|
"loss": 0.2599,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.10481023830538394,
|
|
"grad_norm": 0.4968941807746887,
|
|
"learning_rate": 9.99986686522301e-06,
|
|
"loss": 0.262,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.1059135039717564,
|
|
"grad_norm": 0.49971750378608704,
|
|
"learning_rate": 9.999791977430238e-06,
|
|
"loss": 0.2642,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.10701676963812887,
|
|
"grad_norm": 0.48582854866981506,
|
|
"learning_rate": 9.999700448413483e-06,
|
|
"loss": 0.252,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.10812003530450133,
|
|
"grad_norm": 0.5446631908416748,
|
|
"learning_rate": 9.999592278477389e-06,
|
|
"loss": 0.2652,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.10922330097087378,
|
|
"grad_norm": 0.4594772160053253,
|
|
"learning_rate": 9.999467467981984e-06,
|
|
"loss": 0.253,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.11032656663724624,
|
|
"grad_norm": 0.5003575682640076,
|
|
"learning_rate": 9.999326017342688e-06,
|
|
"loss": 0.2629,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.11032656663724624,
|
|
"eval_loss": 0.26893937587738037,
|
|
"eval_runtime": 268.2711,
|
|
"eval_samples_per_second": 56.894,
|
|
"eval_steps_per_second": 7.112,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.1114298323036187,
|
|
"grad_norm": 0.4908248484134674,
|
|
"learning_rate": 9.999167927030304e-06,
|
|
"loss": 0.2577,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.11253309796999117,
|
|
"grad_norm": 0.6282269358634949,
|
|
"learning_rate": 9.998993197571014e-06,
|
|
"loss": 0.2714,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.11363636363636363,
|
|
"grad_norm": 0.4572107195854187,
|
|
"learning_rate": 9.998801829546387e-06,
|
|
"loss": 0.2469,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.1147396293027361,
|
|
"grad_norm": 0.5121334195137024,
|
|
"learning_rate": 9.99859382359337e-06,
|
|
"loss": 0.2657,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.11584289496910856,
|
|
"grad_norm": 0.4956417679786682,
|
|
"learning_rate": 9.998369180404283e-06,
|
|
"loss": 0.2435,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.11694616063548102,
|
|
"grad_norm": 0.6356124877929688,
|
|
"learning_rate": 9.998127900726825e-06,
|
|
"loss": 0.2694,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.11804942630185349,
|
|
"grad_norm": 0.5022862553596497,
|
|
"learning_rate": 9.997869985364073e-06,
|
|
"loss": 0.2655,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.11915269196822595,
|
|
"grad_norm": 0.47873038053512573,
|
|
"learning_rate": 9.997595435174461e-06,
|
|
"loss": 0.2704,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.12025595763459841,
|
|
"grad_norm": 0.46941813826560974,
|
|
"learning_rate": 9.997304251071802e-06,
|
|
"loss": 0.2594,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.12135922330097088,
|
|
"grad_norm": 0.47806182503700256,
|
|
"learning_rate": 9.996996434025264e-06,
|
|
"loss": 0.2597,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.12246248896734334,
|
|
"grad_norm": 0.666239857673645,
|
|
"learning_rate": 9.996671985059384e-06,
|
|
"loss": 0.2722,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.1235657546337158,
|
|
"grad_norm": 0.40519818663597107,
|
|
"learning_rate": 9.99633090525405e-06,
|
|
"loss": 0.2483,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.12466902030008827,
|
|
"grad_norm": 0.5072190165519714,
|
|
"learning_rate": 9.99597319574451e-06,
|
|
"loss": 0.2528,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.12577228596646073,
|
|
"grad_norm": 0.4706517159938812,
|
|
"learning_rate": 9.995598857721354e-06,
|
|
"loss": 0.2628,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.12687555163283318,
|
|
"grad_norm": 0.46087875962257385,
|
|
"learning_rate": 9.995207892430525e-06,
|
|
"loss": 0.2537,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.12797881729920565,
|
|
"grad_norm": 0.5278568863868713,
|
|
"learning_rate": 9.994800301173303e-06,
|
|
"loss": 0.2687,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.1290820829655781,
|
|
"grad_norm": 0.5285748839378357,
|
|
"learning_rate": 9.994376085306309e-06,
|
|
"loss": 0.2647,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.13018534863195058,
|
|
"grad_norm": 0.5125618577003479,
|
|
"learning_rate": 9.9939352462415e-06,
|
|
"loss": 0.251,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.13128861429832303,
|
|
"grad_norm": 0.64532870054245,
|
|
"learning_rate": 9.993477785446151e-06,
|
|
"loss": 0.2574,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.1323918799646955,
|
|
"grad_norm": 0.49944519996643066,
|
|
"learning_rate": 9.99300370444287e-06,
|
|
"loss": 0.2495,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.1323918799646955,
|
|
"eval_loss": 0.2665667235851288,
|
|
"eval_runtime": 268.5635,
|
|
"eval_samples_per_second": 56.832,
|
|
"eval_steps_per_second": 7.104,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.13349514563106796,
|
|
"grad_norm": 0.4549316465854645,
|
|
"learning_rate": 9.99251300480958e-06,
|
|
"loss": 0.2765,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.13459841129744043,
|
|
"grad_norm": 0.5687413215637207,
|
|
"learning_rate": 9.992005688179518e-06,
|
|
"loss": 0.2729,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.13570167696381288,
|
|
"grad_norm": 0.5213468074798584,
|
|
"learning_rate": 9.991481756241228e-06,
|
|
"loss": 0.2536,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.13680494263018536,
|
|
"grad_norm": 0.5374130606651306,
|
|
"learning_rate": 9.990941210738553e-06,
|
|
"loss": 0.2629,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.1379082082965578,
|
|
"grad_norm": 0.4314231276512146,
|
|
"learning_rate": 9.99038405347064e-06,
|
|
"loss": 0.2538,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.13901147396293026,
|
|
"grad_norm": 0.4665580093860626,
|
|
"learning_rate": 9.989810286291923e-06,
|
|
"loss": 0.2538,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.14011473962930274,
|
|
"grad_norm": 0.5242295861244202,
|
|
"learning_rate": 9.989219911112114e-06,
|
|
"loss": 0.2633,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.1412180052956752,
|
|
"grad_norm": 0.49017295241355896,
|
|
"learning_rate": 9.988612929896211e-06,
|
|
"loss": 0.2678,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.14232127096204766,
|
|
"grad_norm": 0.5350978970527649,
|
|
"learning_rate": 9.987989344664479e-06,
|
|
"loss": 0.2686,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.1434245366284201,
|
|
"grad_norm": 0.5256524085998535,
|
|
"learning_rate": 9.98734915749245e-06,
|
|
"loss": 0.2623,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.1445278022947926,
|
|
"grad_norm": 0.5268611907958984,
|
|
"learning_rate": 9.98669237051091e-06,
|
|
"loss": 0.2545,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.14563106796116504,
|
|
"grad_norm": 0.4854044020175934,
|
|
"learning_rate": 9.986018985905901e-06,
|
|
"loss": 0.2624,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.14673433362753752,
|
|
"grad_norm": 0.6195608973503113,
|
|
"learning_rate": 9.985329005918702e-06,
|
|
"loss": 0.26,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.14783759929390997,
|
|
"grad_norm": 0.5011169910430908,
|
|
"learning_rate": 9.984622432845835e-06,
|
|
"loss": 0.2468,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.14894086496028244,
|
|
"grad_norm": 0.5306973457336426,
|
|
"learning_rate": 9.98389926903904e-06,
|
|
"loss": 0.2478,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.1500441306266549,
|
|
"grad_norm": 0.5306282043457031,
|
|
"learning_rate": 9.983159516905287e-06,
|
|
"loss": 0.2589,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.15114739629302737,
|
|
"grad_norm": 0.47503378987312317,
|
|
"learning_rate": 9.982403178906755e-06,
|
|
"loss": 0.2467,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.15225066195939982,
|
|
"grad_norm": 0.548812985420227,
|
|
"learning_rate": 9.981630257560825e-06,
|
|
"loss": 0.2649,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.1533539276257723,
|
|
"grad_norm": 0.5385993719100952,
|
|
"learning_rate": 9.980840755440075e-06,
|
|
"loss": 0.2507,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.15445719329214475,
|
|
"grad_norm": 0.5004397034645081,
|
|
"learning_rate": 9.980034675172274e-06,
|
|
"loss": 0.2648,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.15445719329214475,
|
|
"eval_loss": 0.2640049159526825,
|
|
"eval_runtime": 273.2064,
|
|
"eval_samples_per_second": 55.866,
|
|
"eval_steps_per_second": 6.984,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.15556045895851722,
|
|
"grad_norm": 0.5281751155853271,
|
|
"learning_rate": 9.979212019440364e-06,
|
|
"loss": 0.2598,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.15666372462488967,
|
|
"grad_norm": 0.41343384981155396,
|
|
"learning_rate": 9.978372790982457e-06,
|
|
"loss": 0.2502,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.15776699029126215,
|
|
"grad_norm": 0.4603078365325928,
|
|
"learning_rate": 9.977516992591832e-06,
|
|
"loss": 0.2716,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.1588702559576346,
|
|
"grad_norm": 0.47830113768577576,
|
|
"learning_rate": 9.976644627116906e-06,
|
|
"loss": 0.2532,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.15997352162400705,
|
|
"grad_norm": 0.4745546281337738,
|
|
"learning_rate": 9.975755697461254e-06,
|
|
"loss": 0.2602,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.16107678729037953,
|
|
"grad_norm": 0.47886019945144653,
|
|
"learning_rate": 9.97485020658357e-06,
|
|
"loss": 0.273,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.16218005295675197,
|
|
"grad_norm": 0.7469276189804077,
|
|
"learning_rate": 9.973928157497675e-06,
|
|
"loss": 0.2631,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.16328331862312445,
|
|
"grad_norm": 0.520545482635498,
|
|
"learning_rate": 9.972989553272501e-06,
|
|
"loss": 0.2506,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.1643865842894969,
|
|
"grad_norm": 0.428124338388443,
|
|
"learning_rate": 9.972034397032086e-06,
|
|
"loss": 0.2482,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.16548984995586938,
|
|
"grad_norm": 0.5379740595817566,
|
|
"learning_rate": 9.971062691955553e-06,
|
|
"loss": 0.2557,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.16659311562224183,
|
|
"grad_norm": 0.5184879899024963,
|
|
"learning_rate": 9.970074441277111e-06,
|
|
"loss": 0.2587,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.1676963812886143,
|
|
"grad_norm": 0.4643745422363281,
|
|
"learning_rate": 9.969069648286034e-06,
|
|
"loss": 0.2538,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.16879964695498675,
|
|
"grad_norm": 0.48147132992744446,
|
|
"learning_rate": 9.968048316326661e-06,
|
|
"loss": 0.2534,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.16990291262135923,
|
|
"grad_norm": 0.5775375366210938,
|
|
"learning_rate": 9.967010448798376e-06,
|
|
"loss": 0.2659,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.17100617828773168,
|
|
"grad_norm": 0.5497225522994995,
|
|
"learning_rate": 9.9659560491556e-06,
|
|
"loss": 0.2464,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.17210944395410416,
|
|
"grad_norm": 0.4842096269130707,
|
|
"learning_rate": 9.964885120907777e-06,
|
|
"loss": 0.2341,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.1732127096204766,
|
|
"grad_norm": 0.5398717522621155,
|
|
"learning_rate": 9.963797667619368e-06,
|
|
"loss": 0.2585,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.17431597528684908,
|
|
"grad_norm": 0.5121772289276123,
|
|
"learning_rate": 9.962693692909834e-06,
|
|
"loss": 0.2677,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.17541924095322153,
|
|
"grad_norm": 0.5982369184494019,
|
|
"learning_rate": 9.961573200453627e-06,
|
|
"loss": 0.2572,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.176522506619594,
|
|
"grad_norm": 0.527195394039154,
|
|
"learning_rate": 9.960436193980175e-06,
|
|
"loss": 0.2503,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.176522506619594,
|
|
"eval_loss": 0.2625011205673218,
|
|
"eval_runtime": 269.8781,
|
|
"eval_samples_per_second": 56.555,
|
|
"eval_steps_per_second": 7.07,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.17762577228596646,
|
|
"grad_norm": 0.5573858022689819,
|
|
"learning_rate": 9.959282677273869e-06,
|
|
"loss": 0.266,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.1787290379523389,
|
|
"grad_norm": 0.4883844554424286,
|
|
"learning_rate": 9.958112654174058e-06,
|
|
"loss": 0.2572,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.1798323036187114,
|
|
"grad_norm": 0.46676844358444214,
|
|
"learning_rate": 9.956926128575026e-06,
|
|
"loss": 0.2459,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.18093556928508384,
|
|
"grad_norm": 0.4161823093891144,
|
|
"learning_rate": 9.955723104425986e-06,
|
|
"loss": 0.2411,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.1820388349514563,
|
|
"grad_norm": 0.45280569791793823,
|
|
"learning_rate": 9.954503585731061e-06,
|
|
"loss": 0.2586,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.18314210061782876,
|
|
"grad_norm": 0.43848279118537903,
|
|
"learning_rate": 9.953267576549279e-06,
|
|
"loss": 0.2464,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.18424536628420124,
|
|
"grad_norm": 0.48168620467185974,
|
|
"learning_rate": 9.95201508099455e-06,
|
|
"loss": 0.2512,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.1853486319505737,
|
|
"grad_norm": 0.5931830406188965,
|
|
"learning_rate": 9.950746103235663e-06,
|
|
"loss": 0.2526,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.18645189761694617,
|
|
"grad_norm": 0.4570105969905853,
|
|
"learning_rate": 9.949460647496258e-06,
|
|
"loss": 0.2457,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.18755516328331862,
|
|
"grad_norm": 0.4142732322216034,
|
|
"learning_rate": 9.948158718054828e-06,
|
|
"loss": 0.2441,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.1886584289496911,
|
|
"grad_norm": 0.48708921670913696,
|
|
"learning_rate": 9.94684031924469e-06,
|
|
"loss": 0.2577,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.18976169461606354,
|
|
"grad_norm": 0.5016698241233826,
|
|
"learning_rate": 9.945505455453983e-06,
|
|
"loss": 0.2562,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.19086496028243602,
|
|
"grad_norm": 0.5526902675628662,
|
|
"learning_rate": 9.944154131125643e-06,
|
|
"loss": 0.255,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.19196822594880847,
|
|
"grad_norm": 0.526472806930542,
|
|
"learning_rate": 9.942786350757398e-06,
|
|
"loss": 0.2659,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.19307149161518095,
|
|
"grad_norm": 0.5003820061683655,
|
|
"learning_rate": 9.941402118901743e-06,
|
|
"loss": 0.2565,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.1941747572815534,
|
|
"grad_norm": 0.5030418038368225,
|
|
"learning_rate": 9.940001440165934e-06,
|
|
"loss": 0.2628,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.19527802294792587,
|
|
"grad_norm": 0.47498998045921326,
|
|
"learning_rate": 9.938584319211965e-06,
|
|
"loss": 0.2744,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.19638128861429832,
|
|
"grad_norm": 0.5270429253578186,
|
|
"learning_rate": 9.93715076075656e-06,
|
|
"loss": 0.2561,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.1974845542806708,
|
|
"grad_norm": 0.5205044150352478,
|
|
"learning_rate": 9.935700769571148e-06,
|
|
"loss": 0.2449,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.19858781994704325,
|
|
"grad_norm": 0.41483354568481445,
|
|
"learning_rate": 9.934234350481856e-06,
|
|
"loss": 0.2595,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.19858781994704325,
|
|
"eval_loss": 0.26145488023757935,
|
|
"eval_runtime": 273.4791,
|
|
"eval_samples_per_second": 55.81,
|
|
"eval_steps_per_second": 6.977,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.1996910856134157,
|
|
"grad_norm": 0.5100669860839844,
|
|
"learning_rate": 9.932751508369492e-06,
|
|
"loss": 0.2485,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.20079435127978817,
|
|
"grad_norm": 0.4988202154636383,
|
|
"learning_rate": 9.931252248169518e-06,
|
|
"loss": 0.2555,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.20189761694616062,
|
|
"grad_norm": 0.49361705780029297,
|
|
"learning_rate": 9.929736574872052e-06,
|
|
"loss": 0.2579,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.2030008826125331,
|
|
"grad_norm": 0.4196428060531616,
|
|
"learning_rate": 9.92820449352183e-06,
|
|
"loss": 0.2456,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.20410414827890555,
|
|
"grad_norm": 0.425731897354126,
|
|
"learning_rate": 9.926656009218208e-06,
|
|
"loss": 0.2457,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.20520741394527803,
|
|
"grad_norm": 0.4996449649333954,
|
|
"learning_rate": 9.925091127115139e-06,
|
|
"loss": 0.2689,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.20631067961165048,
|
|
"grad_norm": 0.4646408259868622,
|
|
"learning_rate": 9.923509852421144e-06,
|
|
"loss": 0.2429,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.20741394527802295,
|
|
"grad_norm": 0.5681747794151306,
|
|
"learning_rate": 9.921912190399317e-06,
|
|
"loss": 0.2581,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.2085172109443954,
|
|
"grad_norm": 0.45096200704574585,
|
|
"learning_rate": 9.920298146367287e-06,
|
|
"loss": 0.2465,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.20962047661076788,
|
|
"grad_norm": 0.4459875226020813,
|
|
"learning_rate": 9.91866772569721e-06,
|
|
"loss": 0.2593,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.21072374227714033,
|
|
"grad_norm": 0.4605613946914673,
|
|
"learning_rate": 9.917020933815753e-06,
|
|
"loss": 0.2646,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.2118270079435128,
|
|
"grad_norm": 0.5199949741363525,
|
|
"learning_rate": 9.91535777620407e-06,
|
|
"loss": 0.2572,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.21293027360988526,
|
|
"grad_norm": 0.42653989791870117,
|
|
"learning_rate": 9.913678258397785e-06,
|
|
"loss": 0.2547,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.21403353927625773,
|
|
"grad_norm": 0.5204625725746155,
|
|
"learning_rate": 9.91198238598698e-06,
|
|
"loss": 0.2407,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.21513680494263018,
|
|
"grad_norm": 0.4516238272190094,
|
|
"learning_rate": 9.910270164616168e-06,
|
|
"loss": 0.2531,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.21624007060900266,
|
|
"grad_norm": 0.4373854696750641,
|
|
"learning_rate": 9.908541599984276e-06,
|
|
"loss": 0.2495,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.2173433362753751,
|
|
"grad_norm": 0.5143552422523499,
|
|
"learning_rate": 9.90679669784463e-06,
|
|
"loss": 0.2496,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.21844660194174756,
|
|
"grad_norm": 0.41742590069770813,
|
|
"learning_rate": 9.905035464004935e-06,
|
|
"loss": 0.2481,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.21954986760812004,
|
|
"grad_norm": 0.46620362997055054,
|
|
"learning_rate": 9.90325790432725e-06,
|
|
"loss": 0.2625,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.22065313327449249,
|
|
"grad_norm": 0.4866413176059723,
|
|
"learning_rate": 9.901464024727976e-06,
|
|
"loss": 0.247,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.22065313327449249,
|
|
"eval_loss": 0.25996777415275574,
|
|
"eval_runtime": 273.2634,
|
|
"eval_samples_per_second": 55.855,
|
|
"eval_steps_per_second": 6.982,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.22175639894086496,
|
|
"grad_norm": 0.4647798240184784,
|
|
"learning_rate": 9.899653831177831e-06,
|
|
"loss": 0.2528,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.2228596646072374,
|
|
"grad_norm": 0.4932115972042084,
|
|
"learning_rate": 9.897827329701834e-06,
|
|
"loss": 0.2544,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.2239629302736099,
|
|
"grad_norm": 0.4925852417945862,
|
|
"learning_rate": 9.895984526379282e-06,
|
|
"loss": 0.2621,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.22506619593998234,
|
|
"grad_norm": 0.5298591256141663,
|
|
"learning_rate": 9.89412542734373e-06,
|
|
"loss": 0.2542,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.22616946160635482,
|
|
"grad_norm": 0.5149207711219788,
|
|
"learning_rate": 9.892250038782972e-06,
|
|
"loss": 0.2579,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.22727272727272727,
|
|
"grad_norm": 0.45972946286201477,
|
|
"learning_rate": 9.890358366939021e-06,
|
|
"loss": 0.2534,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.22837599293909974,
|
|
"grad_norm": 0.4157005846500397,
|
|
"learning_rate": 9.888450418108085e-06,
|
|
"loss": 0.243,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.2294792586054722,
|
|
"grad_norm": 0.39558079838752747,
|
|
"learning_rate": 9.88652619864055e-06,
|
|
"loss": 0.2461,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.23058252427184467,
|
|
"grad_norm": 0.47637176513671875,
|
|
"learning_rate": 9.884585714940953e-06,
|
|
"loss": 0.2353,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.23168578993821712,
|
|
"grad_norm": 0.5233368277549744,
|
|
"learning_rate": 9.882628973467972e-06,
|
|
"loss": 0.2536,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.2327890556045896,
|
|
"grad_norm": 0.4879682660102844,
|
|
"learning_rate": 9.880655980734391e-06,
|
|
"loss": 0.2611,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.23389232127096204,
|
|
"grad_norm": 0.4481244385242462,
|
|
"learning_rate": 9.878666743307083e-06,
|
|
"loss": 0.2549,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.23499558693733452,
|
|
"grad_norm": 0.43410834670066833,
|
|
"learning_rate": 9.876661267806995e-06,
|
|
"loss": 0.2589,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.23609885260370697,
|
|
"grad_norm": 0.6263585686683655,
|
|
"learning_rate": 9.874639560909118e-06,
|
|
"loss": 0.2471,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.23720211827007945,
|
|
"grad_norm": 0.47275134921073914,
|
|
"learning_rate": 9.872601629342468e-06,
|
|
"loss": 0.2575,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.2383053839364519,
|
|
"grad_norm": 0.5037277340888977,
|
|
"learning_rate": 9.870547479890062e-06,
|
|
"loss": 0.2549,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.23940864960282435,
|
|
"grad_norm": 0.5256139039993286,
|
|
"learning_rate": 9.868477119388897e-06,
|
|
"loss": 0.2574,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.24051191526919682,
|
|
"grad_norm": 0.46220555901527405,
|
|
"learning_rate": 9.866390554729923e-06,
|
|
"loss": 0.257,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.24161518093556927,
|
|
"grad_norm": 0.48123809695243835,
|
|
"learning_rate": 9.864287792858032e-06,
|
|
"loss": 0.2437,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.24271844660194175,
|
|
"grad_norm": 0.5462665557861328,
|
|
"learning_rate": 9.862168840772018e-06,
|
|
"loss": 0.2454,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.24271844660194175,
|
|
"eval_loss": 0.25873637199401855,
|
|
"eval_runtime": 271.9106,
|
|
"eval_samples_per_second": 56.132,
|
|
"eval_steps_per_second": 7.017,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.2438217122683142,
|
|
"grad_norm": 0.5108821392059326,
|
|
"learning_rate": 9.860033705524566e-06,
|
|
"loss": 0.247,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.24492497793468668,
|
|
"grad_norm": 0.47107893228530884,
|
|
"learning_rate": 9.857882394222225e-06,
|
|
"loss": 0.2546,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.24602824360105913,
|
|
"grad_norm": 0.4935952425003052,
|
|
"learning_rate": 9.855714914025386e-06,
|
|
"loss": 0.247,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.2471315092674316,
|
|
"grad_norm": 0.5136795043945312,
|
|
"learning_rate": 9.853531272148248e-06,
|
|
"loss": 0.2615,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.24823477493380405,
|
|
"grad_norm": 0.5249958634376526,
|
|
"learning_rate": 9.851331475858813e-06,
|
|
"loss": 0.2619,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.24933804060017653,
|
|
"grad_norm": 0.4954059422016144,
|
|
"learning_rate": 9.849115532478848e-06,
|
|
"loss": 0.2473,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.250441306266549,
|
|
"grad_norm": 0.47944945096969604,
|
|
"learning_rate": 9.846883449383854e-06,
|
|
"loss": 0.2566,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.25154457193292146,
|
|
"grad_norm": 0.5183018445968628,
|
|
"learning_rate": 9.844635234003067e-06,
|
|
"loss": 0.2629,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.25264783759929393,
|
|
"grad_norm": 0.4572855830192566,
|
|
"learning_rate": 9.842370893819404e-06,
|
|
"loss": 0.2593,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.25375110326566636,
|
|
"grad_norm": 0.4775985777378082,
|
|
"learning_rate": 9.840090436369458e-06,
|
|
"loss": 0.2354,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.25485436893203883,
|
|
"grad_norm": 0.48503291606903076,
|
|
"learning_rate": 9.837793869243468e-06,
|
|
"loss": 0.2483,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.2559576345984113,
|
|
"grad_norm": 0.46030426025390625,
|
|
"learning_rate": 9.83548120008529e-06,
|
|
"loss": 0.2616,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.2570609002647838,
|
|
"grad_norm": 0.5037588477134705,
|
|
"learning_rate": 9.83315243659237e-06,
|
|
"loss": 0.2488,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.2581641659311562,
|
|
"grad_norm": 0.508270263671875,
|
|
"learning_rate": 9.830807586515726e-06,
|
|
"loss": 0.2579,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.2592674315975287,
|
|
"grad_norm": 0.4799206554889679,
|
|
"learning_rate": 9.828446657659919e-06,
|
|
"loss": 0.25,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.26037069726390116,
|
|
"grad_norm": 0.531873881816864,
|
|
"learning_rate": 9.826069657883027e-06,
|
|
"loss": 0.2467,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.2614739629302736,
|
|
"grad_norm": 0.5633664727210999,
|
|
"learning_rate": 9.823676595096612e-06,
|
|
"loss": 0.2595,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.26257722859664606,
|
|
"grad_norm": 0.5257665514945984,
|
|
"learning_rate": 9.821267477265705e-06,
|
|
"loss": 0.2662,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.26368049426301854,
|
|
"grad_norm": 0.5463647246360779,
|
|
"learning_rate": 9.818842312408776e-06,
|
|
"loss": 0.2478,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.264783759929391,
|
|
"grad_norm": 0.4790140986442566,
|
|
"learning_rate": 9.816401108597704e-06,
|
|
"loss": 0.2516,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.264783759929391,
|
|
"eval_loss": 0.25740158557891846,
|
|
"eval_runtime": 274.3768,
|
|
"eval_samples_per_second": 55.628,
|
|
"eval_steps_per_second": 6.954,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.26588702559576344,
|
|
"grad_norm": 0.44939637184143066,
|
|
"learning_rate": 9.813943873957748e-06,
|
|
"loss": 0.2568,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.2669902912621359,
|
|
"grad_norm": 0.44032007455825806,
|
|
"learning_rate": 9.811470616667525e-06,
|
|
"loss": 0.2598,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.2680935569285084,
|
|
"grad_norm": 0.4683074951171875,
|
|
"learning_rate": 9.808981344958988e-06,
|
|
"loss": 0.2468,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.26919682259488087,
|
|
"grad_norm": 0.46099165081977844,
|
|
"learning_rate": 9.806476067117384e-06,
|
|
"loss": 0.2597,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.2703000882612533,
|
|
"grad_norm": 0.47137463092803955,
|
|
"learning_rate": 9.803954791481239e-06,
|
|
"loss": 0.2564,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.27140335392762577,
|
|
"grad_norm": 0.41110455989837646,
|
|
"learning_rate": 9.801417526442326e-06,
|
|
"loss": 0.256,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.27250661959399824,
|
|
"grad_norm": 0.4750699996948242,
|
|
"learning_rate": 9.798864280445633e-06,
|
|
"loss": 0.2461,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.2736098852603707,
|
|
"grad_norm": 0.4262714684009552,
|
|
"learning_rate": 9.79629506198934e-06,
|
|
"loss": 0.2611,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.27471315092674314,
|
|
"grad_norm": 0.4699675738811493,
|
|
"learning_rate": 9.793709879624797e-06,
|
|
"loss": 0.2454,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.2758164165931156,
|
|
"grad_norm": 0.4742600619792938,
|
|
"learning_rate": 9.791108741956476e-06,
|
|
"loss": 0.2583,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.2769196822594881,
|
|
"grad_norm": 0.4389561414718628,
|
|
"learning_rate": 9.78849165764196e-06,
|
|
"loss": 0.24,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.2780229479258605,
|
|
"grad_norm": 0.4927821457386017,
|
|
"learning_rate": 9.785858635391913e-06,
|
|
"loss": 0.2527,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.279126213592233,
|
|
"grad_norm": 0.38495415449142456,
|
|
"learning_rate": 9.78320968397004e-06,
|
|
"loss": 0.2411,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.2802294792586055,
|
|
"grad_norm": 0.4532706141471863,
|
|
"learning_rate": 9.780544812193065e-06,
|
|
"loss": 0.234,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.28133274492497795,
|
|
"grad_norm": 0.48407411575317383,
|
|
"learning_rate": 9.777864028930705e-06,
|
|
"loss": 0.2599,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.2824360105913504,
|
|
"grad_norm": 0.47091105580329895,
|
|
"learning_rate": 9.77516734310563e-06,
|
|
"loss": 0.2445,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.28353927625772285,
|
|
"grad_norm": 0.5425460934638977,
|
|
"learning_rate": 9.772454763693453e-06,
|
|
"loss": 0.2499,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.2846425419240953,
|
|
"grad_norm": 0.43206480145454407,
|
|
"learning_rate": 9.769726299722668e-06,
|
|
"loss": 0.2539,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.2857458075904678,
|
|
"grad_norm": 0.49715983867645264,
|
|
"learning_rate": 9.766981960274653e-06,
|
|
"loss": 0.2526,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.2868490732568402,
|
|
"grad_norm": 0.4886232018470764,
|
|
"learning_rate": 9.764221754483623e-06,
|
|
"loss": 0.2496,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.2868490732568402,
|
|
"eval_loss": 0.2564772367477417,
|
|
"eval_runtime": 269.851,
|
|
"eval_samples_per_second": 56.561,
|
|
"eval_steps_per_second": 7.071,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.2879523389232127,
|
|
"grad_norm": 0.4968324601650238,
|
|
"learning_rate": 9.761445691536598e-06,
|
|
"loss": 0.2526,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.2890556045895852,
|
|
"grad_norm": 0.48738226294517517,
|
|
"learning_rate": 9.758653780673381e-06,
|
|
"loss": 0.243,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.29015887025595766,
|
|
"grad_norm": 0.45023027062416077,
|
|
"learning_rate": 9.755846031186521e-06,
|
|
"loss": 0.2463,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.2912621359223301,
|
|
"grad_norm": 0.5096351504325867,
|
|
"learning_rate": 9.753022452421286e-06,
|
|
"loss": 0.2522,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.29236540158870256,
|
|
"grad_norm": 0.4321053922176361,
|
|
"learning_rate": 9.750183053775625e-06,
|
|
"loss": 0.2482,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.29346866725507503,
|
|
"grad_norm": 0.48243576288223267,
|
|
"learning_rate": 9.747327844700147e-06,
|
|
"loss": 0.2583,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.2945719329214475,
|
|
"grad_norm": 0.5312182903289795,
|
|
"learning_rate": 9.744456834698083e-06,
|
|
"loss": 0.2437,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.29567519858781993,
|
|
"grad_norm": 0.46811169385910034,
|
|
"learning_rate": 9.741570033325254e-06,
|
|
"loss": 0.2387,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.2967784642541924,
|
|
"grad_norm": 0.4737708568572998,
|
|
"learning_rate": 9.738667450190041e-06,
|
|
"loss": 0.2715,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.2978817299205649,
|
|
"grad_norm": 0.4285770058631897,
|
|
"learning_rate": 9.73574909495335e-06,
|
|
"loss": 0.2318,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.2989849955869373,
|
|
"grad_norm": 0.42456915974617004,
|
|
"learning_rate": 9.732814977328593e-06,
|
|
"loss": 0.2534,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.3000882612533098,
|
|
"grad_norm": 0.4388004243373871,
|
|
"learning_rate": 9.729865107081631e-06,
|
|
"loss": 0.2494,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.30119152691968226,
|
|
"grad_norm": 0.48463258147239685,
|
|
"learning_rate": 9.726899494030768e-06,
|
|
"loss": 0.2542,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.30229479258605474,
|
|
"grad_norm": 0.4798240661621094,
|
|
"learning_rate": 9.723918148046696e-06,
|
|
"loss": 0.2485,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.30339805825242716,
|
|
"grad_norm": 0.5145127177238464,
|
|
"learning_rate": 9.720921079052483e-06,
|
|
"loss": 0.2463,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.30450132391879964,
|
|
"grad_norm": 0.4174281358718872,
|
|
"learning_rate": 9.717908297023517e-06,
|
|
"loss": 0.2394,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.3056045895851721,
|
|
"grad_norm": 0.4736640155315399,
|
|
"learning_rate": 9.714879811987496e-06,
|
|
"loss": 0.2474,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.3067078552515446,
|
|
"grad_norm": 0.46315228939056396,
|
|
"learning_rate": 9.711835634024378e-06,
|
|
"loss": 0.2482,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.307811120917917,
|
|
"grad_norm": 0.541100800037384,
|
|
"learning_rate": 9.708775773266353e-06,
|
|
"loss": 0.25,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.3089143865842895,
|
|
"grad_norm": 0.4666937589645386,
|
|
"learning_rate": 9.705700239897809e-06,
|
|
"loss": 0.239,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.3089143865842895,
|
|
"eval_loss": 0.25553634762763977,
|
|
"eval_runtime": 271.0024,
|
|
"eval_samples_per_second": 56.321,
|
|
"eval_steps_per_second": 7.041,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.31001765225066197,
|
|
"grad_norm": 0.49646076560020447,
|
|
"learning_rate": 9.702609044155303e-06,
|
|
"loss": 0.2436,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.31112091791703445,
|
|
"grad_norm": 0.48308032751083374,
|
|
"learning_rate": 9.699502196327515e-06,
|
|
"loss": 0.2517,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.31222418358340687,
|
|
"grad_norm": 0.6409610509872437,
|
|
"learning_rate": 9.69637970675523e-06,
|
|
"loss": 0.2509,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.31332744924977934,
|
|
"grad_norm": 0.5959620475769043,
|
|
"learning_rate": 9.69324158583129e-06,
|
|
"loss": 0.256,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.3144307149161518,
|
|
"grad_norm": 0.5620144009590149,
|
|
"learning_rate": 9.69008784400056e-06,
|
|
"loss": 0.2569,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.3155339805825243,
|
|
"grad_norm": 0.5051830410957336,
|
|
"learning_rate": 9.686918491759904e-06,
|
|
"loss": 0.2471,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.3166372462488967,
|
|
"grad_norm": 0.49281784892082214,
|
|
"learning_rate": 9.68373353965814e-06,
|
|
"loss": 0.2301,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.3177405119152692,
|
|
"grad_norm": 0.4283227324485779,
|
|
"learning_rate": 9.68053299829601e-06,
|
|
"loss": 0.2344,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.3188437775816417,
|
|
"grad_norm": 0.4529547095298767,
|
|
"learning_rate": 9.677316878326144e-06,
|
|
"loss": 0.2557,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.3199470432480141,
|
|
"grad_norm": 0.40247344970703125,
|
|
"learning_rate": 9.67408519045302e-06,
|
|
"loss": 0.2486,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.3210503089143866,
|
|
"grad_norm": 0.43372419476509094,
|
|
"learning_rate": 9.670837945432934e-06,
|
|
"loss": 0.2453,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.32215357458075905,
|
|
"grad_norm": 0.4570685625076294,
|
|
"learning_rate": 9.667575154073962e-06,
|
|
"loss": 0.2617,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.3232568402471315,
|
|
"grad_norm": 0.5153756141662598,
|
|
"learning_rate": 9.664296827235924e-06,
|
|
"loss": 0.2564,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.32436010591350395,
|
|
"grad_norm": 0.47910332679748535,
|
|
"learning_rate": 9.66100297583035e-06,
|
|
"loss": 0.2503,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.3254633715798764,
|
|
"grad_norm": 0.4647476077079773,
|
|
"learning_rate": 9.657693610820437e-06,
|
|
"loss": 0.2367,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.3265666372462489,
|
|
"grad_norm": 0.5447574257850647,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.2547,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.3276699029126214,
|
|
"grad_norm": 0.493915319442749,
|
|
"learning_rate": 9.651028384098538e-06,
|
|
"loss": 0.2386,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.3287731685789938,
|
|
"grad_norm": 0.4700816869735718,
|
|
"learning_rate": 9.647672544570981e-06,
|
|
"loss": 0.2537,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.3298764342453663,
|
|
"grad_norm": 0.38883256912231445,
|
|
"learning_rate": 9.644301235807872e-06,
|
|
"loss": 0.233,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.33097969991173876,
|
|
"grad_norm": 0.4903203547000885,
|
|
"learning_rate": 9.640914469030216e-06,
|
|
"loss": 0.2415,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.33097969991173876,
|
|
"eval_loss": 0.2547125220298767,
|
|
"eval_runtime": 274.3354,
|
|
"eval_samples_per_second": 55.636,
|
|
"eval_steps_per_second": 6.955,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.33208296557811123,
|
|
"grad_norm": 0.4478644132614136,
|
|
"learning_rate": 9.637512255510475e-06,
|
|
"loss": 0.236,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.33318623124448365,
|
|
"grad_norm": 0.44624054431915283,
|
|
"learning_rate": 9.634094606572515e-06,
|
|
"loss": 0.2526,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.33428949691085613,
|
|
"grad_norm": 0.4568576514720917,
|
|
"learning_rate": 9.630661533591584e-06,
|
|
"loss": 0.2353,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.3353927625772286,
|
|
"grad_norm": 0.427226722240448,
|
|
"learning_rate": 9.627213047994265e-06,
|
|
"loss": 0.2532,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.3364960282436011,
|
|
"grad_norm": 0.4701986610889435,
|
|
"learning_rate": 9.623749161258437e-06,
|
|
"loss": 0.2349,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.3375992939099735,
|
|
"grad_norm": 0.5643903017044067,
|
|
"learning_rate": 9.620269884913247e-06,
|
|
"loss": 0.259,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.338702559576346,
|
|
"grad_norm": 0.49091801047325134,
|
|
"learning_rate": 9.616775230539057e-06,
|
|
"loss": 0.2512,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.33980582524271846,
|
|
"grad_norm": 0.5190874338150024,
|
|
"learning_rate": 9.613265209767417e-06,
|
|
"loss": 0.245,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.3409090909090909,
|
|
"grad_norm": 0.6141373515129089,
|
|
"learning_rate": 9.609739834281023e-06,
|
|
"loss": 0.2742,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.34201235657546336,
|
|
"grad_norm": 0.5128368139266968,
|
|
"learning_rate": 9.606199115813672e-06,
|
|
"loss": 0.2559,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.34311562224183584,
|
|
"grad_norm": 0.441245436668396,
|
|
"learning_rate": 9.602643066150235e-06,
|
|
"loss": 0.2515,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.3442188879082083,
|
|
"grad_norm": 0.4743674397468567,
|
|
"learning_rate": 9.599071697126608e-06,
|
|
"loss": 0.2541,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.34532215357458074,
|
|
"grad_norm": 0.5153236389160156,
|
|
"learning_rate": 9.595485020629676e-06,
|
|
"loss": 0.2578,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.3464254192409532,
|
|
"grad_norm": 0.4311087131500244,
|
|
"learning_rate": 9.591883048597273e-06,
|
|
"loss": 0.2548,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.3475286849073257,
|
|
"grad_norm": 0.494365930557251,
|
|
"learning_rate": 9.588265793018141e-06,
|
|
"loss": 0.256,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.34863195057369817,
|
|
"grad_norm": 0.426740825176239,
|
|
"learning_rate": 9.584633265931894e-06,
|
|
"loss": 0.2547,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.3497352162400706,
|
|
"grad_norm": 0.4335707426071167,
|
|
"learning_rate": 9.580985479428975e-06,
|
|
"loss": 0.241,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.35083848190644307,
|
|
"grad_norm": 0.47340667247772217,
|
|
"learning_rate": 9.577322445650616e-06,
|
|
"loss": 0.2437,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.35194174757281554,
|
|
"grad_norm": 0.48211535811424255,
|
|
"learning_rate": 9.573644176788795e-06,
|
|
"loss": 0.238,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.353045013239188,
|
|
"grad_norm": 0.515032172203064,
|
|
"learning_rate": 9.569950685086202e-06,
|
|
"loss": 0.2646,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.353045013239188,
|
|
"eval_loss": 0.2541050612926483,
|
|
"eval_runtime": 272.7736,
|
|
"eval_samples_per_second": 55.955,
|
|
"eval_steps_per_second": 6.995,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.35414827890556044,
|
|
"grad_norm": 0.4244433641433716,
|
|
"learning_rate": 9.566241982836193e-06,
|
|
"loss": 0.2487,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.3552515445719329,
|
|
"grad_norm": 0.4410102367401123,
|
|
"learning_rate": 9.562518082382751e-06,
|
|
"loss": 0.2385,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.3563548102383054,
|
|
"grad_norm": 0.5115966200828552,
|
|
"learning_rate": 9.558778996120443e-06,
|
|
"loss": 0.2484,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.3574580759046778,
|
|
"grad_norm": 0.4943847954273224,
|
|
"learning_rate": 9.555024736494382e-06,
|
|
"loss": 0.2575,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.3585613415710503,
|
|
"grad_norm": 0.4769156277179718,
|
|
"learning_rate": 9.551255316000183e-06,
|
|
"loss": 0.2432,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.3596646072374228,
|
|
"grad_norm": 0.43486344814300537,
|
|
"learning_rate": 9.54747074718392e-06,
|
|
"loss": 0.2594,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.36076787290379525,
|
|
"grad_norm": 0.45673149824142456,
|
|
"learning_rate": 9.54367104264209e-06,
|
|
"loss": 0.2513,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 0.36187113857016767,
|
|
"grad_norm": 0.48159259557724,
|
|
"learning_rate": 9.539856215021568e-06,
|
|
"loss": 0.2467,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.36297440423654015,
|
|
"grad_norm": 0.4502279460430145,
|
|
"learning_rate": 9.536026277019562e-06,
|
|
"loss": 0.2485,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 0.3640776699029126,
|
|
"grad_norm": 0.5324723124504089,
|
|
"learning_rate": 9.53218124138357e-06,
|
|
"loss": 0.2417,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.3651809355692851,
|
|
"grad_norm": 0.48323342204093933,
|
|
"learning_rate": 9.528321120911345e-06,
|
|
"loss": 0.253,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 0.3662842012356575,
|
|
"grad_norm": 0.5192784667015076,
|
|
"learning_rate": 9.524445928450851e-06,
|
|
"loss": 0.2301,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.36738746690203,
|
|
"grad_norm": 0.5197545886039734,
|
|
"learning_rate": 9.520555676900214e-06,
|
|
"loss": 0.2443,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 0.3684907325684025,
|
|
"grad_norm": 0.45566871762275696,
|
|
"learning_rate": 9.516650379207677e-06,
|
|
"loss": 0.2447,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.36959399823477496,
|
|
"grad_norm": 0.5340574383735657,
|
|
"learning_rate": 9.51273004837157e-06,
|
|
"loss": 0.2477,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.3706972639011474,
|
|
"grad_norm": 0.4383482336997986,
|
|
"learning_rate": 9.508794697440257e-06,
|
|
"loss": 0.2335,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.37180052956751986,
|
|
"grad_norm": 0.5311030745506287,
|
|
"learning_rate": 9.504844339512096e-06,
|
|
"loss": 0.2474,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 0.37290379523389233,
|
|
"grad_norm": 0.5349487662315369,
|
|
"learning_rate": 9.50087898773539e-06,
|
|
"loss": 0.2625,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.3740070609002648,
|
|
"grad_norm": 0.42293423414230347,
|
|
"learning_rate": 9.49689865530835e-06,
|
|
"loss": 0.2428,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 0.37511032656663723,
|
|
"grad_norm": 0.4599260985851288,
|
|
"learning_rate": 9.492903355479047e-06,
|
|
"loss": 0.2497,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.37511032656663723,
|
|
"eval_loss": 0.25350308418273926,
|
|
"eval_runtime": 270.259,
|
|
"eval_samples_per_second": 56.475,
|
|
"eval_steps_per_second": 7.06,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.3762135922330097,
|
|
"grad_norm": 0.46413329243659973,
|
|
"learning_rate": 9.488893101545372e-06,
|
|
"loss": 0.2409,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 0.3773168578993822,
|
|
"grad_norm": 0.45214733481407166,
|
|
"learning_rate": 9.484867906854986e-06,
|
|
"loss": 0.2427,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.3784201235657546,
|
|
"grad_norm": 0.49880728125572205,
|
|
"learning_rate": 9.480827784805278e-06,
|
|
"loss": 0.2404,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 0.3795233892321271,
|
|
"grad_norm": 0.516257107257843,
|
|
"learning_rate": 9.476772748843327e-06,
|
|
"loss": 0.2531,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.38062665489849956,
|
|
"grad_norm": 0.4441586434841156,
|
|
"learning_rate": 9.472702812465843e-06,
|
|
"loss": 0.2339,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.38172992056487204,
|
|
"grad_norm": 0.4590930938720703,
|
|
"learning_rate": 9.468617989219136e-06,
|
|
"loss": 0.2465,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.38283318623124446,
|
|
"grad_norm": 0.43926405906677246,
|
|
"learning_rate": 9.46451829269906e-06,
|
|
"loss": 0.2475,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 0.38393645189761694,
|
|
"grad_norm": 0.4270091950893402,
|
|
"learning_rate": 9.460403736550982e-06,
|
|
"loss": 0.2404,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.3850397175639894,
|
|
"grad_norm": 0.4161515235900879,
|
|
"learning_rate": 9.45627433446972e-06,
|
|
"loss": 0.2428,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 0.3861429832303619,
|
|
"grad_norm": 0.4878949820995331,
|
|
"learning_rate": 9.452130100199504e-06,
|
|
"loss": 0.2636,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.3872462488967343,
|
|
"grad_norm": 0.4900050759315491,
|
|
"learning_rate": 9.447971047533936e-06,
|
|
"loss": 0.2415,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 0.3883495145631068,
|
|
"grad_norm": 0.43371209502220154,
|
|
"learning_rate": 9.443797190315938e-06,
|
|
"loss": 0.2469,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.38945278022947927,
|
|
"grad_norm": 0.43596795201301575,
|
|
"learning_rate": 9.439608542437704e-06,
|
|
"loss": 0.2394,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 0.39055604589585174,
|
|
"grad_norm": 0.4555245637893677,
|
|
"learning_rate": 9.435405117840662e-06,
|
|
"loss": 0.2435,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.39165931156222417,
|
|
"grad_norm": 0.46150678396224976,
|
|
"learning_rate": 9.431186930515419e-06,
|
|
"loss": 0.2585,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.39276257722859664,
|
|
"grad_norm": 0.42505866289138794,
|
|
"learning_rate": 9.42695399450172e-06,
|
|
"loss": 0.2386,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.3938658428949691,
|
|
"grad_norm": 0.49516651034355164,
|
|
"learning_rate": 9.422706323888398e-06,
|
|
"loss": 0.235,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 0.3949691085613416,
|
|
"grad_norm": 0.48143908381462097,
|
|
"learning_rate": 9.418443932813328e-06,
|
|
"loss": 0.2495,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.396072374227714,
|
|
"grad_norm": 0.5001795887947083,
|
|
"learning_rate": 9.414166835463383e-06,
|
|
"loss": 0.247,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 0.3971756398940865,
|
|
"grad_norm": 0.47970953583717346,
|
|
"learning_rate": 9.409875046074379e-06,
|
|
"loss": 0.2486,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.3971756398940865,
|
|
"eval_loss": 0.2526043653488159,
|
|
"eval_runtime": 269.6648,
|
|
"eval_samples_per_second": 56.6,
|
|
"eval_steps_per_second": 7.075,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.398278905560459,
|
|
"grad_norm": 0.5398975610733032,
|
|
"learning_rate": 9.405568578931042e-06,
|
|
"loss": 0.257,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 0.3993821712268314,
|
|
"grad_norm": 0.4145001769065857,
|
|
"learning_rate": 9.401247448366937e-06,
|
|
"loss": 0.2305,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.40048543689320387,
|
|
"grad_norm": 0.49223729968070984,
|
|
"learning_rate": 9.39691166876445e-06,
|
|
"loss": 0.2385,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 0.40158870255957635,
|
|
"grad_norm": 0.5020371675491333,
|
|
"learning_rate": 9.392561254554712e-06,
|
|
"loss": 0.2507,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.4026919682259488,
|
|
"grad_norm": 0.4438912868499756,
|
|
"learning_rate": 9.388196220217574e-06,
|
|
"loss": 0.2442,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.40379523389232125,
|
|
"grad_norm": 0.5784342288970947,
|
|
"learning_rate": 9.383816580281539e-06,
|
|
"loss": 0.2434,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.4048984995586937,
|
|
"grad_norm": 0.4573621451854706,
|
|
"learning_rate": 9.379422349323728e-06,
|
|
"loss": 0.2348,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 0.4060017652250662,
|
|
"grad_norm": 0.5133495926856995,
|
|
"learning_rate": 9.375013541969828e-06,
|
|
"loss": 0.2474,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.4071050308914387,
|
|
"grad_norm": 0.5082767605781555,
|
|
"learning_rate": 9.370590172894037e-06,
|
|
"loss": 0.2424,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 0.4082082965578111,
|
|
"grad_norm": 0.41318631172180176,
|
|
"learning_rate": 9.366152256819025e-06,
|
|
"loss": 0.2459,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.4093115622241836,
|
|
"grad_norm": 0.48783794045448303,
|
|
"learning_rate": 9.361699808515877e-06,
|
|
"loss": 0.2332,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 0.41041482789055606,
|
|
"grad_norm": 0.46912387013435364,
|
|
"learning_rate": 9.357232842804045e-06,
|
|
"loss": 0.2362,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.41151809355692853,
|
|
"grad_norm": 0.5062457323074341,
|
|
"learning_rate": 9.352751374551305e-06,
|
|
"loss": 0.2479,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 0.41262135922330095,
|
|
"grad_norm": 0.45189908146858215,
|
|
"learning_rate": 9.348255418673702e-06,
|
|
"loss": 0.2597,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.41372462488967343,
|
|
"grad_norm": 0.43714070320129395,
|
|
"learning_rate": 9.3437449901355e-06,
|
|
"loss": 0.2447,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 0.4148278905560459,
|
|
"grad_norm": 0.44575101137161255,
|
|
"learning_rate": 9.339220103949132e-06,
|
|
"loss": 0.2572,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.4159311562224184,
|
|
"grad_norm": 0.4869813024997711,
|
|
"learning_rate": 9.334680775175154e-06,
|
|
"loss": 0.2469,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 0.4170344218887908,
|
|
"grad_norm": 0.4805983901023865,
|
|
"learning_rate": 9.330127018922195e-06,
|
|
"loss": 0.2427,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.4181376875551633,
|
|
"grad_norm": 0.47126686573028564,
|
|
"learning_rate": 9.325558850346897e-06,
|
|
"loss": 0.2448,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 0.41924095322153576,
|
|
"grad_norm": 0.5163640975952148,
|
|
"learning_rate": 9.320976284653877e-06,
|
|
"loss": 0.2289,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.41924095322153576,
|
|
"eval_loss": 0.25212275981903076,
|
|
"eval_runtime": 270.6499,
|
|
"eval_samples_per_second": 56.394,
|
|
"eval_steps_per_second": 7.05,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.4203442188879082,
|
|
"grad_norm": 0.46562832593917847,
|
|
"learning_rate": 9.316379337095671e-06,
|
|
"loss": 0.255,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 0.42144748455428066,
|
|
"grad_norm": 0.3981192708015442,
|
|
"learning_rate": 9.311768022972682e-06,
|
|
"loss": 0.2455,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.42255075022065314,
|
|
"grad_norm": 0.4480000138282776,
|
|
"learning_rate": 9.307142357633132e-06,
|
|
"loss": 0.2437,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 0.4236540158870256,
|
|
"grad_norm": 0.4353036880493164,
|
|
"learning_rate": 9.302502356473006e-06,
|
|
"loss": 0.2435,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.42475728155339804,
|
|
"grad_norm": 0.42388132214546204,
|
|
"learning_rate": 9.297848034936007e-06,
|
|
"loss": 0.2458,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 0.4258605472197705,
|
|
"grad_norm": 0.5140712261199951,
|
|
"learning_rate": 9.293179408513501e-06,
|
|
"loss": 0.2469,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.426963812886143,
|
|
"grad_norm": 0.5060368180274963,
|
|
"learning_rate": 9.288496492744466e-06,
|
|
"loss": 0.2499,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 0.42806707855251547,
|
|
"grad_norm": 0.43134334683418274,
|
|
"learning_rate": 9.283799303215442e-06,
|
|
"loss": 0.233,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.4291703442188879,
|
|
"grad_norm": 0.48315203189849854,
|
|
"learning_rate": 9.279087855560474e-06,
|
|
"loss": 0.2457,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 0.43027360988526037,
|
|
"grad_norm": 0.4424877166748047,
|
|
"learning_rate": 9.274362165461064e-06,
|
|
"loss": 0.2402,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.43137687555163284,
|
|
"grad_norm": 0.47104790806770325,
|
|
"learning_rate": 9.269622248646124e-06,
|
|
"loss": 0.2419,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 0.4324801412180053,
|
|
"grad_norm": 0.4866120517253876,
|
|
"learning_rate": 9.264868120891913e-06,
|
|
"loss": 0.2428,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.43358340688437774,
|
|
"grad_norm": 0.47548824548721313,
|
|
"learning_rate": 9.260099798021988e-06,
|
|
"loss": 0.2355,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 0.4346866725507502,
|
|
"grad_norm": 0.4570111930370331,
|
|
"learning_rate": 9.255317295907158e-06,
|
|
"loss": 0.2509,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.4357899382171227,
|
|
"grad_norm": 0.5114912986755371,
|
|
"learning_rate": 9.250520630465419e-06,
|
|
"loss": 0.2409,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 0.4368932038834951,
|
|
"grad_norm": 0.38849082589149475,
|
|
"learning_rate": 9.245709817661917e-06,
|
|
"loss": 0.2413,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.4379964695498676,
|
|
"grad_norm": 0.5250911712646484,
|
|
"learning_rate": 9.240884873508876e-06,
|
|
"loss": 0.2416,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 0.4390997352162401,
|
|
"grad_norm": 0.43927446007728577,
|
|
"learning_rate": 9.236045814065563e-06,
|
|
"loss": 0.2399,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.44020300088261255,
|
|
"grad_norm": 0.5229560136795044,
|
|
"learning_rate": 9.231192655438222e-06,
|
|
"loss": 0.2536,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 0.44130626654898497,
|
|
"grad_norm": 0.5083780884742737,
|
|
"learning_rate": 9.226325413780021e-06,
|
|
"loss": 0.2324,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.44130626654898497,
|
|
"eval_loss": 0.25146690011024475,
|
|
"eval_runtime": 270.7254,
|
|
"eval_samples_per_second": 56.378,
|
|
"eval_steps_per_second": 7.048,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.44240953221535745,
|
|
"grad_norm": 0.4707069396972656,
|
|
"learning_rate": 9.221444105291013e-06,
|
|
"loss": 0.2594,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 0.4435127978817299,
|
|
"grad_norm": 0.4856661260128021,
|
|
"learning_rate": 9.216548746218056e-06,
|
|
"loss": 0.2493,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.4446160635481024,
|
|
"grad_norm": 0.4883829951286316,
|
|
"learning_rate": 9.211639352854786e-06,
|
|
"loss": 0.2468,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 0.4457193292144748,
|
|
"grad_norm": 0.4665009379386902,
|
|
"learning_rate": 9.206715941541547e-06,
|
|
"loss": 0.2519,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.4468225948808473,
|
|
"grad_norm": 0.45250222086906433,
|
|
"learning_rate": 9.201778528665333e-06,
|
|
"loss": 0.2436,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 0.4479258605472198,
|
|
"grad_norm": 0.457640677690506,
|
|
"learning_rate": 9.196827130659752e-06,
|
|
"loss": 0.2575,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.44902912621359226,
|
|
"grad_norm": 0.3947480320930481,
|
|
"learning_rate": 9.19186176400495e-06,
|
|
"loss": 0.2521,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 0.4501323918799647,
|
|
"grad_norm": 0.5039179921150208,
|
|
"learning_rate": 9.186882445227572e-06,
|
|
"loss": 0.2464,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.45123565754633715,
|
|
"grad_norm": 0.46842432022094727,
|
|
"learning_rate": 9.181889190900702e-06,
|
|
"loss": 0.2603,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 0.45233892321270963,
|
|
"grad_norm": 0.40255504846572876,
|
|
"learning_rate": 9.1768820176438e-06,
|
|
"loss": 0.242,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.4534421888790821,
|
|
"grad_norm": 0.4471176564693451,
|
|
"learning_rate": 9.17186094212266e-06,
|
|
"loss": 0.2349,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 0.45454545454545453,
|
|
"grad_norm": 0.4603224992752075,
|
|
"learning_rate": 9.166825981049345e-06,
|
|
"loss": 0.2449,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.455648720211827,
|
|
"grad_norm": 0.4398849308490753,
|
|
"learning_rate": 9.161777151182137e-06,
|
|
"loss": 0.2346,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 0.4567519858781995,
|
|
"grad_norm": 0.54154372215271,
|
|
"learning_rate": 9.156714469325474e-06,
|
|
"loss": 0.2469,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.4578552515445719,
|
|
"grad_norm": 0.5123459696769714,
|
|
"learning_rate": 9.151637952329903e-06,
|
|
"loss": 0.253,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 0.4589585172109444,
|
|
"grad_norm": 0.40907183289527893,
|
|
"learning_rate": 9.14654761709202e-06,
|
|
"loss": 0.2467,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.46006178287731686,
|
|
"grad_norm": 0.47456130385398865,
|
|
"learning_rate": 9.141443480554408e-06,
|
|
"loss": 0.2492,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 0.46116504854368934,
|
|
"grad_norm": 0.4863825738430023,
|
|
"learning_rate": 9.136325559705593e-06,
|
|
"loss": 0.2416,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.46226831421006176,
|
|
"grad_norm": 0.41943055391311646,
|
|
"learning_rate": 9.131193871579975e-06,
|
|
"loss": 0.2352,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 0.46337157987643424,
|
|
"grad_norm": 0.49511662125587463,
|
|
"learning_rate": 9.12604843325778e-06,
|
|
"loss": 0.2425,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.46337157987643424,
|
|
"eval_loss": 0.25082722306251526,
|
|
"eval_runtime": 268.229,
|
|
"eval_samples_per_second": 56.903,
|
|
"eval_steps_per_second": 7.113,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.4644748455428067,
|
|
"grad_norm": 0.4573848247528076,
|
|
"learning_rate": 9.120889261864999e-06,
|
|
"loss": 0.2586,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 0.4655781112091792,
|
|
"grad_norm": 0.48330816626548767,
|
|
"learning_rate": 9.11571637457333e-06,
|
|
"loss": 0.2471,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.4666813768755516,
|
|
"grad_norm": 0.5316815376281738,
|
|
"learning_rate": 9.110529788600127e-06,
|
|
"loss": 0.2398,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 0.4677846425419241,
|
|
"grad_norm": 0.45236679911613464,
|
|
"learning_rate": 9.105329521208334e-06,
|
|
"loss": 0.2471,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.46888790820829657,
|
|
"grad_norm": 0.4722297787666321,
|
|
"learning_rate": 9.100115589706436e-06,
|
|
"loss": 0.2428,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 0.46999117387466904,
|
|
"grad_norm": 0.4543675482273102,
|
|
"learning_rate": 9.094888011448391e-06,
|
|
"loss": 0.2516,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.47109443954104147,
|
|
"grad_norm": 0.4152880907058716,
|
|
"learning_rate": 9.089646803833589e-06,
|
|
"loss": 0.225,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 0.47219770520741394,
|
|
"grad_norm": 0.44709253311157227,
|
|
"learning_rate": 9.084391984306775e-06,
|
|
"loss": 0.2456,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.4733009708737864,
|
|
"grad_norm": 0.5279027819633484,
|
|
"learning_rate": 9.079123570358e-06,
|
|
"loss": 0.2415,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 0.4744042365401589,
|
|
"grad_norm": 0.4792356491088867,
|
|
"learning_rate": 9.073841579522571e-06,
|
|
"loss": 0.2543,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.4755075022065313,
|
|
"grad_norm": 0.45700347423553467,
|
|
"learning_rate": 9.068546029380971e-06,
|
|
"loss": 0.2593,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 0.4766107678729038,
|
|
"grad_norm": 0.5032837986946106,
|
|
"learning_rate": 9.063236937558826e-06,
|
|
"loss": 0.2528,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.4777140335392763,
|
|
"grad_norm": 0.48134273290634155,
|
|
"learning_rate": 9.057914321726824e-06,
|
|
"loss": 0.2553,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 0.4788172992056487,
|
|
"grad_norm": 0.45645344257354736,
|
|
"learning_rate": 9.052578199600675e-06,
|
|
"loss": 0.2387,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.47992056487202117,
|
|
"grad_norm": 0.4026988744735718,
|
|
"learning_rate": 9.047228588941034e-06,
|
|
"loss": 0.228,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 0.48102383053839365,
|
|
"grad_norm": 0.4304678440093994,
|
|
"learning_rate": 9.041865507553458e-06,
|
|
"loss": 0.2513,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.4821270962047661,
|
|
"grad_norm": 0.4108814299106598,
|
|
"learning_rate": 9.036488973288339e-06,
|
|
"loss": 0.238,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 0.48323036187113855,
|
|
"grad_norm": 0.4733142852783203,
|
|
"learning_rate": 9.031099004040841e-06,
|
|
"loss": 0.2506,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.484333627537511,
|
|
"grad_norm": 0.4324648380279541,
|
|
"learning_rate": 9.025695617750848e-06,
|
|
"loss": 0.2243,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 0.4854368932038835,
|
|
"grad_norm": 0.4637969136238098,
|
|
"learning_rate": 9.020278832402902e-06,
|
|
"loss": 0.2545,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.4854368932038835,
|
|
"eval_loss": 0.25019994378089905,
|
|
"eval_runtime": 271.9518,
|
|
"eval_samples_per_second": 56.124,
|
|
"eval_steps_per_second": 7.016,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.486540158870256,
|
|
"grad_norm": 0.4881959855556488,
|
|
"learning_rate": 9.014848666026138e-06,
|
|
"loss": 0.2467,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 0.4876434245366284,
|
|
"grad_norm": 0.44622328877449036,
|
|
"learning_rate": 9.009405136694234e-06,
|
|
"loss": 0.2512,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.4887466902030009,
|
|
"grad_norm": 0.4718511700630188,
|
|
"learning_rate": 9.003948262525341e-06,
|
|
"loss": 0.247,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 0.48984995586937335,
|
|
"grad_norm": 0.4678577780723572,
|
|
"learning_rate": 8.998478061682025e-06,
|
|
"loss": 0.2301,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.49095322153574583,
|
|
"grad_norm": 0.44579043984413147,
|
|
"learning_rate": 8.992994552371217e-06,
|
|
"loss": 0.2513,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 0.49205648720211825,
|
|
"grad_norm": 0.4663316309452057,
|
|
"learning_rate": 8.987497752844132e-06,
|
|
"loss": 0.2568,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.49315975286849073,
|
|
"grad_norm": 0.5495996475219727,
|
|
"learning_rate": 8.981987681396226e-06,
|
|
"loss": 0.2626,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 0.4942630185348632,
|
|
"grad_norm": 0.4634556174278259,
|
|
"learning_rate": 8.976464356367133e-06,
|
|
"loss": 0.2523,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.4953662842012357,
|
|
"grad_norm": 0.4804018437862396,
|
|
"learning_rate": 8.970927796140592e-06,
|
|
"loss": 0.2323,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 0.4964695498676081,
|
|
"grad_norm": 0.4309288561344147,
|
|
"learning_rate": 8.965378019144397e-06,
|
|
"loss": 0.2432,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.4975728155339806,
|
|
"grad_norm": 0.4046125113964081,
|
|
"learning_rate": 8.959815043850336e-06,
|
|
"loss": 0.228,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 0.49867608120035306,
|
|
"grad_norm": 0.4361225366592407,
|
|
"learning_rate": 8.95423888877412e-06,
|
|
"loss": 0.2398,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.4997793468667255,
|
|
"grad_norm": 0.4338124096393585,
|
|
"learning_rate": 8.948649572475332e-06,
|
|
"loss": 0.2471,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 0.500882612533098,
|
|
"grad_norm": 0.4460661709308624,
|
|
"learning_rate": 8.943047113557358e-06,
|
|
"loss": 0.2525,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.5019858781994704,
|
|
"grad_norm": 0.43851226568222046,
|
|
"learning_rate": 8.937431530667329e-06,
|
|
"loss": 0.2412,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 0.5030891438658429,
|
|
"grad_norm": 0.4404292702674866,
|
|
"learning_rate": 8.931802842496056e-06,
|
|
"loss": 0.2467,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.5041924095322153,
|
|
"grad_norm": 0.5048953890800476,
|
|
"learning_rate": 8.926161067777973e-06,
|
|
"loss": 0.2503,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 0.5052956751985879,
|
|
"grad_norm": 0.48402130603790283,
|
|
"learning_rate": 8.920506225291067e-06,
|
|
"loss": 0.2441,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.5063989408649603,
|
|
"grad_norm": 0.409432977437973,
|
|
"learning_rate": 8.914838333856822e-06,
|
|
"loss": 0.2388,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 0.5075022065313327,
|
|
"grad_norm": 0.41323322057724,
|
|
"learning_rate": 8.90915741234015e-06,
|
|
"loss": 0.2372,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.5075022065313327,
|
|
"eval_loss": 0.24960678815841675,
|
|
"eval_runtime": 272.2362,
|
|
"eval_samples_per_second": 56.065,
|
|
"eval_steps_per_second": 7.009,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.5086054721977052,
|
|
"grad_norm": 0.5443533062934875,
|
|
"learning_rate": 8.90346347964934e-06,
|
|
"loss": 0.2311,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 0.5097087378640777,
|
|
"grad_norm": 0.42860737442970276,
|
|
"learning_rate": 8.897756554735976e-06,
|
|
"loss": 0.2537,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.5108120035304501,
|
|
"grad_norm": 0.4304381012916565,
|
|
"learning_rate": 8.892036656594898e-06,
|
|
"loss": 0.2366,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 0.5119152691968226,
|
|
"grad_norm": 0.5057708024978638,
|
|
"learning_rate": 8.886303804264117e-06,
|
|
"loss": 0.2362,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.513018534863195,
|
|
"grad_norm": 0.48017001152038574,
|
|
"learning_rate": 8.88055801682476e-06,
|
|
"loss": 0.2493,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 0.5141218005295676,
|
|
"grad_norm": 0.441488653421402,
|
|
"learning_rate": 8.874799313401014e-06,
|
|
"loss": 0.2413,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.51522506619594,
|
|
"grad_norm": 0.5098276138305664,
|
|
"learning_rate": 8.86902771316005e-06,
|
|
"loss": 0.2496,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 0.5163283318623124,
|
|
"grad_norm": 0.43526649475097656,
|
|
"learning_rate": 8.863243235311964e-06,
|
|
"loss": 0.2452,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.517431597528685,
|
|
"grad_norm": 0.48061615228652954,
|
|
"learning_rate": 8.857445899109716e-06,
|
|
"loss": 0.2521,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 0.5185348631950574,
|
|
"grad_norm": 0.425627201795578,
|
|
"learning_rate": 8.851635723849062e-06,
|
|
"loss": 0.251,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.5196381288614298,
|
|
"grad_norm": 0.39612120389938354,
|
|
"learning_rate": 8.845812728868496e-06,
|
|
"loss": 0.2366,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 0.5207413945278023,
|
|
"grad_norm": 0.43580201268196106,
|
|
"learning_rate": 8.839976933549173e-06,
|
|
"loss": 0.2501,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.5218446601941747,
|
|
"grad_norm": 0.3925994038581848,
|
|
"learning_rate": 8.834128357314856e-06,
|
|
"loss": 0.2356,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 0.5229479258605472,
|
|
"grad_norm": 0.4675627648830414,
|
|
"learning_rate": 8.828267019631852e-06,
|
|
"loss": 0.2439,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.5240511915269197,
|
|
"grad_norm": 0.5115921497344971,
|
|
"learning_rate": 8.822392940008937e-06,
|
|
"loss": 0.2434,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 0.5251544571932921,
|
|
"grad_norm": 0.5380107760429382,
|
|
"learning_rate": 8.8165061379973e-06,
|
|
"loss": 0.2476,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.5262577228596647,
|
|
"grad_norm": 0.541187047958374,
|
|
"learning_rate": 8.810606633190475e-06,
|
|
"loss": 0.2397,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 0.5273609885260371,
|
|
"grad_norm": 0.49486243724823,
|
|
"learning_rate": 8.804694445224274e-06,
|
|
"loss": 0.2548,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.5284642541924095,
|
|
"grad_norm": 0.5872311592102051,
|
|
"learning_rate": 8.798769593776723e-06,
|
|
"loss": 0.239,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 0.529567519858782,
|
|
"grad_norm": 0.48262667655944824,
|
|
"learning_rate": 8.792832098568002e-06,
|
|
"loss": 0.2328,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.529567519858782,
|
|
"eval_loss": 0.24928364157676697,
|
|
"eval_runtime": 271.3099,
|
|
"eval_samples_per_second": 56.257,
|
|
"eval_steps_per_second": 7.033,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.5306707855251545,
|
|
"grad_norm": 0.40170180797576904,
|
|
"learning_rate": 8.786881979360368e-06,
|
|
"loss": 0.2564,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 0.5317740511915269,
|
|
"grad_norm": 0.44170036911964417,
|
|
"learning_rate": 8.7809192559581e-06,
|
|
"loss": 0.2413,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.5328773168578994,
|
|
"grad_norm": 0.4831240177154541,
|
|
"learning_rate": 8.774943948207427e-06,
|
|
"loss": 0.2391,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 0.5339805825242718,
|
|
"grad_norm": 0.39944949746131897,
|
|
"learning_rate": 8.76895607599646e-06,
|
|
"loss": 0.2361,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.5350838481906444,
|
|
"grad_norm": 0.4743267595767975,
|
|
"learning_rate": 8.762955659255137e-06,
|
|
"loss": 0.2516,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 0.5361871138570168,
|
|
"grad_norm": 0.4756656289100647,
|
|
"learning_rate": 8.756942717955142e-06,
|
|
"loss": 0.2565,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.5372903795233892,
|
|
"grad_norm": 0.45802560448646545,
|
|
"learning_rate": 8.750917272109849e-06,
|
|
"loss": 0.2386,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 0.5383936451897617,
|
|
"grad_norm": 0.45499125123023987,
|
|
"learning_rate": 8.744879341774251e-06,
|
|
"loss": 0.2397,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.5394969108561342,
|
|
"grad_norm": 0.3336021900177002,
|
|
"learning_rate": 8.738828947044895e-06,
|
|
"loss": 0.236,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 0.5406001765225066,
|
|
"grad_norm": 0.4355071485042572,
|
|
"learning_rate": 8.732766108059814e-06,
|
|
"loss": 0.2363,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.5417034421888791,
|
|
"grad_norm": 0.4942583441734314,
|
|
"learning_rate": 8.726690844998457e-06,
|
|
"loss": 0.2301,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 0.5428067078552515,
|
|
"grad_norm": 0.4470270574092865,
|
|
"learning_rate": 8.720603178081632e-06,
|
|
"loss": 0.2357,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.543909973521624,
|
|
"grad_norm": 0.4445681571960449,
|
|
"learning_rate": 8.714503127571425e-06,
|
|
"loss": 0.2558,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 0.5450132391879965,
|
|
"grad_norm": 0.5611670613288879,
|
|
"learning_rate": 8.708390713771145e-06,
|
|
"loss": 0.2444,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.5461165048543689,
|
|
"grad_norm": 0.37664106488227844,
|
|
"learning_rate": 8.702265957025241e-06,
|
|
"loss": 0.2511,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 0.5472197705207414,
|
|
"grad_norm": 0.5034042596817017,
|
|
"learning_rate": 8.696128877719258e-06,
|
|
"loss": 0.2483,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.5483230361871139,
|
|
"grad_norm": 0.4550327956676483,
|
|
"learning_rate": 8.689979496279747e-06,
|
|
"loss": 0.2404,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 0.5494263018534863,
|
|
"grad_norm": 0.4192756712436676,
|
|
"learning_rate": 8.683817833174204e-06,
|
|
"loss": 0.2272,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.5505295675198588,
|
|
"grad_norm": 0.49941694736480713,
|
|
"learning_rate": 8.677643908911007e-06,
|
|
"loss": 0.2461,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 0.5516328331862312,
|
|
"grad_norm": 0.48723432421684265,
|
|
"learning_rate": 8.67145774403934e-06,
|
|
"loss": 0.2359,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.5516328331862312,
|
|
"eval_loss": 0.24847546219825745,
|
|
"eval_runtime": 268.7693,
|
|
"eval_samples_per_second": 56.788,
|
|
"eval_steps_per_second": 7.099,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.5527360988526037,
|
|
"grad_norm": 0.4447115957736969,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.244,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 0.5538393645189762,
|
|
"grad_norm": 0.46144431829452515,
|
|
"learning_rate": 8.659048774870986e-06,
|
|
"loss": 0.2509,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.5549426301853486,
|
|
"grad_norm": 0.41772812604904175,
|
|
"learning_rate": 8.652826011876104e-06,
|
|
"loss": 0.2422,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 0.556045895851721,
|
|
"grad_norm": 0.45326176285743713,
|
|
"learning_rate": 8.646591090876225e-06,
|
|
"loss": 0.241,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.5571491615180936,
|
|
"grad_norm": 0.4441646337509155,
|
|
"learning_rate": 8.64034403262356e-06,
|
|
"loss": 0.2445,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 0.558252427184466,
|
|
"grad_norm": 0.5038093328475952,
|
|
"learning_rate": 8.634084857910709e-06,
|
|
"loss": 0.2478,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.5593556928508385,
|
|
"grad_norm": 0.4078108072280884,
|
|
"learning_rate": 8.627813587570609e-06,
|
|
"loss": 0.255,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 0.560458958517211,
|
|
"grad_norm": 0.4333765506744385,
|
|
"learning_rate": 8.621530242476446e-06,
|
|
"loss": 0.2438,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.5615622241835834,
|
|
"grad_norm": 0.5008811354637146,
|
|
"learning_rate": 8.615234843541606e-06,
|
|
"loss": 0.2388,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 0.5626654898499559,
|
|
"grad_norm": 0.3749838173389435,
|
|
"learning_rate": 8.608927411719585e-06,
|
|
"loss": 0.2422,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.5637687555163283,
|
|
"grad_norm": 0.5079739093780518,
|
|
"learning_rate": 8.602607968003935e-06,
|
|
"loss": 0.2367,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 0.5648720211827007,
|
|
"grad_norm": 0.40866804122924805,
|
|
"learning_rate": 8.59627653342819e-06,
|
|
"loss": 0.2598,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.5659752868490733,
|
|
"grad_norm": 0.3951939642429352,
|
|
"learning_rate": 8.589933129065786e-06,
|
|
"loss": 0.2316,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 0.5670785525154457,
|
|
"grad_norm": 0.41789600253105164,
|
|
"learning_rate": 8.583577776030005e-06,
|
|
"loss": 0.2412,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.5681818181818182,
|
|
"grad_norm": 0.5892974138259888,
|
|
"learning_rate": 8.5772104954739e-06,
|
|
"loss": 0.2441,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 0.5692850838481907,
|
|
"grad_norm": 0.46684080362319946,
|
|
"learning_rate": 8.570831308590219e-06,
|
|
"loss": 0.2437,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.5703883495145631,
|
|
"grad_norm": 0.5170934796333313,
|
|
"learning_rate": 8.564440236611344e-06,
|
|
"loss": 0.2436,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 0.5714916151809356,
|
|
"grad_norm": 0.5239847302436829,
|
|
"learning_rate": 8.558037300809209e-06,
|
|
"loss": 0.2458,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.572594880847308,
|
|
"grad_norm": 0.4109562933444977,
|
|
"learning_rate": 8.551622522495238e-06,
|
|
"loss": 0.2492,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 0.5736981465136805,
|
|
"grad_norm": 0.40857604146003723,
|
|
"learning_rate": 8.545195923020273e-06,
|
|
"loss": 0.24,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.5736981465136805,
|
|
"eval_loss": 0.2479788064956665,
|
|
"eval_runtime": 270.4896,
|
|
"eval_samples_per_second": 56.427,
|
|
"eval_steps_per_second": 7.054,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.574801412180053,
|
|
"grad_norm": 0.47509685158729553,
|
|
"learning_rate": 8.538757523774503e-06,
|
|
"loss": 0.2835,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 0.5759046778464254,
|
|
"grad_norm": 0.47140100598335266,
|
|
"learning_rate": 8.532307346187384e-06,
|
|
"loss": 0.2372,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.5770079435127978,
|
|
"grad_norm": 0.4311586618423462,
|
|
"learning_rate": 8.525845411727581e-06,
|
|
"loss": 0.2446,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 0.5781112091791704,
|
|
"grad_norm": 0.44329634308815,
|
|
"learning_rate": 8.519371741902888e-06,
|
|
"loss": 0.2419,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.5792144748455428,
|
|
"grad_norm": 0.4547870457172394,
|
|
"learning_rate": 8.512886358260162e-06,
|
|
"loss": 0.2398,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 0.5803177405119153,
|
|
"grad_norm": 0.47059762477874756,
|
|
"learning_rate": 8.506389282385242e-06,
|
|
"loss": 0.2512,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.5814210061782877,
|
|
"grad_norm": 0.49984219670295715,
|
|
"learning_rate": 8.499880535902885e-06,
|
|
"loss": 0.2575,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 0.5825242718446602,
|
|
"grad_norm": 0.4017459452152252,
|
|
"learning_rate": 8.493360140476699e-06,
|
|
"loss": 0.2352,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.5836275375110327,
|
|
"grad_norm": 0.3862455487251282,
|
|
"learning_rate": 8.486828117809057e-06,
|
|
"loss": 0.2317,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 0.5847308031774051,
|
|
"grad_norm": 0.4500201642513275,
|
|
"learning_rate": 8.480284489641034e-06,
|
|
"loss": 0.2385,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.5858340688437775,
|
|
"grad_norm": 0.5192285776138306,
|
|
"learning_rate": 8.473729277752331e-06,
|
|
"loss": 0.2426,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 0.5869373345101501,
|
|
"grad_norm": 0.41023924946784973,
|
|
"learning_rate": 8.467162503961209e-06,
|
|
"loss": 0.2346,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.5880406001765225,
|
|
"grad_norm": 0.44286391139030457,
|
|
"learning_rate": 8.460584190124405e-06,
|
|
"loss": 0.246,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 0.589143865842895,
|
|
"grad_norm": 0.45593389868736267,
|
|
"learning_rate": 8.45399435813707e-06,
|
|
"loss": 0.2371,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.5902471315092674,
|
|
"grad_norm": 0.4817209839820862,
|
|
"learning_rate": 8.447393029932692e-06,
|
|
"loss": 0.2376,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 0.5913503971756399,
|
|
"grad_norm": 0.4856320917606354,
|
|
"learning_rate": 8.440780227483016e-06,
|
|
"loss": 0.2451,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.5924536628420124,
|
|
"grad_norm": 0.42935821413993835,
|
|
"learning_rate": 8.43415597279799e-06,
|
|
"loss": 0.2422,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"epoch": 0.5935569285083848,
|
|
"grad_norm": 0.45911160111427307,
|
|
"learning_rate": 8.427520287925669e-06,
|
|
"loss": 0.2397,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.5946601941747572,
|
|
"grad_norm": 0.4541209638118744,
|
|
"learning_rate": 8.420873194952153e-06,
|
|
"loss": 0.2392,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"epoch": 0.5957634598411298,
|
|
"grad_norm": 0.4879801273345947,
|
|
"learning_rate": 8.414214716001519e-06,
|
|
"loss": 0.2479,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.5957634598411298,
|
|
"eval_loss": 0.24747207760810852,
|
|
"eval_runtime": 270.41,
|
|
"eval_samples_per_second": 56.444,
|
|
"eval_steps_per_second": 7.056,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.5968667255075022,
|
|
"grad_norm": 0.4193342328071594,
|
|
"learning_rate": 8.407544873235736e-06,
|
|
"loss": 0.2482,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"epoch": 0.5979699911738746,
|
|
"grad_norm": 0.45270466804504395,
|
|
"learning_rate": 8.400863688854598e-06,
|
|
"loss": 0.2469,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.5990732568402471,
|
|
"grad_norm": 0.4806990623474121,
|
|
"learning_rate": 8.394171185095646e-06,
|
|
"loss": 0.2442,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"epoch": 0.6001765225066196,
|
|
"grad_norm": 0.4336373209953308,
|
|
"learning_rate": 8.387467384234096e-06,
|
|
"loss": 0.2335,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.6012797881729921,
|
|
"grad_norm": 0.4600653648376465,
|
|
"learning_rate": 8.38075230858277e-06,
|
|
"loss": 0.2365,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 0.6023830538393645,
|
|
"grad_norm": 0.5015227198600769,
|
|
"learning_rate": 8.37402598049201e-06,
|
|
"loss": 0.2502,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.603486319505737,
|
|
"grad_norm": 0.552075207233429,
|
|
"learning_rate": 8.367288422349617e-06,
|
|
"loss": 0.2403,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"epoch": 0.6045895851721095,
|
|
"grad_norm": 0.4628050923347473,
|
|
"learning_rate": 8.360539656580768e-06,
|
|
"loss": 0.2294,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.6056928508384819,
|
|
"grad_norm": 0.48730120062828064,
|
|
"learning_rate": 8.353779705647936e-06,
|
|
"loss": 0.2397,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"epoch": 0.6067961165048543,
|
|
"grad_norm": 0.40887823700904846,
|
|
"learning_rate": 8.347008592050834e-06,
|
|
"loss": 0.2491,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.6078993821712269,
|
|
"grad_norm": 0.4346201419830322,
|
|
"learning_rate": 8.340226338326321e-06,
|
|
"loss": 0.2436,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"epoch": 0.6090026478375993,
|
|
"grad_norm": 0.38802462816238403,
|
|
"learning_rate": 8.333432967048339e-06,
|
|
"loss": 0.2379,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.6101059135039718,
|
|
"grad_norm": 0.3992108404636383,
|
|
"learning_rate": 8.326628500827826e-06,
|
|
"loss": 0.2338,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"epoch": 0.6112091791703442,
|
|
"grad_norm": 0.44411781430244446,
|
|
"learning_rate": 8.319812962312662e-06,
|
|
"loss": 0.2301,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.6123124448367167,
|
|
"grad_norm": 0.42220309376716614,
|
|
"learning_rate": 8.312986374187563e-06,
|
|
"loss": 0.238,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 0.6134157105030892,
|
|
"grad_norm": 0.47081899642944336,
|
|
"learning_rate": 8.306148759174036e-06,
|
|
"loss": 0.2536,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.6145189761694616,
|
|
"grad_norm": 0.4740568995475769,
|
|
"learning_rate": 8.299300140030283e-06,
|
|
"loss": 0.2494,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"epoch": 0.615622241835834,
|
|
"grad_norm": 0.41311606764793396,
|
|
"learning_rate": 8.292440539551132e-06,
|
|
"loss": 0.2443,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.6167255075022066,
|
|
"grad_norm": 0.43672001361846924,
|
|
"learning_rate": 8.285569980567965e-06,
|
|
"loss": 0.2386,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"epoch": 0.617828773168579,
|
|
"grad_norm": 0.45961281657218933,
|
|
"learning_rate": 8.278688485948634e-06,
|
|
"loss": 0.2471,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.617828773168579,
|
|
"eval_loss": 0.24707245826721191,
|
|
"eval_runtime": 271.8779,
|
|
"eval_samples_per_second": 56.139,
|
|
"eval_steps_per_second": 7.018,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.6189320388349514,
|
|
"grad_norm": 0.41084882616996765,
|
|
"learning_rate": 8.27179607859739e-06,
|
|
"loss": 0.2353,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"epoch": 0.6200353045013239,
|
|
"grad_norm": 0.40242356061935425,
|
|
"learning_rate": 8.264892781454807e-06,
|
|
"loss": 0.2259,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.6211385701676964,
|
|
"grad_norm": 0.43410569429397583,
|
|
"learning_rate": 8.257978617497706e-06,
|
|
"loss": 0.2375,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"epoch": 0.6222418358340689,
|
|
"grad_norm": 0.49452638626098633,
|
|
"learning_rate": 8.25105360973907e-06,
|
|
"loss": 0.2472,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.6233451015004413,
|
|
"grad_norm": 0.5670908689498901,
|
|
"learning_rate": 8.244117781227982e-06,
|
|
"loss": 0.2434,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 0.6244483671668137,
|
|
"grad_norm": 0.4820459485054016,
|
|
"learning_rate": 8.237171155049539e-06,
|
|
"loss": 0.2393,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.6255516328331863,
|
|
"grad_norm": 0.455879271030426,
|
|
"learning_rate": 8.230213754324773e-06,
|
|
"loss": 0.2269,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"epoch": 0.6266548984995587,
|
|
"grad_norm": 0.41106078028678894,
|
|
"learning_rate": 8.22324560221058e-06,
|
|
"loss": 0.2291,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.6277581641659311,
|
|
"grad_norm": 0.39285704493522644,
|
|
"learning_rate": 8.216266721899642e-06,
|
|
"loss": 0.2357,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"epoch": 0.6288614298323036,
|
|
"grad_norm": 0.3971237242221832,
|
|
"learning_rate": 8.209277136620348e-06,
|
|
"loss": 0.2444,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.6299646954986761,
|
|
"grad_norm": 0.47871458530426025,
|
|
"learning_rate": 8.202276869636713e-06,
|
|
"loss": 0.2357,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"epoch": 0.6310679611650486,
|
|
"grad_norm": 0.4452441930770874,
|
|
"learning_rate": 8.195265944248315e-06,
|
|
"loss": 0.237,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.632171226831421,
|
|
"grad_norm": 0.5275673866271973,
|
|
"learning_rate": 8.188244383790196e-06,
|
|
"loss": 0.2536,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"epoch": 0.6332744924977934,
|
|
"grad_norm": 0.46344441175460815,
|
|
"learning_rate": 8.1812122116328e-06,
|
|
"loss": 0.2437,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.634377758164166,
|
|
"grad_norm": 0.4381890892982483,
|
|
"learning_rate": 8.174169451181893e-06,
|
|
"loss": 0.2488,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 0.6354810238305384,
|
|
"grad_norm": 0.41612380743026733,
|
|
"learning_rate": 8.167116125878483e-06,
|
|
"loss": 0.239,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.6365842894969108,
|
|
"grad_norm": 0.4481007754802704,
|
|
"learning_rate": 8.160052259198737e-06,
|
|
"loss": 0.2395,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"epoch": 0.6376875551632833,
|
|
"grad_norm": 0.42456308007240295,
|
|
"learning_rate": 8.152977874653909e-06,
|
|
"loss": 0.2303,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.6387908208296558,
|
|
"grad_norm": 0.4462544322013855,
|
|
"learning_rate": 8.145892995790269e-06,
|
|
"loss": 0.2476,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"epoch": 0.6398940864960282,
|
|
"grad_norm": 0.37666749954223633,
|
|
"learning_rate": 8.138797646189e-06,
|
|
"loss": 0.2326,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.6398940864960282,
|
|
"eval_loss": 0.24663805961608887,
|
|
"eval_runtime": 271.9733,
|
|
"eval_samples_per_second": 56.119,
|
|
"eval_steps_per_second": 7.015,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.6409973521624007,
|
|
"grad_norm": 0.4413769841194153,
|
|
"learning_rate": 8.131691849466154e-06,
|
|
"loss": 0.2347,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"epoch": 0.6421006178287731,
|
|
"grad_norm": 0.5462684035301208,
|
|
"learning_rate": 8.12457562927254e-06,
|
|
"loss": 0.2491,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.6432038834951457,
|
|
"grad_norm": 0.47332948446273804,
|
|
"learning_rate": 8.117449009293668e-06,
|
|
"loss": 0.2387,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"epoch": 0.6443071491615181,
|
|
"grad_norm": 0.4280896484851837,
|
|
"learning_rate": 8.11031201324966e-06,
|
|
"loss": 0.2347,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.6454104148278905,
|
|
"grad_norm": 0.46633732318878174,
|
|
"learning_rate": 8.103164664895179e-06,
|
|
"loss": 0.2528,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 0.646513680494263,
|
|
"grad_norm": 0.45853593945503235,
|
|
"learning_rate": 8.096006988019331e-06,
|
|
"loss": 0.2329,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.6476169461606355,
|
|
"grad_norm": 0.4461853802204132,
|
|
"learning_rate": 8.088839006445615e-06,
|
|
"loss": 0.2436,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"epoch": 0.6487202118270079,
|
|
"grad_norm": 0.4614443778991699,
|
|
"learning_rate": 8.081660744031818e-06,
|
|
"loss": 0.2442,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.6498234774933804,
|
|
"grad_norm": 0.4097602367401123,
|
|
"learning_rate": 8.074472224669952e-06,
|
|
"loss": 0.2398,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"epoch": 0.6509267431597529,
|
|
"grad_norm": 0.5019506216049194,
|
|
"learning_rate": 8.067273472286158e-06,
|
|
"loss": 0.2488,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.6520300088261254,
|
|
"grad_norm": 0.4480745196342468,
|
|
"learning_rate": 8.060064510840648e-06,
|
|
"loss": 0.2268,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"epoch": 0.6531332744924978,
|
|
"grad_norm": 0.44799327850341797,
|
|
"learning_rate": 8.052845364327609e-06,
|
|
"loss": 0.2407,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.6542365401588702,
|
|
"grad_norm": 0.4316900670528412,
|
|
"learning_rate": 8.045616056775124e-06,
|
|
"loss": 0.2449,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"epoch": 0.6553398058252428,
|
|
"grad_norm": 0.42375341057777405,
|
|
"learning_rate": 8.038376612245104e-06,
|
|
"loss": 0.2363,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.6564430714916152,
|
|
"grad_norm": 0.48923903703689575,
|
|
"learning_rate": 8.031127054833192e-06,
|
|
"loss": 0.2409,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"epoch": 0.6575463371579876,
|
|
"grad_norm": 0.41415655612945557,
|
|
"learning_rate": 8.023867408668692e-06,
|
|
"loss": 0.2335,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.6586496028243601,
|
|
"grad_norm": 0.47558680176734924,
|
|
"learning_rate": 8.016597697914492e-06,
|
|
"loss": 0.2485,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"epoch": 0.6597528684907326,
|
|
"grad_norm": 0.4345654845237732,
|
|
"learning_rate": 8.009317946766975e-06,
|
|
"loss": 0.2445,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.660856134157105,
|
|
"grad_norm": 0.4340679347515106,
|
|
"learning_rate": 8.002028179455941e-06,
|
|
"loss": 0.2403,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"epoch": 0.6619593998234775,
|
|
"grad_norm": 0.45394837856292725,
|
|
"learning_rate": 7.994728420244533e-06,
|
|
"loss": 0.2516,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.6619593998234775,
|
|
"eval_loss": 0.24612218141555786,
|
|
"eval_runtime": 271.5712,
|
|
"eval_samples_per_second": 56.203,
|
|
"eval_steps_per_second": 7.026,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.6630626654898499,
|
|
"grad_norm": 0.4266812205314636,
|
|
"learning_rate": 7.987418693429145e-06,
|
|
"loss": 0.2421,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"epoch": 0.6641659311562225,
|
|
"grad_norm": 0.44489166140556335,
|
|
"learning_rate": 7.98009902333935e-06,
|
|
"loss": 0.2249,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.6652691968225949,
|
|
"grad_norm": 0.3864096701145172,
|
|
"learning_rate": 7.972769434337815e-06,
|
|
"loss": 0.238,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"epoch": 0.6663724624889673,
|
|
"grad_norm": 0.5136005878448486,
|
|
"learning_rate": 7.965429950820222e-06,
|
|
"loss": 0.233,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.6674757281553398,
|
|
"grad_norm": 0.4214404225349426,
|
|
"learning_rate": 7.958080597215187e-06,
|
|
"loss": 0.2382,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"epoch": 0.6685789938217123,
|
|
"grad_norm": 0.3963940441608429,
|
|
"learning_rate": 7.95072139798417e-06,
|
|
"loss": 0.2413,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.6696822594880847,
|
|
"grad_norm": 0.46489056944847107,
|
|
"learning_rate": 7.943352377621414e-06,
|
|
"loss": 0.2405,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"epoch": 0.6707855251544572,
|
|
"grad_norm": 0.47178593277931213,
|
|
"learning_rate": 7.935973560653838e-06,
|
|
"loss": 0.2347,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.6718887908208296,
|
|
"grad_norm": 0.43894341588020325,
|
|
"learning_rate": 7.928584971640974e-06,
|
|
"loss": 0.2443,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"epoch": 0.6729920564872022,
|
|
"grad_norm": 0.39241307973861694,
|
|
"learning_rate": 7.92118663517488e-06,
|
|
"loss": 0.249,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.6740953221535746,
|
|
"grad_norm": 0.4197879731655121,
|
|
"learning_rate": 7.913778575880054e-06,
|
|
"loss": 0.2229,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"epoch": 0.675198587819947,
|
|
"grad_norm": 0.4493717551231384,
|
|
"learning_rate": 7.906360818413354e-06,
|
|
"loss": 0.2385,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.6763018534863195,
|
|
"grad_norm": 0.4284787178039551,
|
|
"learning_rate": 7.898933387463924e-06,
|
|
"loss": 0.2398,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"epoch": 0.677405119152692,
|
|
"grad_norm": 0.4555470943450928,
|
|
"learning_rate": 7.891496307753099e-06,
|
|
"loss": 0.2395,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.6785083848190644,
|
|
"grad_norm": 0.4759092628955841,
|
|
"learning_rate": 7.884049604034331e-06,
|
|
"loss": 0.2441,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"epoch": 0.6796116504854369,
|
|
"grad_norm": 0.421332448720932,
|
|
"learning_rate": 7.876593301093104e-06,
|
|
"loss": 0.2416,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.6807149161518093,
|
|
"grad_norm": 0.5813837051391602,
|
|
"learning_rate": 7.869127423746852e-06,
|
|
"loss": 0.2387,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"epoch": 0.6818181818181818,
|
|
"grad_norm": 0.4229418933391571,
|
|
"learning_rate": 7.861651996844877e-06,
|
|
"loss": 0.2359,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.6829214474845543,
|
|
"grad_norm": 0.4013502597808838,
|
|
"learning_rate": 7.854167045268265e-06,
|
|
"loss": 0.2408,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"epoch": 0.6840247131509267,
|
|
"grad_norm": 0.46940287947654724,
|
|
"learning_rate": 7.8466725939298e-06,
|
|
"loss": 0.2254,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.6840247131509267,
|
|
"eval_loss": 0.24580919742584229,
|
|
"eval_runtime": 268.9469,
|
|
"eval_samples_per_second": 56.751,
|
|
"eval_steps_per_second": 7.094,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.6851279788172993,
|
|
"grad_norm": 0.43689653277397156,
|
|
"learning_rate": 7.839168667773891e-06,
|
|
"loss": 0.248,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"epoch": 0.6862312444836717,
|
|
"grad_norm": 0.4291225075721741,
|
|
"learning_rate": 7.831655291776484e-06,
|
|
"loss": 0.2554,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.6873345101500441,
|
|
"grad_norm": 0.3945559859275818,
|
|
"learning_rate": 7.824132490944968e-06,
|
|
"loss": 0.229,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"epoch": 0.6884377758164166,
|
|
"grad_norm": 0.4158150553703308,
|
|
"learning_rate": 7.81660029031811e-06,
|
|
"loss": 0.2422,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.689541041482789,
|
|
"grad_norm": 0.4768913984298706,
|
|
"learning_rate": 7.809058714965962e-06,
|
|
"loss": 0.2384,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"epoch": 0.6906443071491615,
|
|
"grad_norm": 0.4020480811595917,
|
|
"learning_rate": 7.801507789989775e-06,
|
|
"loss": 0.2327,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.691747572815534,
|
|
"grad_norm": 0.4478599429130554,
|
|
"learning_rate": 7.793947540521922e-06,
|
|
"loss": 0.2507,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"epoch": 0.6928508384819064,
|
|
"grad_norm": 0.4232751727104187,
|
|
"learning_rate": 7.786377991725813e-06,
|
|
"loss": 0.2451,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.693954104148279,
|
|
"grad_norm": 0.4200434982776642,
|
|
"learning_rate": 7.778799168795804e-06,
|
|
"loss": 0.2394,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"epoch": 0.6950573698146514,
|
|
"grad_norm": 0.5268550515174866,
|
|
"learning_rate": 7.771211096957125e-06,
|
|
"loss": 0.2328,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.6961606354810238,
|
|
"grad_norm": 0.48702338337898254,
|
|
"learning_rate": 7.763613801465785e-06,
|
|
"loss": 0.2417,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"epoch": 0.6972639011473963,
|
|
"grad_norm": 0.4694213569164276,
|
|
"learning_rate": 7.756007307608498e-06,
|
|
"loss": 0.2505,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.6983671668137688,
|
|
"grad_norm": 0.40955081582069397,
|
|
"learning_rate": 7.748391640702588e-06,
|
|
"loss": 0.2401,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"epoch": 0.6994704324801412,
|
|
"grad_norm": 0.45171940326690674,
|
|
"learning_rate": 7.740766826095918e-06,
|
|
"loss": 0.23,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.7005736981465137,
|
|
"grad_norm": 0.4558034837245941,
|
|
"learning_rate": 7.733132889166788e-06,
|
|
"loss": 0.2417,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"epoch": 0.7016769638128861,
|
|
"grad_norm": 0.4197918474674225,
|
|
"learning_rate": 7.725489855323869e-06,
|
|
"loss": 0.2432,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.7027802294792586,
|
|
"grad_norm": 0.4640055000782013,
|
|
"learning_rate": 7.717837750006106e-06,
|
|
"loss": 0.2387,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"epoch": 0.7038834951456311,
|
|
"grad_norm": 0.41201338171958923,
|
|
"learning_rate": 7.710176598682639e-06,
|
|
"loss": 0.2253,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.7049867608120035,
|
|
"grad_norm": 0.4629385769367218,
|
|
"learning_rate": 7.702506426852715e-06,
|
|
"loss": 0.2473,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"epoch": 0.706090026478376,
|
|
"grad_norm": 0.41098853945732117,
|
|
"learning_rate": 7.694827260045608e-06,
|
|
"loss": 0.2454,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.706090026478376,
|
|
"eval_loss": 0.24525980651378632,
|
|
"eval_runtime": 270.0878,
|
|
"eval_samples_per_second": 56.511,
|
|
"eval_steps_per_second": 7.064,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.7071932921447485,
|
|
"grad_norm": 0.43668264150619507,
|
|
"learning_rate": 7.687139123820526e-06,
|
|
"loss": 0.2469,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"epoch": 0.7082965578111209,
|
|
"grad_norm": 0.3744730055332184,
|
|
"learning_rate": 7.679442043766534e-06,
|
|
"loss": 0.2336,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.7093998234774934,
|
|
"grad_norm": 0.41323620080947876,
|
|
"learning_rate": 7.671736045502462e-06,
|
|
"loss": 0.2459,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"epoch": 0.7105030891438658,
|
|
"grad_norm": 0.4456137716770172,
|
|
"learning_rate": 7.664021154676828e-06,
|
|
"loss": 0.2491,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.7116063548102383,
|
|
"grad_norm": 0.4377335011959076,
|
|
"learning_rate": 7.656297396967747e-06,
|
|
"loss": 0.2395,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"epoch": 0.7127096204766108,
|
|
"grad_norm": 0.4323022961616516,
|
|
"learning_rate": 7.648564798082842e-06,
|
|
"loss": 0.2403,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.7138128861429832,
|
|
"grad_norm": 0.4660840630531311,
|
|
"learning_rate": 7.640823383759169e-06,
|
|
"loss": 0.2532,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"epoch": 0.7149161518093556,
|
|
"grad_norm": 0.4233240485191345,
|
|
"learning_rate": 7.63307317976312e-06,
|
|
"loss": 0.24,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.7160194174757282,
|
|
"grad_norm": 0.48266658186912537,
|
|
"learning_rate": 7.625314211890342e-06,
|
|
"loss": 0.2426,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"epoch": 0.7171226831421006,
|
|
"grad_norm": 0.3970603346824646,
|
|
"learning_rate": 7.617546505965658e-06,
|
|
"loss": 0.2278,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.7182259488084731,
|
|
"grad_norm": 0.4572686553001404,
|
|
"learning_rate": 7.609770087842969e-06,
|
|
"loss": 0.242,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"epoch": 0.7193292144748455,
|
|
"grad_norm": 0.40155845880508423,
|
|
"learning_rate": 7.601984983405173e-06,
|
|
"loss": 0.2285,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.720432480141218,
|
|
"grad_norm": 0.4204133152961731,
|
|
"learning_rate": 7.594191218564084e-06,
|
|
"loss": 0.2353,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"epoch": 0.7215357458075905,
|
|
"grad_norm": 0.4386295676231384,
|
|
"learning_rate": 7.586388819260338e-06,
|
|
"loss": 0.2228,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.7226390114739629,
|
|
"grad_norm": 0.43028250336647034,
|
|
"learning_rate": 7.57857781146331e-06,
|
|
"loss": 0.2328,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"epoch": 0.7237422771403353,
|
|
"grad_norm": 0.4137963056564331,
|
|
"learning_rate": 7.5707582211710265e-06,
|
|
"loss": 0.2406,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.7248455428067079,
|
|
"grad_norm": 0.3798144459724426,
|
|
"learning_rate": 7.562930074410084e-06,
|
|
"loss": 0.2425,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"epoch": 0.7259488084730803,
|
|
"grad_norm": 0.3895861804485321,
|
|
"learning_rate": 7.555093397235553e-06,
|
|
"loss": 0.2312,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.7270520741394528,
|
|
"grad_norm": 0.44680020213127136,
|
|
"learning_rate": 7.5472482157308975e-06,
|
|
"loss": 0.23,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"epoch": 0.7281553398058253,
|
|
"grad_norm": 0.4767349064350128,
|
|
"learning_rate": 7.539394556007892e-06,
|
|
"loss": 0.2482,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.7281553398058253,
|
|
"eval_loss": 0.24486024677753448,
|
|
"eval_runtime": 269.7229,
|
|
"eval_samples_per_second": 56.588,
|
|
"eval_steps_per_second": 7.074,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.7292586054721977,
|
|
"grad_norm": 0.41484782099723816,
|
|
"learning_rate": 7.531532444206524e-06,
|
|
"loss": 0.2333,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"epoch": 0.7303618711385702,
|
|
"grad_norm": 0.4434278607368469,
|
|
"learning_rate": 7.523661906494913e-06,
|
|
"loss": 0.2393,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.7314651368049426,
|
|
"grad_norm": 0.44898828864097595,
|
|
"learning_rate": 7.515782969069229e-06,
|
|
"loss": 0.2342,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"epoch": 0.732568402471315,
|
|
"grad_norm": 0.4551832973957062,
|
|
"learning_rate": 7.507895658153594e-06,
|
|
"loss": 0.2459,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.7336716681376876,
|
|
"grad_norm": 0.464346706867218,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 0.2178,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"epoch": 0.73477493380406,
|
|
"grad_norm": 0.43785154819488525,
|
|
"learning_rate": 7.492096020888227e-06,
|
|
"loss": 0.2378,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.7358781994704324,
|
|
"grad_norm": 0.4068206548690796,
|
|
"learning_rate": 7.484183747125743e-06,
|
|
"loss": 0.2302,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"epoch": 0.736981465136805,
|
|
"grad_norm": 0.40867307782173157,
|
|
"learning_rate": 7.476263205047629e-06,
|
|
"loss": 0.2403,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.7380847308031774,
|
|
"grad_norm": 0.4066210687160492,
|
|
"learning_rate": 7.468334421016486e-06,
|
|
"loss": 0.2334,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"epoch": 0.7391879964695499,
|
|
"grad_norm": 0.43838587403297424,
|
|
"learning_rate": 7.460397421422346e-06,
|
|
"loss": 0.231,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.7402912621359223,
|
|
"grad_norm": 0.4866897761821747,
|
|
"learning_rate": 7.452452232682585e-06,
|
|
"loss": 0.2513,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"epoch": 0.7413945278022948,
|
|
"grad_norm": 0.5071978569030762,
|
|
"learning_rate": 7.444498881241835e-06,
|
|
"loss": 0.2495,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.7424977934686673,
|
|
"grad_norm": 0.5105838775634766,
|
|
"learning_rate": 7.4365373935719e-06,
|
|
"loss": 0.242,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"epoch": 0.7436010591350397,
|
|
"grad_norm": 0.513029932975769,
|
|
"learning_rate": 7.428567796171662e-06,
|
|
"loss": 0.2468,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.7447043248014121,
|
|
"grad_norm": 0.396241158246994,
|
|
"learning_rate": 7.420590115566995e-06,
|
|
"loss": 0.2283,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"epoch": 0.7458075904677847,
|
|
"grad_norm": 0.43795159459114075,
|
|
"learning_rate": 7.412604378310677e-06,
|
|
"loss": 0.2304,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.7469108561341571,
|
|
"grad_norm": 0.42680585384368896,
|
|
"learning_rate": 7.4046106109823045e-06,
|
|
"loss": 0.2294,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"epoch": 0.7480141218005296,
|
|
"grad_norm": 0.5321316123008728,
|
|
"learning_rate": 7.3966088401881975e-06,
|
|
"loss": 0.2378,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.749117387466902,
|
|
"grad_norm": 0.4479999244213104,
|
|
"learning_rate": 7.388599092561315e-06,
|
|
"loss": 0.2333,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"epoch": 0.7502206531332745,
|
|
"grad_norm": 0.5410358309745789,
|
|
"learning_rate": 7.380581394761169e-06,
|
|
"loss": 0.2429,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.7502206531332745,
|
|
"eval_loss": 0.24444225430488586,
|
|
"eval_runtime": 273.7667,
|
|
"eval_samples_per_second": 55.752,
|
|
"eval_steps_per_second": 6.969,
|
|
"step": 3400
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 9064,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.662905189123752e+18,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|