Files
Qwen3-1.7B-SFT/trainer_state.json
ModelHub XC 0a5beb0149 初始化项目,由ModelHub XC社区提供模型
Model: lllyx/Qwen3-1.7B-SFT
Source: Original Platform
2026-05-10 05:35:48 +08:00

5067 lines
124 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7502206531332745,
"eval_steps": 100,
"global_step": 3400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011032656663724624,
"grad_norm": 5.878592491149902,
"learning_rate": 8.810572687224672e-08,
"loss": 0.4689,
"step": 5
},
{
"epoch": 0.002206531332744925,
"grad_norm": 5.2574687004089355,
"learning_rate": 1.982378854625551e-07,
"loss": 0.4759,
"step": 10
},
{
"epoch": 0.0033097969991173876,
"grad_norm": 5.635329723358154,
"learning_rate": 3.083700440528635e-07,
"loss": 0.466,
"step": 15
},
{
"epoch": 0.00441306266548985,
"grad_norm": 5.272955417633057,
"learning_rate": 4.1850220264317185e-07,
"loss": 0.4468,
"step": 20
},
{
"epoch": 0.005516328331862313,
"grad_norm": 5.219903945922852,
"learning_rate": 5.286343612334802e-07,
"loss": 0.4513,
"step": 25
},
{
"epoch": 0.006619593998234775,
"grad_norm": 4.160714626312256,
"learning_rate": 6.387665198237886e-07,
"loss": 0.4372,
"step": 30
},
{
"epoch": 0.007722859664607238,
"grad_norm": 3.6638221740722656,
"learning_rate": 7.48898678414097e-07,
"loss": 0.4449,
"step": 35
},
{
"epoch": 0.0088261253309797,
"grad_norm": 3.3206021785736084,
"learning_rate": 8.590308370044054e-07,
"loss": 0.4125,
"step": 40
},
{
"epoch": 0.009929390997352162,
"grad_norm": 2.711574077606201,
"learning_rate": 9.691629955947138e-07,
"loss": 0.4156,
"step": 45
},
{
"epoch": 0.011032656663724626,
"grad_norm": 1.6887717247009277,
"learning_rate": 1.0792951541850223e-06,
"loss": 0.4075,
"step": 50
},
{
"epoch": 0.012135922330097087,
"grad_norm": 1.177046775817871,
"learning_rate": 1.1894273127753305e-06,
"loss": 0.3647,
"step": 55
},
{
"epoch": 0.01323918799646955,
"grad_norm": 0.8924531936645508,
"learning_rate": 1.299559471365639e-06,
"loss": 0.3854,
"step": 60
},
{
"epoch": 0.014342453662842012,
"grad_norm": 1.1198736429214478,
"learning_rate": 1.4096916299559475e-06,
"loss": 0.3644,
"step": 65
},
{
"epoch": 0.015445719329214475,
"grad_norm": 0.6698480248451233,
"learning_rate": 1.5198237885462555e-06,
"loss": 0.344,
"step": 70
},
{
"epoch": 0.01654898499558694,
"grad_norm": 0.6314343214035034,
"learning_rate": 1.629955947136564e-06,
"loss": 0.3258,
"step": 75
},
{
"epoch": 0.0176522506619594,
"grad_norm": 0.5537658929824829,
"learning_rate": 1.7400881057268722e-06,
"loss": 0.3236,
"step": 80
},
{
"epoch": 0.01875551632833186,
"grad_norm": 0.6194472312927246,
"learning_rate": 1.8502202643171807e-06,
"loss": 0.3219,
"step": 85
},
{
"epoch": 0.019858781994704325,
"grad_norm": 0.4850139915943146,
"learning_rate": 1.960352422907489e-06,
"loss": 0.3041,
"step": 90
},
{
"epoch": 0.020962047661076788,
"grad_norm": 0.5193836092948914,
"learning_rate": 2.0704845814977977e-06,
"loss": 0.3198,
"step": 95
},
{
"epoch": 0.02206531332744925,
"grad_norm": 0.49118679761886597,
"learning_rate": 2.180616740088106e-06,
"loss": 0.3266,
"step": 100
},
{
"epoch": 0.02206531332744925,
"eval_loss": 0.3100859820842743,
"eval_runtime": 269.4841,
"eval_samples_per_second": 56.638,
"eval_steps_per_second": 7.08,
"step": 100
},
{
"epoch": 0.02316857899382171,
"grad_norm": 0.47427237033843994,
"learning_rate": 2.290748898678414e-06,
"loss": 0.2955,
"step": 105
},
{
"epoch": 0.024271844660194174,
"grad_norm": 0.49798524379730225,
"learning_rate": 2.400881057268723e-06,
"loss": 0.302,
"step": 110
},
{
"epoch": 0.025375110326566638,
"grad_norm": 0.46623724699020386,
"learning_rate": 2.511013215859031e-06,
"loss": 0.2971,
"step": 115
},
{
"epoch": 0.0264783759929391,
"grad_norm": 0.4659285247325897,
"learning_rate": 2.6211453744493394e-06,
"loss": 0.2918,
"step": 120
},
{
"epoch": 0.02758164165931156,
"grad_norm": 0.46458956599235535,
"learning_rate": 2.731277533039648e-06,
"loss": 0.2819,
"step": 125
},
{
"epoch": 0.028684907325684024,
"grad_norm": 0.5174154043197632,
"learning_rate": 2.841409691629956e-06,
"loss": 0.2953,
"step": 130
},
{
"epoch": 0.029788172992056487,
"grad_norm": 0.49397629499435425,
"learning_rate": 2.9515418502202646e-06,
"loss": 0.2932,
"step": 135
},
{
"epoch": 0.03089143865842895,
"grad_norm": 0.4400649964809418,
"learning_rate": 3.061674008810573e-06,
"loss": 0.2829,
"step": 140
},
{
"epoch": 0.031994704324801414,
"grad_norm": 0.4720049798488617,
"learning_rate": 3.1718061674008815e-06,
"loss": 0.2905,
"step": 145
},
{
"epoch": 0.03309796999117388,
"grad_norm": 0.46713733673095703,
"learning_rate": 3.2819383259911898e-06,
"loss": 0.2949,
"step": 150
},
{
"epoch": 0.03420123565754634,
"grad_norm": 0.4691792130470276,
"learning_rate": 3.3920704845814985e-06,
"loss": 0.2845,
"step": 155
},
{
"epoch": 0.0353045013239188,
"grad_norm": 0.507400393486023,
"learning_rate": 3.5022026431718063e-06,
"loss": 0.2929,
"step": 160
},
{
"epoch": 0.03640776699029126,
"grad_norm": 0.5042280554771423,
"learning_rate": 3.6123348017621146e-06,
"loss": 0.2903,
"step": 165
},
{
"epoch": 0.03751103265666372,
"grad_norm": 0.4863748252391815,
"learning_rate": 3.7224669603524232e-06,
"loss": 0.2871,
"step": 170
},
{
"epoch": 0.038614298323036186,
"grad_norm": 0.510486364364624,
"learning_rate": 3.8325991189427315e-06,
"loss": 0.2995,
"step": 175
},
{
"epoch": 0.03971756398940865,
"grad_norm": 0.5123398900032043,
"learning_rate": 3.94273127753304e-06,
"loss": 0.2794,
"step": 180
},
{
"epoch": 0.04082082965578111,
"grad_norm": 0.45075932145118713,
"learning_rate": 4.052863436123348e-06,
"loss": 0.2827,
"step": 185
},
{
"epoch": 0.041924095322153576,
"grad_norm": 0.4237598180770874,
"learning_rate": 4.162995594713657e-06,
"loss": 0.2806,
"step": 190
},
{
"epoch": 0.04302736098852604,
"grad_norm": 0.43099313974380493,
"learning_rate": 4.273127753303965e-06,
"loss": 0.2667,
"step": 195
},
{
"epoch": 0.0441306266548985,
"grad_norm": 0.5144179463386536,
"learning_rate": 4.383259911894274e-06,
"loss": 0.2748,
"step": 200
},
{
"epoch": 0.0441306266548985,
"eval_loss": 0.28673937916755676,
"eval_runtime": 271.0828,
"eval_samples_per_second": 56.304,
"eval_steps_per_second": 7.038,
"step": 200
},
{
"epoch": 0.04523389232127096,
"grad_norm": 0.4969826340675354,
"learning_rate": 4.493392070484582e-06,
"loss": 0.2763,
"step": 205
},
{
"epoch": 0.04633715798764342,
"grad_norm": 0.471080482006073,
"learning_rate": 4.60352422907489e-06,
"loss": 0.2775,
"step": 210
},
{
"epoch": 0.047440423654015886,
"grad_norm": 0.4287184476852417,
"learning_rate": 4.7136563876651984e-06,
"loss": 0.2694,
"step": 215
},
{
"epoch": 0.04854368932038835,
"grad_norm": 0.5114341974258423,
"learning_rate": 4.823788546255507e-06,
"loss": 0.2803,
"step": 220
},
{
"epoch": 0.04964695498676081,
"grad_norm": 0.5093077421188354,
"learning_rate": 4.933920704845816e-06,
"loss": 0.2802,
"step": 225
},
{
"epoch": 0.050750220653133275,
"grad_norm": 0.5257371664047241,
"learning_rate": 5.044052863436124e-06,
"loss": 0.292,
"step": 230
},
{
"epoch": 0.05185348631950574,
"grad_norm": 0.3984781503677368,
"learning_rate": 5.154185022026432e-06,
"loss": 0.2659,
"step": 235
},
{
"epoch": 0.0529567519858782,
"grad_norm": 0.4864000082015991,
"learning_rate": 5.2643171806167406e-06,
"loss": 0.2722,
"step": 240
},
{
"epoch": 0.054060017652250665,
"grad_norm": 0.592187762260437,
"learning_rate": 5.374449339207049e-06,
"loss": 0.2793,
"step": 245
},
{
"epoch": 0.05516328331862312,
"grad_norm": 0.6081680655479431,
"learning_rate": 5.484581497797358e-06,
"loss": 0.2723,
"step": 250
},
{
"epoch": 0.056266548984995585,
"grad_norm": 0.4736359715461731,
"learning_rate": 5.594713656387666e-06,
"loss": 0.2539,
"step": 255
},
{
"epoch": 0.05736981465136805,
"grad_norm": 0.46948790550231934,
"learning_rate": 5.704845814977974e-06,
"loss": 0.2748,
"step": 260
},
{
"epoch": 0.05847308031774051,
"grad_norm": 0.5093392729759216,
"learning_rate": 5.814977973568282e-06,
"loss": 0.2614,
"step": 265
},
{
"epoch": 0.059576345984112974,
"grad_norm": 0.5078116655349731,
"learning_rate": 5.925110132158591e-06,
"loss": 0.2706,
"step": 270
},
{
"epoch": 0.06067961165048544,
"grad_norm": 0.5074042677879333,
"learning_rate": 6.035242290748899e-06,
"loss": 0.2644,
"step": 275
},
{
"epoch": 0.0617828773168579,
"grad_norm": 0.45398184657096863,
"learning_rate": 6.1453744493392075e-06,
"loss": 0.2612,
"step": 280
},
{
"epoch": 0.06288614298323036,
"grad_norm": 0.47826600074768066,
"learning_rate": 6.255506607929516e-06,
"loss": 0.275,
"step": 285
},
{
"epoch": 0.06398940864960283,
"grad_norm": 0.49216607213020325,
"learning_rate": 6.365638766519824e-06,
"loss": 0.2639,
"step": 290
},
{
"epoch": 0.06509267431597529,
"grad_norm": 0.5131933093070984,
"learning_rate": 6.475770925110133e-06,
"loss": 0.286,
"step": 295
},
{
"epoch": 0.06619593998234775,
"grad_norm": 0.4883180856704712,
"learning_rate": 6.585903083700441e-06,
"loss": 0.2813,
"step": 300
},
{
"epoch": 0.06619593998234775,
"eval_loss": 0.2764524519443512,
"eval_runtime": 269.5554,
"eval_samples_per_second": 56.623,
"eval_steps_per_second": 7.078,
"step": 300
},
{
"epoch": 0.06729920564872022,
"grad_norm": 0.47087526321411133,
"learning_rate": 6.69603524229075e-06,
"loss": 0.2724,
"step": 305
},
{
"epoch": 0.06840247131509268,
"grad_norm": 0.4638593792915344,
"learning_rate": 6.806167400881057e-06,
"loss": 0.2572,
"step": 310
},
{
"epoch": 0.06950573698146513,
"grad_norm": 0.5100425481796265,
"learning_rate": 6.916299559471367e-06,
"loss": 0.2814,
"step": 315
},
{
"epoch": 0.0706090026478376,
"grad_norm": 0.505667507648468,
"learning_rate": 7.026431718061674e-06,
"loss": 0.2672,
"step": 320
},
{
"epoch": 0.07171226831421006,
"grad_norm": 0.4753463864326477,
"learning_rate": 7.136563876651983e-06,
"loss": 0.2672,
"step": 325
},
{
"epoch": 0.07281553398058252,
"grad_norm": 0.5588636994361877,
"learning_rate": 7.246696035242291e-06,
"loss": 0.2732,
"step": 330
},
{
"epoch": 0.07391879964695498,
"grad_norm": 0.5111093521118164,
"learning_rate": 7.3568281938326e-06,
"loss": 0.2643,
"step": 335
},
{
"epoch": 0.07502206531332745,
"grad_norm": 0.49439841508865356,
"learning_rate": 7.466960352422908e-06,
"loss": 0.2677,
"step": 340
},
{
"epoch": 0.07612533097969991,
"grad_norm": 0.4980764389038086,
"learning_rate": 7.5770925110132166e-06,
"loss": 0.2577,
"step": 345
},
{
"epoch": 0.07722859664607237,
"grad_norm": 0.5545366406440735,
"learning_rate": 7.687224669603525e-06,
"loss": 0.2735,
"step": 350
},
{
"epoch": 0.07833186231244484,
"grad_norm": 0.45210951566696167,
"learning_rate": 7.797356828193832e-06,
"loss": 0.2544,
"step": 355
},
{
"epoch": 0.0794351279788173,
"grad_norm": 0.4653448462486267,
"learning_rate": 7.907488986784141e-06,
"loss": 0.2815,
"step": 360
},
{
"epoch": 0.08053839364518976,
"grad_norm": 0.4663446247577667,
"learning_rate": 8.01762114537445e-06,
"loss": 0.2647,
"step": 365
},
{
"epoch": 0.08164165931156223,
"grad_norm": 0.5129761695861816,
"learning_rate": 8.127753303964758e-06,
"loss": 0.2561,
"step": 370
},
{
"epoch": 0.08274492497793469,
"grad_norm": 0.5074746012687683,
"learning_rate": 8.237885462555067e-06,
"loss": 0.2642,
"step": 375
},
{
"epoch": 0.08384819064430715,
"grad_norm": 0.48852622509002686,
"learning_rate": 8.348017621145376e-06,
"loss": 0.2484,
"step": 380
},
{
"epoch": 0.08495145631067962,
"grad_norm": 0.46195775270462036,
"learning_rate": 8.458149779735683e-06,
"loss": 0.2432,
"step": 385
},
{
"epoch": 0.08605472197705208,
"grad_norm": 0.5792168974876404,
"learning_rate": 8.568281938325993e-06,
"loss": 0.2711,
"step": 390
},
{
"epoch": 0.08715798764342454,
"grad_norm": 0.57877516746521,
"learning_rate": 8.6784140969163e-06,
"loss": 0.2758,
"step": 395
},
{
"epoch": 0.088261253309797,
"grad_norm": 0.4454537332057953,
"learning_rate": 8.788546255506607e-06,
"loss": 0.269,
"step": 400
},
{
"epoch": 0.088261253309797,
"eval_loss": 0.27229130268096924,
"eval_runtime": 269.3494,
"eval_samples_per_second": 56.666,
"eval_steps_per_second": 7.084,
"step": 400
},
{
"epoch": 0.08936451897616945,
"grad_norm": 0.5199728608131409,
"learning_rate": 8.898678414096917e-06,
"loss": 0.2594,
"step": 405
},
{
"epoch": 0.09046778464254192,
"grad_norm": 0.4717728793621063,
"learning_rate": 9.008810572687226e-06,
"loss": 0.267,
"step": 410
},
{
"epoch": 0.09157105030891438,
"grad_norm": 0.4550599157810211,
"learning_rate": 9.118942731277533e-06,
"loss": 0.254,
"step": 415
},
{
"epoch": 0.09267431597528684,
"grad_norm": 0.5019270777702332,
"learning_rate": 9.229074889867842e-06,
"loss": 0.263,
"step": 420
},
{
"epoch": 0.09377758164165931,
"grad_norm": 0.5012001991271973,
"learning_rate": 9.339207048458151e-06,
"loss": 0.2588,
"step": 425
},
{
"epoch": 0.09488084730803177,
"grad_norm": 0.5489994883537292,
"learning_rate": 9.449339207048459e-06,
"loss": 0.2684,
"step": 430
},
{
"epoch": 0.09598411297440423,
"grad_norm": 0.5029094219207764,
"learning_rate": 9.559471365638768e-06,
"loss": 0.2635,
"step": 435
},
{
"epoch": 0.0970873786407767,
"grad_norm": 0.5363909006118774,
"learning_rate": 9.669603524229075e-06,
"loss": 0.2572,
"step": 440
},
{
"epoch": 0.09819064430714916,
"grad_norm": 0.6020154356956482,
"learning_rate": 9.779735682819384e-06,
"loss": 0.2554,
"step": 445
},
{
"epoch": 0.09929390997352162,
"grad_norm": 0.5789384841918945,
"learning_rate": 9.889867841409693e-06,
"loss": 0.2814,
"step": 450
},
{
"epoch": 0.10039717563989409,
"grad_norm": 0.5141489505767822,
"learning_rate": 1e-05,
"loss": 0.2498,
"step": 455
},
{
"epoch": 0.10150044130626655,
"grad_norm": 0.5430559515953064,
"learning_rate": 9.99999167904182e-06,
"loss": 0.276,
"step": 460
},
{
"epoch": 0.10260370697263901,
"grad_norm": 0.5350551009178162,
"learning_rate": 9.999966716194973e-06,
"loss": 0.2364,
"step": 465
},
{
"epoch": 0.10370697263901148,
"grad_norm": 0.5607656836509705,
"learning_rate": 9.999925111542544e-06,
"loss": 0.2599,
"step": 470
},
{
"epoch": 0.10481023830538394,
"grad_norm": 0.4968941807746887,
"learning_rate": 9.99986686522301e-06,
"loss": 0.262,
"step": 475
},
{
"epoch": 0.1059135039717564,
"grad_norm": 0.49971750378608704,
"learning_rate": 9.999791977430238e-06,
"loss": 0.2642,
"step": 480
},
{
"epoch": 0.10701676963812887,
"grad_norm": 0.48582854866981506,
"learning_rate": 9.999700448413483e-06,
"loss": 0.252,
"step": 485
},
{
"epoch": 0.10812003530450133,
"grad_norm": 0.5446631908416748,
"learning_rate": 9.999592278477389e-06,
"loss": 0.2652,
"step": 490
},
{
"epoch": 0.10922330097087378,
"grad_norm": 0.4594772160053253,
"learning_rate": 9.999467467981984e-06,
"loss": 0.253,
"step": 495
},
{
"epoch": 0.11032656663724624,
"grad_norm": 0.5003575682640076,
"learning_rate": 9.999326017342688e-06,
"loss": 0.2629,
"step": 500
},
{
"epoch": 0.11032656663724624,
"eval_loss": 0.26893937587738037,
"eval_runtime": 268.2711,
"eval_samples_per_second": 56.894,
"eval_steps_per_second": 7.112,
"step": 500
},
{
"epoch": 0.1114298323036187,
"grad_norm": 0.4908248484134674,
"learning_rate": 9.999167927030304e-06,
"loss": 0.2577,
"step": 505
},
{
"epoch": 0.11253309796999117,
"grad_norm": 0.6282269358634949,
"learning_rate": 9.998993197571014e-06,
"loss": 0.2714,
"step": 510
},
{
"epoch": 0.11363636363636363,
"grad_norm": 0.4572107195854187,
"learning_rate": 9.998801829546387e-06,
"loss": 0.2469,
"step": 515
},
{
"epoch": 0.1147396293027361,
"grad_norm": 0.5121334195137024,
"learning_rate": 9.99859382359337e-06,
"loss": 0.2657,
"step": 520
},
{
"epoch": 0.11584289496910856,
"grad_norm": 0.4956417679786682,
"learning_rate": 9.998369180404283e-06,
"loss": 0.2435,
"step": 525
},
{
"epoch": 0.11694616063548102,
"grad_norm": 0.6356124877929688,
"learning_rate": 9.998127900726825e-06,
"loss": 0.2694,
"step": 530
},
{
"epoch": 0.11804942630185349,
"grad_norm": 0.5022862553596497,
"learning_rate": 9.997869985364073e-06,
"loss": 0.2655,
"step": 535
},
{
"epoch": 0.11915269196822595,
"grad_norm": 0.47873038053512573,
"learning_rate": 9.997595435174461e-06,
"loss": 0.2704,
"step": 540
},
{
"epoch": 0.12025595763459841,
"grad_norm": 0.46941813826560974,
"learning_rate": 9.997304251071802e-06,
"loss": 0.2594,
"step": 545
},
{
"epoch": 0.12135922330097088,
"grad_norm": 0.47806182503700256,
"learning_rate": 9.996996434025264e-06,
"loss": 0.2597,
"step": 550
},
{
"epoch": 0.12246248896734334,
"grad_norm": 0.666239857673645,
"learning_rate": 9.996671985059384e-06,
"loss": 0.2722,
"step": 555
},
{
"epoch": 0.1235657546337158,
"grad_norm": 0.40519818663597107,
"learning_rate": 9.99633090525405e-06,
"loss": 0.2483,
"step": 560
},
{
"epoch": 0.12466902030008827,
"grad_norm": 0.5072190165519714,
"learning_rate": 9.99597319574451e-06,
"loss": 0.2528,
"step": 565
},
{
"epoch": 0.12577228596646073,
"grad_norm": 0.4706517159938812,
"learning_rate": 9.995598857721354e-06,
"loss": 0.2628,
"step": 570
},
{
"epoch": 0.12687555163283318,
"grad_norm": 0.46087875962257385,
"learning_rate": 9.995207892430525e-06,
"loss": 0.2537,
"step": 575
},
{
"epoch": 0.12797881729920565,
"grad_norm": 0.5278568863868713,
"learning_rate": 9.994800301173303e-06,
"loss": 0.2687,
"step": 580
},
{
"epoch": 0.1290820829655781,
"grad_norm": 0.5285748839378357,
"learning_rate": 9.994376085306309e-06,
"loss": 0.2647,
"step": 585
},
{
"epoch": 0.13018534863195058,
"grad_norm": 0.5125618577003479,
"learning_rate": 9.9939352462415e-06,
"loss": 0.251,
"step": 590
},
{
"epoch": 0.13128861429832303,
"grad_norm": 0.64532870054245,
"learning_rate": 9.993477785446151e-06,
"loss": 0.2574,
"step": 595
},
{
"epoch": 0.1323918799646955,
"grad_norm": 0.49944519996643066,
"learning_rate": 9.99300370444287e-06,
"loss": 0.2495,
"step": 600
},
{
"epoch": 0.1323918799646955,
"eval_loss": 0.2665667235851288,
"eval_runtime": 268.5635,
"eval_samples_per_second": 56.832,
"eval_steps_per_second": 7.104,
"step": 600
},
{
"epoch": 0.13349514563106796,
"grad_norm": 0.4549316465854645,
"learning_rate": 9.99251300480958e-06,
"loss": 0.2765,
"step": 605
},
{
"epoch": 0.13459841129744043,
"grad_norm": 0.5687413215637207,
"learning_rate": 9.992005688179518e-06,
"loss": 0.2729,
"step": 610
},
{
"epoch": 0.13570167696381288,
"grad_norm": 0.5213468074798584,
"learning_rate": 9.991481756241228e-06,
"loss": 0.2536,
"step": 615
},
{
"epoch": 0.13680494263018536,
"grad_norm": 0.5374130606651306,
"learning_rate": 9.990941210738553e-06,
"loss": 0.2629,
"step": 620
},
{
"epoch": 0.1379082082965578,
"grad_norm": 0.4314231276512146,
"learning_rate": 9.99038405347064e-06,
"loss": 0.2538,
"step": 625
},
{
"epoch": 0.13901147396293026,
"grad_norm": 0.4665580093860626,
"learning_rate": 9.989810286291923e-06,
"loss": 0.2538,
"step": 630
},
{
"epoch": 0.14011473962930274,
"grad_norm": 0.5242295861244202,
"learning_rate": 9.989219911112114e-06,
"loss": 0.2633,
"step": 635
},
{
"epoch": 0.1412180052956752,
"grad_norm": 0.49017295241355896,
"learning_rate": 9.988612929896211e-06,
"loss": 0.2678,
"step": 640
},
{
"epoch": 0.14232127096204766,
"grad_norm": 0.5350978970527649,
"learning_rate": 9.987989344664479e-06,
"loss": 0.2686,
"step": 645
},
{
"epoch": 0.1434245366284201,
"grad_norm": 0.5256524085998535,
"learning_rate": 9.98734915749245e-06,
"loss": 0.2623,
"step": 650
},
{
"epoch": 0.1445278022947926,
"grad_norm": 0.5268611907958984,
"learning_rate": 9.98669237051091e-06,
"loss": 0.2545,
"step": 655
},
{
"epoch": 0.14563106796116504,
"grad_norm": 0.4854044020175934,
"learning_rate": 9.986018985905901e-06,
"loss": 0.2624,
"step": 660
},
{
"epoch": 0.14673433362753752,
"grad_norm": 0.6195608973503113,
"learning_rate": 9.985329005918702e-06,
"loss": 0.26,
"step": 665
},
{
"epoch": 0.14783759929390997,
"grad_norm": 0.5011169910430908,
"learning_rate": 9.984622432845835e-06,
"loss": 0.2468,
"step": 670
},
{
"epoch": 0.14894086496028244,
"grad_norm": 0.5306973457336426,
"learning_rate": 9.98389926903904e-06,
"loss": 0.2478,
"step": 675
},
{
"epoch": 0.1500441306266549,
"grad_norm": 0.5306282043457031,
"learning_rate": 9.983159516905287e-06,
"loss": 0.2589,
"step": 680
},
{
"epoch": 0.15114739629302737,
"grad_norm": 0.47503378987312317,
"learning_rate": 9.982403178906755e-06,
"loss": 0.2467,
"step": 685
},
{
"epoch": 0.15225066195939982,
"grad_norm": 0.548812985420227,
"learning_rate": 9.981630257560825e-06,
"loss": 0.2649,
"step": 690
},
{
"epoch": 0.1533539276257723,
"grad_norm": 0.5385993719100952,
"learning_rate": 9.980840755440075e-06,
"loss": 0.2507,
"step": 695
},
{
"epoch": 0.15445719329214475,
"grad_norm": 0.5004397034645081,
"learning_rate": 9.980034675172274e-06,
"loss": 0.2648,
"step": 700
},
{
"epoch": 0.15445719329214475,
"eval_loss": 0.2640049159526825,
"eval_runtime": 273.2064,
"eval_samples_per_second": 55.866,
"eval_steps_per_second": 6.984,
"step": 700
},
{
"epoch": 0.15556045895851722,
"grad_norm": 0.5281751155853271,
"learning_rate": 9.979212019440364e-06,
"loss": 0.2598,
"step": 705
},
{
"epoch": 0.15666372462488967,
"grad_norm": 0.41343384981155396,
"learning_rate": 9.978372790982457e-06,
"loss": 0.2502,
"step": 710
},
{
"epoch": 0.15776699029126215,
"grad_norm": 0.4603078365325928,
"learning_rate": 9.977516992591832e-06,
"loss": 0.2716,
"step": 715
},
{
"epoch": 0.1588702559576346,
"grad_norm": 0.47830113768577576,
"learning_rate": 9.976644627116906e-06,
"loss": 0.2532,
"step": 720
},
{
"epoch": 0.15997352162400705,
"grad_norm": 0.4745546281337738,
"learning_rate": 9.975755697461254e-06,
"loss": 0.2602,
"step": 725
},
{
"epoch": 0.16107678729037953,
"grad_norm": 0.47886019945144653,
"learning_rate": 9.97485020658357e-06,
"loss": 0.273,
"step": 730
},
{
"epoch": 0.16218005295675197,
"grad_norm": 0.7469276189804077,
"learning_rate": 9.973928157497675e-06,
"loss": 0.2631,
"step": 735
},
{
"epoch": 0.16328331862312445,
"grad_norm": 0.520545482635498,
"learning_rate": 9.972989553272501e-06,
"loss": 0.2506,
"step": 740
},
{
"epoch": 0.1643865842894969,
"grad_norm": 0.428124338388443,
"learning_rate": 9.972034397032086e-06,
"loss": 0.2482,
"step": 745
},
{
"epoch": 0.16548984995586938,
"grad_norm": 0.5379740595817566,
"learning_rate": 9.971062691955553e-06,
"loss": 0.2557,
"step": 750
},
{
"epoch": 0.16659311562224183,
"grad_norm": 0.5184879899024963,
"learning_rate": 9.970074441277111e-06,
"loss": 0.2587,
"step": 755
},
{
"epoch": 0.1676963812886143,
"grad_norm": 0.4643745422363281,
"learning_rate": 9.969069648286034e-06,
"loss": 0.2538,
"step": 760
},
{
"epoch": 0.16879964695498675,
"grad_norm": 0.48147132992744446,
"learning_rate": 9.968048316326661e-06,
"loss": 0.2534,
"step": 765
},
{
"epoch": 0.16990291262135923,
"grad_norm": 0.5775375366210938,
"learning_rate": 9.967010448798376e-06,
"loss": 0.2659,
"step": 770
},
{
"epoch": 0.17100617828773168,
"grad_norm": 0.5497225522994995,
"learning_rate": 9.9659560491556e-06,
"loss": 0.2464,
"step": 775
},
{
"epoch": 0.17210944395410416,
"grad_norm": 0.4842096269130707,
"learning_rate": 9.964885120907777e-06,
"loss": 0.2341,
"step": 780
},
{
"epoch": 0.1732127096204766,
"grad_norm": 0.5398717522621155,
"learning_rate": 9.963797667619368e-06,
"loss": 0.2585,
"step": 785
},
{
"epoch": 0.17431597528684908,
"grad_norm": 0.5121772289276123,
"learning_rate": 9.962693692909834e-06,
"loss": 0.2677,
"step": 790
},
{
"epoch": 0.17541924095322153,
"grad_norm": 0.5982369184494019,
"learning_rate": 9.961573200453627e-06,
"loss": 0.2572,
"step": 795
},
{
"epoch": 0.176522506619594,
"grad_norm": 0.527195394039154,
"learning_rate": 9.960436193980175e-06,
"loss": 0.2503,
"step": 800
},
{
"epoch": 0.176522506619594,
"eval_loss": 0.2625011205673218,
"eval_runtime": 269.8781,
"eval_samples_per_second": 56.555,
"eval_steps_per_second": 7.07,
"step": 800
},
{
"epoch": 0.17762577228596646,
"grad_norm": 0.5573858022689819,
"learning_rate": 9.959282677273869e-06,
"loss": 0.266,
"step": 805
},
{
"epoch": 0.1787290379523389,
"grad_norm": 0.4883844554424286,
"learning_rate": 9.958112654174058e-06,
"loss": 0.2572,
"step": 810
},
{
"epoch": 0.1798323036187114,
"grad_norm": 0.46676844358444214,
"learning_rate": 9.956926128575026e-06,
"loss": 0.2459,
"step": 815
},
{
"epoch": 0.18093556928508384,
"grad_norm": 0.4161823093891144,
"learning_rate": 9.955723104425986e-06,
"loss": 0.2411,
"step": 820
},
{
"epoch": 0.1820388349514563,
"grad_norm": 0.45280569791793823,
"learning_rate": 9.954503585731061e-06,
"loss": 0.2586,
"step": 825
},
{
"epoch": 0.18314210061782876,
"grad_norm": 0.43848279118537903,
"learning_rate": 9.953267576549279e-06,
"loss": 0.2464,
"step": 830
},
{
"epoch": 0.18424536628420124,
"grad_norm": 0.48168620467185974,
"learning_rate": 9.95201508099455e-06,
"loss": 0.2512,
"step": 835
},
{
"epoch": 0.1853486319505737,
"grad_norm": 0.5931830406188965,
"learning_rate": 9.950746103235663e-06,
"loss": 0.2526,
"step": 840
},
{
"epoch": 0.18645189761694617,
"grad_norm": 0.4570105969905853,
"learning_rate": 9.949460647496258e-06,
"loss": 0.2457,
"step": 845
},
{
"epoch": 0.18755516328331862,
"grad_norm": 0.4142732322216034,
"learning_rate": 9.948158718054828e-06,
"loss": 0.2441,
"step": 850
},
{
"epoch": 0.1886584289496911,
"grad_norm": 0.48708921670913696,
"learning_rate": 9.94684031924469e-06,
"loss": 0.2577,
"step": 855
},
{
"epoch": 0.18976169461606354,
"grad_norm": 0.5016698241233826,
"learning_rate": 9.945505455453983e-06,
"loss": 0.2562,
"step": 860
},
{
"epoch": 0.19086496028243602,
"grad_norm": 0.5526902675628662,
"learning_rate": 9.944154131125643e-06,
"loss": 0.255,
"step": 865
},
{
"epoch": 0.19196822594880847,
"grad_norm": 0.526472806930542,
"learning_rate": 9.942786350757398e-06,
"loss": 0.2659,
"step": 870
},
{
"epoch": 0.19307149161518095,
"grad_norm": 0.5003820061683655,
"learning_rate": 9.941402118901743e-06,
"loss": 0.2565,
"step": 875
},
{
"epoch": 0.1941747572815534,
"grad_norm": 0.5030418038368225,
"learning_rate": 9.940001440165934e-06,
"loss": 0.2628,
"step": 880
},
{
"epoch": 0.19527802294792587,
"grad_norm": 0.47498998045921326,
"learning_rate": 9.938584319211965e-06,
"loss": 0.2744,
"step": 885
},
{
"epoch": 0.19638128861429832,
"grad_norm": 0.5270429253578186,
"learning_rate": 9.93715076075656e-06,
"loss": 0.2561,
"step": 890
},
{
"epoch": 0.1974845542806708,
"grad_norm": 0.5205044150352478,
"learning_rate": 9.935700769571148e-06,
"loss": 0.2449,
"step": 895
},
{
"epoch": 0.19858781994704325,
"grad_norm": 0.41483354568481445,
"learning_rate": 9.934234350481856e-06,
"loss": 0.2595,
"step": 900
},
{
"epoch": 0.19858781994704325,
"eval_loss": 0.26145488023757935,
"eval_runtime": 273.4791,
"eval_samples_per_second": 55.81,
"eval_steps_per_second": 6.977,
"step": 900
},
{
"epoch": 0.1996910856134157,
"grad_norm": 0.5100669860839844,
"learning_rate": 9.932751508369492e-06,
"loss": 0.2485,
"step": 905
},
{
"epoch": 0.20079435127978817,
"grad_norm": 0.4988202154636383,
"learning_rate": 9.931252248169518e-06,
"loss": 0.2555,
"step": 910
},
{
"epoch": 0.20189761694616062,
"grad_norm": 0.49361705780029297,
"learning_rate": 9.929736574872052e-06,
"loss": 0.2579,
"step": 915
},
{
"epoch": 0.2030008826125331,
"grad_norm": 0.4196428060531616,
"learning_rate": 9.92820449352183e-06,
"loss": 0.2456,
"step": 920
},
{
"epoch": 0.20410414827890555,
"grad_norm": 0.425731897354126,
"learning_rate": 9.926656009218208e-06,
"loss": 0.2457,
"step": 925
},
{
"epoch": 0.20520741394527803,
"grad_norm": 0.4996449649333954,
"learning_rate": 9.925091127115139e-06,
"loss": 0.2689,
"step": 930
},
{
"epoch": 0.20631067961165048,
"grad_norm": 0.4646408259868622,
"learning_rate": 9.923509852421144e-06,
"loss": 0.2429,
"step": 935
},
{
"epoch": 0.20741394527802295,
"grad_norm": 0.5681747794151306,
"learning_rate": 9.921912190399317e-06,
"loss": 0.2581,
"step": 940
},
{
"epoch": 0.2085172109443954,
"grad_norm": 0.45096200704574585,
"learning_rate": 9.920298146367287e-06,
"loss": 0.2465,
"step": 945
},
{
"epoch": 0.20962047661076788,
"grad_norm": 0.4459875226020813,
"learning_rate": 9.91866772569721e-06,
"loss": 0.2593,
"step": 950
},
{
"epoch": 0.21072374227714033,
"grad_norm": 0.4605613946914673,
"learning_rate": 9.917020933815753e-06,
"loss": 0.2646,
"step": 955
},
{
"epoch": 0.2118270079435128,
"grad_norm": 0.5199949741363525,
"learning_rate": 9.91535777620407e-06,
"loss": 0.2572,
"step": 960
},
{
"epoch": 0.21293027360988526,
"grad_norm": 0.42653989791870117,
"learning_rate": 9.913678258397785e-06,
"loss": 0.2547,
"step": 965
},
{
"epoch": 0.21403353927625773,
"grad_norm": 0.5204625725746155,
"learning_rate": 9.91198238598698e-06,
"loss": 0.2407,
"step": 970
},
{
"epoch": 0.21513680494263018,
"grad_norm": 0.4516238272190094,
"learning_rate": 9.910270164616168e-06,
"loss": 0.2531,
"step": 975
},
{
"epoch": 0.21624007060900266,
"grad_norm": 0.4373854696750641,
"learning_rate": 9.908541599984276e-06,
"loss": 0.2495,
"step": 980
},
{
"epoch": 0.2173433362753751,
"grad_norm": 0.5143552422523499,
"learning_rate": 9.90679669784463e-06,
"loss": 0.2496,
"step": 985
},
{
"epoch": 0.21844660194174756,
"grad_norm": 0.41742590069770813,
"learning_rate": 9.905035464004935e-06,
"loss": 0.2481,
"step": 990
},
{
"epoch": 0.21954986760812004,
"grad_norm": 0.46620362997055054,
"learning_rate": 9.90325790432725e-06,
"loss": 0.2625,
"step": 995
},
{
"epoch": 0.22065313327449249,
"grad_norm": 0.4866413176059723,
"learning_rate": 9.901464024727976e-06,
"loss": 0.247,
"step": 1000
},
{
"epoch": 0.22065313327449249,
"eval_loss": 0.25996777415275574,
"eval_runtime": 273.2634,
"eval_samples_per_second": 55.855,
"eval_steps_per_second": 6.982,
"step": 1000
},
{
"epoch": 0.22175639894086496,
"grad_norm": 0.4647798240184784,
"learning_rate": 9.899653831177831e-06,
"loss": 0.2528,
"step": 1005
},
{
"epoch": 0.2228596646072374,
"grad_norm": 0.4932115972042084,
"learning_rate": 9.897827329701834e-06,
"loss": 0.2544,
"step": 1010
},
{
"epoch": 0.2239629302736099,
"grad_norm": 0.4925852417945862,
"learning_rate": 9.895984526379282e-06,
"loss": 0.2621,
"step": 1015
},
{
"epoch": 0.22506619593998234,
"grad_norm": 0.5298591256141663,
"learning_rate": 9.89412542734373e-06,
"loss": 0.2542,
"step": 1020
},
{
"epoch": 0.22616946160635482,
"grad_norm": 0.5149207711219788,
"learning_rate": 9.892250038782972e-06,
"loss": 0.2579,
"step": 1025
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.45972946286201477,
"learning_rate": 9.890358366939021e-06,
"loss": 0.2534,
"step": 1030
},
{
"epoch": 0.22837599293909974,
"grad_norm": 0.4157005846500397,
"learning_rate": 9.888450418108085e-06,
"loss": 0.243,
"step": 1035
},
{
"epoch": 0.2294792586054722,
"grad_norm": 0.39558079838752747,
"learning_rate": 9.88652619864055e-06,
"loss": 0.2461,
"step": 1040
},
{
"epoch": 0.23058252427184467,
"grad_norm": 0.47637176513671875,
"learning_rate": 9.884585714940953e-06,
"loss": 0.2353,
"step": 1045
},
{
"epoch": 0.23168578993821712,
"grad_norm": 0.5233368277549744,
"learning_rate": 9.882628973467972e-06,
"loss": 0.2536,
"step": 1050
},
{
"epoch": 0.2327890556045896,
"grad_norm": 0.4879682660102844,
"learning_rate": 9.880655980734391e-06,
"loss": 0.2611,
"step": 1055
},
{
"epoch": 0.23389232127096204,
"grad_norm": 0.4481244385242462,
"learning_rate": 9.878666743307083e-06,
"loss": 0.2549,
"step": 1060
},
{
"epoch": 0.23499558693733452,
"grad_norm": 0.43410834670066833,
"learning_rate": 9.876661267806995e-06,
"loss": 0.2589,
"step": 1065
},
{
"epoch": 0.23609885260370697,
"grad_norm": 0.6263585686683655,
"learning_rate": 9.874639560909118e-06,
"loss": 0.2471,
"step": 1070
},
{
"epoch": 0.23720211827007945,
"grad_norm": 0.47275134921073914,
"learning_rate": 9.872601629342468e-06,
"loss": 0.2575,
"step": 1075
},
{
"epoch": 0.2383053839364519,
"grad_norm": 0.5037277340888977,
"learning_rate": 9.870547479890062e-06,
"loss": 0.2549,
"step": 1080
},
{
"epoch": 0.23940864960282435,
"grad_norm": 0.5256139039993286,
"learning_rate": 9.868477119388897e-06,
"loss": 0.2574,
"step": 1085
},
{
"epoch": 0.24051191526919682,
"grad_norm": 0.46220555901527405,
"learning_rate": 9.866390554729923e-06,
"loss": 0.257,
"step": 1090
},
{
"epoch": 0.24161518093556927,
"grad_norm": 0.48123809695243835,
"learning_rate": 9.864287792858032e-06,
"loss": 0.2437,
"step": 1095
},
{
"epoch": 0.24271844660194175,
"grad_norm": 0.5462665557861328,
"learning_rate": 9.862168840772018e-06,
"loss": 0.2454,
"step": 1100
},
{
"epoch": 0.24271844660194175,
"eval_loss": 0.25873637199401855,
"eval_runtime": 271.9106,
"eval_samples_per_second": 56.132,
"eval_steps_per_second": 7.017,
"step": 1100
},
{
"epoch": 0.2438217122683142,
"grad_norm": 0.5108821392059326,
"learning_rate": 9.860033705524566e-06,
"loss": 0.247,
"step": 1105
},
{
"epoch": 0.24492497793468668,
"grad_norm": 0.47107893228530884,
"learning_rate": 9.857882394222225e-06,
"loss": 0.2546,
"step": 1110
},
{
"epoch": 0.24602824360105913,
"grad_norm": 0.4935952425003052,
"learning_rate": 9.855714914025386e-06,
"loss": 0.247,
"step": 1115
},
{
"epoch": 0.2471315092674316,
"grad_norm": 0.5136795043945312,
"learning_rate": 9.853531272148248e-06,
"loss": 0.2615,
"step": 1120
},
{
"epoch": 0.24823477493380405,
"grad_norm": 0.5249958634376526,
"learning_rate": 9.851331475858813e-06,
"loss": 0.2619,
"step": 1125
},
{
"epoch": 0.24933804060017653,
"grad_norm": 0.4954059422016144,
"learning_rate": 9.849115532478848e-06,
"loss": 0.2473,
"step": 1130
},
{
"epoch": 0.250441306266549,
"grad_norm": 0.47944945096969604,
"learning_rate": 9.846883449383854e-06,
"loss": 0.2566,
"step": 1135
},
{
"epoch": 0.25154457193292146,
"grad_norm": 0.5183018445968628,
"learning_rate": 9.844635234003067e-06,
"loss": 0.2629,
"step": 1140
},
{
"epoch": 0.25264783759929393,
"grad_norm": 0.4572855830192566,
"learning_rate": 9.842370893819404e-06,
"loss": 0.2593,
"step": 1145
},
{
"epoch": 0.25375110326566636,
"grad_norm": 0.4775985777378082,
"learning_rate": 9.840090436369458e-06,
"loss": 0.2354,
"step": 1150
},
{
"epoch": 0.25485436893203883,
"grad_norm": 0.48503291606903076,
"learning_rate": 9.837793869243468e-06,
"loss": 0.2483,
"step": 1155
},
{
"epoch": 0.2559576345984113,
"grad_norm": 0.46030426025390625,
"learning_rate": 9.83548120008529e-06,
"loss": 0.2616,
"step": 1160
},
{
"epoch": 0.2570609002647838,
"grad_norm": 0.5037588477134705,
"learning_rate": 9.83315243659237e-06,
"loss": 0.2488,
"step": 1165
},
{
"epoch": 0.2581641659311562,
"grad_norm": 0.508270263671875,
"learning_rate": 9.830807586515726e-06,
"loss": 0.2579,
"step": 1170
},
{
"epoch": 0.2592674315975287,
"grad_norm": 0.4799206554889679,
"learning_rate": 9.828446657659919e-06,
"loss": 0.25,
"step": 1175
},
{
"epoch": 0.26037069726390116,
"grad_norm": 0.531873881816864,
"learning_rate": 9.826069657883027e-06,
"loss": 0.2467,
"step": 1180
},
{
"epoch": 0.2614739629302736,
"grad_norm": 0.5633664727210999,
"learning_rate": 9.823676595096612e-06,
"loss": 0.2595,
"step": 1185
},
{
"epoch": 0.26257722859664606,
"grad_norm": 0.5257665514945984,
"learning_rate": 9.821267477265705e-06,
"loss": 0.2662,
"step": 1190
},
{
"epoch": 0.26368049426301854,
"grad_norm": 0.5463647246360779,
"learning_rate": 9.818842312408776e-06,
"loss": 0.2478,
"step": 1195
},
{
"epoch": 0.264783759929391,
"grad_norm": 0.4790140986442566,
"learning_rate": 9.816401108597704e-06,
"loss": 0.2516,
"step": 1200
},
{
"epoch": 0.264783759929391,
"eval_loss": 0.25740158557891846,
"eval_runtime": 274.3768,
"eval_samples_per_second": 55.628,
"eval_steps_per_second": 6.954,
"step": 1200
},
{
"epoch": 0.26588702559576344,
"grad_norm": 0.44939637184143066,
"learning_rate": 9.813943873957748e-06,
"loss": 0.2568,
"step": 1205
},
{
"epoch": 0.2669902912621359,
"grad_norm": 0.44032007455825806,
"learning_rate": 9.811470616667525e-06,
"loss": 0.2598,
"step": 1210
},
{
"epoch": 0.2680935569285084,
"grad_norm": 0.4683074951171875,
"learning_rate": 9.808981344958988e-06,
"loss": 0.2468,
"step": 1215
},
{
"epoch": 0.26919682259488087,
"grad_norm": 0.46099165081977844,
"learning_rate": 9.806476067117384e-06,
"loss": 0.2597,
"step": 1220
},
{
"epoch": 0.2703000882612533,
"grad_norm": 0.47137463092803955,
"learning_rate": 9.803954791481239e-06,
"loss": 0.2564,
"step": 1225
},
{
"epoch": 0.27140335392762577,
"grad_norm": 0.41110455989837646,
"learning_rate": 9.801417526442326e-06,
"loss": 0.256,
"step": 1230
},
{
"epoch": 0.27250661959399824,
"grad_norm": 0.4750699996948242,
"learning_rate": 9.798864280445633e-06,
"loss": 0.2461,
"step": 1235
},
{
"epoch": 0.2736098852603707,
"grad_norm": 0.4262714684009552,
"learning_rate": 9.79629506198934e-06,
"loss": 0.2611,
"step": 1240
},
{
"epoch": 0.27471315092674314,
"grad_norm": 0.4699675738811493,
"learning_rate": 9.793709879624797e-06,
"loss": 0.2454,
"step": 1245
},
{
"epoch": 0.2758164165931156,
"grad_norm": 0.4742600619792938,
"learning_rate": 9.791108741956476e-06,
"loss": 0.2583,
"step": 1250
},
{
"epoch": 0.2769196822594881,
"grad_norm": 0.4389561414718628,
"learning_rate": 9.78849165764196e-06,
"loss": 0.24,
"step": 1255
},
{
"epoch": 0.2780229479258605,
"grad_norm": 0.4927821457386017,
"learning_rate": 9.785858635391913e-06,
"loss": 0.2527,
"step": 1260
},
{
"epoch": 0.279126213592233,
"grad_norm": 0.38495415449142456,
"learning_rate": 9.78320968397004e-06,
"loss": 0.2411,
"step": 1265
},
{
"epoch": 0.2802294792586055,
"grad_norm": 0.4532706141471863,
"learning_rate": 9.780544812193065e-06,
"loss": 0.234,
"step": 1270
},
{
"epoch": 0.28133274492497795,
"grad_norm": 0.48407411575317383,
"learning_rate": 9.777864028930705e-06,
"loss": 0.2599,
"step": 1275
},
{
"epoch": 0.2824360105913504,
"grad_norm": 0.47091105580329895,
"learning_rate": 9.77516734310563e-06,
"loss": 0.2445,
"step": 1280
},
{
"epoch": 0.28353927625772285,
"grad_norm": 0.5425460934638977,
"learning_rate": 9.772454763693453e-06,
"loss": 0.2499,
"step": 1285
},
{
"epoch": 0.2846425419240953,
"grad_norm": 0.43206480145454407,
"learning_rate": 9.769726299722668e-06,
"loss": 0.2539,
"step": 1290
},
{
"epoch": 0.2857458075904678,
"grad_norm": 0.49715983867645264,
"learning_rate": 9.766981960274653e-06,
"loss": 0.2526,
"step": 1295
},
{
"epoch": 0.2868490732568402,
"grad_norm": 0.4886232018470764,
"learning_rate": 9.764221754483623e-06,
"loss": 0.2496,
"step": 1300
},
{
"epoch": 0.2868490732568402,
"eval_loss": 0.2564772367477417,
"eval_runtime": 269.851,
"eval_samples_per_second": 56.561,
"eval_steps_per_second": 7.071,
"step": 1300
},
{
"epoch": 0.2879523389232127,
"grad_norm": 0.4968324601650238,
"learning_rate": 9.761445691536598e-06,
"loss": 0.2526,
"step": 1305
},
{
"epoch": 0.2890556045895852,
"grad_norm": 0.48738226294517517,
"learning_rate": 9.758653780673381e-06,
"loss": 0.243,
"step": 1310
},
{
"epoch": 0.29015887025595766,
"grad_norm": 0.45023027062416077,
"learning_rate": 9.755846031186521e-06,
"loss": 0.2463,
"step": 1315
},
{
"epoch": 0.2912621359223301,
"grad_norm": 0.5096351504325867,
"learning_rate": 9.753022452421286e-06,
"loss": 0.2522,
"step": 1320
},
{
"epoch": 0.29236540158870256,
"grad_norm": 0.4321053922176361,
"learning_rate": 9.750183053775625e-06,
"loss": 0.2482,
"step": 1325
},
{
"epoch": 0.29346866725507503,
"grad_norm": 0.48243576288223267,
"learning_rate": 9.747327844700147e-06,
"loss": 0.2583,
"step": 1330
},
{
"epoch": 0.2945719329214475,
"grad_norm": 0.5312182903289795,
"learning_rate": 9.744456834698083e-06,
"loss": 0.2437,
"step": 1335
},
{
"epoch": 0.29567519858781993,
"grad_norm": 0.46811169385910034,
"learning_rate": 9.741570033325254e-06,
"loss": 0.2387,
"step": 1340
},
{
"epoch": 0.2967784642541924,
"grad_norm": 0.4737708568572998,
"learning_rate": 9.738667450190041e-06,
"loss": 0.2715,
"step": 1345
},
{
"epoch": 0.2978817299205649,
"grad_norm": 0.4285770058631897,
"learning_rate": 9.73574909495335e-06,
"loss": 0.2318,
"step": 1350
},
{
"epoch": 0.2989849955869373,
"grad_norm": 0.42456915974617004,
"learning_rate": 9.732814977328593e-06,
"loss": 0.2534,
"step": 1355
},
{
"epoch": 0.3000882612533098,
"grad_norm": 0.4388004243373871,
"learning_rate": 9.729865107081631e-06,
"loss": 0.2494,
"step": 1360
},
{
"epoch": 0.30119152691968226,
"grad_norm": 0.48463258147239685,
"learning_rate": 9.726899494030768e-06,
"loss": 0.2542,
"step": 1365
},
{
"epoch": 0.30229479258605474,
"grad_norm": 0.4798240661621094,
"learning_rate": 9.723918148046696e-06,
"loss": 0.2485,
"step": 1370
},
{
"epoch": 0.30339805825242716,
"grad_norm": 0.5145127177238464,
"learning_rate": 9.720921079052483e-06,
"loss": 0.2463,
"step": 1375
},
{
"epoch": 0.30450132391879964,
"grad_norm": 0.4174281358718872,
"learning_rate": 9.717908297023517e-06,
"loss": 0.2394,
"step": 1380
},
{
"epoch": 0.3056045895851721,
"grad_norm": 0.4736640155315399,
"learning_rate": 9.714879811987496e-06,
"loss": 0.2474,
"step": 1385
},
{
"epoch": 0.3067078552515446,
"grad_norm": 0.46315228939056396,
"learning_rate": 9.711835634024378e-06,
"loss": 0.2482,
"step": 1390
},
{
"epoch": 0.307811120917917,
"grad_norm": 0.541100800037384,
"learning_rate": 9.708775773266353e-06,
"loss": 0.25,
"step": 1395
},
{
"epoch": 0.3089143865842895,
"grad_norm": 0.4666937589645386,
"learning_rate": 9.705700239897809e-06,
"loss": 0.239,
"step": 1400
},
{
"epoch": 0.3089143865842895,
"eval_loss": 0.25553634762763977,
"eval_runtime": 271.0024,
"eval_samples_per_second": 56.321,
"eval_steps_per_second": 7.041,
"step": 1400
},
{
"epoch": 0.31001765225066197,
"grad_norm": 0.49646076560020447,
"learning_rate": 9.702609044155303e-06,
"loss": 0.2436,
"step": 1405
},
{
"epoch": 0.31112091791703445,
"grad_norm": 0.48308032751083374,
"learning_rate": 9.699502196327515e-06,
"loss": 0.2517,
"step": 1410
},
{
"epoch": 0.31222418358340687,
"grad_norm": 0.6409610509872437,
"learning_rate": 9.69637970675523e-06,
"loss": 0.2509,
"step": 1415
},
{
"epoch": 0.31332744924977934,
"grad_norm": 0.5959620475769043,
"learning_rate": 9.69324158583129e-06,
"loss": 0.256,
"step": 1420
},
{
"epoch": 0.3144307149161518,
"grad_norm": 0.5620144009590149,
"learning_rate": 9.69008784400056e-06,
"loss": 0.2569,
"step": 1425
},
{
"epoch": 0.3155339805825243,
"grad_norm": 0.5051830410957336,
"learning_rate": 9.686918491759904e-06,
"loss": 0.2471,
"step": 1430
},
{
"epoch": 0.3166372462488967,
"grad_norm": 0.49281784892082214,
"learning_rate": 9.68373353965814e-06,
"loss": 0.2301,
"step": 1435
},
{
"epoch": 0.3177405119152692,
"grad_norm": 0.4283227324485779,
"learning_rate": 9.68053299829601e-06,
"loss": 0.2344,
"step": 1440
},
{
"epoch": 0.3188437775816417,
"grad_norm": 0.4529547095298767,
"learning_rate": 9.677316878326144e-06,
"loss": 0.2557,
"step": 1445
},
{
"epoch": 0.3199470432480141,
"grad_norm": 0.40247344970703125,
"learning_rate": 9.67408519045302e-06,
"loss": 0.2486,
"step": 1450
},
{
"epoch": 0.3210503089143866,
"grad_norm": 0.43372419476509094,
"learning_rate": 9.670837945432934e-06,
"loss": 0.2453,
"step": 1455
},
{
"epoch": 0.32215357458075905,
"grad_norm": 0.4570685625076294,
"learning_rate": 9.667575154073962e-06,
"loss": 0.2617,
"step": 1460
},
{
"epoch": 0.3232568402471315,
"grad_norm": 0.5153756141662598,
"learning_rate": 9.664296827235924e-06,
"loss": 0.2564,
"step": 1465
},
{
"epoch": 0.32436010591350395,
"grad_norm": 0.47910332679748535,
"learning_rate": 9.66100297583035e-06,
"loss": 0.2503,
"step": 1470
},
{
"epoch": 0.3254633715798764,
"grad_norm": 0.4647476077079773,
"learning_rate": 9.657693610820437e-06,
"loss": 0.2367,
"step": 1475
},
{
"epoch": 0.3265666372462489,
"grad_norm": 0.5447574257850647,
"learning_rate": 9.654368743221022e-06,
"loss": 0.2547,
"step": 1480
},
{
"epoch": 0.3276699029126214,
"grad_norm": 0.493915319442749,
"learning_rate": 9.651028384098538e-06,
"loss": 0.2386,
"step": 1485
},
{
"epoch": 0.3287731685789938,
"grad_norm": 0.4700816869735718,
"learning_rate": 9.647672544570981e-06,
"loss": 0.2537,
"step": 1490
},
{
"epoch": 0.3298764342453663,
"grad_norm": 0.38883256912231445,
"learning_rate": 9.644301235807872e-06,
"loss": 0.233,
"step": 1495
},
{
"epoch": 0.33097969991173876,
"grad_norm": 0.4903203547000885,
"learning_rate": 9.640914469030216e-06,
"loss": 0.2415,
"step": 1500
},
{
"epoch": 0.33097969991173876,
"eval_loss": 0.2547125220298767,
"eval_runtime": 274.3354,
"eval_samples_per_second": 55.636,
"eval_steps_per_second": 6.955,
"step": 1500
},
{
"epoch": 0.33208296557811123,
"grad_norm": 0.4478644132614136,
"learning_rate": 9.637512255510475e-06,
"loss": 0.236,
"step": 1505
},
{
"epoch": 0.33318623124448365,
"grad_norm": 0.44624054431915283,
"learning_rate": 9.634094606572515e-06,
"loss": 0.2526,
"step": 1510
},
{
"epoch": 0.33428949691085613,
"grad_norm": 0.4568576514720917,
"learning_rate": 9.630661533591584e-06,
"loss": 0.2353,
"step": 1515
},
{
"epoch": 0.3353927625772286,
"grad_norm": 0.427226722240448,
"learning_rate": 9.627213047994265e-06,
"loss": 0.2532,
"step": 1520
},
{
"epoch": 0.3364960282436011,
"grad_norm": 0.4701986610889435,
"learning_rate": 9.623749161258437e-06,
"loss": 0.2349,
"step": 1525
},
{
"epoch": 0.3375992939099735,
"grad_norm": 0.5643903017044067,
"learning_rate": 9.620269884913247e-06,
"loss": 0.259,
"step": 1530
},
{
"epoch": 0.338702559576346,
"grad_norm": 0.49091801047325134,
"learning_rate": 9.616775230539057e-06,
"loss": 0.2512,
"step": 1535
},
{
"epoch": 0.33980582524271846,
"grad_norm": 0.5190874338150024,
"learning_rate": 9.613265209767417e-06,
"loss": 0.245,
"step": 1540
},
{
"epoch": 0.3409090909090909,
"grad_norm": 0.6141373515129089,
"learning_rate": 9.609739834281023e-06,
"loss": 0.2742,
"step": 1545
},
{
"epoch": 0.34201235657546336,
"grad_norm": 0.5128368139266968,
"learning_rate": 9.606199115813672e-06,
"loss": 0.2559,
"step": 1550
},
{
"epoch": 0.34311562224183584,
"grad_norm": 0.441245436668396,
"learning_rate": 9.602643066150235e-06,
"loss": 0.2515,
"step": 1555
},
{
"epoch": 0.3442188879082083,
"grad_norm": 0.4743674397468567,
"learning_rate": 9.599071697126608e-06,
"loss": 0.2541,
"step": 1560
},
{
"epoch": 0.34532215357458074,
"grad_norm": 0.5153236389160156,
"learning_rate": 9.595485020629676e-06,
"loss": 0.2578,
"step": 1565
},
{
"epoch": 0.3464254192409532,
"grad_norm": 0.4311087131500244,
"learning_rate": 9.591883048597273e-06,
"loss": 0.2548,
"step": 1570
},
{
"epoch": 0.3475286849073257,
"grad_norm": 0.494365930557251,
"learning_rate": 9.588265793018141e-06,
"loss": 0.256,
"step": 1575
},
{
"epoch": 0.34863195057369817,
"grad_norm": 0.426740825176239,
"learning_rate": 9.584633265931894e-06,
"loss": 0.2547,
"step": 1580
},
{
"epoch": 0.3497352162400706,
"grad_norm": 0.4335707426071167,
"learning_rate": 9.580985479428975e-06,
"loss": 0.241,
"step": 1585
},
{
"epoch": 0.35083848190644307,
"grad_norm": 0.47340667247772217,
"learning_rate": 9.577322445650616e-06,
"loss": 0.2437,
"step": 1590
},
{
"epoch": 0.35194174757281554,
"grad_norm": 0.48211535811424255,
"learning_rate": 9.573644176788795e-06,
"loss": 0.238,
"step": 1595
},
{
"epoch": 0.353045013239188,
"grad_norm": 0.515032172203064,
"learning_rate": 9.569950685086202e-06,
"loss": 0.2646,
"step": 1600
},
{
"epoch": 0.353045013239188,
"eval_loss": 0.2541050612926483,
"eval_runtime": 272.7736,
"eval_samples_per_second": 55.955,
"eval_steps_per_second": 6.995,
"step": 1600
},
{
"epoch": 0.35414827890556044,
"grad_norm": 0.4244433641433716,
"learning_rate": 9.566241982836193e-06,
"loss": 0.2487,
"step": 1605
},
{
"epoch": 0.3552515445719329,
"grad_norm": 0.4410102367401123,
"learning_rate": 9.562518082382751e-06,
"loss": 0.2385,
"step": 1610
},
{
"epoch": 0.3563548102383054,
"grad_norm": 0.5115966200828552,
"learning_rate": 9.558778996120443e-06,
"loss": 0.2484,
"step": 1615
},
{
"epoch": 0.3574580759046778,
"grad_norm": 0.4943847954273224,
"learning_rate": 9.555024736494382e-06,
"loss": 0.2575,
"step": 1620
},
{
"epoch": 0.3585613415710503,
"grad_norm": 0.4769156277179718,
"learning_rate": 9.551255316000183e-06,
"loss": 0.2432,
"step": 1625
},
{
"epoch": 0.3596646072374228,
"grad_norm": 0.43486344814300537,
"learning_rate": 9.54747074718392e-06,
"loss": 0.2594,
"step": 1630
},
{
"epoch": 0.36076787290379525,
"grad_norm": 0.45673149824142456,
"learning_rate": 9.54367104264209e-06,
"loss": 0.2513,
"step": 1635
},
{
"epoch": 0.36187113857016767,
"grad_norm": 0.48159259557724,
"learning_rate": 9.539856215021568e-06,
"loss": 0.2467,
"step": 1640
},
{
"epoch": 0.36297440423654015,
"grad_norm": 0.4502279460430145,
"learning_rate": 9.536026277019562e-06,
"loss": 0.2485,
"step": 1645
},
{
"epoch": 0.3640776699029126,
"grad_norm": 0.5324723124504089,
"learning_rate": 9.53218124138357e-06,
"loss": 0.2417,
"step": 1650
},
{
"epoch": 0.3651809355692851,
"grad_norm": 0.48323342204093933,
"learning_rate": 9.528321120911345e-06,
"loss": 0.253,
"step": 1655
},
{
"epoch": 0.3662842012356575,
"grad_norm": 0.5192784667015076,
"learning_rate": 9.524445928450851e-06,
"loss": 0.2301,
"step": 1660
},
{
"epoch": 0.36738746690203,
"grad_norm": 0.5197545886039734,
"learning_rate": 9.520555676900214e-06,
"loss": 0.2443,
"step": 1665
},
{
"epoch": 0.3684907325684025,
"grad_norm": 0.45566871762275696,
"learning_rate": 9.516650379207677e-06,
"loss": 0.2447,
"step": 1670
},
{
"epoch": 0.36959399823477496,
"grad_norm": 0.5340574383735657,
"learning_rate": 9.51273004837157e-06,
"loss": 0.2477,
"step": 1675
},
{
"epoch": 0.3706972639011474,
"grad_norm": 0.4383482336997986,
"learning_rate": 9.508794697440257e-06,
"loss": 0.2335,
"step": 1680
},
{
"epoch": 0.37180052956751986,
"grad_norm": 0.5311030745506287,
"learning_rate": 9.504844339512096e-06,
"loss": 0.2474,
"step": 1685
},
{
"epoch": 0.37290379523389233,
"grad_norm": 0.5349487662315369,
"learning_rate": 9.50087898773539e-06,
"loss": 0.2625,
"step": 1690
},
{
"epoch": 0.3740070609002648,
"grad_norm": 0.42293423414230347,
"learning_rate": 9.49689865530835e-06,
"loss": 0.2428,
"step": 1695
},
{
"epoch": 0.37511032656663723,
"grad_norm": 0.4599260985851288,
"learning_rate": 9.492903355479047e-06,
"loss": 0.2497,
"step": 1700
},
{
"epoch": 0.37511032656663723,
"eval_loss": 0.25350308418273926,
"eval_runtime": 270.259,
"eval_samples_per_second": 56.475,
"eval_steps_per_second": 7.06,
"step": 1700
},
{
"epoch": 0.3762135922330097,
"grad_norm": 0.46413329243659973,
"learning_rate": 9.488893101545372e-06,
"loss": 0.2409,
"step": 1705
},
{
"epoch": 0.3773168578993822,
"grad_norm": 0.45214733481407166,
"learning_rate": 9.484867906854986e-06,
"loss": 0.2427,
"step": 1710
},
{
"epoch": 0.3784201235657546,
"grad_norm": 0.49880728125572205,
"learning_rate": 9.480827784805278e-06,
"loss": 0.2404,
"step": 1715
},
{
"epoch": 0.3795233892321271,
"grad_norm": 0.516257107257843,
"learning_rate": 9.476772748843327e-06,
"loss": 0.2531,
"step": 1720
},
{
"epoch": 0.38062665489849956,
"grad_norm": 0.4441586434841156,
"learning_rate": 9.472702812465843e-06,
"loss": 0.2339,
"step": 1725
},
{
"epoch": 0.38172992056487204,
"grad_norm": 0.4590930938720703,
"learning_rate": 9.468617989219136e-06,
"loss": 0.2465,
"step": 1730
},
{
"epoch": 0.38283318623124446,
"grad_norm": 0.43926405906677246,
"learning_rate": 9.46451829269906e-06,
"loss": 0.2475,
"step": 1735
},
{
"epoch": 0.38393645189761694,
"grad_norm": 0.4270091950893402,
"learning_rate": 9.460403736550982e-06,
"loss": 0.2404,
"step": 1740
},
{
"epoch": 0.3850397175639894,
"grad_norm": 0.4161515235900879,
"learning_rate": 9.45627433446972e-06,
"loss": 0.2428,
"step": 1745
},
{
"epoch": 0.3861429832303619,
"grad_norm": 0.4878949820995331,
"learning_rate": 9.452130100199504e-06,
"loss": 0.2636,
"step": 1750
},
{
"epoch": 0.3872462488967343,
"grad_norm": 0.4900050759315491,
"learning_rate": 9.447971047533936e-06,
"loss": 0.2415,
"step": 1755
},
{
"epoch": 0.3883495145631068,
"grad_norm": 0.43371209502220154,
"learning_rate": 9.443797190315938e-06,
"loss": 0.2469,
"step": 1760
},
{
"epoch": 0.38945278022947927,
"grad_norm": 0.43596795201301575,
"learning_rate": 9.439608542437704e-06,
"loss": 0.2394,
"step": 1765
},
{
"epoch": 0.39055604589585174,
"grad_norm": 0.4555245637893677,
"learning_rate": 9.435405117840662e-06,
"loss": 0.2435,
"step": 1770
},
{
"epoch": 0.39165931156222417,
"grad_norm": 0.46150678396224976,
"learning_rate": 9.431186930515419e-06,
"loss": 0.2585,
"step": 1775
},
{
"epoch": 0.39276257722859664,
"grad_norm": 0.42505866289138794,
"learning_rate": 9.42695399450172e-06,
"loss": 0.2386,
"step": 1780
},
{
"epoch": 0.3938658428949691,
"grad_norm": 0.49516651034355164,
"learning_rate": 9.422706323888398e-06,
"loss": 0.235,
"step": 1785
},
{
"epoch": 0.3949691085613416,
"grad_norm": 0.48143908381462097,
"learning_rate": 9.418443932813328e-06,
"loss": 0.2495,
"step": 1790
},
{
"epoch": 0.396072374227714,
"grad_norm": 0.5001795887947083,
"learning_rate": 9.414166835463383e-06,
"loss": 0.247,
"step": 1795
},
{
"epoch": 0.3971756398940865,
"grad_norm": 0.47970953583717346,
"learning_rate": 9.409875046074379e-06,
"loss": 0.2486,
"step": 1800
},
{
"epoch": 0.3971756398940865,
"eval_loss": 0.2526043653488159,
"eval_runtime": 269.6648,
"eval_samples_per_second": 56.6,
"eval_steps_per_second": 7.075,
"step": 1800
},
{
"epoch": 0.398278905560459,
"grad_norm": 0.5398975610733032,
"learning_rate": 9.405568578931042e-06,
"loss": 0.257,
"step": 1805
},
{
"epoch": 0.3993821712268314,
"grad_norm": 0.4145001769065857,
"learning_rate": 9.401247448366937e-06,
"loss": 0.2305,
"step": 1810
},
{
"epoch": 0.40048543689320387,
"grad_norm": 0.49223729968070984,
"learning_rate": 9.39691166876445e-06,
"loss": 0.2385,
"step": 1815
},
{
"epoch": 0.40158870255957635,
"grad_norm": 0.5020371675491333,
"learning_rate": 9.392561254554712e-06,
"loss": 0.2507,
"step": 1820
},
{
"epoch": 0.4026919682259488,
"grad_norm": 0.4438912868499756,
"learning_rate": 9.388196220217574e-06,
"loss": 0.2442,
"step": 1825
},
{
"epoch": 0.40379523389232125,
"grad_norm": 0.5784342288970947,
"learning_rate": 9.383816580281539e-06,
"loss": 0.2434,
"step": 1830
},
{
"epoch": 0.4048984995586937,
"grad_norm": 0.4573621451854706,
"learning_rate": 9.379422349323728e-06,
"loss": 0.2348,
"step": 1835
},
{
"epoch": 0.4060017652250662,
"grad_norm": 0.5133495926856995,
"learning_rate": 9.375013541969828e-06,
"loss": 0.2474,
"step": 1840
},
{
"epoch": 0.4071050308914387,
"grad_norm": 0.5082767605781555,
"learning_rate": 9.370590172894037e-06,
"loss": 0.2424,
"step": 1845
},
{
"epoch": 0.4082082965578111,
"grad_norm": 0.41318631172180176,
"learning_rate": 9.366152256819025e-06,
"loss": 0.2459,
"step": 1850
},
{
"epoch": 0.4093115622241836,
"grad_norm": 0.48783794045448303,
"learning_rate": 9.361699808515877e-06,
"loss": 0.2332,
"step": 1855
},
{
"epoch": 0.41041482789055606,
"grad_norm": 0.46912387013435364,
"learning_rate": 9.357232842804045e-06,
"loss": 0.2362,
"step": 1860
},
{
"epoch": 0.41151809355692853,
"grad_norm": 0.5062457323074341,
"learning_rate": 9.352751374551305e-06,
"loss": 0.2479,
"step": 1865
},
{
"epoch": 0.41262135922330095,
"grad_norm": 0.45189908146858215,
"learning_rate": 9.348255418673702e-06,
"loss": 0.2597,
"step": 1870
},
{
"epoch": 0.41372462488967343,
"grad_norm": 0.43714070320129395,
"learning_rate": 9.3437449901355e-06,
"loss": 0.2447,
"step": 1875
},
{
"epoch": 0.4148278905560459,
"grad_norm": 0.44575101137161255,
"learning_rate": 9.339220103949132e-06,
"loss": 0.2572,
"step": 1880
},
{
"epoch": 0.4159311562224184,
"grad_norm": 0.4869813024997711,
"learning_rate": 9.334680775175154e-06,
"loss": 0.2469,
"step": 1885
},
{
"epoch": 0.4170344218887908,
"grad_norm": 0.4805983901023865,
"learning_rate": 9.330127018922195e-06,
"loss": 0.2427,
"step": 1890
},
{
"epoch": 0.4181376875551633,
"grad_norm": 0.47126686573028564,
"learning_rate": 9.325558850346897e-06,
"loss": 0.2448,
"step": 1895
},
{
"epoch": 0.41924095322153576,
"grad_norm": 0.5163640975952148,
"learning_rate": 9.320976284653877e-06,
"loss": 0.2289,
"step": 1900
},
{
"epoch": 0.41924095322153576,
"eval_loss": 0.25212275981903076,
"eval_runtime": 270.6499,
"eval_samples_per_second": 56.394,
"eval_steps_per_second": 7.05,
"step": 1900
},
{
"epoch": 0.4203442188879082,
"grad_norm": 0.46562832593917847,
"learning_rate": 9.316379337095671e-06,
"loss": 0.255,
"step": 1905
},
{
"epoch": 0.42144748455428066,
"grad_norm": 0.3981192708015442,
"learning_rate": 9.311768022972682e-06,
"loss": 0.2455,
"step": 1910
},
{
"epoch": 0.42255075022065314,
"grad_norm": 0.4480000138282776,
"learning_rate": 9.307142357633132e-06,
"loss": 0.2437,
"step": 1915
},
{
"epoch": 0.4236540158870256,
"grad_norm": 0.4353036880493164,
"learning_rate": 9.302502356473006e-06,
"loss": 0.2435,
"step": 1920
},
{
"epoch": 0.42475728155339804,
"grad_norm": 0.42388132214546204,
"learning_rate": 9.297848034936007e-06,
"loss": 0.2458,
"step": 1925
},
{
"epoch": 0.4258605472197705,
"grad_norm": 0.5140712261199951,
"learning_rate": 9.293179408513501e-06,
"loss": 0.2469,
"step": 1930
},
{
"epoch": 0.426963812886143,
"grad_norm": 0.5060368180274963,
"learning_rate": 9.288496492744466e-06,
"loss": 0.2499,
"step": 1935
},
{
"epoch": 0.42806707855251547,
"grad_norm": 0.43134334683418274,
"learning_rate": 9.283799303215442e-06,
"loss": 0.233,
"step": 1940
},
{
"epoch": 0.4291703442188879,
"grad_norm": 0.48315203189849854,
"learning_rate": 9.279087855560474e-06,
"loss": 0.2457,
"step": 1945
},
{
"epoch": 0.43027360988526037,
"grad_norm": 0.4424877166748047,
"learning_rate": 9.274362165461064e-06,
"loss": 0.2402,
"step": 1950
},
{
"epoch": 0.43137687555163284,
"grad_norm": 0.47104790806770325,
"learning_rate": 9.269622248646124e-06,
"loss": 0.2419,
"step": 1955
},
{
"epoch": 0.4324801412180053,
"grad_norm": 0.4866120517253876,
"learning_rate": 9.264868120891913e-06,
"loss": 0.2428,
"step": 1960
},
{
"epoch": 0.43358340688437774,
"grad_norm": 0.47548824548721313,
"learning_rate": 9.260099798021988e-06,
"loss": 0.2355,
"step": 1965
},
{
"epoch": 0.4346866725507502,
"grad_norm": 0.4570111930370331,
"learning_rate": 9.255317295907158e-06,
"loss": 0.2509,
"step": 1970
},
{
"epoch": 0.4357899382171227,
"grad_norm": 0.5114912986755371,
"learning_rate": 9.250520630465419e-06,
"loss": 0.2409,
"step": 1975
},
{
"epoch": 0.4368932038834951,
"grad_norm": 0.38849082589149475,
"learning_rate": 9.245709817661917e-06,
"loss": 0.2413,
"step": 1980
},
{
"epoch": 0.4379964695498676,
"grad_norm": 0.5250911712646484,
"learning_rate": 9.240884873508876e-06,
"loss": 0.2416,
"step": 1985
},
{
"epoch": 0.4390997352162401,
"grad_norm": 0.43927446007728577,
"learning_rate": 9.236045814065563e-06,
"loss": 0.2399,
"step": 1990
},
{
"epoch": 0.44020300088261255,
"grad_norm": 0.5229560136795044,
"learning_rate": 9.231192655438222e-06,
"loss": 0.2536,
"step": 1995
},
{
"epoch": 0.44130626654898497,
"grad_norm": 0.5083780884742737,
"learning_rate": 9.226325413780021e-06,
"loss": 0.2324,
"step": 2000
},
{
"epoch": 0.44130626654898497,
"eval_loss": 0.25146690011024475,
"eval_runtime": 270.7254,
"eval_samples_per_second": 56.378,
"eval_steps_per_second": 7.048,
"step": 2000
},
{
"epoch": 0.44240953221535745,
"grad_norm": 0.4707069396972656,
"learning_rate": 9.221444105291013e-06,
"loss": 0.2594,
"step": 2005
},
{
"epoch": 0.4435127978817299,
"grad_norm": 0.4856661260128021,
"learning_rate": 9.216548746218056e-06,
"loss": 0.2493,
"step": 2010
},
{
"epoch": 0.4446160635481024,
"grad_norm": 0.4883829951286316,
"learning_rate": 9.211639352854786e-06,
"loss": 0.2468,
"step": 2015
},
{
"epoch": 0.4457193292144748,
"grad_norm": 0.4665009379386902,
"learning_rate": 9.206715941541547e-06,
"loss": 0.2519,
"step": 2020
},
{
"epoch": 0.4468225948808473,
"grad_norm": 0.45250222086906433,
"learning_rate": 9.201778528665333e-06,
"loss": 0.2436,
"step": 2025
},
{
"epoch": 0.4479258605472198,
"grad_norm": 0.457640677690506,
"learning_rate": 9.196827130659752e-06,
"loss": 0.2575,
"step": 2030
},
{
"epoch": 0.44902912621359226,
"grad_norm": 0.3947480320930481,
"learning_rate": 9.19186176400495e-06,
"loss": 0.2521,
"step": 2035
},
{
"epoch": 0.4501323918799647,
"grad_norm": 0.5039179921150208,
"learning_rate": 9.186882445227572e-06,
"loss": 0.2464,
"step": 2040
},
{
"epoch": 0.45123565754633715,
"grad_norm": 0.46842432022094727,
"learning_rate": 9.181889190900702e-06,
"loss": 0.2603,
"step": 2045
},
{
"epoch": 0.45233892321270963,
"grad_norm": 0.40255504846572876,
"learning_rate": 9.1768820176438e-06,
"loss": 0.242,
"step": 2050
},
{
"epoch": 0.4534421888790821,
"grad_norm": 0.4471176564693451,
"learning_rate": 9.17186094212266e-06,
"loss": 0.2349,
"step": 2055
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.4603224992752075,
"learning_rate": 9.166825981049345e-06,
"loss": 0.2449,
"step": 2060
},
{
"epoch": 0.455648720211827,
"grad_norm": 0.4398849308490753,
"learning_rate": 9.161777151182137e-06,
"loss": 0.2346,
"step": 2065
},
{
"epoch": 0.4567519858781995,
"grad_norm": 0.54154372215271,
"learning_rate": 9.156714469325474e-06,
"loss": 0.2469,
"step": 2070
},
{
"epoch": 0.4578552515445719,
"grad_norm": 0.5123459696769714,
"learning_rate": 9.151637952329903e-06,
"loss": 0.253,
"step": 2075
},
{
"epoch": 0.4589585172109444,
"grad_norm": 0.40907183289527893,
"learning_rate": 9.14654761709202e-06,
"loss": 0.2467,
"step": 2080
},
{
"epoch": 0.46006178287731686,
"grad_norm": 0.47456130385398865,
"learning_rate": 9.141443480554408e-06,
"loss": 0.2492,
"step": 2085
},
{
"epoch": 0.46116504854368934,
"grad_norm": 0.4863825738430023,
"learning_rate": 9.136325559705593e-06,
"loss": 0.2416,
"step": 2090
},
{
"epoch": 0.46226831421006176,
"grad_norm": 0.41943055391311646,
"learning_rate": 9.131193871579975e-06,
"loss": 0.2352,
"step": 2095
},
{
"epoch": 0.46337157987643424,
"grad_norm": 0.49511662125587463,
"learning_rate": 9.12604843325778e-06,
"loss": 0.2425,
"step": 2100
},
{
"epoch": 0.46337157987643424,
"eval_loss": 0.25082722306251526,
"eval_runtime": 268.229,
"eval_samples_per_second": 56.903,
"eval_steps_per_second": 7.113,
"step": 2100
},
{
"epoch": 0.4644748455428067,
"grad_norm": 0.4573848247528076,
"learning_rate": 9.120889261864999e-06,
"loss": 0.2586,
"step": 2105
},
{
"epoch": 0.4655781112091792,
"grad_norm": 0.48330816626548767,
"learning_rate": 9.11571637457333e-06,
"loss": 0.2471,
"step": 2110
},
{
"epoch": 0.4666813768755516,
"grad_norm": 0.5316815376281738,
"learning_rate": 9.110529788600127e-06,
"loss": 0.2398,
"step": 2115
},
{
"epoch": 0.4677846425419241,
"grad_norm": 0.45236679911613464,
"learning_rate": 9.105329521208334e-06,
"loss": 0.2471,
"step": 2120
},
{
"epoch": 0.46888790820829657,
"grad_norm": 0.4722297787666321,
"learning_rate": 9.100115589706436e-06,
"loss": 0.2428,
"step": 2125
},
{
"epoch": 0.46999117387466904,
"grad_norm": 0.4543675482273102,
"learning_rate": 9.094888011448391e-06,
"loss": 0.2516,
"step": 2130
},
{
"epoch": 0.47109443954104147,
"grad_norm": 0.4152880907058716,
"learning_rate": 9.089646803833589e-06,
"loss": 0.225,
"step": 2135
},
{
"epoch": 0.47219770520741394,
"grad_norm": 0.44709253311157227,
"learning_rate": 9.084391984306775e-06,
"loss": 0.2456,
"step": 2140
},
{
"epoch": 0.4733009708737864,
"grad_norm": 0.5279027819633484,
"learning_rate": 9.079123570358e-06,
"loss": 0.2415,
"step": 2145
},
{
"epoch": 0.4744042365401589,
"grad_norm": 0.4792356491088867,
"learning_rate": 9.073841579522571e-06,
"loss": 0.2543,
"step": 2150
},
{
"epoch": 0.4755075022065313,
"grad_norm": 0.45700347423553467,
"learning_rate": 9.068546029380971e-06,
"loss": 0.2593,
"step": 2155
},
{
"epoch": 0.4766107678729038,
"grad_norm": 0.5032837986946106,
"learning_rate": 9.063236937558826e-06,
"loss": 0.2528,
"step": 2160
},
{
"epoch": 0.4777140335392763,
"grad_norm": 0.48134273290634155,
"learning_rate": 9.057914321726824e-06,
"loss": 0.2553,
"step": 2165
},
{
"epoch": 0.4788172992056487,
"grad_norm": 0.45645344257354736,
"learning_rate": 9.052578199600675e-06,
"loss": 0.2387,
"step": 2170
},
{
"epoch": 0.47992056487202117,
"grad_norm": 0.4026988744735718,
"learning_rate": 9.047228588941034e-06,
"loss": 0.228,
"step": 2175
},
{
"epoch": 0.48102383053839365,
"grad_norm": 0.4304678440093994,
"learning_rate": 9.041865507553458e-06,
"loss": 0.2513,
"step": 2180
},
{
"epoch": 0.4821270962047661,
"grad_norm": 0.4108814299106598,
"learning_rate": 9.036488973288339e-06,
"loss": 0.238,
"step": 2185
},
{
"epoch": 0.48323036187113855,
"grad_norm": 0.4733142852783203,
"learning_rate": 9.031099004040841e-06,
"loss": 0.2506,
"step": 2190
},
{
"epoch": 0.484333627537511,
"grad_norm": 0.4324648380279541,
"learning_rate": 9.025695617750848e-06,
"loss": 0.2243,
"step": 2195
},
{
"epoch": 0.4854368932038835,
"grad_norm": 0.4637969136238098,
"learning_rate": 9.020278832402902e-06,
"loss": 0.2545,
"step": 2200
},
{
"epoch": 0.4854368932038835,
"eval_loss": 0.25019994378089905,
"eval_runtime": 271.9518,
"eval_samples_per_second": 56.124,
"eval_steps_per_second": 7.016,
"step": 2200
},
{
"epoch": 0.486540158870256,
"grad_norm": 0.4881959855556488,
"learning_rate": 9.014848666026138e-06,
"loss": 0.2467,
"step": 2205
},
{
"epoch": 0.4876434245366284,
"grad_norm": 0.44622328877449036,
"learning_rate": 9.009405136694234e-06,
"loss": 0.2512,
"step": 2210
},
{
"epoch": 0.4887466902030009,
"grad_norm": 0.4718511700630188,
"learning_rate": 9.003948262525341e-06,
"loss": 0.247,
"step": 2215
},
{
"epoch": 0.48984995586937335,
"grad_norm": 0.4678577780723572,
"learning_rate": 8.998478061682025e-06,
"loss": 0.2301,
"step": 2220
},
{
"epoch": 0.49095322153574583,
"grad_norm": 0.44579043984413147,
"learning_rate": 8.992994552371217e-06,
"loss": 0.2513,
"step": 2225
},
{
"epoch": 0.49205648720211825,
"grad_norm": 0.4663316309452057,
"learning_rate": 8.987497752844132e-06,
"loss": 0.2568,
"step": 2230
},
{
"epoch": 0.49315975286849073,
"grad_norm": 0.5495996475219727,
"learning_rate": 8.981987681396226e-06,
"loss": 0.2626,
"step": 2235
},
{
"epoch": 0.4942630185348632,
"grad_norm": 0.4634556174278259,
"learning_rate": 8.976464356367133e-06,
"loss": 0.2523,
"step": 2240
},
{
"epoch": 0.4953662842012357,
"grad_norm": 0.4804018437862396,
"learning_rate": 8.970927796140592e-06,
"loss": 0.2323,
"step": 2245
},
{
"epoch": 0.4964695498676081,
"grad_norm": 0.4309288561344147,
"learning_rate": 8.965378019144397e-06,
"loss": 0.2432,
"step": 2250
},
{
"epoch": 0.4975728155339806,
"grad_norm": 0.4046125113964081,
"learning_rate": 8.959815043850336e-06,
"loss": 0.228,
"step": 2255
},
{
"epoch": 0.49867608120035306,
"grad_norm": 0.4361225366592407,
"learning_rate": 8.95423888877412e-06,
"loss": 0.2398,
"step": 2260
},
{
"epoch": 0.4997793468667255,
"grad_norm": 0.4338124096393585,
"learning_rate": 8.948649572475332e-06,
"loss": 0.2471,
"step": 2265
},
{
"epoch": 0.500882612533098,
"grad_norm": 0.4460661709308624,
"learning_rate": 8.943047113557358e-06,
"loss": 0.2525,
"step": 2270
},
{
"epoch": 0.5019858781994704,
"grad_norm": 0.43851226568222046,
"learning_rate": 8.937431530667329e-06,
"loss": 0.2412,
"step": 2275
},
{
"epoch": 0.5030891438658429,
"grad_norm": 0.4404292702674866,
"learning_rate": 8.931802842496056e-06,
"loss": 0.2467,
"step": 2280
},
{
"epoch": 0.5041924095322153,
"grad_norm": 0.5048953890800476,
"learning_rate": 8.926161067777973e-06,
"loss": 0.2503,
"step": 2285
},
{
"epoch": 0.5052956751985879,
"grad_norm": 0.48402130603790283,
"learning_rate": 8.920506225291067e-06,
"loss": 0.2441,
"step": 2290
},
{
"epoch": 0.5063989408649603,
"grad_norm": 0.409432977437973,
"learning_rate": 8.914838333856822e-06,
"loss": 0.2388,
"step": 2295
},
{
"epoch": 0.5075022065313327,
"grad_norm": 0.41323322057724,
"learning_rate": 8.90915741234015e-06,
"loss": 0.2372,
"step": 2300
},
{
"epoch": 0.5075022065313327,
"eval_loss": 0.24960678815841675,
"eval_runtime": 272.2362,
"eval_samples_per_second": 56.065,
"eval_steps_per_second": 7.009,
"step": 2300
},
{
"epoch": 0.5086054721977052,
"grad_norm": 0.5443533062934875,
"learning_rate": 8.90346347964934e-06,
"loss": 0.2311,
"step": 2305
},
{
"epoch": 0.5097087378640777,
"grad_norm": 0.42860737442970276,
"learning_rate": 8.897756554735976e-06,
"loss": 0.2537,
"step": 2310
},
{
"epoch": 0.5108120035304501,
"grad_norm": 0.4304381012916565,
"learning_rate": 8.892036656594898e-06,
"loss": 0.2366,
"step": 2315
},
{
"epoch": 0.5119152691968226,
"grad_norm": 0.5057708024978638,
"learning_rate": 8.886303804264117e-06,
"loss": 0.2362,
"step": 2320
},
{
"epoch": 0.513018534863195,
"grad_norm": 0.48017001152038574,
"learning_rate": 8.88055801682476e-06,
"loss": 0.2493,
"step": 2325
},
{
"epoch": 0.5141218005295676,
"grad_norm": 0.441488653421402,
"learning_rate": 8.874799313401014e-06,
"loss": 0.2413,
"step": 2330
},
{
"epoch": 0.51522506619594,
"grad_norm": 0.5098276138305664,
"learning_rate": 8.86902771316005e-06,
"loss": 0.2496,
"step": 2335
},
{
"epoch": 0.5163283318623124,
"grad_norm": 0.43526649475097656,
"learning_rate": 8.863243235311964e-06,
"loss": 0.2452,
"step": 2340
},
{
"epoch": 0.517431597528685,
"grad_norm": 0.48061615228652954,
"learning_rate": 8.857445899109716e-06,
"loss": 0.2521,
"step": 2345
},
{
"epoch": 0.5185348631950574,
"grad_norm": 0.425627201795578,
"learning_rate": 8.851635723849062e-06,
"loss": 0.251,
"step": 2350
},
{
"epoch": 0.5196381288614298,
"grad_norm": 0.39612120389938354,
"learning_rate": 8.845812728868496e-06,
"loss": 0.2366,
"step": 2355
},
{
"epoch": 0.5207413945278023,
"grad_norm": 0.43580201268196106,
"learning_rate": 8.839976933549173e-06,
"loss": 0.2501,
"step": 2360
},
{
"epoch": 0.5218446601941747,
"grad_norm": 0.3925994038581848,
"learning_rate": 8.834128357314856e-06,
"loss": 0.2356,
"step": 2365
},
{
"epoch": 0.5229479258605472,
"grad_norm": 0.4675627648830414,
"learning_rate": 8.828267019631852e-06,
"loss": 0.2439,
"step": 2370
},
{
"epoch": 0.5240511915269197,
"grad_norm": 0.5115921497344971,
"learning_rate": 8.822392940008937e-06,
"loss": 0.2434,
"step": 2375
},
{
"epoch": 0.5251544571932921,
"grad_norm": 0.5380107760429382,
"learning_rate": 8.8165061379973e-06,
"loss": 0.2476,
"step": 2380
},
{
"epoch": 0.5262577228596647,
"grad_norm": 0.541187047958374,
"learning_rate": 8.810606633190475e-06,
"loss": 0.2397,
"step": 2385
},
{
"epoch": 0.5273609885260371,
"grad_norm": 0.49486243724823,
"learning_rate": 8.804694445224274e-06,
"loss": 0.2548,
"step": 2390
},
{
"epoch": 0.5284642541924095,
"grad_norm": 0.5872311592102051,
"learning_rate": 8.798769593776723e-06,
"loss": 0.239,
"step": 2395
},
{
"epoch": 0.529567519858782,
"grad_norm": 0.48262667655944824,
"learning_rate": 8.792832098568002e-06,
"loss": 0.2328,
"step": 2400
},
{
"epoch": 0.529567519858782,
"eval_loss": 0.24928364157676697,
"eval_runtime": 271.3099,
"eval_samples_per_second": 56.257,
"eval_steps_per_second": 7.033,
"step": 2400
},
{
"epoch": 0.5306707855251545,
"grad_norm": 0.40170180797576904,
"learning_rate": 8.786881979360368e-06,
"loss": 0.2564,
"step": 2405
},
{
"epoch": 0.5317740511915269,
"grad_norm": 0.44170036911964417,
"learning_rate": 8.7809192559581e-06,
"loss": 0.2413,
"step": 2410
},
{
"epoch": 0.5328773168578994,
"grad_norm": 0.4831240177154541,
"learning_rate": 8.774943948207427e-06,
"loss": 0.2391,
"step": 2415
},
{
"epoch": 0.5339805825242718,
"grad_norm": 0.39944949746131897,
"learning_rate": 8.76895607599646e-06,
"loss": 0.2361,
"step": 2420
},
{
"epoch": 0.5350838481906444,
"grad_norm": 0.4743267595767975,
"learning_rate": 8.762955659255137e-06,
"loss": 0.2516,
"step": 2425
},
{
"epoch": 0.5361871138570168,
"grad_norm": 0.4756656289100647,
"learning_rate": 8.756942717955142e-06,
"loss": 0.2565,
"step": 2430
},
{
"epoch": 0.5372903795233892,
"grad_norm": 0.45802560448646545,
"learning_rate": 8.750917272109849e-06,
"loss": 0.2386,
"step": 2435
},
{
"epoch": 0.5383936451897617,
"grad_norm": 0.45499125123023987,
"learning_rate": 8.744879341774251e-06,
"loss": 0.2397,
"step": 2440
},
{
"epoch": 0.5394969108561342,
"grad_norm": 0.3336021900177002,
"learning_rate": 8.738828947044895e-06,
"loss": 0.236,
"step": 2445
},
{
"epoch": 0.5406001765225066,
"grad_norm": 0.4355071485042572,
"learning_rate": 8.732766108059814e-06,
"loss": 0.2363,
"step": 2450
},
{
"epoch": 0.5417034421888791,
"grad_norm": 0.4942583441734314,
"learning_rate": 8.726690844998457e-06,
"loss": 0.2301,
"step": 2455
},
{
"epoch": 0.5428067078552515,
"grad_norm": 0.4470270574092865,
"learning_rate": 8.720603178081632e-06,
"loss": 0.2357,
"step": 2460
},
{
"epoch": 0.543909973521624,
"grad_norm": 0.4445681571960449,
"learning_rate": 8.714503127571425e-06,
"loss": 0.2558,
"step": 2465
},
{
"epoch": 0.5450132391879965,
"grad_norm": 0.5611670613288879,
"learning_rate": 8.708390713771145e-06,
"loss": 0.2444,
"step": 2470
},
{
"epoch": 0.5461165048543689,
"grad_norm": 0.37664106488227844,
"learning_rate": 8.702265957025241e-06,
"loss": 0.2511,
"step": 2475
},
{
"epoch": 0.5472197705207414,
"grad_norm": 0.5034042596817017,
"learning_rate": 8.696128877719258e-06,
"loss": 0.2483,
"step": 2480
},
{
"epoch": 0.5483230361871139,
"grad_norm": 0.4550327956676483,
"learning_rate": 8.689979496279747e-06,
"loss": 0.2404,
"step": 2485
},
{
"epoch": 0.5494263018534863,
"grad_norm": 0.4192756712436676,
"learning_rate": 8.683817833174204e-06,
"loss": 0.2272,
"step": 2490
},
{
"epoch": 0.5505295675198588,
"grad_norm": 0.49941694736480713,
"learning_rate": 8.677643908911007e-06,
"loss": 0.2461,
"step": 2495
},
{
"epoch": 0.5516328331862312,
"grad_norm": 0.48723432421684265,
"learning_rate": 8.67145774403934e-06,
"loss": 0.2359,
"step": 2500
},
{
"epoch": 0.5516328331862312,
"eval_loss": 0.24847546219825745,
"eval_runtime": 268.7693,
"eval_samples_per_second": 56.788,
"eval_steps_per_second": 7.099,
"step": 2500
},
{
"epoch": 0.5527360988526037,
"grad_norm": 0.4447115957736969,
"learning_rate": 8.665259359149132e-06,
"loss": 0.244,
"step": 2505
},
{
"epoch": 0.5538393645189762,
"grad_norm": 0.46144431829452515,
"learning_rate": 8.659048774870986e-06,
"loss": 0.2509,
"step": 2510
},
{
"epoch": 0.5549426301853486,
"grad_norm": 0.41772812604904175,
"learning_rate": 8.652826011876104e-06,
"loss": 0.2422,
"step": 2515
},
{
"epoch": 0.556045895851721,
"grad_norm": 0.45326176285743713,
"learning_rate": 8.646591090876225e-06,
"loss": 0.241,
"step": 2520
},
{
"epoch": 0.5571491615180936,
"grad_norm": 0.4441646337509155,
"learning_rate": 8.64034403262356e-06,
"loss": 0.2445,
"step": 2525
},
{
"epoch": 0.558252427184466,
"grad_norm": 0.5038093328475952,
"learning_rate": 8.634084857910709e-06,
"loss": 0.2478,
"step": 2530
},
{
"epoch": 0.5593556928508385,
"grad_norm": 0.4078108072280884,
"learning_rate": 8.627813587570609e-06,
"loss": 0.255,
"step": 2535
},
{
"epoch": 0.560458958517211,
"grad_norm": 0.4333765506744385,
"learning_rate": 8.621530242476446e-06,
"loss": 0.2438,
"step": 2540
},
{
"epoch": 0.5615622241835834,
"grad_norm": 0.5008811354637146,
"learning_rate": 8.615234843541606e-06,
"loss": 0.2388,
"step": 2545
},
{
"epoch": 0.5626654898499559,
"grad_norm": 0.3749838173389435,
"learning_rate": 8.608927411719585e-06,
"loss": 0.2422,
"step": 2550
},
{
"epoch": 0.5637687555163283,
"grad_norm": 0.5079739093780518,
"learning_rate": 8.602607968003935e-06,
"loss": 0.2367,
"step": 2555
},
{
"epoch": 0.5648720211827007,
"grad_norm": 0.40866804122924805,
"learning_rate": 8.59627653342819e-06,
"loss": 0.2598,
"step": 2560
},
{
"epoch": 0.5659752868490733,
"grad_norm": 0.3951939642429352,
"learning_rate": 8.589933129065786e-06,
"loss": 0.2316,
"step": 2565
},
{
"epoch": 0.5670785525154457,
"grad_norm": 0.41789600253105164,
"learning_rate": 8.583577776030005e-06,
"loss": 0.2412,
"step": 2570
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.5892974138259888,
"learning_rate": 8.5772104954739e-06,
"loss": 0.2441,
"step": 2575
},
{
"epoch": 0.5692850838481907,
"grad_norm": 0.46684080362319946,
"learning_rate": 8.570831308590219e-06,
"loss": 0.2437,
"step": 2580
},
{
"epoch": 0.5703883495145631,
"grad_norm": 0.5170934796333313,
"learning_rate": 8.564440236611344e-06,
"loss": 0.2436,
"step": 2585
},
{
"epoch": 0.5714916151809356,
"grad_norm": 0.5239847302436829,
"learning_rate": 8.558037300809209e-06,
"loss": 0.2458,
"step": 2590
},
{
"epoch": 0.572594880847308,
"grad_norm": 0.4109562933444977,
"learning_rate": 8.551622522495238e-06,
"loss": 0.2492,
"step": 2595
},
{
"epoch": 0.5736981465136805,
"grad_norm": 0.40857604146003723,
"learning_rate": 8.545195923020273e-06,
"loss": 0.24,
"step": 2600
},
{
"epoch": 0.5736981465136805,
"eval_loss": 0.2479788064956665,
"eval_runtime": 270.4896,
"eval_samples_per_second": 56.427,
"eval_steps_per_second": 7.054,
"step": 2600
},
{
"epoch": 0.574801412180053,
"grad_norm": 0.47509685158729553,
"learning_rate": 8.538757523774503e-06,
"loss": 0.2835,
"step": 2605
},
{
"epoch": 0.5759046778464254,
"grad_norm": 0.47140100598335266,
"learning_rate": 8.532307346187384e-06,
"loss": 0.2372,
"step": 2610
},
{
"epoch": 0.5770079435127978,
"grad_norm": 0.4311586618423462,
"learning_rate": 8.525845411727581e-06,
"loss": 0.2446,
"step": 2615
},
{
"epoch": 0.5781112091791704,
"grad_norm": 0.44329634308815,
"learning_rate": 8.519371741902888e-06,
"loss": 0.2419,
"step": 2620
},
{
"epoch": 0.5792144748455428,
"grad_norm": 0.4547870457172394,
"learning_rate": 8.512886358260162e-06,
"loss": 0.2398,
"step": 2625
},
{
"epoch": 0.5803177405119153,
"grad_norm": 0.47059762477874756,
"learning_rate": 8.506389282385242e-06,
"loss": 0.2512,
"step": 2630
},
{
"epoch": 0.5814210061782877,
"grad_norm": 0.49984219670295715,
"learning_rate": 8.499880535902885e-06,
"loss": 0.2575,
"step": 2635
},
{
"epoch": 0.5825242718446602,
"grad_norm": 0.4017459452152252,
"learning_rate": 8.493360140476699e-06,
"loss": 0.2352,
"step": 2640
},
{
"epoch": 0.5836275375110327,
"grad_norm": 0.3862455487251282,
"learning_rate": 8.486828117809057e-06,
"loss": 0.2317,
"step": 2645
},
{
"epoch": 0.5847308031774051,
"grad_norm": 0.4500201642513275,
"learning_rate": 8.480284489641034e-06,
"loss": 0.2385,
"step": 2650
},
{
"epoch": 0.5858340688437775,
"grad_norm": 0.5192285776138306,
"learning_rate": 8.473729277752331e-06,
"loss": 0.2426,
"step": 2655
},
{
"epoch": 0.5869373345101501,
"grad_norm": 0.41023924946784973,
"learning_rate": 8.467162503961209e-06,
"loss": 0.2346,
"step": 2660
},
{
"epoch": 0.5880406001765225,
"grad_norm": 0.44286391139030457,
"learning_rate": 8.460584190124405e-06,
"loss": 0.246,
"step": 2665
},
{
"epoch": 0.589143865842895,
"grad_norm": 0.45593389868736267,
"learning_rate": 8.45399435813707e-06,
"loss": 0.2371,
"step": 2670
},
{
"epoch": 0.5902471315092674,
"grad_norm": 0.4817209839820862,
"learning_rate": 8.447393029932692e-06,
"loss": 0.2376,
"step": 2675
},
{
"epoch": 0.5913503971756399,
"grad_norm": 0.4856320917606354,
"learning_rate": 8.440780227483016e-06,
"loss": 0.2451,
"step": 2680
},
{
"epoch": 0.5924536628420124,
"grad_norm": 0.42935821413993835,
"learning_rate": 8.43415597279799e-06,
"loss": 0.2422,
"step": 2685
},
{
"epoch": 0.5935569285083848,
"grad_norm": 0.45911160111427307,
"learning_rate": 8.427520287925669e-06,
"loss": 0.2397,
"step": 2690
},
{
"epoch": 0.5946601941747572,
"grad_norm": 0.4541209638118744,
"learning_rate": 8.420873194952153e-06,
"loss": 0.2392,
"step": 2695
},
{
"epoch": 0.5957634598411298,
"grad_norm": 0.4879801273345947,
"learning_rate": 8.414214716001519e-06,
"loss": 0.2479,
"step": 2700
},
{
"epoch": 0.5957634598411298,
"eval_loss": 0.24747207760810852,
"eval_runtime": 270.41,
"eval_samples_per_second": 56.444,
"eval_steps_per_second": 7.056,
"step": 2700
},
{
"epoch": 0.5968667255075022,
"grad_norm": 0.4193342328071594,
"learning_rate": 8.407544873235736e-06,
"loss": 0.2482,
"step": 2705
},
{
"epoch": 0.5979699911738746,
"grad_norm": 0.45270466804504395,
"learning_rate": 8.400863688854598e-06,
"loss": 0.2469,
"step": 2710
},
{
"epoch": 0.5990732568402471,
"grad_norm": 0.4806990623474121,
"learning_rate": 8.394171185095646e-06,
"loss": 0.2442,
"step": 2715
},
{
"epoch": 0.6001765225066196,
"grad_norm": 0.4336373209953308,
"learning_rate": 8.387467384234096e-06,
"loss": 0.2335,
"step": 2720
},
{
"epoch": 0.6012797881729921,
"grad_norm": 0.4600653648376465,
"learning_rate": 8.38075230858277e-06,
"loss": 0.2365,
"step": 2725
},
{
"epoch": 0.6023830538393645,
"grad_norm": 0.5015227198600769,
"learning_rate": 8.37402598049201e-06,
"loss": 0.2502,
"step": 2730
},
{
"epoch": 0.603486319505737,
"grad_norm": 0.552075207233429,
"learning_rate": 8.367288422349617e-06,
"loss": 0.2403,
"step": 2735
},
{
"epoch": 0.6045895851721095,
"grad_norm": 0.4628050923347473,
"learning_rate": 8.360539656580768e-06,
"loss": 0.2294,
"step": 2740
},
{
"epoch": 0.6056928508384819,
"grad_norm": 0.48730120062828064,
"learning_rate": 8.353779705647936e-06,
"loss": 0.2397,
"step": 2745
},
{
"epoch": 0.6067961165048543,
"grad_norm": 0.40887823700904846,
"learning_rate": 8.347008592050834e-06,
"loss": 0.2491,
"step": 2750
},
{
"epoch": 0.6078993821712269,
"grad_norm": 0.4346201419830322,
"learning_rate": 8.340226338326321e-06,
"loss": 0.2436,
"step": 2755
},
{
"epoch": 0.6090026478375993,
"grad_norm": 0.38802462816238403,
"learning_rate": 8.333432967048339e-06,
"loss": 0.2379,
"step": 2760
},
{
"epoch": 0.6101059135039718,
"grad_norm": 0.3992108404636383,
"learning_rate": 8.326628500827826e-06,
"loss": 0.2338,
"step": 2765
},
{
"epoch": 0.6112091791703442,
"grad_norm": 0.44411781430244446,
"learning_rate": 8.319812962312662e-06,
"loss": 0.2301,
"step": 2770
},
{
"epoch": 0.6123124448367167,
"grad_norm": 0.42220309376716614,
"learning_rate": 8.312986374187563e-06,
"loss": 0.238,
"step": 2775
},
{
"epoch": 0.6134157105030892,
"grad_norm": 0.47081899642944336,
"learning_rate": 8.306148759174036e-06,
"loss": 0.2536,
"step": 2780
},
{
"epoch": 0.6145189761694616,
"grad_norm": 0.4740568995475769,
"learning_rate": 8.299300140030283e-06,
"loss": 0.2494,
"step": 2785
},
{
"epoch": 0.615622241835834,
"grad_norm": 0.41311606764793396,
"learning_rate": 8.292440539551132e-06,
"loss": 0.2443,
"step": 2790
},
{
"epoch": 0.6167255075022066,
"grad_norm": 0.43672001361846924,
"learning_rate": 8.285569980567965e-06,
"loss": 0.2386,
"step": 2795
},
{
"epoch": 0.617828773168579,
"grad_norm": 0.45961281657218933,
"learning_rate": 8.278688485948634e-06,
"loss": 0.2471,
"step": 2800
},
{
"epoch": 0.617828773168579,
"eval_loss": 0.24707245826721191,
"eval_runtime": 271.8779,
"eval_samples_per_second": 56.139,
"eval_steps_per_second": 7.018,
"step": 2800
},
{
"epoch": 0.6189320388349514,
"grad_norm": 0.41084882616996765,
"learning_rate": 8.27179607859739e-06,
"loss": 0.2353,
"step": 2805
},
{
"epoch": 0.6200353045013239,
"grad_norm": 0.40242356061935425,
"learning_rate": 8.264892781454807e-06,
"loss": 0.2259,
"step": 2810
},
{
"epoch": 0.6211385701676964,
"grad_norm": 0.43410569429397583,
"learning_rate": 8.257978617497706e-06,
"loss": 0.2375,
"step": 2815
},
{
"epoch": 0.6222418358340689,
"grad_norm": 0.49452638626098633,
"learning_rate": 8.25105360973907e-06,
"loss": 0.2472,
"step": 2820
},
{
"epoch": 0.6233451015004413,
"grad_norm": 0.5670908689498901,
"learning_rate": 8.244117781227982e-06,
"loss": 0.2434,
"step": 2825
},
{
"epoch": 0.6244483671668137,
"grad_norm": 0.4820459485054016,
"learning_rate": 8.237171155049539e-06,
"loss": 0.2393,
"step": 2830
},
{
"epoch": 0.6255516328331863,
"grad_norm": 0.455879271030426,
"learning_rate": 8.230213754324773e-06,
"loss": 0.2269,
"step": 2835
},
{
"epoch": 0.6266548984995587,
"grad_norm": 0.41106078028678894,
"learning_rate": 8.22324560221058e-06,
"loss": 0.2291,
"step": 2840
},
{
"epoch": 0.6277581641659311,
"grad_norm": 0.39285704493522644,
"learning_rate": 8.216266721899642e-06,
"loss": 0.2357,
"step": 2845
},
{
"epoch": 0.6288614298323036,
"grad_norm": 0.3971237242221832,
"learning_rate": 8.209277136620348e-06,
"loss": 0.2444,
"step": 2850
},
{
"epoch": 0.6299646954986761,
"grad_norm": 0.47871458530426025,
"learning_rate": 8.202276869636713e-06,
"loss": 0.2357,
"step": 2855
},
{
"epoch": 0.6310679611650486,
"grad_norm": 0.4452441930770874,
"learning_rate": 8.195265944248315e-06,
"loss": 0.237,
"step": 2860
},
{
"epoch": 0.632171226831421,
"grad_norm": 0.5275673866271973,
"learning_rate": 8.188244383790196e-06,
"loss": 0.2536,
"step": 2865
},
{
"epoch": 0.6332744924977934,
"grad_norm": 0.46344441175460815,
"learning_rate": 8.1812122116328e-06,
"loss": 0.2437,
"step": 2870
},
{
"epoch": 0.634377758164166,
"grad_norm": 0.4381890892982483,
"learning_rate": 8.174169451181893e-06,
"loss": 0.2488,
"step": 2875
},
{
"epoch": 0.6354810238305384,
"grad_norm": 0.41612380743026733,
"learning_rate": 8.167116125878483e-06,
"loss": 0.239,
"step": 2880
},
{
"epoch": 0.6365842894969108,
"grad_norm": 0.4481007754802704,
"learning_rate": 8.160052259198737e-06,
"loss": 0.2395,
"step": 2885
},
{
"epoch": 0.6376875551632833,
"grad_norm": 0.42456308007240295,
"learning_rate": 8.152977874653909e-06,
"loss": 0.2303,
"step": 2890
},
{
"epoch": 0.6387908208296558,
"grad_norm": 0.4462544322013855,
"learning_rate": 8.145892995790269e-06,
"loss": 0.2476,
"step": 2895
},
{
"epoch": 0.6398940864960282,
"grad_norm": 0.37666749954223633,
"learning_rate": 8.138797646189e-06,
"loss": 0.2326,
"step": 2900
},
{
"epoch": 0.6398940864960282,
"eval_loss": 0.24663805961608887,
"eval_runtime": 271.9733,
"eval_samples_per_second": 56.119,
"eval_steps_per_second": 7.015,
"step": 2900
},
{
"epoch": 0.6409973521624007,
"grad_norm": 0.4413769841194153,
"learning_rate": 8.131691849466154e-06,
"loss": 0.2347,
"step": 2905
},
{
"epoch": 0.6421006178287731,
"grad_norm": 0.5462684035301208,
"learning_rate": 8.12457562927254e-06,
"loss": 0.2491,
"step": 2910
},
{
"epoch": 0.6432038834951457,
"grad_norm": 0.47332948446273804,
"learning_rate": 8.117449009293668e-06,
"loss": 0.2387,
"step": 2915
},
{
"epoch": 0.6443071491615181,
"grad_norm": 0.4280896484851837,
"learning_rate": 8.11031201324966e-06,
"loss": 0.2347,
"step": 2920
},
{
"epoch": 0.6454104148278905,
"grad_norm": 0.46633732318878174,
"learning_rate": 8.103164664895179e-06,
"loss": 0.2528,
"step": 2925
},
{
"epoch": 0.646513680494263,
"grad_norm": 0.45853593945503235,
"learning_rate": 8.096006988019331e-06,
"loss": 0.2329,
"step": 2930
},
{
"epoch": 0.6476169461606355,
"grad_norm": 0.4461853802204132,
"learning_rate": 8.088839006445615e-06,
"loss": 0.2436,
"step": 2935
},
{
"epoch": 0.6487202118270079,
"grad_norm": 0.4614443778991699,
"learning_rate": 8.081660744031818e-06,
"loss": 0.2442,
"step": 2940
},
{
"epoch": 0.6498234774933804,
"grad_norm": 0.4097602367401123,
"learning_rate": 8.074472224669952e-06,
"loss": 0.2398,
"step": 2945
},
{
"epoch": 0.6509267431597529,
"grad_norm": 0.5019506216049194,
"learning_rate": 8.067273472286158e-06,
"loss": 0.2488,
"step": 2950
},
{
"epoch": 0.6520300088261254,
"grad_norm": 0.4480745196342468,
"learning_rate": 8.060064510840648e-06,
"loss": 0.2268,
"step": 2955
},
{
"epoch": 0.6531332744924978,
"grad_norm": 0.44799327850341797,
"learning_rate": 8.052845364327609e-06,
"loss": 0.2407,
"step": 2960
},
{
"epoch": 0.6542365401588702,
"grad_norm": 0.4316900670528412,
"learning_rate": 8.045616056775124e-06,
"loss": 0.2449,
"step": 2965
},
{
"epoch": 0.6553398058252428,
"grad_norm": 0.42375341057777405,
"learning_rate": 8.038376612245104e-06,
"loss": 0.2363,
"step": 2970
},
{
"epoch": 0.6564430714916152,
"grad_norm": 0.48923903703689575,
"learning_rate": 8.031127054833192e-06,
"loss": 0.2409,
"step": 2975
},
{
"epoch": 0.6575463371579876,
"grad_norm": 0.41415655612945557,
"learning_rate": 8.023867408668692e-06,
"loss": 0.2335,
"step": 2980
},
{
"epoch": 0.6586496028243601,
"grad_norm": 0.47558680176734924,
"learning_rate": 8.016597697914492e-06,
"loss": 0.2485,
"step": 2985
},
{
"epoch": 0.6597528684907326,
"grad_norm": 0.4345654845237732,
"learning_rate": 8.009317946766975e-06,
"loss": 0.2445,
"step": 2990
},
{
"epoch": 0.660856134157105,
"grad_norm": 0.4340679347515106,
"learning_rate": 8.002028179455941e-06,
"loss": 0.2403,
"step": 2995
},
{
"epoch": 0.6619593998234775,
"grad_norm": 0.45394837856292725,
"learning_rate": 7.994728420244533e-06,
"loss": 0.2516,
"step": 3000
},
{
"epoch": 0.6619593998234775,
"eval_loss": 0.24612218141555786,
"eval_runtime": 271.5712,
"eval_samples_per_second": 56.203,
"eval_steps_per_second": 7.026,
"step": 3000
},
{
"epoch": 0.6630626654898499,
"grad_norm": 0.4266812205314636,
"learning_rate": 7.987418693429145e-06,
"loss": 0.2421,
"step": 3005
},
{
"epoch": 0.6641659311562225,
"grad_norm": 0.44489166140556335,
"learning_rate": 7.98009902333935e-06,
"loss": 0.2249,
"step": 3010
},
{
"epoch": 0.6652691968225949,
"grad_norm": 0.3864096701145172,
"learning_rate": 7.972769434337815e-06,
"loss": 0.238,
"step": 3015
},
{
"epoch": 0.6663724624889673,
"grad_norm": 0.5136005878448486,
"learning_rate": 7.965429950820222e-06,
"loss": 0.233,
"step": 3020
},
{
"epoch": 0.6674757281553398,
"grad_norm": 0.4214404225349426,
"learning_rate": 7.958080597215187e-06,
"loss": 0.2382,
"step": 3025
},
{
"epoch": 0.6685789938217123,
"grad_norm": 0.3963940441608429,
"learning_rate": 7.95072139798417e-06,
"loss": 0.2413,
"step": 3030
},
{
"epoch": 0.6696822594880847,
"grad_norm": 0.46489056944847107,
"learning_rate": 7.943352377621414e-06,
"loss": 0.2405,
"step": 3035
},
{
"epoch": 0.6707855251544572,
"grad_norm": 0.47178593277931213,
"learning_rate": 7.935973560653838e-06,
"loss": 0.2347,
"step": 3040
},
{
"epoch": 0.6718887908208296,
"grad_norm": 0.43894341588020325,
"learning_rate": 7.928584971640974e-06,
"loss": 0.2443,
"step": 3045
},
{
"epoch": 0.6729920564872022,
"grad_norm": 0.39241307973861694,
"learning_rate": 7.92118663517488e-06,
"loss": 0.249,
"step": 3050
},
{
"epoch": 0.6740953221535746,
"grad_norm": 0.4197879731655121,
"learning_rate": 7.913778575880054e-06,
"loss": 0.2229,
"step": 3055
},
{
"epoch": 0.675198587819947,
"grad_norm": 0.4493717551231384,
"learning_rate": 7.906360818413354e-06,
"loss": 0.2385,
"step": 3060
},
{
"epoch": 0.6763018534863195,
"grad_norm": 0.4284787178039551,
"learning_rate": 7.898933387463924e-06,
"loss": 0.2398,
"step": 3065
},
{
"epoch": 0.677405119152692,
"grad_norm": 0.4555470943450928,
"learning_rate": 7.891496307753099e-06,
"loss": 0.2395,
"step": 3070
},
{
"epoch": 0.6785083848190644,
"grad_norm": 0.4759092628955841,
"learning_rate": 7.884049604034331e-06,
"loss": 0.2441,
"step": 3075
},
{
"epoch": 0.6796116504854369,
"grad_norm": 0.421332448720932,
"learning_rate": 7.876593301093104e-06,
"loss": 0.2416,
"step": 3080
},
{
"epoch": 0.6807149161518093,
"grad_norm": 0.5813837051391602,
"learning_rate": 7.869127423746852e-06,
"loss": 0.2387,
"step": 3085
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.4229418933391571,
"learning_rate": 7.861651996844877e-06,
"loss": 0.2359,
"step": 3090
},
{
"epoch": 0.6829214474845543,
"grad_norm": 0.4013502597808838,
"learning_rate": 7.854167045268265e-06,
"loss": 0.2408,
"step": 3095
},
{
"epoch": 0.6840247131509267,
"grad_norm": 0.46940287947654724,
"learning_rate": 7.8466725939298e-06,
"loss": 0.2254,
"step": 3100
},
{
"epoch": 0.6840247131509267,
"eval_loss": 0.24580919742584229,
"eval_runtime": 268.9469,
"eval_samples_per_second": 56.751,
"eval_steps_per_second": 7.094,
"step": 3100
},
{
"epoch": 0.6851279788172993,
"grad_norm": 0.43689653277397156,
"learning_rate": 7.839168667773891e-06,
"loss": 0.248,
"step": 3105
},
{
"epoch": 0.6862312444836717,
"grad_norm": 0.4291225075721741,
"learning_rate": 7.831655291776484e-06,
"loss": 0.2554,
"step": 3110
},
{
"epoch": 0.6873345101500441,
"grad_norm": 0.3945559859275818,
"learning_rate": 7.824132490944968e-06,
"loss": 0.229,
"step": 3115
},
{
"epoch": 0.6884377758164166,
"grad_norm": 0.4158150553703308,
"learning_rate": 7.81660029031811e-06,
"loss": 0.2422,
"step": 3120
},
{
"epoch": 0.689541041482789,
"grad_norm": 0.4768913984298706,
"learning_rate": 7.809058714965962e-06,
"loss": 0.2384,
"step": 3125
},
{
"epoch": 0.6906443071491615,
"grad_norm": 0.4020480811595917,
"learning_rate": 7.801507789989775e-06,
"loss": 0.2327,
"step": 3130
},
{
"epoch": 0.691747572815534,
"grad_norm": 0.4478599429130554,
"learning_rate": 7.793947540521922e-06,
"loss": 0.2507,
"step": 3135
},
{
"epoch": 0.6928508384819064,
"grad_norm": 0.4232751727104187,
"learning_rate": 7.786377991725813e-06,
"loss": 0.2451,
"step": 3140
},
{
"epoch": 0.693954104148279,
"grad_norm": 0.4200434982776642,
"learning_rate": 7.778799168795804e-06,
"loss": 0.2394,
"step": 3145
},
{
"epoch": 0.6950573698146514,
"grad_norm": 0.5268550515174866,
"learning_rate": 7.771211096957125e-06,
"loss": 0.2328,
"step": 3150
},
{
"epoch": 0.6961606354810238,
"grad_norm": 0.48702338337898254,
"learning_rate": 7.763613801465785e-06,
"loss": 0.2417,
"step": 3155
},
{
"epoch": 0.6972639011473963,
"grad_norm": 0.4694213569164276,
"learning_rate": 7.756007307608498e-06,
"loss": 0.2505,
"step": 3160
},
{
"epoch": 0.6983671668137688,
"grad_norm": 0.40955081582069397,
"learning_rate": 7.748391640702588e-06,
"loss": 0.2401,
"step": 3165
},
{
"epoch": 0.6994704324801412,
"grad_norm": 0.45171940326690674,
"learning_rate": 7.740766826095918e-06,
"loss": 0.23,
"step": 3170
},
{
"epoch": 0.7005736981465137,
"grad_norm": 0.4558034837245941,
"learning_rate": 7.733132889166788e-06,
"loss": 0.2417,
"step": 3175
},
{
"epoch": 0.7016769638128861,
"grad_norm": 0.4197918474674225,
"learning_rate": 7.725489855323869e-06,
"loss": 0.2432,
"step": 3180
},
{
"epoch": 0.7027802294792586,
"grad_norm": 0.4640055000782013,
"learning_rate": 7.717837750006106e-06,
"loss": 0.2387,
"step": 3185
},
{
"epoch": 0.7038834951456311,
"grad_norm": 0.41201338171958923,
"learning_rate": 7.710176598682639e-06,
"loss": 0.2253,
"step": 3190
},
{
"epoch": 0.7049867608120035,
"grad_norm": 0.4629385769367218,
"learning_rate": 7.702506426852715e-06,
"loss": 0.2473,
"step": 3195
},
{
"epoch": 0.706090026478376,
"grad_norm": 0.41098853945732117,
"learning_rate": 7.694827260045608e-06,
"loss": 0.2454,
"step": 3200
},
{
"epoch": 0.706090026478376,
"eval_loss": 0.24525980651378632,
"eval_runtime": 270.0878,
"eval_samples_per_second": 56.511,
"eval_steps_per_second": 7.064,
"step": 3200
},
{
"epoch": 0.7071932921447485,
"grad_norm": 0.43668264150619507,
"learning_rate": 7.687139123820526e-06,
"loss": 0.2469,
"step": 3205
},
{
"epoch": 0.7082965578111209,
"grad_norm": 0.3744730055332184,
"learning_rate": 7.679442043766534e-06,
"loss": 0.2336,
"step": 3210
},
{
"epoch": 0.7093998234774934,
"grad_norm": 0.41323620080947876,
"learning_rate": 7.671736045502462e-06,
"loss": 0.2459,
"step": 3215
},
{
"epoch": 0.7105030891438658,
"grad_norm": 0.4456137716770172,
"learning_rate": 7.664021154676828e-06,
"loss": 0.2491,
"step": 3220
},
{
"epoch": 0.7116063548102383,
"grad_norm": 0.4377335011959076,
"learning_rate": 7.656297396967747e-06,
"loss": 0.2395,
"step": 3225
},
{
"epoch": 0.7127096204766108,
"grad_norm": 0.4323022961616516,
"learning_rate": 7.648564798082842e-06,
"loss": 0.2403,
"step": 3230
},
{
"epoch": 0.7138128861429832,
"grad_norm": 0.4660840630531311,
"learning_rate": 7.640823383759169e-06,
"loss": 0.2532,
"step": 3235
},
{
"epoch": 0.7149161518093556,
"grad_norm": 0.4233240485191345,
"learning_rate": 7.63307317976312e-06,
"loss": 0.24,
"step": 3240
},
{
"epoch": 0.7160194174757282,
"grad_norm": 0.48266658186912537,
"learning_rate": 7.625314211890342e-06,
"loss": 0.2426,
"step": 3245
},
{
"epoch": 0.7171226831421006,
"grad_norm": 0.3970603346824646,
"learning_rate": 7.617546505965658e-06,
"loss": 0.2278,
"step": 3250
},
{
"epoch": 0.7182259488084731,
"grad_norm": 0.4572686553001404,
"learning_rate": 7.609770087842969e-06,
"loss": 0.242,
"step": 3255
},
{
"epoch": 0.7193292144748455,
"grad_norm": 0.40155845880508423,
"learning_rate": 7.601984983405173e-06,
"loss": 0.2285,
"step": 3260
},
{
"epoch": 0.720432480141218,
"grad_norm": 0.4204133152961731,
"learning_rate": 7.594191218564084e-06,
"loss": 0.2353,
"step": 3265
},
{
"epoch": 0.7215357458075905,
"grad_norm": 0.4386295676231384,
"learning_rate": 7.586388819260338e-06,
"loss": 0.2228,
"step": 3270
},
{
"epoch": 0.7226390114739629,
"grad_norm": 0.43028250336647034,
"learning_rate": 7.57857781146331e-06,
"loss": 0.2328,
"step": 3275
},
{
"epoch": 0.7237422771403353,
"grad_norm": 0.4137963056564331,
"learning_rate": 7.5707582211710265e-06,
"loss": 0.2406,
"step": 3280
},
{
"epoch": 0.7248455428067079,
"grad_norm": 0.3798144459724426,
"learning_rate": 7.562930074410084e-06,
"loss": 0.2425,
"step": 3285
},
{
"epoch": 0.7259488084730803,
"grad_norm": 0.3895861804485321,
"learning_rate": 7.555093397235553e-06,
"loss": 0.2312,
"step": 3290
},
{
"epoch": 0.7270520741394528,
"grad_norm": 0.44680020213127136,
"learning_rate": 7.5472482157308975e-06,
"loss": 0.23,
"step": 3295
},
{
"epoch": 0.7281553398058253,
"grad_norm": 0.4767349064350128,
"learning_rate": 7.539394556007892e-06,
"loss": 0.2482,
"step": 3300
},
{
"epoch": 0.7281553398058253,
"eval_loss": 0.24486024677753448,
"eval_runtime": 269.7229,
"eval_samples_per_second": 56.588,
"eval_steps_per_second": 7.074,
"step": 3300
},
{
"epoch": 0.7292586054721977,
"grad_norm": 0.41484782099723816,
"learning_rate": 7.531532444206524e-06,
"loss": 0.2333,
"step": 3305
},
{
"epoch": 0.7303618711385702,
"grad_norm": 0.4434278607368469,
"learning_rate": 7.523661906494913e-06,
"loss": 0.2393,
"step": 3310
},
{
"epoch": 0.7314651368049426,
"grad_norm": 0.44898828864097595,
"learning_rate": 7.515782969069229e-06,
"loss": 0.2342,
"step": 3315
},
{
"epoch": 0.732568402471315,
"grad_norm": 0.4551832973957062,
"learning_rate": 7.507895658153594e-06,
"loss": 0.2459,
"step": 3320
},
{
"epoch": 0.7336716681376876,
"grad_norm": 0.464346706867218,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2178,
"step": 3325
},
{
"epoch": 0.73477493380406,
"grad_norm": 0.43785154819488525,
"learning_rate": 7.492096020888227e-06,
"loss": 0.2378,
"step": 3330
},
{
"epoch": 0.7358781994704324,
"grad_norm": 0.4068206548690796,
"learning_rate": 7.484183747125743e-06,
"loss": 0.2302,
"step": 3335
},
{
"epoch": 0.736981465136805,
"grad_norm": 0.40867307782173157,
"learning_rate": 7.476263205047629e-06,
"loss": 0.2403,
"step": 3340
},
{
"epoch": 0.7380847308031774,
"grad_norm": 0.4066210687160492,
"learning_rate": 7.468334421016486e-06,
"loss": 0.2334,
"step": 3345
},
{
"epoch": 0.7391879964695499,
"grad_norm": 0.43838587403297424,
"learning_rate": 7.460397421422346e-06,
"loss": 0.231,
"step": 3350
},
{
"epoch": 0.7402912621359223,
"grad_norm": 0.4866897761821747,
"learning_rate": 7.452452232682585e-06,
"loss": 0.2513,
"step": 3355
},
{
"epoch": 0.7413945278022948,
"grad_norm": 0.5071978569030762,
"learning_rate": 7.444498881241835e-06,
"loss": 0.2495,
"step": 3360
},
{
"epoch": 0.7424977934686673,
"grad_norm": 0.5105838775634766,
"learning_rate": 7.4365373935719e-06,
"loss": 0.242,
"step": 3365
},
{
"epoch": 0.7436010591350397,
"grad_norm": 0.513029932975769,
"learning_rate": 7.428567796171662e-06,
"loss": 0.2468,
"step": 3370
},
{
"epoch": 0.7447043248014121,
"grad_norm": 0.396241158246994,
"learning_rate": 7.420590115566995e-06,
"loss": 0.2283,
"step": 3375
},
{
"epoch": 0.7458075904677847,
"grad_norm": 0.43795159459114075,
"learning_rate": 7.412604378310677e-06,
"loss": 0.2304,
"step": 3380
},
{
"epoch": 0.7469108561341571,
"grad_norm": 0.42680585384368896,
"learning_rate": 7.4046106109823045e-06,
"loss": 0.2294,
"step": 3385
},
{
"epoch": 0.7480141218005296,
"grad_norm": 0.5321316123008728,
"learning_rate": 7.3966088401881975e-06,
"loss": 0.2378,
"step": 3390
},
{
"epoch": 0.749117387466902,
"grad_norm": 0.4479999244213104,
"learning_rate": 7.388599092561315e-06,
"loss": 0.2333,
"step": 3395
},
{
"epoch": 0.7502206531332745,
"grad_norm": 0.5410358309745789,
"learning_rate": 7.380581394761169e-06,
"loss": 0.2429,
"step": 3400
},
{
"epoch": 0.7502206531332745,
"eval_loss": 0.24444225430488586,
"eval_runtime": 273.7667,
"eval_samples_per_second": 55.752,
"eval_steps_per_second": 6.969,
"step": 3400
}
],
"logging_steps": 5,
"max_steps": 9064,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.662905189123752e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}