Files
GPRM-4B/trainer_state.json

3523 lines
85 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4971,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006035913686434284,
"grad_norm": 13.465597639885102,
"learning_rate": 1.8072289156626505e-07,
"loss": 0.8887,
"step": 10
},
{
"epoch": 0.012071827372868568,
"grad_norm": 12.414937705631793,
"learning_rate": 3.8152610441767073e-07,
"loss": 0.8871,
"step": 20
},
{
"epoch": 0.01810774105930285,
"grad_norm": 5.604563580675332,
"learning_rate": 5.823293172690764e-07,
"loss": 0.7943,
"step": 30
},
{
"epoch": 0.024143654745737136,
"grad_norm": 3.082127771205323,
"learning_rate": 7.83132530120482e-07,
"loss": 0.6989,
"step": 40
},
{
"epoch": 0.03017956843217142,
"grad_norm": 1.9253363533227204,
"learning_rate": 9.839357429718876e-07,
"loss": 0.6283,
"step": 50
},
{
"epoch": 0.0362154821186057,
"grad_norm": 1.2352792533570607,
"learning_rate": 1.1847389558232934e-06,
"loss": 0.5916,
"step": 60
},
{
"epoch": 0.04225139580503999,
"grad_norm": 0.8094703225757798,
"learning_rate": 1.385542168674699e-06,
"loss": 0.5623,
"step": 70
},
{
"epoch": 0.04828730949147427,
"grad_norm": 0.7924082712954621,
"learning_rate": 1.5863453815261046e-06,
"loss": 0.536,
"step": 80
},
{
"epoch": 0.05432322317790855,
"grad_norm": 0.7765422489934142,
"learning_rate": 1.7871485943775102e-06,
"loss": 0.5246,
"step": 90
},
{
"epoch": 0.06035913686434284,
"grad_norm": 1.2024335532490196,
"learning_rate": 1.987951807228916e-06,
"loss": 0.5112,
"step": 100
},
{
"epoch": 0.06639505055077713,
"grad_norm": 0.7361271470838762,
"learning_rate": 2.1887550200803216e-06,
"loss": 0.4973,
"step": 110
},
{
"epoch": 0.0724309642372114,
"grad_norm": 0.9636947285799289,
"learning_rate": 2.389558232931727e-06,
"loss": 0.4926,
"step": 120
},
{
"epoch": 0.07846687792364569,
"grad_norm": 0.834221678860187,
"learning_rate": 2.590361445783133e-06,
"loss": 0.4868,
"step": 130
},
{
"epoch": 0.08450279161007998,
"grad_norm": 0.721459089158014,
"learning_rate": 2.791164658634538e-06,
"loss": 0.4836,
"step": 140
},
{
"epoch": 0.09053870529651425,
"grad_norm": 0.7680662169711512,
"learning_rate": 2.991967871485944e-06,
"loss": 0.4759,
"step": 150
},
{
"epoch": 0.09657461898294854,
"grad_norm": 0.8074470103289187,
"learning_rate": 3.1927710843373494e-06,
"loss": 0.4753,
"step": 160
},
{
"epoch": 0.10261053266938283,
"grad_norm": 0.7821569953929599,
"learning_rate": 3.393574297188755e-06,
"loss": 0.4689,
"step": 170
},
{
"epoch": 0.1086464463558171,
"grad_norm": 0.8046561770221946,
"learning_rate": 3.5943775100401606e-06,
"loss": 0.4678,
"step": 180
},
{
"epoch": 0.1146823600422514,
"grad_norm": 0.8787311236716008,
"learning_rate": 3.7951807228915664e-06,
"loss": 0.463,
"step": 190
},
{
"epoch": 0.12071827372868568,
"grad_norm": 0.8810490177348705,
"learning_rate": 3.995983935742972e-06,
"loss": 0.4601,
"step": 200
},
{
"epoch": 0.12675418741511996,
"grad_norm": 0.8889957032229883,
"learning_rate": 4.196787148594378e-06,
"loss": 0.4589,
"step": 210
},
{
"epoch": 0.13279010110155426,
"grad_norm": 1.0047774511651133,
"learning_rate": 4.397590361445783e-06,
"loss": 0.4533,
"step": 220
},
{
"epoch": 0.13882601478798853,
"grad_norm": 0.8382898002966074,
"learning_rate": 4.598393574297189e-06,
"loss": 0.4541,
"step": 230
},
{
"epoch": 0.1448619284744228,
"grad_norm": 0.8867952107395614,
"learning_rate": 4.799196787148594e-06,
"loss": 0.4488,
"step": 240
},
{
"epoch": 0.1508978421608571,
"grad_norm": 0.8306941422038123,
"learning_rate": 5e-06,
"loss": 0.4521,
"step": 250
},
{
"epoch": 0.15693375584729138,
"grad_norm": 0.8874942497893604,
"learning_rate": 5.200803212851407e-06,
"loss": 0.4505,
"step": 260
},
{
"epoch": 0.16296966953372566,
"grad_norm": 0.8911658824764517,
"learning_rate": 5.401606425702812e-06,
"loss": 0.446,
"step": 270
},
{
"epoch": 0.16900558322015996,
"grad_norm": 0.8852855388263275,
"learning_rate": 5.602409638554217e-06,
"loss": 0.4452,
"step": 280
},
{
"epoch": 0.17504149690659423,
"grad_norm": 0.9910318655855725,
"learning_rate": 5.803212851405623e-06,
"loss": 0.4413,
"step": 290
},
{
"epoch": 0.1810774105930285,
"grad_norm": 1.0110708733608424,
"learning_rate": 6.004016064257029e-06,
"loss": 0.4397,
"step": 300
},
{
"epoch": 0.1871133242794628,
"grad_norm": 0.9003864963841174,
"learning_rate": 6.2048192771084344e-06,
"loss": 0.4414,
"step": 310
},
{
"epoch": 0.19314923796589709,
"grad_norm": 0.906399226331659,
"learning_rate": 6.40562248995984e-06,
"loss": 0.4372,
"step": 320
},
{
"epoch": 0.19918515165233136,
"grad_norm": 1.071695971731785,
"learning_rate": 6.606425702811245e-06,
"loss": 0.4381,
"step": 330
},
{
"epoch": 0.20522106533876566,
"grad_norm": 0.9761062070856111,
"learning_rate": 6.8072289156626514e-06,
"loss": 0.4343,
"step": 340
},
{
"epoch": 0.21125697902519994,
"grad_norm": 0.9721838815300707,
"learning_rate": 7.008032128514058e-06,
"loss": 0.4373,
"step": 350
},
{
"epoch": 0.2172928927116342,
"grad_norm": 0.8909414468062403,
"learning_rate": 7.208835341365462e-06,
"loss": 0.4336,
"step": 360
},
{
"epoch": 0.2233288063980685,
"grad_norm": 0.964718436271309,
"learning_rate": 7.4096385542168684e-06,
"loss": 0.4348,
"step": 370
},
{
"epoch": 0.2293647200845028,
"grad_norm": 1.025409695885071,
"learning_rate": 7.610441767068274e-06,
"loss": 0.4326,
"step": 380
},
{
"epoch": 0.23540063377093706,
"grad_norm": 0.9270275542948012,
"learning_rate": 7.81124497991968e-06,
"loss": 0.4324,
"step": 390
},
{
"epoch": 0.24143654745737136,
"grad_norm": 0.9390965093376761,
"learning_rate": 8.012048192771085e-06,
"loss": 0.4302,
"step": 400
},
{
"epoch": 0.24747246114380564,
"grad_norm": 0.7707812298350031,
"learning_rate": 8.21285140562249e-06,
"loss": 0.4276,
"step": 410
},
{
"epoch": 0.2535083748302399,
"grad_norm": 0.8215921682895242,
"learning_rate": 8.413654618473896e-06,
"loss": 0.4274,
"step": 420
},
{
"epoch": 0.2595442885166742,
"grad_norm": 1.0290878620245738,
"learning_rate": 8.614457831325302e-06,
"loss": 0.427,
"step": 430
},
{
"epoch": 0.2655802022031085,
"grad_norm": 0.9009095092288704,
"learning_rate": 8.815261044176707e-06,
"loss": 0.4232,
"step": 440
},
{
"epoch": 0.27161611588954276,
"grad_norm": 0.9646916353387767,
"learning_rate": 9.016064257028112e-06,
"loss": 0.4235,
"step": 450
},
{
"epoch": 0.27765202957597707,
"grad_norm": 0.8009669905789347,
"learning_rate": 9.21686746987952e-06,
"loss": 0.4248,
"step": 460
},
{
"epoch": 0.28368794326241137,
"grad_norm": 2.57971922495045,
"learning_rate": 9.417670682730925e-06,
"loss": 0.4246,
"step": 470
},
{
"epoch": 0.2897238569488456,
"grad_norm": 0.9225235875464007,
"learning_rate": 9.61847389558233e-06,
"loss": 0.4256,
"step": 480
},
{
"epoch": 0.2957597706352799,
"grad_norm": 0.8937790567235143,
"learning_rate": 9.819277108433736e-06,
"loss": 0.4232,
"step": 490
},
{
"epoch": 0.3017956843217142,
"grad_norm": 0.992661961364272,
"learning_rate": 9.99999876677608e-06,
"loss": 0.4236,
"step": 500
},
{
"epoch": 0.30783159800814847,
"grad_norm": 1.1203639087859305,
"learning_rate": 9.999850780641762e-06,
"loss": 0.423,
"step": 510
},
{
"epoch": 0.31386751169458277,
"grad_norm": 0.9504675074156581,
"learning_rate": 9.999456158087994e-06,
"loss": 0.4255,
"step": 520
},
{
"epoch": 0.31990342538101707,
"grad_norm": 1.1526705071263037,
"learning_rate": 9.998814918581017e-06,
"loss": 0.4236,
"step": 530
},
{
"epoch": 0.3259393390674513,
"grad_norm": 0.9400926329756719,
"learning_rate": 9.99792709375238e-06,
"loss": 0.4193,
"step": 540
},
{
"epoch": 0.3319752527538856,
"grad_norm": 0.761979605644821,
"learning_rate": 9.996792727397374e-06,
"loss": 0.4178,
"step": 550
},
{
"epoch": 0.3380111664403199,
"grad_norm": 0.7761858463434534,
"learning_rate": 9.995411875472882e-06,
"loss": 0.4172,
"step": 560
},
{
"epoch": 0.34404708012675417,
"grad_norm": 0.8353265789234773,
"learning_rate": 9.993784606094612e-06,
"loss": 0.417,
"step": 570
},
{
"epoch": 0.35008299381318847,
"grad_norm": 0.7921534241896437,
"learning_rate": 9.991910999533739e-06,
"loss": 0.4164,
"step": 580
},
{
"epoch": 0.35611890749962277,
"grad_norm": 0.8368518529458858,
"learning_rate": 9.98979114821294e-06,
"loss": 0.4212,
"step": 590
},
{
"epoch": 0.362154821186057,
"grad_norm": 0.8526689259731893,
"learning_rate": 9.98742515670185e-06,
"loss": 0.413,
"step": 600
},
{
"epoch": 0.3681907348724913,
"grad_norm": 0.8691355689423315,
"learning_rate": 9.98481314171188e-06,
"loss": 0.4147,
"step": 610
},
{
"epoch": 0.3742266485589256,
"grad_norm": 0.7413766525933784,
"learning_rate": 9.981955232090484e-06,
"loss": 0.4202,
"step": 620
},
{
"epoch": 0.38026256224535987,
"grad_norm": 0.862826800304683,
"learning_rate": 9.978851568814789e-06,
"loss": 0.4144,
"step": 630
},
{
"epoch": 0.38629847593179417,
"grad_norm": 0.852995884285724,
"learning_rate": 9.975502304984643e-06,
"loss": 0.4159,
"step": 640
},
{
"epoch": 0.3923343896182285,
"grad_norm": 0.8190268708459463,
"learning_rate": 9.971907605815065e-06,
"loss": 0.4133,
"step": 650
},
{
"epoch": 0.3983703033046627,
"grad_norm": 0.7826738241592833,
"learning_rate": 9.968067648628092e-06,
"loss": 0.417,
"step": 660
},
{
"epoch": 0.404406216991097,
"grad_norm": 0.8234056482304477,
"learning_rate": 9.963982622844037e-06,
"loss": 0.4151,
"step": 670
},
{
"epoch": 0.4104421306775313,
"grad_norm": 0.8389822495874198,
"learning_rate": 9.959652729972138e-06,
"loss": 0.4142,
"step": 680
},
{
"epoch": 0.41647804436396557,
"grad_norm": 0.7530220222404655,
"learning_rate": 9.955078183600626e-06,
"loss": 0.4135,
"step": 690
},
{
"epoch": 0.4225139580503999,
"grad_norm": 0.8094044727188283,
"learning_rate": 9.950259209386182e-06,
"loss": 0.4076,
"step": 700
},
{
"epoch": 0.4285498717368342,
"grad_norm": 0.7704390882655109,
"learning_rate": 9.945196045042812e-06,
"loss": 0.41,
"step": 710
},
{
"epoch": 0.4345857854232684,
"grad_norm": 0.9003987196323937,
"learning_rate": 9.93988894033011e-06,
"loss": 0.4114,
"step": 720
},
{
"epoch": 0.4406216991097027,
"grad_norm": 0.8729571471009108,
"learning_rate": 9.934338157040953e-06,
"loss": 0.4128,
"step": 730
},
{
"epoch": 0.446657612796137,
"grad_norm": 0.7801434856688376,
"learning_rate": 9.928543968988576e-06,
"loss": 0.4103,
"step": 740
},
{
"epoch": 0.4526935264825713,
"grad_norm": 0.9417689284475159,
"learning_rate": 9.922506661993067e-06,
"loss": 0.4086,
"step": 750
},
{
"epoch": 0.4587294401690056,
"grad_norm": 0.8877368018323296,
"learning_rate": 9.91622653386727e-06,
"loss": 0.4139,
"step": 760
},
{
"epoch": 0.4647653538554399,
"grad_norm": 0.7960343939884429,
"learning_rate": 9.909703894402093e-06,
"loss": 0.4072,
"step": 770
},
{
"epoch": 0.4708012675418741,
"grad_norm": 0.7142525800658928,
"learning_rate": 9.90293906535123e-06,
"loss": 0.4069,
"step": 780
},
{
"epoch": 0.4768371812283084,
"grad_norm": 0.8168998091378754,
"learning_rate": 9.895932380415277e-06,
"loss": 0.4053,
"step": 790
},
{
"epoch": 0.48287309491474273,
"grad_norm": 0.7851582099155968,
"learning_rate": 9.888684185225291e-06,
"loss": 0.4096,
"step": 800
},
{
"epoch": 0.48890900860117703,
"grad_norm": 0.7313895363802666,
"learning_rate": 9.881194837325722e-06,
"loss": 0.4035,
"step": 810
},
{
"epoch": 0.4949449222876113,
"grad_norm": 0.801599057157289,
"learning_rate": 9.873464706156785e-06,
"loss": 0.4082,
"step": 820
},
{
"epoch": 0.5009808359740455,
"grad_norm": 0.7959824627607599,
"learning_rate": 9.865494173036238e-06,
"loss": 0.4086,
"step": 830
},
{
"epoch": 0.5070167496604798,
"grad_norm": 0.7643194639900054,
"learning_rate": 9.857283631140563e-06,
"loss": 0.4097,
"step": 840
},
{
"epoch": 0.5130526633469141,
"grad_norm": 0.8141162481887632,
"learning_rate": 9.848833485485577e-06,
"loss": 0.4068,
"step": 850
},
{
"epoch": 0.5190885770333484,
"grad_norm": 0.7263606575446551,
"learning_rate": 9.840144152906455e-06,
"loss": 0.4052,
"step": 860
},
{
"epoch": 0.5251244907197827,
"grad_norm": 0.7326820835121685,
"learning_rate": 9.831216062037163e-06,
"loss": 0.403,
"step": 870
},
{
"epoch": 0.531160404406217,
"grad_norm": 0.7722145618849807,
"learning_rate": 9.822049653289318e-06,
"loss": 0.4041,
"step": 880
},
{
"epoch": 0.5371963180926512,
"grad_norm": 0.7035970302521439,
"learning_rate": 9.81264537883046e-06,
"loss": 0.401,
"step": 890
},
{
"epoch": 0.5432322317790855,
"grad_norm": 0.6580207236042055,
"learning_rate": 9.803003702561753e-06,
"loss": 0.4057,
"step": 900
},
{
"epoch": 0.5492681454655198,
"grad_norm": 0.6960070468306416,
"learning_rate": 9.79312510009509e-06,
"loss": 0.4103,
"step": 910
},
{
"epoch": 0.5553040591519541,
"grad_norm": 0.7088936549744779,
"learning_rate": 9.783010058729644e-06,
"loss": 0.4024,
"step": 920
},
{
"epoch": 0.5613399728383884,
"grad_norm": 0.8173990374915286,
"learning_rate": 9.772659077427824e-06,
"loss": 0.3983,
"step": 930
},
{
"epoch": 0.5673758865248227,
"grad_norm": 0.7248588219467303,
"learning_rate": 9.762072666790658e-06,
"loss": 0.4042,
"step": 940
},
{
"epoch": 0.5734118002112569,
"grad_norm": 0.6953286894486166,
"learning_rate": 9.751251349032615e-06,
"loss": 0.4052,
"step": 950
},
{
"epoch": 0.5794477138976912,
"grad_norm": 0.6805775618542874,
"learning_rate": 9.74019565795584e-06,
"loss": 0.4028,
"step": 960
},
{
"epoch": 0.5854836275841255,
"grad_norm": 0.7073250522342893,
"learning_rate": 9.728906138923823e-06,
"loss": 0.4031,
"step": 970
},
{
"epoch": 0.5915195412705598,
"grad_norm": 0.8161486510568995,
"learning_rate": 9.71738334883449e-06,
"loss": 0.4012,
"step": 980
},
{
"epoch": 0.5975554549569941,
"grad_norm": 0.7478470587664012,
"learning_rate": 9.705627856092743e-06,
"loss": 0.4035,
"step": 990
},
{
"epoch": 0.6035913686434284,
"grad_norm": 1.2181648223419725,
"learning_rate": 9.69364024058242e-06,
"loss": 0.3994,
"step": 1000
},
{
"epoch": 0.6096272823298626,
"grad_norm": 0.724496170506016,
"learning_rate": 9.681421093637677e-06,
"loss": 0.4003,
"step": 1010
},
{
"epoch": 0.6156631960162969,
"grad_norm": 0.7245373569956688,
"learning_rate": 9.668971018013835e-06,
"loss": 0.3993,
"step": 1020
},
{
"epoch": 0.6216991097027312,
"grad_norm": 1.3707555561464966,
"learning_rate": 9.656290627857638e-06,
"loss": 0.4031,
"step": 1030
},
{
"epoch": 0.6277350233891655,
"grad_norm": 0.8617205371794142,
"learning_rate": 9.643380548676957e-06,
"loss": 0.3989,
"step": 1040
},
{
"epoch": 0.6337709370755998,
"grad_norm": 0.7218421707442351,
"learning_rate": 9.63024141730994e-06,
"loss": 0.4009,
"step": 1050
},
{
"epoch": 0.6398068507620341,
"grad_norm": 0.7919863849580143,
"learning_rate": 9.616873881893593e-06,
"loss": 0.402,
"step": 1060
},
{
"epoch": 0.6458427644484683,
"grad_norm": 0.7643496416415103,
"learning_rate": 9.603278601831806e-06,
"loss": 0.3966,
"step": 1070
},
{
"epoch": 0.6518786781349026,
"grad_norm": 0.8387350986976135,
"learning_rate": 9.58945624776284e-06,
"loss": 0.3974,
"step": 1080
},
{
"epoch": 0.6579145918213369,
"grad_norm": 0.7195707742464319,
"learning_rate": 9.575407501526218e-06,
"loss": 0.4033,
"step": 1090
},
{
"epoch": 0.6639505055077712,
"grad_norm": 0.8948583587192116,
"learning_rate": 9.561133056129122e-06,
"loss": 0.4005,
"step": 1100
},
{
"epoch": 0.6699864191942055,
"grad_norm": 0.7784558611785358,
"learning_rate": 9.546633615712184e-06,
"loss": 0.3969,
"step": 1110
},
{
"epoch": 0.6760223328806398,
"grad_norm": 0.7279188084081983,
"learning_rate": 9.531909895514766e-06,
"loss": 0.3968,
"step": 1120
},
{
"epoch": 0.6820582465670741,
"grad_norm": 0.7707824454002812,
"learning_rate": 9.516962621839667e-06,
"loss": 0.3941,
"step": 1130
},
{
"epoch": 0.6880941602535083,
"grad_norm": 0.7559246242676043,
"learning_rate": 9.501792532017304e-06,
"loss": 0.3935,
"step": 1140
},
{
"epoch": 0.6941300739399426,
"grad_norm": 0.7670492895949397,
"learning_rate": 9.48640037436934e-06,
"loss": 0.3962,
"step": 1150
},
{
"epoch": 0.7001659876263769,
"grad_norm": 0.7574175499302432,
"learning_rate": 9.470786908171761e-06,
"loss": 0.396,
"step": 1160
},
{
"epoch": 0.7062019013128112,
"grad_norm": 1.1364368407573255,
"learning_rate": 9.454952903617434e-06,
"loss": 0.3987,
"step": 1170
},
{
"epoch": 0.7122378149992455,
"grad_norm": 0.6929517509246322,
"learning_rate": 9.438899141778105e-06,
"loss": 0.3959,
"step": 1180
},
{
"epoch": 0.7182737286856798,
"grad_norm": 0.7239918001848392,
"learning_rate": 9.42262641456588e-06,
"loss": 0.3961,
"step": 1190
},
{
"epoch": 0.724309642372114,
"grad_norm": 0.7351627240649914,
"learning_rate": 9.406135524694146e-06,
"loss": 0.3946,
"step": 1200
},
{
"epoch": 0.7303455560585483,
"grad_norm": 0.7178193311197739,
"learning_rate": 9.389427285637986e-06,
"loss": 0.3934,
"step": 1210
},
{
"epoch": 0.7363814697449826,
"grad_norm": 0.7197436378060236,
"learning_rate": 9.372502521594052e-06,
"loss": 0.3951,
"step": 1220
},
{
"epoch": 0.7424173834314169,
"grad_norm": 0.7020942866993558,
"learning_rate": 9.355362067439899e-06,
"loss": 0.3953,
"step": 1230
},
{
"epoch": 0.7484532971178512,
"grad_norm": 0.6493652144119091,
"learning_rate": 9.338006768692807e-06,
"loss": 0.3976,
"step": 1240
},
{
"epoch": 0.7544892108042855,
"grad_norm": 0.7452091082245685,
"learning_rate": 9.320437481468077e-06,
"loss": 0.3947,
"step": 1250
},
{
"epoch": 0.7605251244907197,
"grad_norm": 0.7211982596336295,
"learning_rate": 9.302655072436789e-06,
"loss": 0.3978,
"step": 1260
},
{
"epoch": 0.766561038177154,
"grad_norm": 0.8069527677411222,
"learning_rate": 9.284660418783064e-06,
"loss": 0.3961,
"step": 1270
},
{
"epoch": 0.7725969518635883,
"grad_norm": 0.6964974366663241,
"learning_rate": 9.266454408160779e-06,
"loss": 0.395,
"step": 1280
},
{
"epoch": 0.7786328655500226,
"grad_norm": 0.6951835215600591,
"learning_rate": 9.248037938649792e-06,
"loss": 0.3918,
"step": 1290
},
{
"epoch": 0.784668779236457,
"grad_norm": 0.7011033108204148,
"learning_rate": 9.229411918711637e-06,
"loss": 0.3911,
"step": 1300
},
{
"epoch": 0.7907046929228913,
"grad_norm": 0.6699999752789259,
"learning_rate": 9.210577267144703e-06,
"loss": 0.3917,
"step": 1310
},
{
"epoch": 0.7967406066093254,
"grad_norm": 0.7952469588442095,
"learning_rate": 9.191534913038926e-06,
"loss": 0.393,
"step": 1320
},
{
"epoch": 0.8027765202957597,
"grad_norm": 0.7362949625214187,
"learning_rate": 9.172285795729945e-06,
"loss": 0.3916,
"step": 1330
},
{
"epoch": 0.808812433982194,
"grad_norm": 0.777349182077021,
"learning_rate": 9.152830864752773e-06,
"loss": 0.396,
"step": 1340
},
{
"epoch": 0.8148483476686283,
"grad_norm": 0.6858011231159463,
"learning_rate": 9.133171079794952e-06,
"loss": 0.3949,
"step": 1350
},
{
"epoch": 0.8208842613550626,
"grad_norm": 0.8252893789848457,
"learning_rate": 9.113307410649222e-06,
"loss": 0.3951,
"step": 1360
},
{
"epoch": 0.826920175041497,
"grad_norm": 0.742614174317752,
"learning_rate": 9.093240837165668e-06,
"loss": 0.3912,
"step": 1370
},
{
"epoch": 0.8329560887279311,
"grad_norm": 0.6712408370389595,
"learning_rate": 9.072972349203401e-06,
"loss": 0.3938,
"step": 1380
},
{
"epoch": 0.8389920024143654,
"grad_norm": 0.7390425813359819,
"learning_rate": 9.052502946581718e-06,
"loss": 0.3902,
"step": 1390
},
{
"epoch": 0.8450279161007997,
"grad_norm": 0.9031901060003036,
"learning_rate": 9.031833639030789e-06,
"loss": 0.39,
"step": 1400
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.8073830235615219,
"learning_rate": 9.010965446141842e-06,
"loss": 0.3907,
"step": 1410
},
{
"epoch": 0.8570997434736684,
"grad_norm": 0.7197468777451328,
"learning_rate": 8.989899397316875e-06,
"loss": 0.3933,
"step": 1420
},
{
"epoch": 0.8631356571601027,
"grad_norm": 0.7874409375571629,
"learning_rate": 8.96863653171787e-06,
"loss": 0.3941,
"step": 1430
},
{
"epoch": 0.8691715708465368,
"grad_norm": 0.7047790860975574,
"learning_rate": 8.947177898215538e-06,
"loss": 0.3918,
"step": 1440
},
{
"epoch": 0.8752074845329711,
"grad_norm": 0.6732410856766448,
"learning_rate": 8.925524555337575e-06,
"loss": 0.3948,
"step": 1450
},
{
"epoch": 0.8812433982194054,
"grad_norm": 0.6379130166882847,
"learning_rate": 8.90367757121645e-06,
"loss": 0.392,
"step": 1460
},
{
"epoch": 0.8872793119058398,
"grad_norm": 0.6453169279070088,
"learning_rate": 8.881638023536715e-06,
"loss": 0.3902,
"step": 1470
},
{
"epoch": 0.893315225592274,
"grad_norm": 0.8925532684482897,
"learning_rate": 8.859406999481839e-06,
"loss": 0.3897,
"step": 1480
},
{
"epoch": 0.8993511392787084,
"grad_norm": 0.7321151042406583,
"learning_rate": 8.836985595680585e-06,
"loss": 0.3903,
"step": 1490
},
{
"epoch": 0.9053870529651425,
"grad_norm": 0.717542202485072,
"learning_rate": 8.81437491815291e-06,
"loss": 0.3907,
"step": 1500
},
{
"epoch": 0.9114229666515768,
"grad_norm": 0.6899069830042462,
"learning_rate": 8.791576082255414e-06,
"loss": 0.3914,
"step": 1510
},
{
"epoch": 0.9174588803380112,
"grad_norm": 0.7416902913208727,
"learning_rate": 8.768590212626305e-06,
"loss": 0.3914,
"step": 1520
},
{
"epoch": 0.9234947940244455,
"grad_norm": 0.648187852127454,
"learning_rate": 8.745418443129944e-06,
"loss": 0.3878,
"step": 1530
},
{
"epoch": 0.9295307077108798,
"grad_norm": 0.6971446829374528,
"learning_rate": 8.722061916800892e-06,
"loss": 0.3889,
"step": 1540
},
{
"epoch": 0.9355666213973141,
"grad_norm": 0.6897656341763103,
"learning_rate": 8.698521785787543e-06,
"loss": 0.3916,
"step": 1550
},
{
"epoch": 0.9416025350837482,
"grad_norm": 0.6707821534631215,
"learning_rate": 8.674799211295272e-06,
"loss": 0.3872,
"step": 1560
},
{
"epoch": 0.9476384487701826,
"grad_norm": 0.7047440310341709,
"learning_rate": 8.650895363529172e-06,
"loss": 0.3893,
"step": 1570
},
{
"epoch": 0.9536743624566169,
"grad_norm": 0.7111300925227007,
"learning_rate": 8.626811421636318e-06,
"loss": 0.3899,
"step": 1580
},
{
"epoch": 0.9597102761430512,
"grad_norm": 0.742242466940292,
"learning_rate": 8.602548573647603e-06,
"loss": 0.3933,
"step": 1590
},
{
"epoch": 0.9657461898294855,
"grad_norm": 0.6405514647772552,
"learning_rate": 8.578108016419138e-06,
"loss": 0.3886,
"step": 1600
},
{
"epoch": 0.9717821035159198,
"grad_norm": 0.6969067995610034,
"learning_rate": 8.553490955573207e-06,
"loss": 0.3875,
"step": 1610
},
{
"epoch": 0.9778180172023541,
"grad_norm": 0.6404080311189763,
"learning_rate": 8.528698605438801e-06,
"loss": 0.3915,
"step": 1620
},
{
"epoch": 0.9838539308887883,
"grad_norm": 0.689314089106684,
"learning_rate": 8.50373218899171e-06,
"loss": 0.3897,
"step": 1630
},
{
"epoch": 0.9898898445752226,
"grad_norm": 0.6238451440610306,
"learning_rate": 8.478592937794202e-06,
"loss": 0.3865,
"step": 1640
},
{
"epoch": 0.9959257582616569,
"grad_norm": 0.6246538104726604,
"learning_rate": 8.453282091934262e-06,
"loss": 0.3891,
"step": 1650
},
{
"epoch": 1.0018107741059303,
"grad_norm": 0.6650133535244673,
"learning_rate": 8.427800899964438e-06,
"loss": 0.3775,
"step": 1660
},
{
"epoch": 1.0078466877923646,
"grad_norm": 0.7340465665361768,
"learning_rate": 8.402150618840229e-06,
"loss": 0.3658,
"step": 1670
},
{
"epoch": 1.013882601478799,
"grad_norm": 0.8803678131362109,
"learning_rate": 8.376332513858091e-06,
"loss": 0.3643,
"step": 1680
},
{
"epoch": 1.0199185151652332,
"grad_norm": 0.6784266807756097,
"learning_rate": 8.350347858593035e-06,
"loss": 0.3632,
"step": 1690
},
{
"epoch": 1.0259544288516673,
"grad_norm": 0.6757297253946429,
"learning_rate": 8.324197934835775e-06,
"loss": 0.3611,
"step": 1700
},
{
"epoch": 1.0319903425381016,
"grad_norm": 0.6937615226816463,
"learning_rate": 8.297884032529525e-06,
"loss": 0.3641,
"step": 1710
},
{
"epoch": 1.038026256224536,
"grad_norm": 0.6656265896882699,
"learning_rate": 8.271407449706347e-06,
"loss": 0.3634,
"step": 1720
},
{
"epoch": 1.0440621699109702,
"grad_norm": 0.6758693000716391,
"learning_rate": 8.244769492423144e-06,
"loss": 0.3651,
"step": 1730
},
{
"epoch": 1.0500980835974045,
"grad_norm": 0.7271602756269683,
"learning_rate": 8.217971474697205e-06,
"loss": 0.3655,
"step": 1740
},
{
"epoch": 1.0561339972838388,
"grad_norm": 0.7262048623607191,
"learning_rate": 8.191014718441413e-06,
"loss": 0.3646,
"step": 1750
},
{
"epoch": 1.0621699109702731,
"grad_norm": 0.7594858496478063,
"learning_rate": 8.163900553399022e-06,
"loss": 0.3683,
"step": 1760
},
{
"epoch": 1.0682058246567074,
"grad_norm": 0.6834326812737692,
"learning_rate": 8.13663031707806e-06,
"loss": 0.3657,
"step": 1770
},
{
"epoch": 1.0742417383431417,
"grad_norm": 0.829231127715137,
"learning_rate": 8.109205354685367e-06,
"loss": 0.3657,
"step": 1780
},
{
"epoch": 1.080277652029576,
"grad_norm": 0.7172584884654448,
"learning_rate": 8.081627019060223e-06,
"loss": 0.3612,
"step": 1790
},
{
"epoch": 1.0863135657160103,
"grad_norm": 0.700123283944604,
"learning_rate": 8.053896670607616e-06,
"loss": 0.3669,
"step": 1800
},
{
"epoch": 1.0923494794024446,
"grad_norm": 0.6802763184360072,
"learning_rate": 8.026015677231137e-06,
"loss": 0.36,
"step": 1810
},
{
"epoch": 1.0983853930888787,
"grad_norm": 0.6976972839342949,
"learning_rate": 7.997985414265513e-06,
"loss": 0.3645,
"step": 1820
},
{
"epoch": 1.104421306775313,
"grad_norm": 0.6892045690564895,
"learning_rate": 7.969807264408745e-06,
"loss": 0.3664,
"step": 1830
},
{
"epoch": 1.1104572204617473,
"grad_norm": 0.6606374628961976,
"learning_rate": 7.94148261765391e-06,
"loss": 0.3611,
"step": 1840
},
{
"epoch": 1.1164931341481816,
"grad_norm": 0.7063672325182395,
"learning_rate": 7.913012871220605e-06,
"loss": 0.3652,
"step": 1850
},
{
"epoch": 1.122529047834616,
"grad_norm": 0.6353061774622171,
"learning_rate": 7.884399429486e-06,
"loss": 0.3619,
"step": 1860
},
{
"epoch": 1.1285649615210502,
"grad_norm": 0.6646621743965846,
"learning_rate": 7.855643703915585e-06,
"loss": 0.3638,
"step": 1870
},
{
"epoch": 1.1346008752074845,
"grad_norm": 0.6379034557335701,
"learning_rate": 7.826747112993532e-06,
"loss": 0.3595,
"step": 1880
},
{
"epoch": 1.1406367888939188,
"grad_norm": 0.6995974469144366,
"learning_rate": 7.797711082152726e-06,
"loss": 0.3628,
"step": 1890
},
{
"epoch": 1.1466727025803531,
"grad_norm": 0.6564170955860726,
"learning_rate": 7.768537043704447e-06,
"loss": 0.3637,
"step": 1900
},
{
"epoch": 1.1527086162667874,
"grad_norm": 0.7572552114374352,
"learning_rate": 7.739226436767721e-06,
"loss": 0.362,
"step": 1910
},
{
"epoch": 1.1587445299532217,
"grad_norm": 0.7571612085211564,
"learning_rate": 7.709780707198328e-06,
"loss": 0.3638,
"step": 1920
},
{
"epoch": 1.164780443639656,
"grad_norm": 0.6792493024466744,
"learning_rate": 7.680201307517479e-06,
"loss": 0.3625,
"step": 1930
},
{
"epoch": 1.1708163573260904,
"grad_norm": 0.664259682779261,
"learning_rate": 7.650489696840164e-06,
"loss": 0.3646,
"step": 1940
},
{
"epoch": 1.1768522710125244,
"grad_norm": 0.6270149603322056,
"learning_rate": 7.6206473408031775e-06,
"loss": 0.3624,
"step": 1950
},
{
"epoch": 1.1828881846989587,
"grad_norm": 0.6383894085325998,
"learning_rate": 7.590675711492823e-06,
"loss": 0.3643,
"step": 1960
},
{
"epoch": 1.188924098385393,
"grad_norm": 0.6816453891866903,
"learning_rate": 7.56057628737229e-06,
"loss": 0.3637,
"step": 1970
},
{
"epoch": 1.1949600120718273,
"grad_norm": 0.7133078108250313,
"learning_rate": 7.530350553208726e-06,
"loss": 0.3585,
"step": 1980
},
{
"epoch": 1.2009959257582616,
"grad_norm": 0.6322767475179056,
"learning_rate": 7.500000000000001e-06,
"loss": 0.361,
"step": 1990
},
{
"epoch": 1.207031839444696,
"grad_norm": 0.6987380190815154,
"learning_rate": 7.469526124901149e-06,
"loss": 0.3623,
"step": 2000
},
{
"epoch": 1.2130677531311302,
"grad_norm": 0.6219916214226197,
"learning_rate": 7.4389304311505195e-06,
"loss": 0.3637,
"step": 2010
},
{
"epoch": 1.2191036668175645,
"grad_norm": 0.6591583924033313,
"learning_rate": 7.408214427995628e-06,
"loss": 0.3644,
"step": 2020
},
{
"epoch": 1.2251395805039988,
"grad_norm": 0.7005471225701302,
"learning_rate": 7.3773796306187e-06,
"loss": 0.3595,
"step": 2030
},
{
"epoch": 1.2311754941904332,
"grad_norm": 0.6332845796820719,
"learning_rate": 7.346427560061931e-06,
"loss": 0.3652,
"step": 2040
},
{
"epoch": 1.2372114078768675,
"grad_norm": 0.6778224076333697,
"learning_rate": 7.315359743152464e-06,
"loss": 0.3606,
"step": 2050
},
{
"epoch": 1.2432473215633015,
"grad_norm": 0.6582665893949518,
"learning_rate": 7.284177712427056e-06,
"loss": 0.3599,
"step": 2060
},
{
"epoch": 1.2492832352497358,
"grad_norm": 0.6584059931101761,
"learning_rate": 7.252883006056495e-06,
"loss": 0.3622,
"step": 2070
},
{
"epoch": 1.2553191489361701,
"grad_norm": 0.6857700496450303,
"learning_rate": 7.221477167769716e-06,
"loss": 0.3633,
"step": 2080
},
{
"epoch": 1.2613550626226044,
"grad_norm": 0.6856644672766703,
"learning_rate": 7.189961746777657e-06,
"loss": 0.363,
"step": 2090
},
{
"epoch": 1.2673909763090387,
"grad_norm": 0.6857005736783666,
"learning_rate": 7.1583382976968295e-06,
"loss": 0.3618,
"step": 2100
},
{
"epoch": 1.273426889995473,
"grad_norm": 0.6166440607694041,
"learning_rate": 7.126608380472642e-06,
"loss": 0.3593,
"step": 2110
},
{
"epoch": 1.2794628036819073,
"grad_norm": 0.6673854300030073,
"learning_rate": 7.094773560302438e-06,
"loss": 0.3616,
"step": 2120
},
{
"epoch": 1.2854987173683416,
"grad_norm": 0.6261609808400934,
"learning_rate": 7.062835407558295e-06,
"loss": 0.3623,
"step": 2130
},
{
"epoch": 1.291534631054776,
"grad_norm": 0.6573770008704372,
"learning_rate": 7.030795497709559e-06,
"loss": 0.3616,
"step": 2140
},
{
"epoch": 1.2975705447412103,
"grad_norm": 0.63175357402283,
"learning_rate": 6.99865541124513e-06,
"loss": 0.363,
"step": 2150
},
{
"epoch": 1.3036064584276446,
"grad_norm": 0.7095581591416922,
"learning_rate": 6.9664167335954866e-06,
"loss": 0.3604,
"step": 2160
},
{
"epoch": 1.3096423721140789,
"grad_norm": 0.6211244267814455,
"learning_rate": 6.9340810550545004e-06,
"loss": 0.3584,
"step": 2170
},
{
"epoch": 1.3156782858005132,
"grad_norm": 0.6411383893721285,
"learning_rate": 6.901649970700966e-06,
"loss": 0.3616,
"step": 2180
},
{
"epoch": 1.3217141994869475,
"grad_norm": 0.6508872294411808,
"learning_rate": 6.869125080319934e-06,
"loss": 0.3626,
"step": 2190
},
{
"epoch": 1.3277501131733815,
"grad_norm": 0.6456129899609592,
"learning_rate": 6.836507988323785e-06,
"loss": 0.3612,
"step": 2200
},
{
"epoch": 1.3337860268598158,
"grad_norm": 0.6885055595324049,
"learning_rate": 6.803800303673096e-06,
"loss": 0.3588,
"step": 2210
},
{
"epoch": 1.3398219405462501,
"grad_norm": 0.6841559054058574,
"learning_rate": 6.77100363979726e-06,
"loss": 0.3608,
"step": 2220
},
{
"epoch": 1.3458578542326844,
"grad_norm": 0.7229876827512576,
"learning_rate": 6.738119614514913e-06,
"loss": 0.3655,
"step": 2230
},
{
"epoch": 1.3518937679191187,
"grad_norm": 0.6235312062043321,
"learning_rate": 6.705149849954116e-06,
"loss": 0.3607,
"step": 2240
},
{
"epoch": 1.357929681605553,
"grad_norm": 0.6372979896414575,
"learning_rate": 6.672095972472339e-06,
"loss": 0.3613,
"step": 2250
},
{
"epoch": 1.3639655952919874,
"grad_norm": 0.5943237749223176,
"learning_rate": 6.638959612576243e-06,
"loss": 0.3578,
"step": 2260
},
{
"epoch": 1.3700015089784217,
"grad_norm": 0.6331473442190148,
"learning_rate": 6.605742404841241e-06,
"loss": 0.3606,
"step": 2270
},
{
"epoch": 1.376037422664856,
"grad_norm": 0.6352200712052698,
"learning_rate": 6.572445987830869e-06,
"loss": 0.3602,
"step": 2280
},
{
"epoch": 1.38207333635129,
"grad_norm": 0.6315011206585134,
"learning_rate": 6.539072004015962e-06,
"loss": 0.3585,
"step": 2290
},
{
"epoch": 1.3881092500377243,
"grad_norm": 0.672467399271792,
"learning_rate": 6.505622099693624e-06,
"loss": 0.359,
"step": 2300
},
{
"epoch": 1.3941451637241586,
"grad_norm": 0.6540330679200106,
"learning_rate": 6.4720979249060245e-06,
"loss": 0.357,
"step": 2310
},
{
"epoch": 1.400181077410593,
"grad_norm": 0.6296334356002367,
"learning_rate": 6.438501133359006e-06,
"loss": 0.363,
"step": 2320
},
{
"epoch": 1.4062169910970272,
"grad_norm": 0.5755292937597596,
"learning_rate": 6.404833382340498e-06,
"loss": 0.3579,
"step": 2330
},
{
"epoch": 1.4122529047834615,
"grad_norm": 0.6273216809842853,
"learning_rate": 6.3710963326387845e-06,
"loss": 0.361,
"step": 2340
},
{
"epoch": 1.4182888184698959,
"grad_norm": 0.659504858020357,
"learning_rate": 6.337291648460554e-06,
"loss": 0.3648,
"step": 2350
},
{
"epoch": 1.4243247321563302,
"grad_norm": 0.646430703430766,
"learning_rate": 6.303420997348828e-06,
"loss": 0.3609,
"step": 2360
},
{
"epoch": 1.4303606458427645,
"grad_norm": 0.70677217944382,
"learning_rate": 6.269486050100692e-06,
"loss": 0.3583,
"step": 2370
},
{
"epoch": 1.4363965595291988,
"grad_norm": 0.6982928562021034,
"learning_rate": 6.2354884806848825e-06,
"loss": 0.3587,
"step": 2380
},
{
"epoch": 1.442432473215633,
"grad_norm": 0.635748838083391,
"learning_rate": 6.201429966159203e-06,
"loss": 0.3603,
"step": 2390
},
{
"epoch": 1.4484683869020674,
"grad_norm": 0.6591941857655591,
"learning_rate": 6.167312186587813e-06,
"loss": 0.3587,
"step": 2400
},
{
"epoch": 1.4545043005885017,
"grad_norm": 0.6513018732706167,
"learning_rate": 6.133136824958334e-06,
"loss": 0.3583,
"step": 2410
},
{
"epoch": 1.460540214274936,
"grad_norm": 0.6895727383237782,
"learning_rate": 6.098905567098846e-06,
"loss": 0.3638,
"step": 2420
},
{
"epoch": 1.4665761279613703,
"grad_norm": 0.6281650394691185,
"learning_rate": 6.064620101594715e-06,
"loss": 0.3629,
"step": 2430
},
{
"epoch": 1.4726120416478046,
"grad_norm": 0.7324490252015554,
"learning_rate": 6.030282119705306e-06,
"loss": 0.3621,
"step": 2440
},
{
"epoch": 1.4786479553342387,
"grad_norm": 0.6803933740478001,
"learning_rate": 5.99589331528055e-06,
"loss": 0.3613,
"step": 2450
},
{
"epoch": 1.484683869020673,
"grad_norm": 0.6535344969186776,
"learning_rate": 5.961455384677393e-06,
"loss": 0.3588,
"step": 2460
},
{
"epoch": 1.4907197827071073,
"grad_norm": 0.6220530519094237,
"learning_rate": 5.92697002667611e-06,
"loss": 0.3614,
"step": 2470
},
{
"epoch": 1.4967556963935416,
"grad_norm": 0.5997735782443615,
"learning_rate": 5.892438942396515e-06,
"loss": 0.3562,
"step": 2480
},
{
"epoch": 1.5027916100799759,
"grad_norm": 0.5881600037112182,
"learning_rate": 5.857863835214041e-06,
"loss": 0.36,
"step": 2490
},
{
"epoch": 1.5088275237664102,
"grad_norm": 0.6301732957095514,
"learning_rate": 5.823246410675714e-06,
"loss": 0.3602,
"step": 2500
},
{
"epoch": 1.5148634374528445,
"grad_norm": 0.6369138058336548,
"learning_rate": 5.788588376416026e-06,
"loss": 0.3575,
"step": 2510
},
{
"epoch": 1.5208993511392785,
"grad_norm": 1.8916358390305654,
"learning_rate": 5.753891442072693e-06,
"loss": 0.3584,
"step": 2520
},
{
"epoch": 1.5269352648257128,
"grad_norm": 0.6400402583906231,
"learning_rate": 5.719157319202325e-06,
"loss": 0.3539,
"step": 2530
},
{
"epoch": 1.5329711785121471,
"grad_norm": 0.6223661041265537,
"learning_rate": 5.684387721195997e-06,
"loss": 0.3595,
"step": 2540
},
{
"epoch": 1.5390070921985815,
"grad_norm": 0.6649761362975228,
"learning_rate": 5.649584363194725e-06,
"loss": 0.36,
"step": 2550
},
{
"epoch": 1.5450430058850158,
"grad_norm": 0.5989851062495032,
"learning_rate": 5.6147489620048655e-06,
"loss": 0.3582,
"step": 2560
},
{
"epoch": 1.55107891957145,
"grad_norm": 0.6435791376898407,
"learning_rate": 5.579883236013429e-06,
"loss": 0.3559,
"step": 2570
},
{
"epoch": 1.5571148332578844,
"grad_norm": 0.5973586913854247,
"learning_rate": 5.544988905103304e-06,
"loss": 0.3581,
"step": 2580
},
{
"epoch": 1.5631507469443187,
"grad_norm": 0.6331916860819433,
"learning_rate": 5.510067690568429e-06,
"loss": 0.3573,
"step": 2590
},
{
"epoch": 1.569186660630753,
"grad_norm": 0.6000249694556851,
"learning_rate": 5.475121315028876e-06,
"loss": 0.3574,
"step": 2600
},
{
"epoch": 1.5752225743171873,
"grad_norm": 0.5919987411148389,
"learning_rate": 5.4401515023458805e-06,
"loss": 0.3622,
"step": 2610
},
{
"epoch": 1.5812584880036216,
"grad_norm": 0.6130160505042299,
"learning_rate": 5.4051599775368e-06,
"loss": 0.3585,
"step": 2620
},
{
"epoch": 1.5872944016900559,
"grad_norm": 0.6196465067482942,
"learning_rate": 5.370148466690026e-06,
"loss": 0.3524,
"step": 2630
},
{
"epoch": 1.5933303153764902,
"grad_norm": 0.6396523422153624,
"learning_rate": 5.335118696879836e-06,
"loss": 0.3584,
"step": 2640
},
{
"epoch": 1.5993662290629245,
"grad_norm": 0.6247037129381725,
"learning_rate": 5.3000723960812e-06,
"loss": 0.358,
"step": 2650
},
{
"epoch": 1.6054021427493588,
"grad_norm": 0.6296280096461855,
"learning_rate": 5.265011293084539e-06,
"loss": 0.3557,
"step": 2660
},
{
"epoch": 1.611438056435793,
"grad_norm": 0.6270649643037325,
"learning_rate": 5.2299371174104505e-06,
"loss": 0.3586,
"step": 2670
},
{
"epoch": 1.6174739701222274,
"grad_norm": 0.6724245016825049,
"learning_rate": 5.194851599224392e-06,
"loss": 0.3563,
"step": 2680
},
{
"epoch": 1.6235098838086617,
"grad_norm": 0.6246722692854128,
"learning_rate": 5.159756469251327e-06,
"loss": 0.3587,
"step": 2690
},
{
"epoch": 1.629545797495096,
"grad_norm": 0.5856892553580461,
"learning_rate": 5.1246534586903655e-06,
"loss": 0.3538,
"step": 2700
},
{
"epoch": 1.63558171118153,
"grad_norm": 0.6199649535926036,
"learning_rate": 5.089544299129349e-06,
"loss": 0.3552,
"step": 2710
},
{
"epoch": 1.6416176248679644,
"grad_norm": 0.6395106688159933,
"learning_rate": 5.054430722459442e-06,
"loss": 0.3575,
"step": 2720
},
{
"epoch": 1.6476535385543987,
"grad_norm": 0.6217763272730691,
"learning_rate": 5.019314460789708e-06,
"loss": 0.3568,
"step": 2730
},
{
"epoch": 1.653689452240833,
"grad_norm": 0.6159996290026578,
"learning_rate": 4.984197246361649e-06,
"loss": 0.3565,
"step": 2740
},
{
"epoch": 1.6597253659272673,
"grad_norm": 0.6021051813495957,
"learning_rate": 4.949080811463767e-06,
"loss": 0.3577,
"step": 2750
},
{
"epoch": 1.6657612796137016,
"grad_norm": 0.6102206368388114,
"learning_rate": 4.913966888346118e-06,
"loss": 0.3556,
"step": 2760
},
{
"epoch": 1.6717971933001357,
"grad_norm": 0.5968837038838994,
"learning_rate": 4.8788572091348435e-06,
"loss": 0.3581,
"step": 2770
},
{
"epoch": 1.67783310698657,
"grad_norm": 0.5981355700097328,
"learning_rate": 4.843753505746748e-06,
"loss": 0.358,
"step": 2780
},
{
"epoch": 1.6838690206730043,
"grad_norm": 0.6567740858768865,
"learning_rate": 4.8086575098038505e-06,
"loss": 0.3573,
"step": 2790
},
{
"epoch": 1.6899049343594386,
"grad_norm": 0.6773288375423023,
"learning_rate": 4.773570952547975e-06,
"loss": 0.3552,
"step": 2800
},
{
"epoch": 1.6959408480458729,
"grad_norm": 0.6202686068367487,
"learning_rate": 4.738495564755345e-06,
"loss": 0.3547,
"step": 2810
},
{
"epoch": 1.7019767617323072,
"grad_norm": 0.5595337919079114,
"learning_rate": 4.703433076651205e-06,
"loss": 0.353,
"step": 2820
},
{
"epoch": 1.7080126754187415,
"grad_norm": 0.6583890978208258,
"learning_rate": 4.668385217824482e-06,
"loss": 0.3583,
"step": 2830
},
{
"epoch": 1.7140485891051758,
"grad_norm": 0.5898922057879373,
"learning_rate": 4.633353717142448e-06,
"loss": 0.3524,
"step": 2840
},
{
"epoch": 1.72008450279161,
"grad_norm": 0.5938698503556435,
"learning_rate": 4.5983403026654625e-06,
"loss": 0.3554,
"step": 2850
},
{
"epoch": 1.7261204164780444,
"grad_norm": 0.632653867195755,
"learning_rate": 4.563346701561699e-06,
"loss": 0.3535,
"step": 2860
},
{
"epoch": 1.7321563301644787,
"grad_norm": 0.634481958151908,
"learning_rate": 4.528374640021975e-06,
"loss": 0.3548,
"step": 2870
},
{
"epoch": 1.738192243850913,
"grad_norm": 0.6554591212571549,
"learning_rate": 4.493425843174581e-06,
"loss": 0.3523,
"step": 2880
},
{
"epoch": 1.7442281575373473,
"grad_norm": 0.639030241328894,
"learning_rate": 4.4585020350001885e-06,
"loss": 0.3571,
"step": 2890
},
{
"epoch": 1.7502640712237816,
"grad_norm": 0.579081823243162,
"learning_rate": 4.423604938246815e-06,
"loss": 0.358,
"step": 2900
},
{
"epoch": 1.7562999849102159,
"grad_norm": 0.5786332593667859,
"learning_rate": 4.38873627434483e-06,
"loss": 0.3546,
"step": 2910
},
{
"epoch": 1.7623358985966502,
"grad_norm": 0.5844630643462843,
"learning_rate": 4.353897763322053e-06,
"loss": 0.3557,
"step": 2920
},
{
"epoch": 1.7683718122830845,
"grad_norm": 0.6362540824300466,
"learning_rate": 4.319091123718891e-06,
"loss": 0.3577,
"step": 2930
},
{
"epoch": 1.7744077259695188,
"grad_norm": 0.6152238906869951,
"learning_rate": 4.284318072503581e-06,
"loss": 0.3558,
"step": 2940
},
{
"epoch": 1.7804436396559529,
"grad_norm": 0.5871415463947245,
"learning_rate": 4.249580324987482e-06,
"loss": 0.3565,
"step": 2950
},
{
"epoch": 1.7864795533423872,
"grad_norm": 0.5894304003956816,
"learning_rate": 4.2148795947404664e-06,
"loss": 0.3548,
"step": 2960
},
{
"epoch": 1.7925154670288215,
"grad_norm": 0.5546376741165042,
"learning_rate": 4.180217593506394e-06,
"loss": 0.3545,
"step": 2970
},
{
"epoch": 1.7985513807152558,
"grad_norm": 0.5882950021870835,
"learning_rate": 4.1455960311186645e-06,
"loss": 0.3578,
"step": 2980
},
{
"epoch": 1.80458729440169,
"grad_norm": 0.6581353476419389,
"learning_rate": 4.111016615415887e-06,
"loss": 0.3545,
"step": 2990
},
{
"epoch": 1.8106232080881244,
"grad_norm": 0.728199708802779,
"learning_rate": 4.076481052157621e-06,
"loss": 0.3567,
"step": 3000
},
{
"epoch": 1.8166591217745585,
"grad_norm": 0.5836951966903218,
"learning_rate": 4.0419910449402385e-06,
"loss": 0.3541,
"step": 3010
},
{
"epoch": 1.8226950354609928,
"grad_norm": 0.583825208842142,
"learning_rate": 4.0075482951128965e-06,
"loss": 0.3557,
"step": 3020
},
{
"epoch": 1.828730949147427,
"grad_norm": 0.627394077298899,
"learning_rate": 3.973154501693597e-06,
"loss": 0.352,
"step": 3030
},
{
"epoch": 1.8347668628338614,
"grad_norm": 0.6500394437203815,
"learning_rate": 3.938811361285386e-06,
"loss": 0.3543,
"step": 3040
},
{
"epoch": 1.8408027765202957,
"grad_norm": 0.5787408936785984,
"learning_rate": 3.904520567992655e-06,
"loss": 0.3539,
"step": 3050
},
{
"epoch": 1.84683869020673,
"grad_norm": 0.6006488260082842,
"learning_rate": 3.870283813337587e-06,
"loss": 0.3534,
"step": 3060
},
{
"epoch": 1.8528746038931643,
"grad_norm": 0.6017706438925717,
"learning_rate": 3.836102786176697e-06,
"loss": 0.3533,
"step": 3070
},
{
"epoch": 1.8589105175795986,
"grad_norm": 0.6160731963284618,
"learning_rate": 3.8019791726175353e-06,
"loss": 0.3537,
"step": 3080
},
{
"epoch": 1.8649464312660329,
"grad_norm": 0.7394723530516694,
"learning_rate": 3.767914655935513e-06,
"loss": 0.3512,
"step": 3090
},
{
"epoch": 1.8709823449524672,
"grad_norm": 0.5969802619046902,
"learning_rate": 3.73391091649086e-06,
"loss": 0.3514,
"step": 3100
},
{
"epoch": 1.8770182586389015,
"grad_norm": 0.6434909203687009,
"learning_rate": 3.6999696316457468e-06,
"loss": 0.3525,
"step": 3110
},
{
"epoch": 1.8830541723253358,
"grad_norm": 0.6185839002292769,
"learning_rate": 3.6660924756815314e-06,
"loss": 0.3516,
"step": 3120
},
{
"epoch": 1.88909008601177,
"grad_norm": 0.5764246370880874,
"learning_rate": 3.63228111971618e-06,
"loss": 0.3543,
"step": 3130
},
{
"epoch": 1.8951259996982044,
"grad_norm": 0.5724269342695871,
"learning_rate": 3.5985372316218187e-06,
"loss": 0.3524,
"step": 3140
},
{
"epoch": 1.9011619133846387,
"grad_norm": 0.5893980753783277,
"learning_rate": 3.5648624759424723e-06,
"loss": 0.3487,
"step": 3150
},
{
"epoch": 1.907197827071073,
"grad_norm": 0.6385286384600478,
"learning_rate": 3.5312585138119503e-06,
"loss": 0.353,
"step": 3160
},
{
"epoch": 1.9132337407575073,
"grad_norm": 0.643587632906283,
"learning_rate": 3.4977270028719013e-06,
"loss": 0.3498,
"step": 3170
},
{
"epoch": 1.9192696544439416,
"grad_norm": 0.6189874783125575,
"learning_rate": 3.4642695971900506e-06,
"loss": 0.3542,
"step": 3180
},
{
"epoch": 1.925305568130376,
"grad_norm": 0.6320316722606764,
"learning_rate": 3.4308879471785986e-06,
"loss": 0.3523,
"step": 3190
},
{
"epoch": 1.93134148181681,
"grad_norm": 0.6715762862677156,
"learning_rate": 3.3975836995128176e-06,
"loss": 0.3505,
"step": 3200
},
{
"epoch": 1.9373773955032443,
"grad_norm": 0.5947951437286136,
"learning_rate": 3.3643584970498166e-06,
"loss": 0.356,
"step": 3210
},
{
"epoch": 1.9434133091896786,
"grad_norm": 0.5953138896683005,
"learning_rate": 3.3312139787474986e-06,
"loss": 0.3552,
"step": 3220
},
{
"epoch": 1.9494492228761129,
"grad_norm": 0.5696476474991146,
"learning_rate": 3.298151779583725e-06,
"loss": 0.3496,
"step": 3230
},
{
"epoch": 1.9554851365625472,
"grad_norm": 0.6131972032987533,
"learning_rate": 3.2651735304756505e-06,
"loss": 0.3536,
"step": 3240
},
{
"epoch": 1.9615210502489815,
"grad_norm": 0.6336317988993604,
"learning_rate": 3.2322808581992825e-06,
"loss": 0.3563,
"step": 3250
},
{
"epoch": 1.9675569639354156,
"grad_norm": 0.6341579320490388,
"learning_rate": 3.1994753853092284e-06,
"loss": 0.3482,
"step": 3260
},
{
"epoch": 1.9735928776218499,
"grad_norm": 0.5954681993221721,
"learning_rate": 3.166758730058653e-06,
"loss": 0.3518,
"step": 3270
},
{
"epoch": 1.9796287913082842,
"grad_norm": 0.5893599270303087,
"learning_rate": 3.134132506319467e-06,
"loss": 0.3536,
"step": 3280
},
{
"epoch": 1.9856647049947185,
"grad_norm": 0.5689301232875419,
"learning_rate": 3.101598323502698e-06,
"loss": 0.3537,
"step": 3290
},
{
"epoch": 1.9917006186811528,
"grad_norm": 0.6116819898452338,
"learning_rate": 3.0691577864791176e-06,
"loss": 0.3515,
"step": 3300
},
{
"epoch": 1.997736532367587,
"grad_norm": 0.5926997741101551,
"learning_rate": 3.036812495500058e-06,
"loss": 0.3504,
"step": 3310
},
{
"epoch": 2.0036215482118607,
"grad_norm": 0.5928785278377309,
"learning_rate": 3.0045640461184917e-06,
"loss": 0.339,
"step": 3320
},
{
"epoch": 2.009657461898295,
"grad_norm": 0.6039984062866832,
"learning_rate": 2.97241402911031e-06,
"loss": 0.3325,
"step": 3330
},
{
"epoch": 2.0156933755847293,
"grad_norm": 0.6671960610879556,
"learning_rate": 2.940364030395856e-06,
"loss": 0.3284,
"step": 3340
},
{
"epoch": 2.0217292892711636,
"grad_norm": 0.5808483500966948,
"learning_rate": 2.908415630961702e-06,
"loss": 0.3265,
"step": 3350
},
{
"epoch": 2.027765202957598,
"grad_norm": 0.6017580883286716,
"learning_rate": 2.876570406782645e-06,
"loss": 0.3296,
"step": 3360
},
{
"epoch": 2.033801116644032,
"grad_norm": 0.6067555273933171,
"learning_rate": 2.844829928743987e-06,
"loss": 0.3315,
"step": 3370
},
{
"epoch": 2.0398370303304665,
"grad_norm": 0.5774545226545359,
"learning_rate": 2.813195762564018e-06,
"loss": 0.3268,
"step": 3380
},
{
"epoch": 2.0458729440169003,
"grad_norm": 0.5888748284507602,
"learning_rate": 2.781669468716811e-06,
"loss": 0.3292,
"step": 3390
},
{
"epoch": 2.0519088577033346,
"grad_norm": 0.6137376399757654,
"learning_rate": 2.7502526023552227e-06,
"loss": 0.3258,
"step": 3400
},
{
"epoch": 2.057944771389769,
"grad_norm": 0.59390579398881,
"learning_rate": 2.718946713234185e-06,
"loss": 0.3295,
"step": 3410
},
{
"epoch": 2.0639806850762032,
"grad_norm": 0.6555105104152712,
"learning_rate": 2.6877533456342714e-06,
"loss": 0.3301,
"step": 3420
},
{
"epoch": 2.0700165987626375,
"grad_norm": 0.6048063575727766,
"learning_rate": 2.6566740382855005e-06,
"loss": 0.3289,
"step": 3430
},
{
"epoch": 2.076052512449072,
"grad_norm": 0.6014841818951663,
"learning_rate": 2.625710324291442e-06,
"loss": 0.3325,
"step": 3440
},
{
"epoch": 2.082088426135506,
"grad_norm": 0.6035697169885135,
"learning_rate": 2.5948637310535886e-06,
"loss": 0.3296,
"step": 3450
},
{
"epoch": 2.0881243398219405,
"grad_norm": 0.6112233467387164,
"learning_rate": 2.5641357801960186e-06,
"loss": 0.3278,
"step": 3460
},
{
"epoch": 2.0941602535083748,
"grad_norm": 0.5870217829586826,
"learning_rate": 2.5335279874903185e-06,
"loss": 0.3313,
"step": 3470
},
{
"epoch": 2.100196167194809,
"grad_norm": 0.5897131296840935,
"learning_rate": 2.503041862780827e-06,
"loss": 0.3296,
"step": 3480
},
{
"epoch": 2.1062320808812434,
"grad_norm": 0.5718259687035243,
"learning_rate": 2.47267890991016e-06,
"loss": 0.3281,
"step": 3490
},
{
"epoch": 2.1122679945676777,
"grad_norm": 0.5777856500315681,
"learning_rate": 2.4424406266450045e-06,
"loss": 0.3296,
"step": 3500
},
{
"epoch": 2.118303908254112,
"grad_norm": 0.6262457739159312,
"learning_rate": 2.412328504602264e-06,
"loss": 0.3336,
"step": 3510
},
{
"epoch": 2.1243398219405463,
"grad_norm": 0.589194023665236,
"learning_rate": 2.382344029175462e-06,
"loss": 0.3349,
"step": 3520
},
{
"epoch": 2.1303757356269806,
"grad_norm": 0.6140628916832596,
"learning_rate": 2.3524886794614653e-06,
"loss": 0.331,
"step": 3530
},
{
"epoch": 2.136411649313415,
"grad_norm": 0.6028871935735021,
"learning_rate": 2.322763928187543e-06,
"loss": 0.3307,
"step": 3540
},
{
"epoch": 2.142447562999849,
"grad_norm": 0.5798390235554982,
"learning_rate": 2.293171241638698e-06,
"loss": 0.3298,
"step": 3550
},
{
"epoch": 2.1484834766862835,
"grad_norm": 0.5950496656474389,
"learning_rate": 2.263712079585345e-06,
"loss": 0.3305,
"step": 3560
},
{
"epoch": 2.154519390372718,
"grad_norm": 0.5926734664470145,
"learning_rate": 2.2343878952113012e-06,
"loss": 0.3276,
"step": 3570
},
{
"epoch": 2.160555304059152,
"grad_norm": 0.5877698580097848,
"learning_rate": 2.2052001350421096e-06,
"loss": 0.3268,
"step": 3580
},
{
"epoch": 2.1665912177455864,
"grad_norm": 0.5888247000199527,
"learning_rate": 2.1761502388736655e-06,
"loss": 0.3327,
"step": 3590
},
{
"epoch": 2.1726271314320207,
"grad_norm": 0.5807991121980183,
"learning_rate": 2.14723963970121e-06,
"loss": 0.3315,
"step": 3600
},
{
"epoch": 2.178663045118455,
"grad_norm": 0.5763459777490838,
"learning_rate": 2.118469763648643e-06,
"loss": 0.3278,
"step": 3610
},
{
"epoch": 2.1846989588048893,
"grad_norm": 0.5588744726618396,
"learning_rate": 2.0898420298981537e-06,
"loss": 0.3296,
"step": 3620
},
{
"epoch": 2.1907348724913236,
"grad_norm": 0.6040859182215225,
"learning_rate": 2.061357850620243e-06,
"loss": 0.3279,
"step": 3630
},
{
"epoch": 2.1967707861777575,
"grad_norm": 0.6083091005217864,
"learning_rate": 2.0330186309040394e-06,
"loss": 0.3298,
"step": 3640
},
{
"epoch": 2.2028066998641918,
"grad_norm": 0.568667447432841,
"learning_rate": 2.0048257686879997e-06,
"loss": 0.3286,
"step": 3650
},
{
"epoch": 2.208842613550626,
"grad_norm": 0.586169393672314,
"learning_rate": 1.9767806546909457e-06,
"loss": 0.3316,
"step": 3660
},
{
"epoch": 2.2148785272370604,
"grad_norm": 0.5855668928973393,
"learning_rate": 1.9488846723434646e-06,
"loss": 0.3262,
"step": 3670
},
{
"epoch": 2.2209144409234947,
"grad_norm": 0.5920501956876788,
"learning_rate": 1.921139197719664e-06,
"loss": 0.3298,
"step": 3680
},
{
"epoch": 2.226950354609929,
"grad_norm": 0.6343784219115092,
"learning_rate": 1.893545599469292e-06,
"loss": 0.3316,
"step": 3690
},
{
"epoch": 2.2329862682963633,
"grad_norm": 0.56167618088226,
"learning_rate": 1.86610523875023e-06,
"loss": 0.3288,
"step": 3700
},
{
"epoch": 2.2390221819827976,
"grad_norm": 0.5937195941687996,
"learning_rate": 1.8388194691613308e-06,
"loss": 0.3285,
"step": 3710
},
{
"epoch": 2.245058095669232,
"grad_norm": 0.6068056100462802,
"learning_rate": 1.811689636675672e-06,
"loss": 0.3295,
"step": 3720
},
{
"epoch": 2.251094009355666,
"grad_norm": 0.5998463693882512,
"learning_rate": 1.7847170795741414e-06,
"loss": 0.33,
"step": 3730
},
{
"epoch": 2.2571299230421005,
"grad_norm": 0.5707846476820784,
"learning_rate": 1.7579031283794234e-06,
"loss": 0.3324,
"step": 3740
},
{
"epoch": 2.2631658367285348,
"grad_norm": 0.6070101386107148,
"learning_rate": 1.7312491057903808e-06,
"loss": 0.3288,
"step": 3750
},
{
"epoch": 2.269201750414969,
"grad_norm": 0.5684370763425239,
"learning_rate": 1.7047563266167888e-06,
"loss": 0.3291,
"step": 3760
},
{
"epoch": 2.2752376641014034,
"grad_norm": 0.5367883177519198,
"learning_rate": 1.678426097714489e-06,
"loss": 0.3265,
"step": 3770
},
{
"epoch": 2.2812735777878377,
"grad_norm": 0.5853244396608877,
"learning_rate": 1.6522597179209187e-06,
"loss": 0.3259,
"step": 3780
},
{
"epoch": 2.287309491474272,
"grad_norm": 0.5641343283784108,
"learning_rate": 1.6262584779910472e-06,
"loss": 0.3286,
"step": 3790
},
{
"epoch": 2.2933454051607063,
"grad_norm": 0.5563209895809159,
"learning_rate": 1.600423660533692e-06,
"loss": 0.3281,
"step": 3800
},
{
"epoch": 2.2993813188471406,
"grad_norm": 0.5805361043294971,
"learning_rate": 1.5747565399482605e-06,
"loss": 0.3299,
"step": 3810
},
{
"epoch": 2.305417232533575,
"grad_norm": 0.5811803574606669,
"learning_rate": 1.5492583823618878e-06,
"loss": 0.3289,
"step": 3820
},
{
"epoch": 2.311453146220009,
"grad_norm": 0.6040233888147246,
"learning_rate": 1.523930445566963e-06,
"loss": 0.3308,
"step": 3830
},
{
"epoch": 2.3174890599064435,
"grad_norm": 0.6059976475921155,
"learning_rate": 1.4987739789591056e-06,
"loss": 0.3294,
"step": 3840
},
{
"epoch": 2.323524973592878,
"grad_norm": 0.5905386095910952,
"learning_rate": 1.4737902234755203e-06,
"loss": 0.3301,
"step": 3850
},
{
"epoch": 2.329560887279312,
"grad_norm": 0.5747067002149818,
"learning_rate": 1.448980411533782e-06,
"loss": 0.3278,
"step": 3860
},
{
"epoch": 2.335596800965746,
"grad_norm": 0.5732211405787891,
"learning_rate": 1.4243457669710564e-06,
"loss": 0.3245,
"step": 3870
},
{
"epoch": 2.3416327146521807,
"grad_norm": 0.6079651710560006,
"learning_rate": 1.3998875049837141e-06,
"loss": 0.3268,
"step": 3880
},
{
"epoch": 2.3476686283386146,
"grad_norm": 0.5783578941572416,
"learning_rate": 1.3756068320673938e-06,
"loss": 0.3283,
"step": 3890
},
{
"epoch": 2.353704542025049,
"grad_norm": 0.5532376575030373,
"learning_rate": 1.3515049459574847e-06,
"loss": 0.3254,
"step": 3900
},
{
"epoch": 2.359740455711483,
"grad_norm": 0.5467274114487632,
"learning_rate": 1.3275830355700519e-06,
"loss": 0.3257,
"step": 3910
},
{
"epoch": 2.3657763693979175,
"grad_norm": 0.5922264462167515,
"learning_rate": 1.3038422809431733e-06,
"loss": 0.3291,
"step": 3920
},
{
"epoch": 2.3718122830843518,
"grad_norm": 0.5807751637804499,
"learning_rate": 1.280283853178742e-06,
"loss": 0.3281,
"step": 3930
},
{
"epoch": 2.377848196770786,
"grad_norm": 0.5751202261036737,
"learning_rate": 1.256908914384698e-06,
"loss": 0.3321,
"step": 3940
},
{
"epoch": 2.3838841104572204,
"grad_norm": 0.5829573981972134,
"learning_rate": 1.233718617617689e-06,
"loss": 0.3303,
"step": 3950
},
{
"epoch": 2.3899200241436547,
"grad_norm": 0.5614143554083199,
"learning_rate": 1.2107141068262119e-06,
"loss": 0.3276,
"step": 3960
},
{
"epoch": 2.395955937830089,
"grad_norm": 0.5657826082869326,
"learning_rate": 1.1878965167941658e-06,
"loss": 0.3279,
"step": 3970
},
{
"epoch": 2.4019918515165233,
"grad_norm": 0.5583977788315128,
"learning_rate": 1.1652669730848837e-06,
"loss": 0.3259,
"step": 3980
},
{
"epoch": 2.4080277652029576,
"grad_norm": 0.5670227130617606,
"learning_rate": 1.1428265919856057e-06,
"loss": 0.3319,
"step": 3990
},
{
"epoch": 2.414063678889392,
"grad_norm": 0.5345020446470288,
"learning_rate": 1.1205764804524172e-06,
"loss": 0.3258,
"step": 4000
},
{
"epoch": 2.420099592575826,
"grad_norm": 0.5742530447532448,
"learning_rate": 1.0985177360556421e-06,
"loss": 0.3281,
"step": 4010
},
{
"epoch": 2.4261355062622605,
"grad_norm": 0.5681633515485598,
"learning_rate": 1.0766514469257006e-06,
"loss": 0.33,
"step": 4020
},
{
"epoch": 2.432171419948695,
"grad_norm": 0.5469547021834809,
"learning_rate": 1.0549786916994387e-06,
"loss": 0.3271,
"step": 4030
},
{
"epoch": 2.438207333635129,
"grad_norm": 0.5467836338693935,
"learning_rate": 1.0335005394669062e-06,
"loss": 0.3282,
"step": 4040
},
{
"epoch": 2.4442432473215634,
"grad_norm": 0.5496370736783344,
"learning_rate": 1.012218049718639e-06,
"loss": 0.3267,
"step": 4050
},
{
"epoch": 2.4502791610079977,
"grad_norm": 0.5532695447765059,
"learning_rate": 9.911322722933825e-07,
"loss": 0.3267,
"step": 4060
},
{
"epoch": 2.456315074694432,
"grad_norm": 0.5593061519759683,
"learning_rate": 9.702442473263035e-07,
"loss": 0.3261,
"step": 4070
},
{
"epoch": 2.4623509883808663,
"grad_norm": 0.5675718980431652,
"learning_rate": 9.495550051976937e-07,
"loss": 0.33,
"step": 4080
},
{
"epoch": 2.4683869020673006,
"grad_norm": 0.55036807877547,
"learning_rate": 9.290655664821296e-07,
"loss": 0.326,
"step": 4090
},
{
"epoch": 2.474422815753735,
"grad_norm": 0.6047393707132771,
"learning_rate": 9.087769418981352e-07,
"loss": 0.3294,
"step": 4100
},
{
"epoch": 2.480458729440169,
"grad_norm": 0.5377856224781872,
"learning_rate": 8.88690132258323e-07,
"loss": 0.3301,
"step": 4110
},
{
"epoch": 2.486494643126603,
"grad_norm": 0.5404023215833121,
"learning_rate": 8.688061284200266e-07,
"loss": 0.3308,
"step": 4120
},
{
"epoch": 2.492530556813038,
"grad_norm": 0.5429038964087051,
"learning_rate": 8.491259112364192e-07,
"loss": 0.3277,
"step": 4130
},
{
"epoch": 2.4985664704994717,
"grad_norm": 0.5556392061166345,
"learning_rate": 8.296504515081333e-07,
"loss": 0.328,
"step": 4140
},
{
"epoch": 2.5046023841859064,
"grad_norm": 0.5550852188468128,
"learning_rate": 8.103807099353733e-07,
"loss": 0.3303,
"step": 4150
},
{
"epoch": 2.5106382978723403,
"grad_norm": 0.5683960534884703,
"learning_rate": 7.913176370705166e-07,
"loss": 0.3303,
"step": 4160
},
{
"epoch": 2.5166742115587746,
"grad_norm": 0.5647058376594801,
"learning_rate": 7.724621732712373e-07,
"loss": 0.3281,
"step": 4170
},
{
"epoch": 2.522710125245209,
"grad_norm": 0.5396463872633352,
"learning_rate": 7.538152486541078e-07,
"loss": 0.3224,
"step": 4180
},
{
"epoch": 2.528746038931643,
"grad_norm": 0.5769965957501234,
"learning_rate": 7.353777830487247e-07,
"loss": 0.3298,
"step": 4190
},
{
"epoch": 2.5347819526180775,
"grad_norm": 0.5617546845646423,
"learning_rate": 7.171506859523298e-07,
"loss": 0.3284,
"step": 4200
},
{
"epoch": 2.540817866304512,
"grad_norm": 0.5370456459767287,
"learning_rate": 6.991348564849504e-07,
"loss": 0.3272,
"step": 4210
},
{
"epoch": 2.546853779990946,
"grad_norm": 0.5449920129863155,
"learning_rate": 6.813311833450426e-07,
"loss": 0.3244,
"step": 4220
},
{
"epoch": 2.5528896936773804,
"grad_norm": 0.5814796250543772,
"learning_rate": 6.637405447656542e-07,
"loss": 0.3286,
"step": 4230
},
{
"epoch": 2.5589256073638147,
"grad_norm": 0.5802300234417045,
"learning_rate": 6.463638084711088e-07,
"loss": 0.3303,
"step": 4240
},
{
"epoch": 2.564961521050249,
"grad_norm": 0.5682016106324166,
"learning_rate": 6.29201831634188e-07,
"loss": 0.3275,
"step": 4250
},
{
"epoch": 2.5709974347366833,
"grad_norm": 0.628799960343276,
"learning_rate": 6.122554608338605e-07,
"loss": 0.3278,
"step": 4260
},
{
"epoch": 2.5770333484231176,
"grad_norm": 0.5261749879449605,
"learning_rate": 5.955255320135195e-07,
"loss": 0.3287,
"step": 4270
},
{
"epoch": 2.583069262109552,
"grad_norm": 0.5365103226953842,
"learning_rate": 5.790128704397424e-07,
"loss": 0.3242,
"step": 4280
},
{
"epoch": 2.589105175795986,
"grad_norm": 0.5482210552849281,
"learning_rate": 5.627182906615825e-07,
"loss": 0.3254,
"step": 4290
},
{
"epoch": 2.5951410894824205,
"grad_norm": 0.5270093070193902,
"learning_rate": 5.466425964703914e-07,
"loss": 0.3268,
"step": 4300
},
{
"epoch": 2.601177003168855,
"grad_norm": 0.5351843851712077,
"learning_rate": 5.307865808601664e-07,
"loss": 0.3267,
"step": 4310
},
{
"epoch": 2.607212916855289,
"grad_norm": 0.5551045883829538,
"learning_rate": 5.151510259884329e-07,
"loss": 0.3261,
"step": 4320
},
{
"epoch": 2.6132488305417234,
"grad_norm": 0.5716515174477422,
"learning_rate": 4.997367031376627e-07,
"loss": 0.3283,
"step": 4330
},
{
"epoch": 2.6192847442281577,
"grad_norm": 0.5484469831279773,
"learning_rate": 4.84544372677228e-07,
"loss": 0.3279,
"step": 4340
},
{
"epoch": 2.6253206579145916,
"grad_norm": 0.5627722024643765,
"learning_rate": 4.6957478402589076e-07,
"loss": 0.3285,
"step": 4350
},
{
"epoch": 2.6313565716010263,
"grad_norm": 0.54030007506572,
"learning_rate": 4.548286756148401e-07,
"loss": 0.328,
"step": 4360
},
{
"epoch": 2.63739248528746,
"grad_norm": 0.5688872966757411,
"learning_rate": 4.4030677485125906e-07,
"loss": 0.3291,
"step": 4370
},
{
"epoch": 2.643428398973895,
"grad_norm": 0.5611453338620043,
"learning_rate": 4.2600979808244627e-07,
"loss": 0.3267,
"step": 4380
},
{
"epoch": 2.649464312660329,
"grad_norm": 0.5591585705521456,
"learning_rate": 4.119384505604834e-07,
"loss": 0.3285,
"step": 4390
},
{
"epoch": 2.655500226346763,
"grad_norm": 0.5403567309346599,
"learning_rate": 3.980934264074393e-07,
"loss": 0.3234,
"step": 4400
},
{
"epoch": 2.6615361400331974,
"grad_norm": 0.5366841662024877,
"learning_rate": 3.8447540858113197e-07,
"loss": 0.3289,
"step": 4410
},
{
"epoch": 2.6675720537196317,
"grad_norm": 0.5505493242335168,
"learning_rate": 3.710850688414419e-07,
"loss": 0.329,
"step": 4420
},
{
"epoch": 2.673607967406066,
"grad_norm": 0.5572305600353893,
"learning_rate": 3.579230677171702e-07,
"loss": 0.326,
"step": 4430
},
{
"epoch": 2.6796438810925003,
"grad_norm": 0.5320801899819191,
"learning_rate": 3.4499005447346024e-07,
"loss": 0.3272,
"step": 4440
},
{
"epoch": 2.6856797947789346,
"grad_norm": 0.5621605698475473,
"learning_rate": 3.32286667079767e-07,
"loss": 0.3232,
"step": 4450
},
{
"epoch": 2.691715708465369,
"grad_norm": 0.5535800034831663,
"learning_rate": 3.1981353217838853e-07,
"loss": 0.3267,
"step": 4460
},
{
"epoch": 2.697751622151803,
"grad_norm": 0.5541989505631728,
"learning_rate": 3.0757126505355284e-07,
"loss": 0.3271,
"step": 4470
},
{
"epoch": 2.7037875358382375,
"grad_norm": 0.554309743511386,
"learning_rate": 2.9556046960106997e-07,
"loss": 0.3275,
"step": 4480
},
{
"epoch": 2.709823449524672,
"grad_norm": 0.5441084268121339,
"learning_rate": 2.837817382985375e-07,
"loss": 0.3265,
"step": 4490
},
{
"epoch": 2.715859363211106,
"grad_norm": 0.5661752729331364,
"learning_rate": 2.722356521761188e-07,
"loss": 0.3251,
"step": 4500
},
{
"epoch": 2.7218952768975404,
"grad_norm": 0.5284497315283775,
"learning_rate": 2.6092278078788004e-07,
"loss": 0.3249,
"step": 4510
},
{
"epoch": 2.7279311905839747,
"grad_norm": 0.52978683625873,
"learning_rate": 2.4984368218369305e-07,
"loss": 0.3282,
"step": 4520
},
{
"epoch": 2.733967104270409,
"grad_norm": 0.5435219044017648,
"learning_rate": 2.389989028817108e-07,
"loss": 0.3283,
"step": 4530
},
{
"epoch": 2.7400030179568433,
"grad_norm": 0.5516780362582209,
"learning_rate": 2.2838897784140612e-07,
"loss": 0.3274,
"step": 4540
},
{
"epoch": 2.7460389316432776,
"grad_norm": 0.546719555306795,
"learning_rate": 2.1801443043718285e-07,
"loss": 0.3298,
"step": 4550
},
{
"epoch": 2.752074845329712,
"grad_norm": 0.5563980632574993,
"learning_rate": 2.0787577243255807e-07,
"loss": 0.3267,
"step": 4560
},
{
"epoch": 2.758110759016146,
"grad_norm": 0.535638021015215,
"learning_rate": 1.9797350395492077e-07,
"loss": 0.3253,
"step": 4570
},
{
"epoch": 2.76414667270258,
"grad_norm": 0.5347609121819951,
"learning_rate": 1.8830811347085697e-07,
"loss": 0.3252,
"step": 4580
},
{
"epoch": 2.770182586389015,
"grad_norm": 0.5415863482391344,
"learning_rate": 1.788800777620542e-07,
"loss": 0.3276,
"step": 4590
},
{
"epoch": 2.7762185000754487,
"grad_norm": 0.5466212280219622,
"learning_rate": 1.6968986190178728e-07,
"loss": 0.326,
"step": 4600
},
{
"epoch": 2.7822544137618834,
"grad_norm": 0.5391843573715891,
"learning_rate": 1.60737919231973e-07,
"loss": 0.3265,
"step": 4610
},
{
"epoch": 2.7882903274483173,
"grad_norm": 0.5465887305789703,
"learning_rate": 1.5202469134080633e-07,
"loss": 0.3291,
"step": 4620
},
{
"epoch": 2.794326241134752,
"grad_norm": 0.5447449635613493,
"learning_rate": 1.4355060804098043e-07,
"loss": 0.3254,
"step": 4630
},
{
"epoch": 2.800362154821186,
"grad_norm": 0.5376834372862567,
"learning_rate": 1.3531608734848433e-07,
"loss": 0.3252,
"step": 4640
},
{
"epoch": 2.80639806850762,
"grad_norm": 0.5419447242645747,
"learning_rate": 1.273215354619789e-07,
"loss": 0.3277,
"step": 4650
},
{
"epoch": 2.8124339821940545,
"grad_norm": 0.521436211709283,
"learning_rate": 1.1956734674276492e-07,
"loss": 0.3267,
"step": 4660
},
{
"epoch": 2.818469895880489,
"grad_norm": 0.5443036316275357,
"learning_rate": 1.1205390369532553e-07,
"loss": 0.328,
"step": 4670
},
{
"epoch": 2.824505809566923,
"grad_norm": 0.5736771187575125,
"learning_rate": 1.0478157694846002e-07,
"loss": 0.3269,
"step": 4680
},
{
"epoch": 2.8305417232533574,
"grad_norm": 0.5533030963421177,
"learning_rate": 9.775072523700135e-08,
"loss": 0.3274,
"step": 4690
},
{
"epoch": 2.8365776369397917,
"grad_norm": 0.5297867847542854,
"learning_rate": 9.096169538411747e-08,
"loss": 0.3251,
"step": 4700
},
{
"epoch": 2.842613550626226,
"grad_norm": 0.5603590658940372,
"learning_rate": 8.441482228420505e-08,
"loss": 0.3261,
"step": 4710
},
{
"epoch": 2.8486494643126603,
"grad_norm": 0.5648155137748375,
"learning_rate": 7.81104288863721e-08,
"loss": 0.3238,
"step": 4720
},
{
"epoch": 2.8546853779990946,
"grad_norm": 0.5155233113764542,
"learning_rate": 7.204882617850129e-08,
"loss": 0.3284,
"step": 4730
},
{
"epoch": 2.860721291685529,
"grad_norm": 0.5283055469638852,
"learning_rate": 6.623031317191386e-08,
"loss": 0.3243,
"step": 4740
},
{
"epoch": 2.866757205371963,
"grad_norm": 0.5324757215458941,
"learning_rate": 6.065517688661926e-08,
"loss": 0.3266,
"step": 4750
},
{
"epoch": 2.8727931190583975,
"grad_norm": 0.5270694862009192,
"learning_rate": 5.532369233715418e-08,
"loss": 0.3263,
"step": 4760
},
{
"epoch": 2.878829032744832,
"grad_norm": 0.5184850936640313,
"learning_rate": 5.02361225190201e-08,
"loss": 0.325,
"step": 4770
},
{
"epoch": 2.884864946431266,
"grad_norm": 0.5522807685327075,
"learning_rate": 4.539271839570702e-08,
"loss": 0.3303,
"step": 4780
},
{
"epoch": 2.8909008601177004,
"grad_norm": 0.5833975533295399,
"learning_rate": 4.079371888631667e-08,
"loss": 0.3287,
"step": 4790
},
{
"epoch": 2.8969367738041347,
"grad_norm": 0.5435014494666157,
"learning_rate": 3.643935085377193e-08,
"loss": 0.3291,
"step": 4800
},
{
"epoch": 2.902972687490569,
"grad_norm": 0.5317951774661862,
"learning_rate": 3.232982909363247e-08,
"loss": 0.3302,
"step": 4810
},
{
"epoch": 2.9090086011770033,
"grad_norm": 0.5470417295465569,
"learning_rate": 2.8465356323494897e-08,
"loss": 0.3293,
"step": 4820
},
{
"epoch": 2.915044514863437,
"grad_norm": 0.5361189628769133,
"learning_rate": 2.4846123172992953e-08,
"loss": 0.3281,
"step": 4830
},
{
"epoch": 2.921080428549872,
"grad_norm": 0.54177713240335,
"learning_rate": 2.147230817439616e-08,
"loss": 0.326,
"step": 4840
},
{
"epoch": 2.927116342236306,
"grad_norm": 0.5323052126594137,
"learning_rate": 1.834407775380187e-08,
"loss": 0.3281,
"step": 4850
},
{
"epoch": 2.9331522559227405,
"grad_norm": 0.5359903004283559,
"learning_rate": 1.5461586222924596e-08,
"loss": 0.3261,
"step": 4860
},
{
"epoch": 2.9391881696091744,
"grad_norm": 0.5543427271655068,
"learning_rate": 1.2824975771486558e-08,
"loss": 0.3264,
"step": 4870
},
{
"epoch": 2.945224083295609,
"grad_norm": 0.5384618149718552,
"learning_rate": 1.0434376460201067e-08,
"loss": 0.3271,
"step": 4880
},
{
"epoch": 2.951259996982043,
"grad_norm": 0.5121547980752482,
"learning_rate": 8.289906214358767e-09,
"loss": 0.3252,
"step": 4890
},
{
"epoch": 2.9572959106684773,
"grad_norm": 0.5126843579972032,
"learning_rate": 6.391670818008955e-09,
"loss": 0.3255,
"step": 4900
},
{
"epoch": 2.9633318243549116,
"grad_norm": 0.5323524113852374,
"learning_rate": 4.7397639087432e-09,
"loss": 0.3267,
"step": 4910
},
{
"epoch": 2.969367738041346,
"grad_norm": 0.5324354968490875,
"learning_rate": 3.3342669730729303e-09,
"loss": 0.3255,
"step": 4920
},
{
"epoch": 2.97540365172778,
"grad_norm": 0.550825930999869,
"learning_rate": 2.1752493424148647e-09,
"loss": 0.328,
"step": 4930
},
{
"epoch": 2.9814395654142145,
"grad_norm": 0.5289245768111625,
"learning_rate": 1.2627681896670852e-09,
"loss": 0.3265,
"step": 4940
},
{
"epoch": 2.987475479100649,
"grad_norm": 0.5548931035545003,
"learning_rate": 5.968685263885165e-10,
"loss": 0.329,
"step": 4950
},
{
"epoch": 2.993511392787083,
"grad_norm": 0.5268141741402684,
"learning_rate": 1.7758320058236522e-10,
"loss": 0.3264,
"step": 4960
},
{
"epoch": 2.9995473064735174,
"grad_norm": 0.5394909098085136,
"learning_rate": 4.932895071863009e-12,
"loss": 0.3267,
"step": 4970
},
{
"epoch": 3.0,
"step": 4971,
"total_flos": 3906508525600768.0,
"train_loss": 0.3714317911300974,
"train_runtime": 271631.3463,
"train_samples_per_second": 4.684,
"train_steps_per_second": 0.018
}
],
"logging_steps": 10,
"max_steps": 4971,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3906508525600768.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}