Files
CC-Zeta-0/checkpoint-6500/trainer_state.json
ModelHub XC ec3bdf1823 初始化项目,由ModelHub XC社区提供模型
Model: geasslabs/CC-Zeta-0
Source: Original Platform
2026-06-20 18:47:01 +08:00

4585 lines
115 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9745218752049776,
"eval_steps": 500,
"global_step": 6500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014992644233922732,
"grad_norm": 1.3515625,
"learning_rate": 3.6e-08,
"loss": 1.6646936416625977,
"step": 10
},
{
"epoch": 0.0029985288467845464,
"grad_norm": 1.0859375,
"learning_rate": 7.599999999999999e-08,
"loss": 1.642629623413086,
"step": 20
},
{
"epoch": 0.004497793270176819,
"grad_norm": 0.953125,
"learning_rate": 1.16e-07,
"loss": 1.6638397216796874,
"step": 30
},
{
"epoch": 0.005997057693569093,
"grad_norm": 0.99609375,
"learning_rate": 1.56e-07,
"loss": 1.648602294921875,
"step": 40
},
{
"epoch": 0.007496322116961366,
"grad_norm": 1.1796875,
"learning_rate": 1.96e-07,
"loss": 1.6447210311889648,
"step": 50
},
{
"epoch": 0.008995586540353638,
"grad_norm": 1.125,
"learning_rate": 2.3599999999999997e-07,
"loss": 1.6022998809814453,
"step": 60
},
{
"epoch": 0.010494850963745913,
"grad_norm": 1.1171875,
"learning_rate": 2.7600000000000004e-07,
"loss": 1.5853511810302734,
"step": 70
},
{
"epoch": 0.011994115387138186,
"grad_norm": 1.078125,
"learning_rate": 3.1599999999999997e-07,
"loss": 1.512193775177002,
"step": 80
},
{
"epoch": 0.013493379810530459,
"grad_norm": 1.0703125,
"learning_rate": 3.5599999999999996e-07,
"loss": 1.4463014602661133,
"step": 90
},
{
"epoch": 0.014992644233922731,
"grad_norm": 1.21875,
"learning_rate": 3.96e-07,
"loss": 1.3526597023010254,
"step": 100
},
{
"epoch": 0.016491908657315004,
"grad_norm": 1.71875,
"learning_rate": 4.36e-07,
"loss": 1.1787323951721191,
"step": 110
},
{
"epoch": 0.017991173080707277,
"grad_norm": 0.9765625,
"learning_rate": 4.76e-07,
"loss": 0.9612628936767578,
"step": 120
},
{
"epoch": 0.01949043750409955,
"grad_norm": 1.2578125,
"learning_rate": 5.16e-07,
"loss": 0.6817054748535156,
"step": 130
},
{
"epoch": 0.020989701927491826,
"grad_norm": 0.703125,
"learning_rate": 5.560000000000001e-07,
"loss": 0.4010798454284668,
"step": 140
},
{
"epoch": 0.0224889663508841,
"grad_norm": 0.44921875,
"learning_rate": 5.96e-07,
"loss": 0.33456034660339357,
"step": 150
},
{
"epoch": 0.02398823077427637,
"grad_norm": 0.447265625,
"learning_rate": 6.36e-07,
"loss": 0.351082444190979,
"step": 160
},
{
"epoch": 0.025487495197668644,
"grad_norm": 0.41015625,
"learning_rate": 6.76e-07,
"loss": 0.3510997772216797,
"step": 170
},
{
"epoch": 0.026986759621060917,
"grad_norm": 0.2734375,
"learning_rate": 7.159999999999999e-07,
"loss": 0.27989749908447265,
"step": 180
},
{
"epoch": 0.02848602404445319,
"grad_norm": 0.484375,
"learning_rate": 7.559999999999999e-07,
"loss": 0.25770542621612547,
"step": 190
},
{
"epoch": 0.029985288467845463,
"grad_norm": 0.361328125,
"learning_rate": 7.96e-07,
"loss": 0.24579532146453859,
"step": 200
},
{
"epoch": 0.031484552891237735,
"grad_norm": 0.357421875,
"learning_rate": 8.359999999999999e-07,
"loss": 0.19708189964294434,
"step": 210
},
{
"epoch": 0.03298381731463001,
"grad_norm": 0.337890625,
"learning_rate": 8.76e-07,
"loss": 0.19723033905029297,
"step": 220
},
{
"epoch": 0.03448308173802228,
"grad_norm": 0.447265625,
"learning_rate": 9.16e-07,
"loss": 0.2601869821548462,
"step": 230
},
{
"epoch": 0.035982346161414554,
"grad_norm": 0.2431640625,
"learning_rate": 9.559999999999998e-07,
"loss": 0.22059969902038573,
"step": 240
},
{
"epoch": 0.037481610584806827,
"grad_norm": 0.291015625,
"learning_rate": 9.959999999999999e-07,
"loss": 0.23547093868255614,
"step": 250
},
{
"epoch": 0.0389808750081991,
"grad_norm": 0.396484375,
"learning_rate": 1.036e-06,
"loss": 0.23184664249420167,
"step": 260
},
{
"epoch": 0.04048013943159137,
"grad_norm": 0.2373046875,
"learning_rate": 1.076e-06,
"loss": 0.1775584936141968,
"step": 270
},
{
"epoch": 0.04197940385498365,
"grad_norm": 0.3046875,
"learning_rate": 1.116e-06,
"loss": 0.1842654228210449,
"step": 280
},
{
"epoch": 0.043478668278375925,
"grad_norm": 0.29296875,
"learning_rate": 1.1559999999999998e-06,
"loss": 0.1996519684791565,
"step": 290
},
{
"epoch": 0.0449779327017682,
"grad_norm": 0.26171875,
"learning_rate": 1.1959999999999999e-06,
"loss": 0.2009890556335449,
"step": 300
},
{
"epoch": 0.04647719712516047,
"grad_norm": 0.359375,
"learning_rate": 1.236e-06,
"loss": 0.20845539569854737,
"step": 310
},
{
"epoch": 0.04797646154855274,
"grad_norm": 0.474609375,
"learning_rate": 1.276e-06,
"loss": 0.18711086511611938,
"step": 320
},
{
"epoch": 0.049475725971945016,
"grad_norm": 0.287109375,
"learning_rate": 1.316e-06,
"loss": 0.17694923877716065,
"step": 330
},
{
"epoch": 0.05097499039533729,
"grad_norm": 0.4921875,
"learning_rate": 1.356e-06,
"loss": 0.2112422227859497,
"step": 340
},
{
"epoch": 0.05247425481872956,
"grad_norm": 0.49609375,
"learning_rate": 1.3959999999999998e-06,
"loss": 0.16929364204406738,
"step": 350
},
{
"epoch": 0.053973519242121834,
"grad_norm": 0.216796875,
"learning_rate": 1.4359999999999999e-06,
"loss": 0.14373246431350709,
"step": 360
},
{
"epoch": 0.05547278366551411,
"grad_norm": 0.251953125,
"learning_rate": 1.476e-06,
"loss": 0.19208487272262573,
"step": 370
},
{
"epoch": 0.05697204808890638,
"grad_norm": 0.490234375,
"learning_rate": 1.516e-06,
"loss": 0.21074600219726564,
"step": 380
},
{
"epoch": 0.05847131251229865,
"grad_norm": 0.404296875,
"learning_rate": 1.556e-06,
"loss": 0.23820483684539795,
"step": 390
},
{
"epoch": 0.059970576935690925,
"grad_norm": 0.3828125,
"learning_rate": 1.596e-06,
"loss": 0.1559612274169922,
"step": 400
},
{
"epoch": 0.0614698413590832,
"grad_norm": 0.263671875,
"learning_rate": 1.6359999999999999e-06,
"loss": 0.17463357448577882,
"step": 410
},
{
"epoch": 0.06296910578247547,
"grad_norm": 0.2294921875,
"learning_rate": 1.676e-06,
"loss": 0.17741453647613525,
"step": 420
},
{
"epoch": 0.06446837020586775,
"grad_norm": 0.37890625,
"learning_rate": 1.716e-06,
"loss": 0.16474447250366211,
"step": 430
},
{
"epoch": 0.06596763462926002,
"grad_norm": 0.1826171875,
"learning_rate": 1.756e-06,
"loss": 0.15805156230926515,
"step": 440
},
{
"epoch": 0.0674668990526523,
"grad_norm": 0.357421875,
"learning_rate": 1.796e-06,
"loss": 0.1870645046234131,
"step": 450
},
{
"epoch": 0.06896616347604456,
"grad_norm": 0.3203125,
"learning_rate": 1.836e-06,
"loss": 0.17467626333236694,
"step": 460
},
{
"epoch": 0.07046542789943684,
"grad_norm": 0.296875,
"learning_rate": 1.8759999999999997e-06,
"loss": 0.1839710831642151,
"step": 470
},
{
"epoch": 0.07196469232282911,
"grad_norm": 0.271484375,
"learning_rate": 1.916e-06,
"loss": 0.15121291875839232,
"step": 480
},
{
"epoch": 0.07346395674622139,
"grad_norm": 0.1962890625,
"learning_rate": 1.956e-06,
"loss": 0.16356420516967773,
"step": 490
},
{
"epoch": 0.07496322116961365,
"grad_norm": 0.2294921875,
"learning_rate": 1.996e-06,
"loss": 0.18490909337997435,
"step": 500
},
{
"epoch": 0.07646248559300593,
"grad_norm": 0.306640625,
"learning_rate": 1.9999895001358395e-06,
"loss": 0.17003339529037476,
"step": 510
},
{
"epoch": 0.0779617500163982,
"grad_norm": 0.314453125,
"learning_rate": 1.9999532045921925e-06,
"loss": 0.14626400470733641,
"step": 520
},
{
"epoch": 0.07946101443979048,
"grad_norm": 0.1728515625,
"learning_rate": 1.9998909846818658e-06,
"loss": 0.1461304545402527,
"step": 530
},
{
"epoch": 0.08096027886318274,
"grad_norm": 0.1572265625,
"learning_rate": 1.9998028420179468e-06,
"loss": 0.1631840229034424,
"step": 540
},
{
"epoch": 0.08245954328657502,
"grad_norm": 0.255859375,
"learning_rate": 1.9996887788855846e-06,
"loss": 0.14891616106033326,
"step": 550
},
{
"epoch": 0.0839588077099673,
"grad_norm": 0.341796875,
"learning_rate": 1.999548798241933e-06,
"loss": 0.1451740264892578,
"step": 560
},
{
"epoch": 0.08545807213335957,
"grad_norm": 0.26953125,
"learning_rate": 1.9993829037160704e-06,
"loss": 0.13687235116958618,
"step": 570
},
{
"epoch": 0.08695733655675185,
"grad_norm": 0.19921875,
"learning_rate": 1.9991910996089085e-06,
"loss": 0.15143134593963622,
"step": 580
},
{
"epoch": 0.08845660098014411,
"grad_norm": 0.322265625,
"learning_rate": 1.998973390893081e-06,
"loss": 0.15538901090621948,
"step": 590
},
{
"epoch": 0.0899558654035364,
"grad_norm": 0.267578125,
"learning_rate": 1.998729783212812e-06,
"loss": 0.17548735141754152,
"step": 600
},
{
"epoch": 0.09145512982692866,
"grad_norm": 0.234375,
"learning_rate": 1.998460282883772e-06,
"loss": 0.1454736351966858,
"step": 610
},
{
"epoch": 0.09295439425032094,
"grad_norm": 0.392578125,
"learning_rate": 1.998164896892913e-06,
"loss": 0.13865782022476197,
"step": 620
},
{
"epoch": 0.0944536586737132,
"grad_norm": 0.259765625,
"learning_rate": 1.9978436328982882e-06,
"loss": 0.16720572710037232,
"step": 630
},
{
"epoch": 0.09595292309710549,
"grad_norm": 0.2197265625,
"learning_rate": 1.997496499228853e-06,
"loss": 0.14800021648406983,
"step": 640
},
{
"epoch": 0.09745218752049775,
"grad_norm": 0.1708984375,
"learning_rate": 1.9971235048842495e-06,
"loss": 0.14826395511627197,
"step": 650
},
{
"epoch": 0.09895145194389003,
"grad_norm": 0.1396484375,
"learning_rate": 1.996724659534572e-06,
"loss": 0.12433024644851684,
"step": 660
},
{
"epoch": 0.1004507163672823,
"grad_norm": 0.28125,
"learning_rate": 1.9962999735201173e-06,
"loss": 0.1702478051185608,
"step": 670
},
{
"epoch": 0.10194998079067458,
"grad_norm": 0.251953125,
"learning_rate": 1.9958494578511167e-06,
"loss": 0.1259335994720459,
"step": 680
},
{
"epoch": 0.10344924521406684,
"grad_norm": 0.2353515625,
"learning_rate": 1.99537312420745e-06,
"loss": 0.20034666061401368,
"step": 690
},
{
"epoch": 0.10494850963745912,
"grad_norm": 0.25390625,
"learning_rate": 1.994870984938344e-06,
"loss": 0.12428268194198608,
"step": 700
},
{
"epoch": 0.10644777406085139,
"grad_norm": 0.2578125,
"learning_rate": 1.9943430530620497e-06,
"loss": 0.11142982244491577,
"step": 710
},
{
"epoch": 0.10794703848424367,
"grad_norm": 0.361328125,
"learning_rate": 1.993789342265507e-06,
"loss": 0.1445391893386841,
"step": 720
},
{
"epoch": 0.10944630290763595,
"grad_norm": 0.353515625,
"learning_rate": 1.99320986690399e-06,
"loss": 0.1293397307395935,
"step": 730
},
{
"epoch": 0.11094556733102821,
"grad_norm": 0.26171875,
"learning_rate": 1.9926046420007326e-06,
"loss": 0.11696268320083618,
"step": 740
},
{
"epoch": 0.1124448317544205,
"grad_norm": 0.2490234375,
"learning_rate": 1.9919736832465417e-06,
"loss": 0.12922875881195067,
"step": 750
},
{
"epoch": 0.11394409617781276,
"grad_norm": 0.3046875,
"learning_rate": 1.9913170069993896e-06,
"loss": 0.13306174278259278,
"step": 760
},
{
"epoch": 0.11544336060120504,
"grad_norm": 0.1982421875,
"learning_rate": 1.9906346302839882e-06,
"loss": 0.13486032485961913,
"step": 770
},
{
"epoch": 0.1169426250245973,
"grad_norm": 0.1982421875,
"learning_rate": 1.9899265707913492e-06,
"loss": 0.13135333061218263,
"step": 780
},
{
"epoch": 0.11844188944798958,
"grad_norm": 0.294921875,
"learning_rate": 1.989192846878326e-06,
"loss": 0.12307331562042237,
"step": 790
},
{
"epoch": 0.11994115387138185,
"grad_norm": 0.29296875,
"learning_rate": 1.988433477567137e-06,
"loss": 0.11497733592987061,
"step": 800
},
{
"epoch": 0.12144041829477413,
"grad_norm": 0.1865234375,
"learning_rate": 1.9876484825448706e-06,
"loss": 0.13883528709411622,
"step": 810
},
{
"epoch": 0.1229396827181664,
"grad_norm": 0.37109375,
"learning_rate": 1.9868378821629795e-06,
"loss": 0.13286290168762208,
"step": 820
},
{
"epoch": 0.12443894714155868,
"grad_norm": 0.3046875,
"learning_rate": 1.9860016974367474e-06,
"loss": 0.1608394503593445,
"step": 830
},
{
"epoch": 0.12593821156495094,
"grad_norm": 0.27734375,
"learning_rate": 1.985139950044749e-06,
"loss": 0.1350063681602478,
"step": 840
},
{
"epoch": 0.1274374759883432,
"grad_norm": 0.2236328125,
"learning_rate": 1.9842526623282844e-06,
"loss": 0.14678356647491456,
"step": 850
},
{
"epoch": 0.1289367404117355,
"grad_norm": 0.1455078125,
"learning_rate": 1.9833398572908027e-06,
"loss": 0.13124724626541137,
"step": 860
},
{
"epoch": 0.13043600483512777,
"grad_norm": 0.22265625,
"learning_rate": 1.9824015585973037e-06,
"loss": 0.1295769214630127,
"step": 870
},
{
"epoch": 0.13193526925852003,
"grad_norm": 0.1455078125,
"learning_rate": 1.9814377905737253e-06,
"loss": 0.14678038358688356,
"step": 880
},
{
"epoch": 0.1334345336819123,
"grad_norm": 0.27734375,
"learning_rate": 1.980448578206312e-06,
"loss": 0.12379497289657593,
"step": 890
},
{
"epoch": 0.1349337981053046,
"grad_norm": 0.322265625,
"learning_rate": 1.9794339471409684e-06,
"loss": 0.1308390736579895,
"step": 900
},
{
"epoch": 0.13643306252869686,
"grad_norm": 0.322265625,
"learning_rate": 1.978393923682593e-06,
"loss": 0.1078214168548584,
"step": 910
},
{
"epoch": 0.13793232695208912,
"grad_norm": 0.203125,
"learning_rate": 1.9773285347943975e-06,
"loss": 0.12421451807022095,
"step": 920
},
{
"epoch": 0.13943159137548142,
"grad_norm": 0.3125,
"learning_rate": 1.976237808097206e-06,
"loss": 0.11592028141021729,
"step": 930
},
{
"epoch": 0.14093085579887368,
"grad_norm": 0.2158203125,
"learning_rate": 1.975121771868741e-06,
"loss": 0.11567631959915162,
"step": 940
},
{
"epoch": 0.14243012022226595,
"grad_norm": 0.1904296875,
"learning_rate": 1.9739804550428887e-06,
"loss": 0.13639799356460572,
"step": 950
},
{
"epoch": 0.14392938464565821,
"grad_norm": 0.349609375,
"learning_rate": 1.9728138872089495e-06,
"loss": 0.12592445611953734,
"step": 960
},
{
"epoch": 0.1454286490690505,
"grad_norm": 0.1826171875,
"learning_rate": 1.9716220986108715e-06,
"loss": 0.12377442121505737,
"step": 970
},
{
"epoch": 0.14692791349244277,
"grad_norm": 0.2578125,
"learning_rate": 1.9704051201464644e-06,
"loss": 0.14418370723724366,
"step": 980
},
{
"epoch": 0.14842717791583504,
"grad_norm": 0.2109375,
"learning_rate": 1.9691629833666016e-06,
"loss": 0.1573760986328125,
"step": 990
},
{
"epoch": 0.1499264423392273,
"grad_norm": 0.275390625,
"learning_rate": 1.9678957204743986e-06,
"loss": 0.1386464238166809,
"step": 1000
},
{
"epoch": 0.1514257067626196,
"grad_norm": 0.287109375,
"learning_rate": 1.966603364324381e-06,
"loss": 0.13971794843673707,
"step": 1010
},
{
"epoch": 0.15292497118601187,
"grad_norm": 0.1669921875,
"learning_rate": 1.965285948421631e-06,
"loss": 0.13169209957122802,
"step": 1020
},
{
"epoch": 0.15442423560940413,
"grad_norm": 0.31640625,
"learning_rate": 1.963943506920921e-06,
"loss": 0.1507979989051819,
"step": 1030
},
{
"epoch": 0.1559235000327964,
"grad_norm": 0.1748046875,
"learning_rate": 1.962576074625824e-06,
"loss": 0.11561447381973267,
"step": 1040
},
{
"epoch": 0.1574227644561887,
"grad_norm": 0.25390625,
"learning_rate": 1.961183686987816e-06,
"loss": 0.14605475664138795,
"step": 1050
},
{
"epoch": 0.15892202887958096,
"grad_norm": 0.421875,
"learning_rate": 1.9597663801053534e-06,
"loss": 0.13819440603256225,
"step": 1060
},
{
"epoch": 0.16042129330297322,
"grad_norm": 0.16015625,
"learning_rate": 1.9583241907229395e-06,
"loss": 0.14112586975097657,
"step": 1070
},
{
"epoch": 0.1619205577263655,
"grad_norm": 0.158203125,
"learning_rate": 1.95685715623017e-06,
"loss": 0.1168364405632019,
"step": 1080
},
{
"epoch": 0.16341982214975778,
"grad_norm": 0.390625,
"learning_rate": 1.955365314660765e-06,
"loss": 0.11267675161361694,
"step": 1090
},
{
"epoch": 0.16491908657315005,
"grad_norm": 0.30078125,
"learning_rate": 1.9538487046915824e-06,
"loss": 0.12178796529769897,
"step": 1100
},
{
"epoch": 0.1664183509965423,
"grad_norm": 0.2353515625,
"learning_rate": 1.952307365641615e-06,
"loss": 0.10850706100463867,
"step": 1110
},
{
"epoch": 0.1679176154199346,
"grad_norm": 0.248046875,
"learning_rate": 1.950741337470971e-06,
"loss": 0.12071930170059204,
"step": 1120
},
{
"epoch": 0.16941687984332687,
"grad_norm": 0.291015625,
"learning_rate": 1.949150660779839e-06,
"loss": 0.12768586874008178,
"step": 1130
},
{
"epoch": 0.17091614426671914,
"grad_norm": 0.2041015625,
"learning_rate": 1.9475353768074354e-06,
"loss": 0.12366677522659301,
"step": 1140
},
{
"epoch": 0.1724154086901114,
"grad_norm": 0.216796875,
"learning_rate": 1.9458955274309334e-06,
"loss": 0.12472466230392457,
"step": 1150
},
{
"epoch": 0.1739146731135037,
"grad_norm": 0.61328125,
"learning_rate": 1.944231155164378e-06,
"loss": 0.10178214311599731,
"step": 1160
},
{
"epoch": 0.17541393753689596,
"grad_norm": 0.1884765625,
"learning_rate": 1.942542303157587e-06,
"loss": 0.10434643030166627,
"step": 1170
},
{
"epoch": 0.17691320196028823,
"grad_norm": 0.1728515625,
"learning_rate": 1.940829015195027e-06,
"loss": 0.12654454708099366,
"step": 1180
},
{
"epoch": 0.1784124663836805,
"grad_norm": 0.2099609375,
"learning_rate": 1.939091335694682e-06,
"loss": 0.14714936017990113,
"step": 1190
},
{
"epoch": 0.1799117308070728,
"grad_norm": 0.220703125,
"learning_rate": 1.9373293097069006e-06,
"loss": 0.12481101751327514,
"step": 1200
},
{
"epoch": 0.18141099523046506,
"grad_norm": 0.177734375,
"learning_rate": 1.935542982913229e-06,
"loss": 0.126925528049469,
"step": 1210
},
{
"epoch": 0.18291025965385732,
"grad_norm": 0.216796875,
"learning_rate": 1.9337324016252246e-06,
"loss": 0.12335828542709351,
"step": 1220
},
{
"epoch": 0.1844095240772496,
"grad_norm": 0.255859375,
"learning_rate": 1.931897612783257e-06,
"loss": 0.1198701024055481,
"step": 1230
},
{
"epoch": 0.18590878850064188,
"grad_norm": 0.1884765625,
"learning_rate": 1.9300386639552917e-06,
"loss": 0.10855865478515625,
"step": 1240
},
{
"epoch": 0.18740805292403415,
"grad_norm": 0.169921875,
"learning_rate": 1.928155603335654e-06,
"loss": 0.11242524385452271,
"step": 1250
},
{
"epoch": 0.1889073173474264,
"grad_norm": 0.2021484375,
"learning_rate": 1.9262484797437835e-06,
"loss": 0.10338661670684815,
"step": 1260
},
{
"epoch": 0.1904065817708187,
"grad_norm": 0.275390625,
"learning_rate": 1.924317342622964e-06,
"loss": 0.13085209131240844,
"step": 1270
},
{
"epoch": 0.19190584619421097,
"grad_norm": 0.228515625,
"learning_rate": 1.922362242039046e-06,
"loss": 0.13100965023040773,
"step": 1280
},
{
"epoch": 0.19340511061760324,
"grad_norm": 0.318359375,
"learning_rate": 1.920383228679146e-06,
"loss": 0.11286605596542358,
"step": 1290
},
{
"epoch": 0.1949043750409955,
"grad_norm": 0.177734375,
"learning_rate": 1.9183803538503325e-06,
"loss": 0.10787241458892823,
"step": 1300
},
{
"epoch": 0.1964036394643878,
"grad_norm": 0.2041015625,
"learning_rate": 1.916353669478297e-06,
"loss": 0.12694379091262817,
"step": 1310
},
{
"epoch": 0.19790290388778006,
"grad_norm": 0.275390625,
"learning_rate": 1.914303228106007e-06,
"loss": 0.12459377050399781,
"step": 1320
},
{
"epoch": 0.19940216831117233,
"grad_norm": 0.1953125,
"learning_rate": 1.912229082892344e-06,
"loss": 0.11015371084213257,
"step": 1330
},
{
"epoch": 0.2009014327345646,
"grad_norm": 0.166015625,
"learning_rate": 1.910131287610726e-06,
"loss": 0.10224473476409912,
"step": 1340
},
{
"epoch": 0.2024006971579569,
"grad_norm": 0.453125,
"learning_rate": 1.9080098966477114e-06,
"loss": 0.1472551107406616,
"step": 1350
},
{
"epoch": 0.20389996158134915,
"grad_norm": 0.28515625,
"learning_rate": 1.9058649650015913e-06,
"loss": 0.12049105167388915,
"step": 1360
},
{
"epoch": 0.20539922600474142,
"grad_norm": 0.2236328125,
"learning_rate": 1.9036965482809624e-06,
"loss": 0.10829113721847534,
"step": 1370
},
{
"epoch": 0.20689849042813369,
"grad_norm": 0.291015625,
"learning_rate": 1.9015047027032858e-06,
"loss": 0.09630746841430664,
"step": 1380
},
{
"epoch": 0.20839775485152598,
"grad_norm": 0.25,
"learning_rate": 1.8992894850934288e-06,
"loss": 0.10639712810516358,
"step": 1390
},
{
"epoch": 0.20989701927491825,
"grad_norm": 0.251953125,
"learning_rate": 1.8970509528821933e-06,
"loss": 0.1108583927154541,
"step": 1400
},
{
"epoch": 0.2113962836983105,
"grad_norm": 0.267578125,
"learning_rate": 1.8947891641048236e-06,
"loss": 0.1440010905265808,
"step": 1410
},
{
"epoch": 0.21289554812170278,
"grad_norm": 0.2734375,
"learning_rate": 1.8925041773995066e-06,
"loss": 0.11479418277740479,
"step": 1420
},
{
"epoch": 0.21439481254509507,
"grad_norm": 0.30078125,
"learning_rate": 1.8901960520058466e-06,
"loss": 0.1372006893157959,
"step": 1430
},
{
"epoch": 0.21589407696848734,
"grad_norm": 0.130859375,
"learning_rate": 1.8878648477633338e-06,
"loss": 0.1048818826675415,
"step": 1440
},
{
"epoch": 0.2173933413918796,
"grad_norm": 0.2353515625,
"learning_rate": 1.8855106251097893e-06,
"loss": 0.11379430294036866,
"step": 1450
},
{
"epoch": 0.2188926058152719,
"grad_norm": 0.158203125,
"learning_rate": 1.8831334450798008e-06,
"loss": 0.11848256587982178,
"step": 1460
},
{
"epoch": 0.22039187023866416,
"grad_norm": 0.2080078125,
"learning_rate": 1.8807333693031394e-06,
"loss": 0.11757129430770874,
"step": 1470
},
{
"epoch": 0.22189113466205643,
"grad_norm": 0.255859375,
"learning_rate": 1.8783104600031608e-06,
"loss": 0.1077274203300476,
"step": 1480
},
{
"epoch": 0.2233903990854487,
"grad_norm": 0.1875,
"learning_rate": 1.8758647799951936e-06,
"loss": 0.13631620407104492,
"step": 1490
},
{
"epoch": 0.224889663508841,
"grad_norm": 0.1787109375,
"learning_rate": 1.8733963926849108e-06,
"loss": 0.11129487752914428,
"step": 1500
},
{
"epoch": 0.22638892793223325,
"grad_norm": 0.234375,
"learning_rate": 1.870905362066684e-06,
"loss": 0.10358604192733764,
"step": 1510
},
{
"epoch": 0.22788819235562552,
"grad_norm": 0.2275390625,
"learning_rate": 1.8683917527219274e-06,
"loss": 0.10696442127227783,
"step": 1520
},
{
"epoch": 0.22938745677901778,
"grad_norm": 0.244140625,
"learning_rate": 1.86585562981742e-06,
"loss": 0.1079567551612854,
"step": 1530
},
{
"epoch": 0.23088672120241008,
"grad_norm": 0.19140625,
"learning_rate": 1.863297059103619e-06,
"loss": 0.08297246098518371,
"step": 1540
},
{
"epoch": 0.23238598562580234,
"grad_norm": 0.2314453125,
"learning_rate": 1.860716106912954e-06,
"loss": 0.11826142072677612,
"step": 1550
},
{
"epoch": 0.2338852500491946,
"grad_norm": 0.1689453125,
"learning_rate": 1.858112840158107e-06,
"loss": 0.11677643060684204,
"step": 1560
},
{
"epoch": 0.23538451447258688,
"grad_norm": 0.220703125,
"learning_rate": 1.8554873263302783e-06,
"loss": 0.10421488285064698,
"step": 1570
},
{
"epoch": 0.23688377889597917,
"grad_norm": 0.33203125,
"learning_rate": 1.8528396334974364e-06,
"loss": 0.10596433877944947,
"step": 1580
},
{
"epoch": 0.23838304331937143,
"grad_norm": 0.150390625,
"learning_rate": 1.850169830302553e-06,
"loss": 0.09852623343467712,
"step": 1590
},
{
"epoch": 0.2398823077427637,
"grad_norm": 0.2392578125,
"learning_rate": 1.8474779859618245e-06,
"loss": 0.13672434091567992,
"step": 1600
},
{
"epoch": 0.24138157216615597,
"grad_norm": 0.177734375,
"learning_rate": 1.8447641702628762e-06,
"loss": 0.11511225700378418,
"step": 1610
},
{
"epoch": 0.24288083658954826,
"grad_norm": 0.27734375,
"learning_rate": 1.8420284535629539e-06,
"loss": 0.11240946054458618,
"step": 1620
},
{
"epoch": 0.24438010101294053,
"grad_norm": 0.1484375,
"learning_rate": 1.839270906787099e-06,
"loss": 0.07973622083663941,
"step": 1630
},
{
"epoch": 0.2458793654363328,
"grad_norm": 0.23046875,
"learning_rate": 1.8364916014263115e-06,
"loss": 0.10506463050842285,
"step": 1640
},
{
"epoch": 0.24737862985972509,
"grad_norm": 0.287109375,
"learning_rate": 1.8336906095356937e-06,
"loss": 0.1416532278060913,
"step": 1650
},
{
"epoch": 0.24887789428311735,
"grad_norm": 0.294921875,
"learning_rate": 1.830868003732585e-06,
"loss": 0.10021046400070191,
"step": 1660
},
{
"epoch": 0.2503771587065096,
"grad_norm": 0.2158203125,
"learning_rate": 1.8280238571946773e-06,
"loss": 0.09624313712120056,
"step": 1670
},
{
"epoch": 0.2518764231299019,
"grad_norm": 0.19921875,
"learning_rate": 1.8251582436581193e-06,
"loss": 0.09360762238502503,
"step": 1680
},
{
"epoch": 0.25337568755329415,
"grad_norm": 0.248046875,
"learning_rate": 1.8222712374156038e-06,
"loss": 0.10825358629226685,
"step": 1690
},
{
"epoch": 0.2548749519766864,
"grad_norm": 0.2578125,
"learning_rate": 1.8193629133144412e-06,
"loss": 0.09739000201225281,
"step": 1700
},
{
"epoch": 0.25637421640007874,
"grad_norm": 0.2734375,
"learning_rate": 1.8164333467546205e-06,
"loss": 0.13052973747253419,
"step": 1710
},
{
"epoch": 0.257873480823471,
"grad_norm": 0.314453125,
"learning_rate": 1.8134826136868533e-06,
"loss": 0.1281905174255371,
"step": 1720
},
{
"epoch": 0.25937274524686327,
"grad_norm": 0.1611328125,
"learning_rate": 1.810510790610606e-06,
"loss": 0.1224624514579773,
"step": 1730
},
{
"epoch": 0.26087200967025553,
"grad_norm": 0.2236328125,
"learning_rate": 1.8075179545721148e-06,
"loss": 0.11144398450851441,
"step": 1740
},
{
"epoch": 0.2623712740936478,
"grad_norm": 0.3125,
"learning_rate": 1.8045041831623892e-06,
"loss": 0.07502882480621338,
"step": 1750
},
{
"epoch": 0.26387053851704007,
"grad_norm": 0.2578125,
"learning_rate": 1.8014695545152014e-06,
"loss": 0.11576559543609619,
"step": 1760
},
{
"epoch": 0.26536980294043233,
"grad_norm": 0.16015625,
"learning_rate": 1.7984141473050583e-06,
"loss": 0.10232355594635009,
"step": 1770
},
{
"epoch": 0.2668690673638246,
"grad_norm": 0.267578125,
"learning_rate": 1.7953380407451632e-06,
"loss": 0.10430169105529785,
"step": 1780
},
{
"epoch": 0.2683683317872169,
"grad_norm": 0.279296875,
"learning_rate": 1.7922413145853632e-06,
"loss": 0.10129927396774292,
"step": 1790
},
{
"epoch": 0.2698675962106092,
"grad_norm": 0.1787109375,
"learning_rate": 1.7891240491100794e-06,
"loss": 0.1479990601539612,
"step": 1800
},
{
"epoch": 0.27136686063400145,
"grad_norm": 0.265625,
"learning_rate": 1.7859863251362268e-06,
"loss": 0.09153670072555542,
"step": 1810
},
{
"epoch": 0.2728661250573937,
"grad_norm": 0.185546875,
"learning_rate": 1.7828282240111188e-06,
"loss": 0.10302189588546753,
"step": 1820
},
{
"epoch": 0.274365389480786,
"grad_norm": 0.279296875,
"learning_rate": 1.779649827610359e-06,
"loss": 0.10783896446228028,
"step": 1830
},
{
"epoch": 0.27586465390417825,
"grad_norm": 0.2099609375,
"learning_rate": 1.7764512183357161e-06,
"loss": 0.10202981233596801,
"step": 1840
},
{
"epoch": 0.2773639183275705,
"grad_norm": 0.279296875,
"learning_rate": 1.7732324791129914e-06,
"loss": 0.09905132055282592,
"step": 1850
},
{
"epoch": 0.27886318275096283,
"grad_norm": 0.1611328125,
"learning_rate": 1.769993693389865e-06,
"loss": 0.10531445741653442,
"step": 1860
},
{
"epoch": 0.2803624471743551,
"grad_norm": 0.2138671875,
"learning_rate": 1.7667349451337353e-06,
"loss": 0.08846319317817689,
"step": 1870
},
{
"epoch": 0.28186171159774737,
"grad_norm": 0.2392578125,
"learning_rate": 1.7634563188295403e-06,
"loss": 0.0975230872631073,
"step": 1880
},
{
"epoch": 0.28336097602113963,
"grad_norm": 0.31640625,
"learning_rate": 1.7601578994775684e-06,
"loss": 0.09964791536331177,
"step": 1890
},
{
"epoch": 0.2848602404445319,
"grad_norm": 0.2373046875,
"learning_rate": 1.756839772591254e-06,
"loss": 0.1272280693054199,
"step": 1900
},
{
"epoch": 0.28635950486792416,
"grad_norm": 0.353515625,
"learning_rate": 1.7535020241949598e-06,
"loss": 0.11281530857086182,
"step": 1910
},
{
"epoch": 0.28785876929131643,
"grad_norm": 0.19921875,
"learning_rate": 1.7501447408217497e-06,
"loss": 0.12100661993026733,
"step": 1920
},
{
"epoch": 0.2893580337147087,
"grad_norm": 0.16015625,
"learning_rate": 1.7467680095111414e-06,
"loss": 0.10090996026992798,
"step": 1930
},
{
"epoch": 0.290857298138101,
"grad_norm": 0.28125,
"learning_rate": 1.7433719178068524e-06,
"loss": 0.13152073621749877,
"step": 1940
},
{
"epoch": 0.2923565625614933,
"grad_norm": 0.193359375,
"learning_rate": 1.739956553754529e-06,
"loss": 0.09162830114364624,
"step": 1950
},
{
"epoch": 0.29385582698488555,
"grad_norm": 0.396484375,
"learning_rate": 1.7365220058994655e-06,
"loss": 0.1236315131187439,
"step": 1960
},
{
"epoch": 0.2953550914082778,
"grad_norm": 0.2119140625,
"learning_rate": 1.7330683632843059e-06,
"loss": 0.09788467288017273,
"step": 1970
},
{
"epoch": 0.2968543558316701,
"grad_norm": 0.25390625,
"learning_rate": 1.7295957154467382e-06,
"loss": 0.09465370178222657,
"step": 1980
},
{
"epoch": 0.29835362025506235,
"grad_norm": 0.1279296875,
"learning_rate": 1.726104152417171e-06,
"loss": 0.1005245327949524,
"step": 1990
},
{
"epoch": 0.2998528846784546,
"grad_norm": 0.28515625,
"learning_rate": 1.722593764716401e-06,
"loss": 0.11565471887588501,
"step": 2000
},
{
"epoch": 0.30135214910184693,
"grad_norm": 0.1865234375,
"learning_rate": 1.7190646433532644e-06,
"loss": 0.10114152431488037,
"step": 2010
},
{
"epoch": 0.3028514135252392,
"grad_norm": 0.349609375,
"learning_rate": 1.7155168798222789e-06,
"loss": 0.11758486032485962,
"step": 2020
},
{
"epoch": 0.30435067794863147,
"grad_norm": 0.26953125,
"learning_rate": 1.7119505661012718e-06,
"loss": 0.12670440673828126,
"step": 2030
},
{
"epoch": 0.30584994237202373,
"grad_norm": 0.2138671875,
"learning_rate": 1.7083657946489941e-06,
"loss": 0.09111065268516541,
"step": 2040
},
{
"epoch": 0.307349206795416,
"grad_norm": 0.244140625,
"learning_rate": 1.7047626584027248e-06,
"loss": 0.10659761428833008,
"step": 2050
},
{
"epoch": 0.30884847121880826,
"grad_norm": 0.203125,
"learning_rate": 1.7011412507758598e-06,
"loss": 0.09141663908958435,
"step": 2060
},
{
"epoch": 0.31034773564220053,
"grad_norm": 0.2158203125,
"learning_rate": 1.6975016656554924e-06,
"loss": 0.1156761646270752,
"step": 2070
},
{
"epoch": 0.3118470000655928,
"grad_norm": 0.1796875,
"learning_rate": 1.693843997399977e-06,
"loss": 0.1171414852142334,
"step": 2080
},
{
"epoch": 0.3133462644889851,
"grad_norm": 0.158203125,
"learning_rate": 1.690168340836484e-06,
"loss": 0.10372446775436402,
"step": 2090
},
{
"epoch": 0.3148455289123774,
"grad_norm": 0.3515625,
"learning_rate": 1.6864747912585416e-06,
"loss": 0.11128904819488525,
"step": 2100
},
{
"epoch": 0.31634479333576965,
"grad_norm": 0.2138671875,
"learning_rate": 1.6827634444235643e-06,
"loss": 0.11956160068511963,
"step": 2110
},
{
"epoch": 0.3178440577591619,
"grad_norm": 0.1640625,
"learning_rate": 1.6790343965503709e-06,
"loss": 0.08641130924224853,
"step": 2120
},
{
"epoch": 0.3193433221825542,
"grad_norm": 0.318359375,
"learning_rate": 1.67528774431669e-06,
"loss": 0.11216531991958618,
"step": 2130
},
{
"epoch": 0.32084258660594644,
"grad_norm": 0.2890625,
"learning_rate": 1.6715235848566533e-06,
"loss": 0.09440256357192993,
"step": 2140
},
{
"epoch": 0.3223418510293387,
"grad_norm": 0.2373046875,
"learning_rate": 1.6677420157582774e-06,
"loss": 0.08534490466117858,
"step": 2150
},
{
"epoch": 0.323841115452731,
"grad_norm": 0.30859375,
"learning_rate": 1.663943135060934e-06,
"loss": 0.0956838846206665,
"step": 2160
},
{
"epoch": 0.3253403798761233,
"grad_norm": 0.1767578125,
"learning_rate": 1.6601270412528084e-06,
"loss": 0.1049761414527893,
"step": 2170
},
{
"epoch": 0.32683964429951556,
"grad_norm": 0.189453125,
"learning_rate": 1.6562938332683454e-06,
"loss": 0.10431164503097534,
"step": 2180
},
{
"epoch": 0.32833890872290783,
"grad_norm": 0.169921875,
"learning_rate": 1.6524436104856845e-06,
"loss": 0.09506284594535827,
"step": 2190
},
{
"epoch": 0.3298381731463001,
"grad_norm": 0.23828125,
"learning_rate": 1.648576472724084e-06,
"loss": 0.1192029595375061,
"step": 2200
},
{
"epoch": 0.33133743756969236,
"grad_norm": 0.201171875,
"learning_rate": 1.6446925202413331e-06,
"loss": 0.09638182520866394,
"step": 2210
},
{
"epoch": 0.3328367019930846,
"grad_norm": 0.19921875,
"learning_rate": 1.640791853731152e-06,
"loss": 0.090701824426651,
"step": 2220
},
{
"epoch": 0.3343359664164769,
"grad_norm": 0.220703125,
"learning_rate": 1.6368745743205821e-06,
"loss": 0.09149349331855774,
"step": 2230
},
{
"epoch": 0.3358352308398692,
"grad_norm": 0.310546875,
"learning_rate": 1.6329407835673635e-06,
"loss": 0.13018569946289063,
"step": 2240
},
{
"epoch": 0.3373344952632615,
"grad_norm": 0.296875,
"learning_rate": 1.628990583457302e-06,
"loss": 0.1057326078414917,
"step": 2250
},
{
"epoch": 0.33883375968665375,
"grad_norm": 0.212890625,
"learning_rate": 1.6250240764016272e-06,
"loss": 0.1026038646697998,
"step": 2260
},
{
"epoch": 0.340333024110046,
"grad_norm": 0.32421875,
"learning_rate": 1.6210413652343338e-06,
"loss": 0.08930633664131164,
"step": 2270
},
{
"epoch": 0.3418322885334383,
"grad_norm": 0.38671875,
"learning_rate": 1.6170425532095187e-06,
"loss": 0.10358338356018067,
"step": 2280
},
{
"epoch": 0.34333155295683054,
"grad_norm": 0.271484375,
"learning_rate": 1.6130277439987022e-06,
"loss": 0.09695777893066407,
"step": 2290
},
{
"epoch": 0.3448308173802228,
"grad_norm": 0.296875,
"learning_rate": 1.6089970416881414e-06,
"loss": 0.10922973155975342,
"step": 2300
},
{
"epoch": 0.3463300818036151,
"grad_norm": 0.3046875,
"learning_rate": 1.6049505507761309e-06,
"loss": 0.10175033807754516,
"step": 2310
},
{
"epoch": 0.3478293462270074,
"grad_norm": 0.2890625,
"learning_rate": 1.600888376170294e-06,
"loss": 0.10103652477264405,
"step": 2320
},
{
"epoch": 0.34932861065039966,
"grad_norm": 0.1904296875,
"learning_rate": 1.5968106231848632e-06,
"loss": 0.07333493828773499,
"step": 2330
},
{
"epoch": 0.35082787507379193,
"grad_norm": 0.1875,
"learning_rate": 1.5927173975379488e-06,
"loss": 0.08524224758148194,
"step": 2340
},
{
"epoch": 0.3523271394971842,
"grad_norm": 0.220703125,
"learning_rate": 1.5886088053488e-06,
"loss": 0.09646062850952149,
"step": 2350
},
{
"epoch": 0.35382640392057646,
"grad_norm": 0.265625,
"learning_rate": 1.584484953135051e-06,
"loss": 0.0860047996044159,
"step": 2360
},
{
"epoch": 0.3553256683439687,
"grad_norm": 0.177734375,
"learning_rate": 1.580345947809962e-06,
"loss": 0.09231213331222535,
"step": 2370
},
{
"epoch": 0.356824932767361,
"grad_norm": 0.1845703125,
"learning_rate": 1.5761918966796462e-06,
"loss": 0.08510161638259887,
"step": 2380
},
{
"epoch": 0.3583241971907533,
"grad_norm": 0.171875,
"learning_rate": 1.5720229074402883e-06,
"loss": 0.10984573364257813,
"step": 2390
},
{
"epoch": 0.3598234616141456,
"grad_norm": 0.26171875,
"learning_rate": 1.5678390881753512e-06,
"loss": 0.11594033241271973,
"step": 2400
},
{
"epoch": 0.36132272603753784,
"grad_norm": 0.376953125,
"learning_rate": 1.5636405473527763e-06,
"loss": 0.09002584218978882,
"step": 2410
},
{
"epoch": 0.3628219904609301,
"grad_norm": 0.216796875,
"learning_rate": 1.5594273938221683e-06,
"loss": 0.09397087097167969,
"step": 2420
},
{
"epoch": 0.3643212548843224,
"grad_norm": 0.3515625,
"learning_rate": 1.5551997368119758e-06,
"loss": 0.10535862445831298,
"step": 2430
},
{
"epoch": 0.36582051930771464,
"grad_norm": 0.1787109375,
"learning_rate": 1.5509576859266589e-06,
"loss": 0.09418719410896301,
"step": 2440
},
{
"epoch": 0.3673197837311069,
"grad_norm": 0.19921875,
"learning_rate": 1.5467013511438455e-06,
"loss": 0.10402942895889282,
"step": 2450
},
{
"epoch": 0.3688190481544992,
"grad_norm": 0.171875,
"learning_rate": 1.5424308428114842e-06,
"loss": 0.09072368144989014,
"step": 2460
},
{
"epoch": 0.3703183125778915,
"grad_norm": 0.2177734375,
"learning_rate": 1.5381462716449793e-06,
"loss": 0.12782552242279052,
"step": 2470
},
{
"epoch": 0.37181757700128376,
"grad_norm": 0.271484375,
"learning_rate": 1.5338477487243229e-06,
"loss": 0.12468627691268921,
"step": 2480
},
{
"epoch": 0.373316841424676,
"grad_norm": 0.19921875,
"learning_rate": 1.5295353854912142e-06,
"loss": 0.08745025396347046,
"step": 2490
},
{
"epoch": 0.3748161058480683,
"grad_norm": 0.177734375,
"learning_rate": 1.5252092937461708e-06,
"loss": 0.11175857782363892,
"step": 2500
},
{
"epoch": 0.37631537027146056,
"grad_norm": 0.2412109375,
"learning_rate": 1.52086958564563e-06,
"loss": 0.09319526553153992,
"step": 2510
},
{
"epoch": 0.3778146346948528,
"grad_norm": 0.2890625,
"learning_rate": 1.5165163736990402e-06,
"loss": 0.09921846985816955,
"step": 2520
},
{
"epoch": 0.3793138991182451,
"grad_norm": 0.228515625,
"learning_rate": 1.5121497707659459e-06,
"loss": 0.13923016786575318,
"step": 2530
},
{
"epoch": 0.3808131635416374,
"grad_norm": 0.177734375,
"learning_rate": 1.5077698900530605e-06,
"loss": 0.09786847829818726,
"step": 2540
},
{
"epoch": 0.3823124279650297,
"grad_norm": 0.185546875,
"learning_rate": 1.5033768451113309e-06,
"loss": 0.09633988738059998,
"step": 2550
},
{
"epoch": 0.38381169238842194,
"grad_norm": 0.2578125,
"learning_rate": 1.4989707498329943e-06,
"loss": 0.14051291942596436,
"step": 2560
},
{
"epoch": 0.3853109568118142,
"grad_norm": 0.1962890625,
"learning_rate": 1.4945517184486266e-06,
"loss": 0.09283372163772582,
"step": 2570
},
{
"epoch": 0.3868102212352065,
"grad_norm": 0.18359375,
"learning_rate": 1.4901198655241784e-06,
"loss": 0.09845755696296692,
"step": 2580
},
{
"epoch": 0.38830948565859874,
"grad_norm": 0.2216796875,
"learning_rate": 1.4856753059580065e-06,
"loss": 0.09300137758255005,
"step": 2590
},
{
"epoch": 0.389808750081991,
"grad_norm": 0.2138671875,
"learning_rate": 1.4812181549778956e-06,
"loss": 0.0833775520324707,
"step": 2600
},
{
"epoch": 0.3913080145053833,
"grad_norm": 0.24609375,
"learning_rate": 1.4767485281380694e-06,
"loss": 0.09278824925422668,
"step": 2610
},
{
"epoch": 0.3928072789287756,
"grad_norm": 0.2353515625,
"learning_rate": 1.4722665413161948e-06,
"loss": 0.09754594564437866,
"step": 2620
},
{
"epoch": 0.39430654335216786,
"grad_norm": 0.26953125,
"learning_rate": 1.46777231071038e-06,
"loss": 0.1008460521697998,
"step": 2630
},
{
"epoch": 0.3958058077755601,
"grad_norm": 0.28515625,
"learning_rate": 1.4632659528361591e-06,
"loss": 0.0745955765247345,
"step": 2640
},
{
"epoch": 0.3973050721989524,
"grad_norm": 0.2470703125,
"learning_rate": 1.4587475845234729e-06,
"loss": 0.11444522142410278,
"step": 2650
},
{
"epoch": 0.39880433662234466,
"grad_norm": 0.23046875,
"learning_rate": 1.454217322913641e-06,
"loss": 0.09638299942016601,
"step": 2660
},
{
"epoch": 0.4003036010457369,
"grad_norm": 0.16015625,
"learning_rate": 1.4496752854563217e-06,
"loss": 0.0774892508983612,
"step": 2670
},
{
"epoch": 0.4018028654691292,
"grad_norm": 0.263671875,
"learning_rate": 1.4451215899064699e-06,
"loss": 0.10078433752059937,
"step": 2680
},
{
"epoch": 0.40330212989252145,
"grad_norm": 0.2001953125,
"learning_rate": 1.4405563543212841e-06,
"loss": 0.0878619134426117,
"step": 2690
},
{
"epoch": 0.4048013943159138,
"grad_norm": 0.1982421875,
"learning_rate": 1.4359796970571434e-06,
"loss": 0.08299956321716309,
"step": 2700
},
{
"epoch": 0.40630065873930604,
"grad_norm": 0.244140625,
"learning_rate": 1.4313917367665414e-06,
"loss": 0.11845102310180664,
"step": 2710
},
{
"epoch": 0.4077999231626983,
"grad_norm": 0.26953125,
"learning_rate": 1.4267925923950094e-06,
"loss": 0.1439320921897888,
"step": 2720
},
{
"epoch": 0.4092991875860906,
"grad_norm": 0.248046875,
"learning_rate": 1.422182383178032e-06,
"loss": 0.09109203219413757,
"step": 2730
},
{
"epoch": 0.41079845200948284,
"grad_norm": 0.2099609375,
"learning_rate": 1.4175612286379562e-06,
"loss": 0.07972334623336792,
"step": 2740
},
{
"epoch": 0.4122977164328751,
"grad_norm": 0.1748046875,
"learning_rate": 1.412929248580894e-06,
"loss": 0.08981594443321228,
"step": 2750
},
{
"epoch": 0.41379698085626737,
"grad_norm": 0.201171875,
"learning_rate": 1.4082865630936134e-06,
"loss": 0.10788861513137818,
"step": 2760
},
{
"epoch": 0.4152962452796597,
"grad_norm": 0.251953125,
"learning_rate": 1.4036332925404283e-06,
"loss": 0.08774803280830383,
"step": 2770
},
{
"epoch": 0.41679550970305196,
"grad_norm": 0.1806640625,
"learning_rate": 1.3989695575600763e-06,
"loss": 0.0800628900527954,
"step": 2780
},
{
"epoch": 0.4182947741264442,
"grad_norm": 0.216796875,
"learning_rate": 1.3942954790625904e-06,
"loss": 0.11887997388839722,
"step": 2790
},
{
"epoch": 0.4197940385498365,
"grad_norm": 0.1650390625,
"learning_rate": 1.3896111782261668e-06,
"loss": 0.09116448163986206,
"step": 2800
},
{
"epoch": 0.42129330297322876,
"grad_norm": 0.400390625,
"learning_rate": 1.3849167764940211e-06,
"loss": 0.11099686622619628,
"step": 2810
},
{
"epoch": 0.422792567396621,
"grad_norm": 0.228515625,
"learning_rate": 1.38021239557124e-06,
"loss": 0.09188846349716187,
"step": 2820
},
{
"epoch": 0.4242918318200133,
"grad_norm": 0.1748046875,
"learning_rate": 1.3754981574216267e-06,
"loss": 0.09292811751365662,
"step": 2830
},
{
"epoch": 0.42579109624340555,
"grad_norm": 0.30078125,
"learning_rate": 1.3707741842645392e-06,
"loss": 0.0990601897239685,
"step": 2840
},
{
"epoch": 0.4272903606667979,
"grad_norm": 0.21875,
"learning_rate": 1.3660405985717212e-06,
"loss": 0.0773146092891693,
"step": 2850
},
{
"epoch": 0.42878962509019014,
"grad_norm": 0.224609375,
"learning_rate": 1.361297523064126e-06,
"loss": 0.09871623516082764,
"step": 2860
},
{
"epoch": 0.4302888895135824,
"grad_norm": 0.224609375,
"learning_rate": 1.3565450807087373e-06,
"loss": 0.09449006915092469,
"step": 2870
},
{
"epoch": 0.4317881539369747,
"grad_norm": 0.2265625,
"learning_rate": 1.3517833947153782e-06,
"loss": 0.09626795053482055,
"step": 2880
},
{
"epoch": 0.43328741836036694,
"grad_norm": 0.26953125,
"learning_rate": 1.34701258853352e-06,
"loss": 0.07917786836624145,
"step": 2890
},
{
"epoch": 0.4347866827837592,
"grad_norm": 0.2578125,
"learning_rate": 1.3422327858490792e-06,
"loss": 0.10537385940551758,
"step": 2900
},
{
"epoch": 0.43628594720715147,
"grad_norm": 0.1923828125,
"learning_rate": 1.337444110581212e-06,
"loss": 0.07042791247367859,
"step": 2910
},
{
"epoch": 0.4377852116305438,
"grad_norm": 0.2119140625,
"learning_rate": 1.3326466868791013e-06,
"loss": 0.0855652630329132,
"step": 2920
},
{
"epoch": 0.43928447605393606,
"grad_norm": 0.205078125,
"learning_rate": 1.3278406391187391e-06,
"loss": 0.09092465043067932,
"step": 2930
},
{
"epoch": 0.4407837404773283,
"grad_norm": 0.216796875,
"learning_rate": 1.3230260918997004e-06,
"loss": 0.10829230546951293,
"step": 2940
},
{
"epoch": 0.4422830049007206,
"grad_norm": 0.31640625,
"learning_rate": 1.3182031700419129e-06,
"loss": 0.09212432503700256,
"step": 2950
},
{
"epoch": 0.44378226932411285,
"grad_norm": 0.1708984375,
"learning_rate": 1.3133719985824237e-06,
"loss": 0.06796190738677979,
"step": 2960
},
{
"epoch": 0.4452815337475051,
"grad_norm": 0.2080078125,
"learning_rate": 1.3085327027721536e-06,
"loss": 0.08660737872123718,
"step": 2970
},
{
"epoch": 0.4467807981708974,
"grad_norm": 0.1943359375,
"learning_rate": 1.3036854080726525e-06,
"loss": 0.07199004888534546,
"step": 2980
},
{
"epoch": 0.44828006259428965,
"grad_norm": 0.197265625,
"learning_rate": 1.298830240152847e-06,
"loss": 0.11634057760238647,
"step": 2990
},
{
"epoch": 0.449779327017682,
"grad_norm": 0.240234375,
"learning_rate": 1.2939673248857805e-06,
"loss": 0.11802215576171875,
"step": 3000
},
{
"epoch": 0.45127859144107424,
"grad_norm": 0.21875,
"learning_rate": 1.2890967883453509e-06,
"loss": 0.10256350040435791,
"step": 3010
},
{
"epoch": 0.4527778558644665,
"grad_norm": 0.2470703125,
"learning_rate": 1.2842187568030431e-06,
"loss": 0.08822081089019776,
"step": 3020
},
{
"epoch": 0.45427712028785877,
"grad_norm": 0.205078125,
"learning_rate": 1.2793333567246526e-06,
"loss": 0.08067854046821595,
"step": 3030
},
{
"epoch": 0.45577638471125104,
"grad_norm": 0.2099609375,
"learning_rate": 1.2744407147670098e-06,
"loss": 0.09741014242172241,
"step": 3040
},
{
"epoch": 0.4572756491346433,
"grad_norm": 0.234375,
"learning_rate": 1.269540957774695e-06,
"loss": 0.07846143245697021,
"step": 3050
},
{
"epoch": 0.45877491355803557,
"grad_norm": 0.353515625,
"learning_rate": 1.2646342127767486e-06,
"loss": 0.10557938814163208,
"step": 3060
},
{
"epoch": 0.46027417798142783,
"grad_norm": 0.251953125,
"learning_rate": 1.2597206069833805e-06,
"loss": 0.0840741217136383,
"step": 3070
},
{
"epoch": 0.46177344240482016,
"grad_norm": 0.205078125,
"learning_rate": 1.2548002677826704e-06,
"loss": 0.09562651515007019,
"step": 3080
},
{
"epoch": 0.4632727068282124,
"grad_norm": 0.2236328125,
"learning_rate": 1.2498733227372648e-06,
"loss": 0.09925270080566406,
"step": 3090
},
{
"epoch": 0.4647719712516047,
"grad_norm": 0.2255859375,
"learning_rate": 1.2449398995810709e-06,
"loss": 0.10337086915969848,
"step": 3100
},
{
"epoch": 0.46627123567499695,
"grad_norm": 0.3671875,
"learning_rate": 1.2400001262159458e-06,
"loss": 0.07978419065475464,
"step": 3110
},
{
"epoch": 0.4677705000983892,
"grad_norm": 0.30859375,
"learning_rate": 1.2350541307083776e-06,
"loss": 0.07110666632652282,
"step": 3120
},
{
"epoch": 0.4692697645217815,
"grad_norm": 0.2197265625,
"learning_rate": 1.2301020412861675e-06,
"loss": 0.07428762912750245,
"step": 3130
},
{
"epoch": 0.47076902894517375,
"grad_norm": 0.36328125,
"learning_rate": 1.2251439863351068e-06,
"loss": 0.09102022051811218,
"step": 3140
},
{
"epoch": 0.47226829336856607,
"grad_norm": 0.302734375,
"learning_rate": 1.220180094395644e-06,
"loss": 0.08342552185058594,
"step": 3150
},
{
"epoch": 0.47376755779195834,
"grad_norm": 0.24609375,
"learning_rate": 1.2152104941595562e-06,
"loss": 0.12274667024612426,
"step": 3160
},
{
"epoch": 0.4752668222153506,
"grad_norm": 0.19921875,
"learning_rate": 1.2102353144666117e-06,
"loss": 0.09014168381690979,
"step": 3170
},
{
"epoch": 0.47676608663874287,
"grad_norm": 0.17578125,
"learning_rate": 1.205254684301229e-06,
"loss": 0.07782111167907715,
"step": 3180
},
{
"epoch": 0.47826535106213514,
"grad_norm": 0.2001953125,
"learning_rate": 1.2002687327891328e-06,
"loss": 0.07985667586326599,
"step": 3190
},
{
"epoch": 0.4797646154855274,
"grad_norm": 0.2578125,
"learning_rate": 1.1952775891940082e-06,
"loss": 0.09129717350006103,
"step": 3200
},
{
"epoch": 0.48126387990891967,
"grad_norm": 0.234375,
"learning_rate": 1.190281382914146e-06,
"loss": 0.1002733588218689,
"step": 3210
},
{
"epoch": 0.48276314433231193,
"grad_norm": 0.23046875,
"learning_rate": 1.185280243479092e-06,
"loss": 0.08630979657173157,
"step": 3220
},
{
"epoch": 0.48426240875570425,
"grad_norm": 0.1982421875,
"learning_rate": 1.1802743005462862e-06,
"loss": 0.08386391997337342,
"step": 3230
},
{
"epoch": 0.4857616731790965,
"grad_norm": 0.23828125,
"learning_rate": 1.1752636838977013e-06,
"loss": 0.08188863396644593,
"step": 3240
},
{
"epoch": 0.4872609376024888,
"grad_norm": 0.298828125,
"learning_rate": 1.1702485234364797e-06,
"loss": 0.10928175449371338,
"step": 3250
},
{
"epoch": 0.48876020202588105,
"grad_norm": 0.1923828125,
"learning_rate": 1.165228949183565e-06,
"loss": 0.09540101885795593,
"step": 3260
},
{
"epoch": 0.4902594664492733,
"grad_norm": 0.2265625,
"learning_rate": 1.16020509127433e-06,
"loss": 0.092869633436203,
"step": 3270
},
{
"epoch": 0.4917587308726656,
"grad_norm": 0.259765625,
"learning_rate": 1.1551770799552039e-06,
"loss": 0.09745745658874512,
"step": 3280
},
{
"epoch": 0.49325799529605785,
"grad_norm": 0.19921875,
"learning_rate": 1.1501450455802968e-06,
"loss": 0.09029659032821655,
"step": 3290
},
{
"epoch": 0.49475725971945017,
"grad_norm": 0.228515625,
"learning_rate": 1.145109118608017e-06,
"loss": 0.09824432134628296,
"step": 3300
},
{
"epoch": 0.49625652414284244,
"grad_norm": 0.26171875,
"learning_rate": 1.1400694295976915e-06,
"loss": 0.08436204195022583,
"step": 3310
},
{
"epoch": 0.4977557885662347,
"grad_norm": 0.2158203125,
"learning_rate": 1.135026109206181e-06,
"loss": 0.10501574277877808,
"step": 3320
},
{
"epoch": 0.49925505298962697,
"grad_norm": 0.337890625,
"learning_rate": 1.1299792881844906e-06,
"loss": 0.09339694380760193,
"step": 3330
},
{
"epoch": 0.5007543174130192,
"grad_norm": 0.224609375,
"learning_rate": 1.1249290973743814e-06,
"loss": 0.07747515439987182,
"step": 3340
},
{
"epoch": 0.5022535818364116,
"grad_norm": 0.232421875,
"learning_rate": 1.1198756677049796e-06,
"loss": 0.09033283591270447,
"step": 3350
},
{
"epoch": 0.5037528462598038,
"grad_norm": 0.234375,
"learning_rate": 1.1148191301893795e-06,
"loss": 0.06604780554771424,
"step": 3360
},
{
"epoch": 0.5052521106831961,
"grad_norm": 0.220703125,
"learning_rate": 1.1097596159212475e-06,
"loss": 0.08669602274894714,
"step": 3370
},
{
"epoch": 0.5067513751065883,
"grad_norm": 0.255859375,
"learning_rate": 1.104697256071426e-06,
"loss": 0.11573494672775268,
"step": 3380
},
{
"epoch": 0.5082506395299806,
"grad_norm": 0.255859375,
"learning_rate": 1.0996321818845294e-06,
"loss": 0.09091781973838806,
"step": 3390
},
{
"epoch": 0.5097499039533728,
"grad_norm": 0.244140625,
"learning_rate": 1.0945645246755424e-06,
"loss": 0.0938392698764801,
"step": 3400
},
{
"epoch": 0.5112491683767652,
"grad_norm": 0.2158203125,
"learning_rate": 1.089494415826418e-06,
"loss": 0.08227325677871704,
"step": 3410
},
{
"epoch": 0.5127484328001575,
"grad_norm": 0.2138671875,
"learning_rate": 1.084421986782667e-06,
"loss": 0.07320802211761475,
"step": 3420
},
{
"epoch": 0.5142476972235497,
"grad_norm": 0.1953125,
"learning_rate": 1.079347369049954e-06,
"loss": 0.08411517143249511,
"step": 3430
},
{
"epoch": 0.515746961646942,
"grad_norm": 0.2451171875,
"learning_rate": 1.0742706941906873e-06,
"loss": 0.1013220191001892,
"step": 3440
},
{
"epoch": 0.5172462260703342,
"grad_norm": 0.2255859375,
"learning_rate": 1.0691920938206052e-06,
"loss": 0.08412815928459168,
"step": 3450
},
{
"epoch": 0.5187454904937265,
"grad_norm": 0.21484375,
"learning_rate": 1.0641116996053678e-06,
"loss": 0.08085081577301026,
"step": 3460
},
{
"epoch": 0.5202447549171187,
"grad_norm": 0.291015625,
"learning_rate": 1.0590296432571414e-06,
"loss": 0.08313990831375122,
"step": 3470
},
{
"epoch": 0.5217440193405111,
"grad_norm": 0.275390625,
"learning_rate": 1.0539460565311836e-06,
"loss": 0.0919266939163208,
"step": 3480
},
{
"epoch": 0.5232432837639034,
"grad_norm": 0.2470703125,
"learning_rate": 1.048861071222428e-06,
"loss": 0.09890375733375549,
"step": 3490
},
{
"epoch": 0.5247425481872956,
"grad_norm": 0.2451171875,
"learning_rate": 1.0437748191620678e-06,
"loss": 0.08521285653114319,
"step": 3500
},
{
"epoch": 0.5262418126106879,
"grad_norm": 0.2275390625,
"learning_rate": 1.0386874322141365e-06,
"loss": 0.08201659321784974,
"step": 3510
},
{
"epoch": 0.5277410770340801,
"grad_norm": 0.419921875,
"learning_rate": 1.0335990422720908e-06,
"loss": 0.08876433968544006,
"step": 3520
},
{
"epoch": 0.5292403414574725,
"grad_norm": 0.2099609375,
"learning_rate": 1.0285097812553916e-06,
"loss": 0.08933233618736267,
"step": 3530
},
{
"epoch": 0.5307396058808647,
"grad_norm": 0.240234375,
"learning_rate": 1.0234197811060808e-06,
"loss": 0.07142494320869446,
"step": 3540
},
{
"epoch": 0.532238870304257,
"grad_norm": 0.220703125,
"learning_rate": 1.0183291737853636e-06,
"loss": 0.07216275334358216,
"step": 3550
},
{
"epoch": 0.5337381347276492,
"grad_norm": 0.2353515625,
"learning_rate": 1.0132380912701884e-06,
"loss": 0.09240591526031494,
"step": 3560
},
{
"epoch": 0.5352373991510415,
"grad_norm": 0.1962890625,
"learning_rate": 1.0081466655498198e-06,
"loss": 0.08051929473876954,
"step": 3570
},
{
"epoch": 0.5367366635744338,
"grad_norm": 0.2451171875,
"learning_rate": 1.0030550286224228e-06,
"loss": 0.06649044156074524,
"step": 3580
},
{
"epoch": 0.538235927997826,
"grad_norm": 0.2158203125,
"learning_rate": 9.979633124916373e-07,
"loss": 0.09150764346122742,
"step": 3590
},
{
"epoch": 0.5397351924212184,
"grad_norm": 0.212890625,
"learning_rate": 9.928716491631568e-07,
"loss": 0.09035595655441284,
"step": 3600
},
{
"epoch": 0.5412344568446106,
"grad_norm": 0.1806640625,
"learning_rate": 9.877801706413051e-07,
"loss": 0.09294023513793945,
"step": 3610
},
{
"epoch": 0.5427337212680029,
"grad_norm": 0.2265625,
"learning_rate": 9.826890089256157e-07,
"loss": 0.1178174376487732,
"step": 3620
},
{
"epoch": 0.5442329856913951,
"grad_norm": 0.2490234375,
"learning_rate": 9.775982960074077e-07,
"loss": 0.10003062486648559,
"step": 3630
},
{
"epoch": 0.5457322501147874,
"grad_norm": 0.333984375,
"learning_rate": 9.725081638663661e-07,
"loss": 0.10663024187088013,
"step": 3640
},
{
"epoch": 0.5472315145381798,
"grad_norm": 0.2421875,
"learning_rate": 9.674187444671184e-07,
"loss": 0.09378329515457154,
"step": 3650
},
{
"epoch": 0.548730778961572,
"grad_norm": 0.244140625,
"learning_rate": 9.623301697558134e-07,
"loss": 0.0637846291065216,
"step": 3660
},
{
"epoch": 0.5502300433849643,
"grad_norm": 0.185546875,
"learning_rate": 9.572425716567015e-07,
"loss": 0.0605103075504303,
"step": 3670
},
{
"epoch": 0.5517293078083565,
"grad_norm": 0.2236328125,
"learning_rate": 9.521560820687135e-07,
"loss": 0.09556649327278137,
"step": 3680
},
{
"epoch": 0.5532285722317488,
"grad_norm": 0.24609375,
"learning_rate": 9.470708328620413e-07,
"loss": 0.09757782220840454,
"step": 3690
},
{
"epoch": 0.554727836655141,
"grad_norm": 0.197265625,
"learning_rate": 9.419869558747198e-07,
"loss": 0.09097603559494019,
"step": 3700
},
{
"epoch": 0.5562271010785333,
"grad_norm": 0.234375,
"learning_rate": 9.369045829092076e-07,
"loss": 0.089606112241745,
"step": 3710
},
{
"epoch": 0.5577263655019257,
"grad_norm": 0.2158203125,
"learning_rate": 9.318238457289711e-07,
"loss": 0.09462766051292419,
"step": 3720
},
{
"epoch": 0.5592256299253179,
"grad_norm": 0.1513671875,
"learning_rate": 9.267448760550683e-07,
"loss": 0.06713712811470032,
"step": 3730
},
{
"epoch": 0.5607248943487102,
"grad_norm": 0.2109375,
"learning_rate": 9.216678055627325e-07,
"loss": 0.08841444849967957,
"step": 3740
},
{
"epoch": 0.5622241587721024,
"grad_norm": 0.2373046875,
"learning_rate": 9.165927658779603e-07,
"loss": 0.07210164666175842,
"step": 3750
},
{
"epoch": 0.5637234231954947,
"grad_norm": 0.2373046875,
"learning_rate": 9.11519888574099e-07,
"loss": 0.09946097731590271,
"step": 3760
},
{
"epoch": 0.5652226876188869,
"grad_norm": 0.2373046875,
"learning_rate": 9.064493051684341e-07,
"loss": 0.07101974487304688,
"step": 3770
},
{
"epoch": 0.5667219520422793,
"grad_norm": 0.2236328125,
"learning_rate": 9.013811471187807e-07,
"loss": 0.10910413265228272,
"step": 3780
},
{
"epoch": 0.5682212164656715,
"grad_norm": 0.25,
"learning_rate": 8.963155458200753e-07,
"loss": 0.07558327913284302,
"step": 3790
},
{
"epoch": 0.5697204808890638,
"grad_norm": 0.2392578125,
"learning_rate": 8.912526326009686e-07,
"loss": 0.08378031253814697,
"step": 3800
},
{
"epoch": 0.5712197453124561,
"grad_norm": 0.291015625,
"learning_rate": 8.861925387204217e-07,
"loss": 0.0926354169845581,
"step": 3810
},
{
"epoch": 0.5727190097358483,
"grad_norm": 0.2421875,
"learning_rate": 8.811353953643031e-07,
"loss": 0.0765921413898468,
"step": 3820
},
{
"epoch": 0.5742182741592406,
"grad_norm": 0.197265625,
"learning_rate": 8.760813336419868e-07,
"loss": 0.09550715684890747,
"step": 3830
},
{
"epoch": 0.5757175385826329,
"grad_norm": 0.2109375,
"learning_rate": 8.710304845829533e-07,
"loss": 0.07235878109931945,
"step": 3840
},
{
"epoch": 0.5772168030060252,
"grad_norm": 0.1943359375,
"learning_rate": 8.65982979133394e-07,
"loss": 0.08240407705307007,
"step": 3850
},
{
"epoch": 0.5787160674294174,
"grad_norm": 0.212890625,
"learning_rate": 8.609389481528138e-07,
"loss": 0.0828467309474945,
"step": 3860
},
{
"epoch": 0.5802153318528097,
"grad_norm": 0.2080078125,
"learning_rate": 8.558985224106409e-07,
"loss": 0.06905397176742553,
"step": 3870
},
{
"epoch": 0.581714596276202,
"grad_norm": 0.1953125,
"learning_rate": 8.508618325828361e-07,
"loss": 0.08870742321014405,
"step": 3880
},
{
"epoch": 0.5832138606995942,
"grad_norm": 0.32421875,
"learning_rate": 8.458290092485034e-07,
"loss": 0.08924266099929809,
"step": 3890
},
{
"epoch": 0.5847131251229866,
"grad_norm": 0.265625,
"learning_rate": 8.408001828865064e-07,
"loss": 0.08538001179695129,
"step": 3900
},
{
"epoch": 0.5862123895463788,
"grad_norm": 0.21875,
"learning_rate": 8.357754838720846e-07,
"loss": 0.05365139842033386,
"step": 3910
},
{
"epoch": 0.5877116539697711,
"grad_norm": 0.197265625,
"learning_rate": 8.307550424734735e-07,
"loss": 0.07388515472412109,
"step": 3920
},
{
"epoch": 0.5892109183931633,
"grad_norm": 0.1875,
"learning_rate": 8.257389888485274e-07,
"loss": 0.09646939039230347,
"step": 3930
},
{
"epoch": 0.5907101828165556,
"grad_norm": 0.3046875,
"learning_rate": 8.207274530413457e-07,
"loss": 0.09254279732704163,
"step": 3940
},
{
"epoch": 0.592209447239948,
"grad_norm": 0.2109375,
"learning_rate": 8.157205649789001e-07,
"loss": 0.06844722628593444,
"step": 3950
},
{
"epoch": 0.5937087116633402,
"grad_norm": 0.2080078125,
"learning_rate": 8.107184544676671e-07,
"loss": 0.07432733774185181,
"step": 3960
},
{
"epoch": 0.5952079760867325,
"grad_norm": 0.271484375,
"learning_rate": 8.057212511902623e-07,
"loss": 0.08080208897590638,
"step": 3970
},
{
"epoch": 0.5967072405101247,
"grad_norm": 0.189453125,
"learning_rate": 8.007290847020783e-07,
"loss": 0.10689427852630615,
"step": 3980
},
{
"epoch": 0.598206504933517,
"grad_norm": 0.203125,
"learning_rate": 7.957420844279256e-07,
"loss": 0.0826223611831665,
"step": 3990
},
{
"epoch": 0.5997057693569092,
"grad_norm": 0.330078125,
"learning_rate": 7.907603796586793e-07,
"loss": 0.08745207786560058,
"step": 4000
},
{
"epoch": 0.6012050337803015,
"grad_norm": 0.205078125,
"learning_rate": 7.857840995479237e-07,
"loss": 0.06742951273918152,
"step": 4010
},
{
"epoch": 0.6027042982036939,
"grad_norm": 0.296875,
"learning_rate": 7.808133731086063e-07,
"loss": 0.10504342317581176,
"step": 4020
},
{
"epoch": 0.6042035626270861,
"grad_norm": 0.34765625,
"learning_rate": 7.758483292096928e-07,
"loss": 0.10398197174072266,
"step": 4030
},
{
"epoch": 0.6057028270504784,
"grad_norm": 0.28515625,
"learning_rate": 7.708890965728249e-07,
"loss": 0.11235659122467041,
"step": 4040
},
{
"epoch": 0.6072020914738706,
"grad_norm": 0.28515625,
"learning_rate": 7.659358037689845e-07,
"loss": 0.10213931798934936,
"step": 4050
},
{
"epoch": 0.6087013558972629,
"grad_norm": 0.2314453125,
"learning_rate": 7.609885792151602e-07,
"loss": 0.09277363419532776,
"step": 4060
},
{
"epoch": 0.6102006203206551,
"grad_norm": 0.279296875,
"learning_rate": 7.560475511710174e-07,
"loss": 0.08845908641815185,
"step": 4070
},
{
"epoch": 0.6116998847440475,
"grad_norm": 0.2275390625,
"learning_rate": 7.511128477355728e-07,
"loss": 0.06152995824813843,
"step": 4080
},
{
"epoch": 0.6131991491674397,
"grad_norm": 0.1982421875,
"learning_rate": 7.461845968438753e-07,
"loss": 0.0993484079837799,
"step": 4090
},
{
"epoch": 0.614698413590832,
"grad_norm": 0.232421875,
"learning_rate": 7.412629262636861e-07,
"loss": 0.08685197830200195,
"step": 4100
},
{
"epoch": 0.6161976780142243,
"grad_norm": 0.203125,
"learning_rate": 7.363479635921693e-07,
"loss": 0.10489131212234497,
"step": 4110
},
{
"epoch": 0.6176969424376165,
"grad_norm": 0.2265625,
"learning_rate": 7.314398362525827e-07,
"loss": 0.0976183295249939,
"step": 4120
},
{
"epoch": 0.6191962068610088,
"grad_norm": 0.318359375,
"learning_rate": 7.265386714909732e-07,
"loss": 0.10362049341201782,
"step": 4130
},
{
"epoch": 0.6206954712844011,
"grad_norm": 0.21875,
"learning_rate": 7.216445963728795e-07,
"loss": 0.09439095258712768,
"step": 4140
},
{
"epoch": 0.6221947357077934,
"grad_norm": 0.20703125,
"learning_rate": 7.167577377800372e-07,
"loss": 0.07266764044761657,
"step": 4150
},
{
"epoch": 0.6236940001311856,
"grad_norm": 0.2021484375,
"learning_rate": 7.118782224070886e-07,
"loss": 0.08935718536376953,
"step": 4160
},
{
"epoch": 0.6251932645545779,
"grad_norm": 0.27734375,
"learning_rate": 7.070061767582993e-07,
"loss": 0.09530102014541626,
"step": 4170
},
{
"epoch": 0.6266925289779702,
"grad_norm": 0.205078125,
"learning_rate": 7.021417271442786e-07,
"loss": 0.08460386395454407,
"step": 4180
},
{
"epoch": 0.6281917934013624,
"grad_norm": 0.25390625,
"learning_rate": 6.972849996787029e-07,
"loss": 0.09141365885734558,
"step": 4190
},
{
"epoch": 0.6296910578247548,
"grad_norm": 0.18359375,
"learning_rate": 6.924361202750484e-07,
"loss": 0.09532070755958558,
"step": 4200
},
{
"epoch": 0.631190322248147,
"grad_norm": 0.2158203125,
"learning_rate": 6.875952146433252e-07,
"loss": 0.09375123977661133,
"step": 4210
},
{
"epoch": 0.6326895866715393,
"grad_norm": 0.2158203125,
"learning_rate": 6.827624082868191e-07,
"loss": 0.07426313161849976,
"step": 4220
},
{
"epoch": 0.6341888510949315,
"grad_norm": 0.267578125,
"learning_rate": 6.779378264988369e-07,
"loss": 0.09327669143676758,
"step": 4230
},
{
"epoch": 0.6356881155183238,
"grad_norm": 0.3046875,
"learning_rate": 6.731215943594597e-07,
"loss": 0.08692552447319031,
"step": 4240
},
{
"epoch": 0.6371873799417161,
"grad_norm": 0.283203125,
"learning_rate": 6.683138367322982e-07,
"loss": 0.0770199477672577,
"step": 4250
},
{
"epoch": 0.6386866443651084,
"grad_norm": 0.220703125,
"learning_rate": 6.635146782612568e-07,
"loss": 0.07209202647209167,
"step": 4260
},
{
"epoch": 0.6401859087885007,
"grad_norm": 0.2060546875,
"learning_rate": 6.587242433673023e-07,
"loss": 0.07247981429100037,
"step": 4270
},
{
"epoch": 0.6416851732118929,
"grad_norm": 0.19921875,
"learning_rate": 6.539426562452364e-07,
"loss": 0.07441559433937073,
"step": 4280
},
{
"epoch": 0.6431844376352852,
"grad_norm": 0.2021484375,
"learning_rate": 6.491700408604781e-07,
"loss": 0.0830713927745819,
"step": 4290
},
{
"epoch": 0.6446837020586774,
"grad_norm": 0.1845703125,
"learning_rate": 6.444065209458494e-07,
"loss": 0.0942071557044983,
"step": 4300
},
{
"epoch": 0.6461829664820697,
"grad_norm": 0.259765625,
"learning_rate": 6.396522199983659e-07,
"loss": 0.08134819865226746,
"step": 4310
},
{
"epoch": 0.647682230905462,
"grad_norm": 0.236328125,
"learning_rate": 6.349072612760366e-07,
"loss": 0.10018385648727417,
"step": 4320
},
{
"epoch": 0.6491814953288543,
"grad_norm": 0.228515625,
"learning_rate": 6.301717677946678e-07,
"loss": 0.09734719395637512,
"step": 4330
},
{
"epoch": 0.6506807597522466,
"grad_norm": 0.2431640625,
"learning_rate": 6.254458623246745e-07,
"loss": 0.0996459424495697,
"step": 4340
},
{
"epoch": 0.6521800241756388,
"grad_norm": 0.2236328125,
"learning_rate": 6.207296673878957e-07,
"loss": 0.070529043674469,
"step": 4350
},
{
"epoch": 0.6536792885990311,
"grad_norm": 0.20703125,
"learning_rate": 6.160233052544206e-07,
"loss": 0.07517372369766236,
"step": 4360
},
{
"epoch": 0.6551785530224233,
"grad_norm": 0.25390625,
"learning_rate": 6.113268979394162e-07,
"loss": 0.08323991298675537,
"step": 4370
},
{
"epoch": 0.6566778174458157,
"grad_norm": 0.2294921875,
"learning_rate": 6.066405671999657e-07,
"loss": 0.09829720854759216,
"step": 4380
},
{
"epoch": 0.6581770818692079,
"grad_norm": 0.30859375,
"learning_rate": 6.019644345319108e-07,
"loss": 0.06705747246742248,
"step": 4390
},
{
"epoch": 0.6596763462926002,
"grad_norm": 0.326171875,
"learning_rate": 5.972986211667032e-07,
"loss": 0.08918554186820984,
"step": 4400
},
{
"epoch": 0.6611756107159925,
"grad_norm": 0.193359375,
"learning_rate": 5.92643248068259e-07,
"loss": 0.0527131199836731,
"step": 4410
},
{
"epoch": 0.6626748751393847,
"grad_norm": 0.2109375,
"learning_rate": 5.87998435929826e-07,
"loss": 0.061626529693603514,
"step": 4420
},
{
"epoch": 0.664174139562777,
"grad_norm": 0.2314453125,
"learning_rate": 5.83364305170852e-07,
"loss": 0.10371142625808716,
"step": 4430
},
{
"epoch": 0.6656734039861693,
"grad_norm": 0.2236328125,
"learning_rate": 5.787409759338644e-07,
"loss": 0.08246560096740722,
"step": 4440
},
{
"epoch": 0.6671726684095616,
"grad_norm": 0.2099609375,
"learning_rate": 5.741285680813544e-07,
"loss": 0.07695434689521789,
"step": 4450
},
{
"epoch": 0.6686719328329538,
"grad_norm": 0.1982421875,
"learning_rate": 5.695272011926701e-07,
"loss": 0.06416907906532288,
"step": 4460
},
{
"epoch": 0.6701711972563461,
"grad_norm": 0.310546875,
"learning_rate": 5.649369945609169e-07,
"loss": 0.05495827198028565,
"step": 4470
},
{
"epoch": 0.6716704616797384,
"grad_norm": 0.224609375,
"learning_rate": 5.603580671898629e-07,
"loss": 0.07965745329856873,
"step": 4480
},
{
"epoch": 0.6731697261031306,
"grad_norm": 0.322265625,
"learning_rate": 5.557905377908558e-07,
"loss": 0.10348300933837891,
"step": 4490
},
{
"epoch": 0.674668990526523,
"grad_norm": 0.337890625,
"learning_rate": 5.512345247797437e-07,
"loss": 0.11305124759674072,
"step": 4500
},
{
"epoch": 0.6761682549499152,
"grad_norm": 0.2119140625,
"learning_rate": 5.466901462738057e-07,
"loss": 0.06318964958190917,
"step": 4510
},
{
"epoch": 0.6776675193733075,
"grad_norm": 0.23046875,
"learning_rate": 5.421575200886899e-07,
"loss": 0.10519200563430786,
"step": 4520
},
{
"epoch": 0.6791667837966997,
"grad_norm": 0.1787109375,
"learning_rate": 5.376367637353586e-07,
"loss": 0.08189275860786438,
"step": 4530
},
{
"epoch": 0.680666048220092,
"grad_norm": 0.498046875,
"learning_rate": 5.331279944170417e-07,
"loss": 0.09210953116416931,
"step": 4540
},
{
"epoch": 0.6821653126434843,
"grad_norm": 0.189453125,
"learning_rate": 5.286313290261982e-07,
"loss": 0.07461657524108886,
"step": 4550
},
{
"epoch": 0.6836645770668766,
"grad_norm": 0.30859375,
"learning_rate": 5.24146884141486e-07,
"loss": 0.09393454194068909,
"step": 4560
},
{
"epoch": 0.6851638414902689,
"grad_norm": 0.22265625,
"learning_rate": 5.19674776024739e-07,
"loss": 0.08053632378578186,
"step": 4570
},
{
"epoch": 0.6866631059136611,
"grad_norm": 0.294921875,
"learning_rate": 5.152151206179538e-07,
"loss": 0.07931421399116516,
"step": 4580
},
{
"epoch": 0.6881623703370534,
"grad_norm": 0.236328125,
"learning_rate": 5.107680335402824e-07,
"loss": 0.09329952597618103,
"step": 4590
},
{
"epoch": 0.6896616347604456,
"grad_norm": 0.240234375,
"learning_rate": 5.063336300850362e-07,
"loss": 0.07256720066070557,
"step": 4600
},
{
"epoch": 0.6911608991838379,
"grad_norm": 0.255859375,
"learning_rate": 5.019120252166966e-07,
"loss": 0.07386515140533448,
"step": 4610
},
{
"epoch": 0.6926601636072302,
"grad_norm": 0.1904296875,
"learning_rate": 4.975033335679332e-07,
"loss": 0.0855524480342865,
"step": 4620
},
{
"epoch": 0.6941594280306225,
"grad_norm": 0.220703125,
"learning_rate": 4.931076694366337e-07,
"loss": 0.08902753591537475,
"step": 4630
},
{
"epoch": 0.6956586924540148,
"grad_norm": 0.2236328125,
"learning_rate": 4.887251467829398e-07,
"loss": 0.09814743995666504,
"step": 4640
},
{
"epoch": 0.697157956877407,
"grad_norm": 0.294921875,
"learning_rate": 4.843558792262924e-07,
"loss": 0.09769907593727112,
"step": 4650
},
{
"epoch": 0.6986572213007993,
"grad_norm": 0.294921875,
"learning_rate": 4.799999800424867e-07,
"loss": 0.12376710176467895,
"step": 4660
},
{
"epoch": 0.7001564857241915,
"grad_norm": 0.2158203125,
"learning_rate": 4.7565756216073505e-07,
"loss": 0.07605620622634887,
"step": 4670
},
{
"epoch": 0.7016557501475839,
"grad_norm": 0.296875,
"learning_rate": 4.713287381607389e-07,
"loss": 0.09146468043327331,
"step": 4680
},
{
"epoch": 0.7031550145709761,
"grad_norm": 0.2001953125,
"learning_rate": 4.670136202697706e-07,
"loss": 0.11566205024719238,
"step": 4690
},
{
"epoch": 0.7046542789943684,
"grad_norm": 0.2099609375,
"learning_rate": 4.6271232035976395e-07,
"loss": 0.07541021108627319,
"step": 4700
},
{
"epoch": 0.7061535434177607,
"grad_norm": 0.2255859375,
"learning_rate": 4.5842494994441315e-07,
"loss": 0.10867191553115844,
"step": 4710
},
{
"epoch": 0.7076528078411529,
"grad_norm": 0.298828125,
"learning_rate": 4.541516201762824e-07,
"loss": 0.08358562588691712,
"step": 4720
},
{
"epoch": 0.7091520722645452,
"grad_norm": 0.2158203125,
"learning_rate": 4.4989244184392405e-07,
"loss": 0.10019409656524658,
"step": 4730
},
{
"epoch": 0.7106513366879375,
"grad_norm": 0.2353515625,
"learning_rate": 4.456475253690061e-07,
"loss": 0.08848651647567748,
"step": 4740
},
{
"epoch": 0.7121506011113298,
"grad_norm": 0.201171875,
"learning_rate": 4.414169808034496e-07,
"loss": 0.07086822390556335,
"step": 4750
},
{
"epoch": 0.713649865534722,
"grad_norm": 0.255859375,
"learning_rate": 4.3720091782657574e-07,
"loss": 0.1078036069869995,
"step": 4760
},
{
"epoch": 0.7151491299581143,
"grad_norm": 0.2314453125,
"learning_rate": 4.32999445742262e-07,
"loss": 0.09499780535697937,
"step": 4770
},
{
"epoch": 0.7166483943815066,
"grad_norm": 0.2431640625,
"learning_rate": 4.2881267347610837e-07,
"loss": 0.08308950662612916,
"step": 4780
},
{
"epoch": 0.7181476588048988,
"grad_norm": 0.2890625,
"learning_rate": 4.2464070957261375e-07,
"loss": 0.08044061660766602,
"step": 4790
},
{
"epoch": 0.7196469232282912,
"grad_norm": 0.2001953125,
"learning_rate": 4.204836621923618e-07,
"loss": 0.06061916947364807,
"step": 4800
},
{
"epoch": 0.7211461876516834,
"grad_norm": 0.2490234375,
"learning_rate": 4.1634163910921606e-07,
"loss": 0.10452162027359009,
"step": 4810
},
{
"epoch": 0.7226454520750757,
"grad_norm": 0.2158203125,
"learning_rate": 4.1221474770752696e-07,
"loss": 0.0969232976436615,
"step": 4820
},
{
"epoch": 0.7241447164984679,
"grad_norm": 0.1728515625,
"learning_rate": 4.081030949793471e-07,
"loss": 0.07360079884529114,
"step": 4830
},
{
"epoch": 0.7256439809218602,
"grad_norm": 0.1943359375,
"learning_rate": 4.0400678752165807e-07,
"loss": 0.08355346322059631,
"step": 4840
},
{
"epoch": 0.7271432453452524,
"grad_norm": 0.279296875,
"learning_rate": 3.9992593153360563e-07,
"loss": 0.07457499504089356,
"step": 4850
},
{
"epoch": 0.7286425097686448,
"grad_norm": 0.314453125,
"learning_rate": 3.9586063281374796e-07,
"loss": 0.0845346987247467,
"step": 4860
},
{
"epoch": 0.7301417741920371,
"grad_norm": 0.2275390625,
"learning_rate": 3.9181099675731154e-07,
"loss": 0.07429866194725036,
"step": 4870
},
{
"epoch": 0.7316410386154293,
"grad_norm": 0.2255859375,
"learning_rate": 3.8777712835345966e-07,
"loss": 0.05976992845535278,
"step": 4880
},
{
"epoch": 0.7331403030388216,
"grad_norm": 0.1884765625,
"learning_rate": 3.837591321825696e-07,
"loss": 0.07514649033546447,
"step": 4890
},
{
"epoch": 0.7346395674622138,
"grad_norm": 0.22265625,
"learning_rate": 3.7975711241352224e-07,
"loss": 0.0838453233242035,
"step": 4900
},
{
"epoch": 0.7361388318856061,
"grad_norm": 0.28125,
"learning_rate": 3.757711728010007e-07,
"loss": 0.08041094541549683,
"step": 4910
},
{
"epoch": 0.7376380963089983,
"grad_norm": 0.271484375,
"learning_rate": 3.7180141668280065e-07,
"loss": 0.0707211971282959,
"step": 4920
},
{
"epoch": 0.7391373607323907,
"grad_norm": 0.2109375,
"learning_rate": 3.678479469771516e-07,
"loss": 0.09502058625221252,
"step": 4930
},
{
"epoch": 0.740636625155783,
"grad_norm": 0.25390625,
"learning_rate": 3.639108661800482e-07,
"loss": 0.09508728384971618,
"step": 4940
},
{
"epoch": 0.7421358895791752,
"grad_norm": 0.26953125,
"learning_rate": 3.59990276362593e-07,
"loss": 0.07535126805305481,
"step": 4950
},
{
"epoch": 0.7436351540025675,
"grad_norm": 0.271484375,
"learning_rate": 3.5608627916835077e-07,
"loss": 0.07866016626358033,
"step": 4960
},
{
"epoch": 0.7451344184259597,
"grad_norm": 0.177734375,
"learning_rate": 3.521989758107122e-07,
"loss": 0.10100013017654419,
"step": 4970
},
{
"epoch": 0.746633682849352,
"grad_norm": 0.365234375,
"learning_rate": 3.4832846707027144e-07,
"loss": 0.08256787061691284,
"step": 4980
},
{
"epoch": 0.7481329472727443,
"grad_norm": 0.185546875,
"learning_rate": 3.444748532922116e-07,
"loss": 0.08142110109329223,
"step": 4990
},
{
"epoch": 0.7496322116961366,
"grad_norm": 0.1806640625,
"learning_rate": 3.4063823438370477e-07,
"loss": 0.09730502367019653,
"step": 5000
},
{
"epoch": 0.7511314761195289,
"grad_norm": 0.2578125,
"learning_rate": 3.3681870981132076e-07,
"loss": 0.051060861349105834,
"step": 5010
},
{
"epoch": 0.7526307405429211,
"grad_norm": 0.29296875,
"learning_rate": 3.330163785984491e-07,
"loss": 0.07702358365058899,
"step": 5020
},
{
"epoch": 0.7541300049663134,
"grad_norm": 0.25,
"learning_rate": 3.292313393227313e-07,
"loss": 0.07249666452407837,
"step": 5030
},
{
"epoch": 0.7556292693897056,
"grad_norm": 0.2119140625,
"learning_rate": 3.254636901135055e-07,
"loss": 0.08777963519096374,
"step": 5040
},
{
"epoch": 0.757128533813098,
"grad_norm": 0.26171875,
"learning_rate": 3.2171352864926216e-07,
"loss": 0.09629991054534912,
"step": 5050
},
{
"epoch": 0.7586277982364902,
"grad_norm": 0.400390625,
"learning_rate": 3.179809521551119e-07,
"loss": 0.07828204035758972,
"step": 5060
},
{
"epoch": 0.7601270626598825,
"grad_norm": 0.2236328125,
"learning_rate": 3.142660574002648e-07,
"loss": 0.06039868593215943,
"step": 5070
},
{
"epoch": 0.7616263270832748,
"grad_norm": 0.26171875,
"learning_rate": 3.1056894069552154e-07,
"loss": 0.06850762367248535,
"step": 5080
},
{
"epoch": 0.763125591506667,
"grad_norm": 0.25390625,
"learning_rate": 3.0688969789077656e-07,
"loss": 0.07535871863365173,
"step": 5090
},
{
"epoch": 0.7646248559300594,
"grad_norm": 0.2275390625,
"learning_rate": 3.0322842437253303e-07,
"loss": 0.0845901370048523,
"step": 5100
},
{
"epoch": 0.7661241203534516,
"grad_norm": 0.267578125,
"learning_rate": 2.9958521506143006e-07,
"loss": 0.09275015592575073,
"step": 5110
},
{
"epoch": 0.7676233847768439,
"grad_norm": 0.24609375,
"learning_rate": 2.9596016440978175e-07,
"loss": 0.10449213981628418,
"step": 5120
},
{
"epoch": 0.7691226492002361,
"grad_norm": 0.2060546875,
"learning_rate": 2.923533663991282e-07,
"loss": 0.08837388157844543,
"step": 5130
},
{
"epoch": 0.7706219136236284,
"grad_norm": 0.330078125,
"learning_rate": 2.8876491453779936e-07,
"loss": 0.09125276803970336,
"step": 5140
},
{
"epoch": 0.7721211780470206,
"grad_norm": 0.2734375,
"learning_rate": 2.851949018584906e-07,
"loss": 0.0870974063873291,
"step": 5150
},
{
"epoch": 0.773620442470413,
"grad_norm": 1.109375,
"learning_rate": 2.816434209158508e-07,
"loss": 0.11278444528579712,
"step": 5160
},
{
"epoch": 0.7751197068938053,
"grad_norm": 0.2431640625,
"learning_rate": 2.781105637840829e-07,
"loss": 0.11417597532272339,
"step": 5170
},
{
"epoch": 0.7766189713171975,
"grad_norm": 0.2080078125,
"learning_rate": 2.7459642205455657e-07,
"loss": 0.0695708453655243,
"step": 5180
},
{
"epoch": 0.7781182357405898,
"grad_norm": 0.294921875,
"learning_rate": 2.71101086833434e-07,
"loss": 0.07352896332740784,
"step": 5190
},
{
"epoch": 0.779617500163982,
"grad_norm": 0.265625,
"learning_rate": 2.6762464873930754e-07,
"loss": 0.09707750678062439,
"step": 5200
},
{
"epoch": 0.7811167645873743,
"grad_norm": 0.1865234375,
"learning_rate": 2.6416719790085084e-07,
"loss": 0.09525392651557922,
"step": 5210
},
{
"epoch": 0.7826160290107665,
"grad_norm": 0.291015625,
"learning_rate": 2.607288239544817e-07,
"loss": 0.10324461460113525,
"step": 5220
},
{
"epoch": 0.7841152934341589,
"grad_norm": 0.2490234375,
"learning_rate": 2.573096160420386e-07,
"loss": 0.056819206476211546,
"step": 5230
},
{
"epoch": 0.7856145578575512,
"grad_norm": 0.1875,
"learning_rate": 2.5390966280846925e-07,
"loss": 0.07321354150772094,
"step": 5240
},
{
"epoch": 0.7871138222809434,
"grad_norm": 0.2177734375,
"learning_rate": 2.505290523995329e-07,
"loss": 0.05529284477233887,
"step": 5250
},
{
"epoch": 0.7886130867043357,
"grad_norm": 0.28515625,
"learning_rate": 2.4716787245951465e-07,
"loss": 0.08749927282333374,
"step": 5260
},
{
"epoch": 0.7901123511277279,
"grad_norm": 0.251953125,
"learning_rate": 2.4382621012895367e-07,
"loss": 0.10226259231567383,
"step": 5270
},
{
"epoch": 0.7916116155511203,
"grad_norm": 0.369140625,
"learning_rate": 2.405041520423835e-07,
"loss": 0.08864956498146057,
"step": 5280
},
{
"epoch": 0.7931108799745125,
"grad_norm": 0.2197265625,
"learning_rate": 2.372017843260864e-07,
"loss": 0.10684455633163452,
"step": 5290
},
{
"epoch": 0.7946101443979048,
"grad_norm": 0.1884765625,
"learning_rate": 2.3391919259586057e-07,
"loss": 0.09059134125709534,
"step": 5300
},
{
"epoch": 0.7961094088212971,
"grad_norm": 0.2158203125,
"learning_rate": 2.3065646195479992e-07,
"loss": 0.07700026631355286,
"step": 5310
},
{
"epoch": 0.7976086732446893,
"grad_norm": 0.37890625,
"learning_rate": 2.2741367699108839e-07,
"loss": 0.08473354578018188,
"step": 5320
},
{
"epoch": 0.7991079376680816,
"grad_norm": 0.2265625,
"learning_rate": 2.2419092177580666e-07,
"loss": 0.07873227596282958,
"step": 5330
},
{
"epoch": 0.8006072020914738,
"grad_norm": 0.26953125,
"learning_rate": 2.209882798607523e-07,
"loss": 0.09732807874679565,
"step": 5340
},
{
"epoch": 0.8021064665148662,
"grad_norm": 0.26953125,
"learning_rate": 2.178058342762743e-07,
"loss": 0.10025830268859863,
"step": 5350
},
{
"epoch": 0.8036057309382584,
"grad_norm": 0.263671875,
"learning_rate": 2.1464366752911979e-07,
"loss": 0.09230310916900634,
"step": 5360
},
{
"epoch": 0.8051049953616507,
"grad_norm": 0.2353515625,
"learning_rate": 2.1150186160029525e-07,
"loss": 0.06340540051460267,
"step": 5370
},
{
"epoch": 0.8066042597850429,
"grad_norm": 0.26953125,
"learning_rate": 2.0838049794294132e-07,
"loss": 0.10046428442001343,
"step": 5380
},
{
"epoch": 0.8081035242084352,
"grad_norm": 0.220703125,
"learning_rate": 2.052796574802209e-07,
"loss": 0.06854251027107239,
"step": 5390
},
{
"epoch": 0.8096027886318276,
"grad_norm": 0.2216796875,
"learning_rate": 2.0219942060322114e-07,
"loss": 0.08301514387130737,
"step": 5400
},
{
"epoch": 0.8111020530552198,
"grad_norm": 0.2734375,
"learning_rate": 1.99139867168869e-07,
"loss": 0.06499930620193481,
"step": 5410
},
{
"epoch": 0.8126013174786121,
"grad_norm": 0.275390625,
"learning_rate": 1.9610107649786167e-07,
"loss": 0.08899691700935364,
"step": 5420
},
{
"epoch": 0.8141005819020043,
"grad_norm": 0.1923828125,
"learning_rate": 1.9308312737260934e-07,
"loss": 0.06367949843406677,
"step": 5430
},
{
"epoch": 0.8155998463253966,
"grad_norm": 0.2578125,
"learning_rate": 1.9008609803519304e-07,
"loss": 0.09109672904014587,
"step": 5440
},
{
"epoch": 0.8170991107487888,
"grad_norm": 0.2373046875,
"learning_rate": 1.871100661853363e-07,
"loss": 0.0652251660823822,
"step": 5450
},
{
"epoch": 0.8185983751721811,
"grad_norm": 0.263671875,
"learning_rate": 1.841551089783907e-07,
"loss": 0.10543818473815918,
"step": 5460
},
{
"epoch": 0.8200976395955735,
"grad_norm": 0.2333984375,
"learning_rate": 1.8122130302333517e-07,
"loss": 0.07551140189170838,
"step": 5470
},
{
"epoch": 0.8215969040189657,
"grad_norm": 0.255859375,
"learning_rate": 1.7830872438079048e-07,
"loss": 0.07271650433540344,
"step": 5480
},
{
"epoch": 0.823096168442358,
"grad_norm": 0.21484375,
"learning_rate": 1.7541744856104667e-07,
"loss": 0.07429500818252563,
"step": 5490
},
{
"epoch": 0.8245954328657502,
"grad_norm": 0.287109375,
"learning_rate": 1.7254755052210624e-07,
"loss": 0.06771766543388366,
"step": 5500
},
{
"epoch": 0.8260946972891425,
"grad_norm": 0.3046875,
"learning_rate": 1.6969910466773973e-07,
"loss": 0.11255881786346436,
"step": 5510
},
{
"epoch": 0.8275939617125347,
"grad_norm": 0.2080078125,
"learning_rate": 1.66872184845558e-07,
"loss": 0.07378043532371521,
"step": 5520
},
{
"epoch": 0.8290932261359271,
"grad_norm": 0.2236328125,
"learning_rate": 1.6406686434509644e-07,
"loss": 0.06890552639961242,
"step": 5530
},
{
"epoch": 0.8305924905593194,
"grad_norm": 0.2060546875,
"learning_rate": 1.6128321589591587e-07,
"loss": 0.08552584648132325,
"step": 5540
},
{
"epoch": 0.8320917549827116,
"grad_norm": 0.326171875,
"learning_rate": 1.5852131166571648e-07,
"loss": 0.08140406608581544,
"step": 5550
},
{
"epoch": 0.8335910194061039,
"grad_norm": 0.251953125,
"learning_rate": 1.55781223258467e-07,
"loss": 0.09987716674804688,
"step": 5560
},
{
"epoch": 0.8350902838294961,
"grad_norm": 0.1982421875,
"learning_rate": 1.5306302171254836e-07,
"loss": 0.0620901346206665,
"step": 5570
},
{
"epoch": 0.8365895482528884,
"grad_norm": 0.263671875,
"learning_rate": 1.503667774989119e-07,
"loss": 0.07742155194282532,
"step": 5580
},
{
"epoch": 0.8380888126762807,
"grad_norm": 0.27734375,
"learning_rate": 1.4769256051925228e-07,
"loss": 0.09683317542076111,
"step": 5590
},
{
"epoch": 0.839588077099673,
"grad_norm": 0.2177734375,
"learning_rate": 1.4504044010419513e-07,
"loss": 0.10250561237335205,
"step": 5600
},
{
"epoch": 0.8410873415230652,
"grad_norm": 0.2314453125,
"learning_rate": 1.4241048501150088e-07,
"loss": 0.0593035876750946,
"step": 5610
},
{
"epoch": 0.8425866059464575,
"grad_norm": 0.33203125,
"learning_rate": 1.3980276342427966e-07,
"loss": 0.07098089456558228,
"step": 5620
},
{
"epoch": 0.8440858703698498,
"grad_norm": 0.25,
"learning_rate": 1.3721734294922594e-07,
"loss": 0.08620147705078125,
"step": 5630
},
{
"epoch": 0.845585134793242,
"grad_norm": 0.2138671875,
"learning_rate": 1.346542906148649e-07,
"loss": 0.08298314213752747,
"step": 5640
},
{
"epoch": 0.8470843992166344,
"grad_norm": 0.34765625,
"learning_rate": 1.3211367286981458e-07,
"loss": 0.1136427640914917,
"step": 5650
},
{
"epoch": 0.8485836636400266,
"grad_norm": 0.25,
"learning_rate": 1.2959555558106282e-07,
"loss": 0.0708082675933838,
"step": 5660
},
{
"epoch": 0.8500829280634189,
"grad_norm": 0.25390625,
"learning_rate": 1.271000040322614e-07,
"loss": 0.09266042709350586,
"step": 5670
},
{
"epoch": 0.8515821924868111,
"grad_norm": 0.298828125,
"learning_rate": 1.2462708292203062e-07,
"loss": 0.09188313484191894,
"step": 5680
},
{
"epoch": 0.8530814569102034,
"grad_norm": 0.3046875,
"learning_rate": 1.2217685636228447e-07,
"loss": 0.11194919347763062,
"step": 5690
},
{
"epoch": 0.8545807213335957,
"grad_norm": 0.259765625,
"learning_rate": 1.1974938787656742e-07,
"loss": 0.0845366358757019,
"step": 5700
},
{
"epoch": 0.856079985756988,
"grad_norm": 0.28515625,
"learning_rate": 1.1734474039840737e-07,
"loss": 0.07923954129219055,
"step": 5710
},
{
"epoch": 0.8575792501803803,
"grad_norm": 0.306640625,
"learning_rate": 1.1496297626968465e-07,
"loss": 0.09228439927101136,
"step": 5720
},
{
"epoch": 0.8590785146037725,
"grad_norm": 0.2314453125,
"learning_rate": 1.1260415723901584e-07,
"loss": 0.08742096424102783,
"step": 5730
},
{
"epoch": 0.8605777790271648,
"grad_norm": 0.2353515625,
"learning_rate": 1.1026834446015177e-07,
"loss": 0.07722960710525513,
"step": 5740
},
{
"epoch": 0.862077043450557,
"grad_norm": 0.2060546875,
"learning_rate": 1.0795559849039315e-07,
"loss": 0.08857112526893615,
"step": 5750
},
{
"epoch": 0.8635763078739493,
"grad_norm": 0.205078125,
"learning_rate": 1.0566597928902043e-07,
"loss": 0.06474360227584838,
"step": 5760
},
{
"epoch": 0.8650755722973417,
"grad_norm": 0.29296875,
"learning_rate": 1.033995462157392e-07,
"loss": 0.09699549674987792,
"step": 5770
},
{
"epoch": 0.8665748367207339,
"grad_norm": 0.2451171875,
"learning_rate": 1.0115635802914101e-07,
"loss": 0.07245502471923829,
"step": 5780
},
{
"epoch": 0.8680741011441262,
"grad_norm": 0.26171875,
"learning_rate": 9.89364728851807e-08,
"loss": 0.07710716128349304,
"step": 5790
},
{
"epoch": 0.8695733655675184,
"grad_norm": 0.294921875,
"learning_rate": 9.673994833566746e-08,
"loss": 0.07985681295394897,
"step": 5800
},
{
"epoch": 0.8710726299909107,
"grad_norm": 0.212890625,
"learning_rate": 9.456684132677418e-08,
"loss": 0.07051183581352234,
"step": 5810
},
{
"epoch": 0.8725718944143029,
"grad_norm": 0.2392578125,
"learning_rate": 9.241720819756016e-08,
"loss": 0.09385765790939331,
"step": 5820
},
{
"epoch": 0.8740711588376953,
"grad_norm": 0.302734375,
"learning_rate": 9.029110467851076e-08,
"loss": 0.07226101160049439,
"step": 5830
},
{
"epoch": 0.8755704232610876,
"grad_norm": 0.224609375,
"learning_rate": 8.818858589009248e-08,
"loss": 0.07575808763504029,
"step": 5840
},
{
"epoch": 0.8770696876844798,
"grad_norm": 0.1962890625,
"learning_rate": 8.610970634132465e-08,
"loss": 0.07295922040939332,
"step": 5850
},
{
"epoch": 0.8785689521078721,
"grad_norm": 0.291015625,
"learning_rate": 8.405451992836442e-08,
"loss": 0.08540709614753723,
"step": 5860
},
{
"epoch": 0.8800682165312643,
"grad_norm": 0.240234375,
"learning_rate": 8.202307993311153e-08,
"loss": 0.08457719087600708,
"step": 5870
},
{
"epoch": 0.8815674809546566,
"grad_norm": 0.224609375,
"learning_rate": 8.001543902182594e-08,
"loss": 0.06852260828018189,
"step": 5880
},
{
"epoch": 0.8830667453780489,
"grad_norm": 0.208984375,
"learning_rate": 7.803164924376248e-08,
"loss": 0.0945811927318573,
"step": 5890
},
{
"epoch": 0.8845660098014412,
"grad_norm": 0.2734375,
"learning_rate": 7.607176202982112e-08,
"loss": 0.07205227017402649,
"step": 5900
},
{
"epoch": 0.8860652742248334,
"grad_norm": 0.25390625,
"learning_rate": 7.413582819121511e-08,
"loss": 0.08640796542167664,
"step": 5910
},
{
"epoch": 0.8875645386482257,
"grad_norm": 0.2060546875,
"learning_rate": 7.22238979181512e-08,
"loss": 0.0951160728931427,
"step": 5920
},
{
"epoch": 0.889063803071618,
"grad_norm": 0.21484375,
"learning_rate": 7.033602077853052e-08,
"loss": 0.07211223244667053,
"step": 5930
},
{
"epoch": 0.8905630674950102,
"grad_norm": 0.2373046875,
"learning_rate": 6.847224571666277e-08,
"loss": 0.07400254607200622,
"step": 5940
},
{
"epoch": 0.8920623319184026,
"grad_norm": 0.298828125,
"learning_rate": 6.663262105199718e-08,
"loss": 0.09436286687850952,
"step": 5950
},
{
"epoch": 0.8935615963417948,
"grad_norm": 0.255859375,
"learning_rate": 6.481719447786971e-08,
"loss": 0.07624666690826416,
"step": 5960
},
{
"epoch": 0.8950608607651871,
"grad_norm": 0.25,
"learning_rate": 6.302601306026755e-08,
"loss": 0.08409606218338013,
"step": 5970
},
{
"epoch": 0.8965601251885793,
"grad_norm": 0.2265625,
"learning_rate": 6.125912323660709e-08,
"loss": 0.07607480883598328,
"step": 5980
},
{
"epoch": 0.8980593896119716,
"grad_norm": 0.2412109375,
"learning_rate": 5.951657081453176e-08,
"loss": 0.08595433235168456,
"step": 5990
},
{
"epoch": 0.899558654035364,
"grad_norm": 0.181640625,
"learning_rate": 5.7798400970723634e-08,
"loss": 0.0745903193950653,
"step": 6000
},
{
"epoch": 0.9010579184587562,
"grad_norm": 0.2392578125,
"learning_rate": 5.610465824973232e-08,
"loss": 0.07999681830406188,
"step": 6010
},
{
"epoch": 0.9025571828821485,
"grad_norm": 0.205078125,
"learning_rate": 5.443538656281954e-08,
"loss": 0.08919501900672913,
"step": 6020
},
{
"epoch": 0.9040564473055407,
"grad_norm": 0.20703125,
"learning_rate": 5.279062918682253e-08,
"loss": 0.07325602769851684,
"step": 6030
},
{
"epoch": 0.905555711728933,
"grad_norm": 0.2236328125,
"learning_rate": 5.117042876302946e-08,
"loss": 0.07375933527946472,
"step": 6040
},
{
"epoch": 0.9070549761523252,
"grad_norm": 0.30859375,
"learning_rate": 4.9574827296075986e-08,
"loss": 0.09143089056015015,
"step": 6050
},
{
"epoch": 0.9085542405757175,
"grad_norm": 0.205078125,
"learning_rate": 4.800386615285534e-08,
"loss": 0.06721729636192322,
"step": 6060
},
{
"epoch": 0.9100535049991099,
"grad_norm": 0.2314453125,
"learning_rate": 4.645758606144623e-08,
"loss": 0.0724267840385437,
"step": 6070
},
{
"epoch": 0.9115527694225021,
"grad_norm": 0.263671875,
"learning_rate": 4.49360271100564e-08,
"loss": 0.09417140483856201,
"step": 6080
},
{
"epoch": 0.9130520338458944,
"grad_norm": 0.2138671875,
"learning_rate": 4.3439228745984493e-08,
"loss": 0.10223345756530762,
"step": 6090
},
{
"epoch": 0.9145512982692866,
"grad_norm": 0.259765625,
"learning_rate": 4.196722977459566e-08,
"loss": 0.08283578753471374,
"step": 6100
},
{
"epoch": 0.9160505626926789,
"grad_norm": 0.3359375,
"learning_rate": 4.0520068358317e-08,
"loss": 0.11019489765167237,
"step": 6110
},
{
"epoch": 0.9175498271160711,
"grad_norm": 0.21484375,
"learning_rate": 3.9097782015647286e-08,
"loss": 0.07297813296318054,
"step": 6120
},
{
"epoch": 0.9190490915394635,
"grad_norm": 0.2060546875,
"learning_rate": 3.7700407620184674e-08,
"loss": 0.07638216018676758,
"step": 6130
},
{
"epoch": 0.9205483559628557,
"grad_norm": 0.1962890625,
"learning_rate": 3.632798139967064e-08,
"loss": 0.09769478440284729,
"step": 6140
},
{
"epoch": 0.922047620386248,
"grad_norm": 0.345703125,
"learning_rate": 3.498053893505126e-08,
"loss": 0.07059162259101867,
"step": 6150
},
{
"epoch": 0.9235468848096403,
"grad_norm": 0.2080078125,
"learning_rate": 3.365811515955319e-08,
"loss": 0.10193029642105103,
"step": 6160
},
{
"epoch": 0.9250461492330325,
"grad_norm": 0.2099609375,
"learning_rate": 3.236074435777991e-08,
"loss": 0.08877017498016357,
"step": 6170
},
{
"epoch": 0.9265454136564248,
"grad_norm": 0.19921875,
"learning_rate": 3.1088460164821694e-08,
"loss": 0.07558783888816833,
"step": 6180
},
{
"epoch": 0.928044678079817,
"grad_norm": 0.26953125,
"learning_rate": 2.984129556538417e-08,
"loss": 0.10496606826782226,
"step": 6190
},
{
"epoch": 0.9295439425032094,
"grad_norm": 0.1826171875,
"learning_rate": 2.8619282892932472e-08,
"loss": 0.08706371784210205,
"step": 6200
},
{
"epoch": 0.9310432069266016,
"grad_norm": 0.24609375,
"learning_rate": 2.742245382885422e-08,
"loss": 0.07445533275604248,
"step": 6210
},
{
"epoch": 0.9325424713499939,
"grad_norm": 0.2392578125,
"learning_rate": 2.6250839401636636e-08,
"loss": 0.08374568819999695,
"step": 6220
},
{
"epoch": 0.9340417357733862,
"grad_norm": 0.220703125,
"learning_rate": 2.510446998606297e-08,
"loss": 0.08437891006469726,
"step": 6230
},
{
"epoch": 0.9355410001967784,
"grad_norm": 0.232421875,
"learning_rate": 2.3983375302425445e-08,
"loss": 0.06599584221839905,
"step": 6240
},
{
"epoch": 0.9370402646201708,
"grad_norm": 0.390625,
"learning_rate": 2.2887584415753558e-08,
"loss": 0.08677806854248046,
"step": 6250
},
{
"epoch": 0.938539529043563,
"grad_norm": 0.173828125,
"learning_rate": 2.1817125735061448e-08,
"loss": 0.057446730136871335,
"step": 6260
},
{
"epoch": 0.9400387934669553,
"grad_norm": 0.279296875,
"learning_rate": 2.0772027012611382e-08,
"loss": 0.07344555258750915,
"step": 6270
},
{
"epoch": 0.9415380578903475,
"grad_norm": 0.2333984375,
"learning_rate": 1.975231534319366e-08,
"loss": 0.061513519287109374,
"step": 6280
},
{
"epoch": 0.9430373223137398,
"grad_norm": 0.234375,
"learning_rate": 1.875801716342462e-08,
"loss": 0.08662024140357971,
"step": 6290
},
{
"epoch": 0.9445365867371321,
"grad_norm": 0.2890625,
"learning_rate": 1.7789158251061087e-08,
"loss": 0.08880329728126526,
"step": 6300
},
{
"epoch": 0.9460358511605244,
"grad_norm": 0.23046875,
"learning_rate": 1.684576372433222e-08,
"loss": 0.08403295874595643,
"step": 6310
},
{
"epoch": 0.9475351155839167,
"grad_norm": 0.2099609375,
"learning_rate": 1.5927858041288154e-08,
"loss": 0.07371333837509156,
"step": 6320
},
{
"epoch": 0.9490343800073089,
"grad_norm": 0.255859375,
"learning_rate": 1.503546499916608e-08,
"loss": 0.0930757999420166,
"step": 6330
},
{
"epoch": 0.9505336444307012,
"grad_norm": 0.2490234375,
"learning_rate": 1.4168607733773042e-08,
"loss": 0.09260554909706116,
"step": 6340
},
{
"epoch": 0.9520329088540934,
"grad_norm": 0.23046875,
"learning_rate": 1.3327308718886322e-08,
"loss": 0.06500183939933776,
"step": 6350
},
{
"epoch": 0.9535321732774857,
"grad_norm": 0.259765625,
"learning_rate": 1.2511589765670682e-08,
"loss": 0.12267719507217408,
"step": 6360
},
{
"epoch": 0.9550314377008781,
"grad_norm": 0.23046875,
"learning_rate": 1.1721472022113044e-08,
"loss": 0.15489401817321777,
"step": 6370
},
{
"epoch": 0.9565307021242703,
"grad_norm": 0.41015625,
"learning_rate": 1.0956975972474136e-08,
"loss": 0.08266881704330445,
"step": 6380
},
{
"epoch": 0.9580299665476626,
"grad_norm": 0.21875,
"learning_rate": 1.0218121436757266e-08,
"loss": 0.062265390157699586,
"step": 6390
},
{
"epoch": 0.9595292309710548,
"grad_norm": 0.3125,
"learning_rate": 9.504927570194831e-09,
"loss": 0.11146190166473388,
"step": 6400
},
{
"epoch": 0.9610284953944471,
"grad_norm": 0.271484375,
"learning_rate": 8.817412862751172e-09,
"loss": 0.11401185989379883,
"step": 6410
},
{
"epoch": 0.9625277598178393,
"grad_norm": 0.259765625,
"learning_rate": 8.155595138644055e-09,
"loss": 0.06964959502220154,
"step": 6420
},
{
"epoch": 0.9640270242412317,
"grad_norm": 0.248046875,
"learning_rate": 7.519491555881497e-09,
"loss": 0.08737698793411255,
"step": 6430
},
{
"epoch": 0.9655262886646239,
"grad_norm": 0.39453125,
"learning_rate": 6.909118605817776e-09,
"loss": 0.09992367029190063,
"step": 6440
},
{
"epoch": 0.9670255530880162,
"grad_norm": 0.458984375,
"learning_rate": 6.324492112725676e-09,
"loss": 0.10620630979537964,
"step": 6450
},
{
"epoch": 0.9685248175114085,
"grad_norm": 0.27734375,
"learning_rate": 5.765627233386028e-09,
"loss": 0.09715937972068786,
"step": 6460
},
{
"epoch": 0.9700240819348007,
"grad_norm": 0.18359375,
"learning_rate": 5.2325384566949126e-09,
"loss": 0.07616119980812072,
"step": 6470
},
{
"epoch": 0.971523346358193,
"grad_norm": 0.2041015625,
"learning_rate": 4.725239603287856e-09,
"loss": 0.08586298823356628,
"step": 6480
},
{
"epoch": 0.9730226107815853,
"grad_norm": 0.2158203125,
"learning_rate": 4.243743825181889e-09,
"loss": 0.10227413177490234,
"step": 6490
},
{
"epoch": 0.9745218752049776,
"grad_norm": 0.240234375,
"learning_rate": 3.788063605434267e-09,
"loss": 0.1260104298591614,
"step": 6500
}
],
"logging_steps": 10,
"max_steps": 6670,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.128917086797111e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}