Files
gpt2-poems-finetuned-v1/last-checkpoint/trainer_state.json
ModelHub XC ecd909d340 初始化项目,由ModelHub XC社区提供模型
Model: kriteekathapa/gpt2-poems-finetuned-v1
Source: Original Platform
2026-04-30 23:24:19 +08:00

2362 lines
60 KiB
JSON

{
"best_global_step": 7500,
"best_metric": 3.910543441772461,
"best_model_checkpoint": "./gpt2-poems-finetuned\\checkpoint-7500",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 7835,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006385186367627105,
"grad_norm": Infinity,
"learning_rate": 0.0,
"loss": 4.7595,
"step": 1
},
{
"epoch": 0.015962965919067762,
"grad_norm": 9.886136054992676,
"learning_rate": 6.122448979591837e-07,
"loss": 4.7981,
"step": 25
},
{
"epoch": 0.031925931838135524,
"grad_norm": 5.848156452178955,
"learning_rate": 1.25e-06,
"loss": 4.7085,
"step": 50
},
{
"epoch": 0.047888897757203286,
"grad_norm": 7.19024133682251,
"learning_rate": 1.8877551020408163e-06,
"loss": 4.5164,
"step": 75
},
{
"epoch": 0.06385186367627105,
"grad_norm": 4.210382461547852,
"learning_rate": 2.5255102040816328e-06,
"loss": 4.4614,
"step": 100
},
{
"epoch": 0.07981482959533881,
"grad_norm": 5.846686363220215,
"learning_rate": 3.1632653061224496e-06,
"loss": 4.3782,
"step": 125
},
{
"epoch": 0.09577779551440657,
"grad_norm": 4.44492769241333,
"learning_rate": 3.8010204081632656e-06,
"loss": 4.3687,
"step": 150
},
{
"epoch": 0.11174076143347433,
"grad_norm": 3.783334493637085,
"learning_rate": 4.438775510204082e-06,
"loss": 4.3127,
"step": 175
},
{
"epoch": 0.1277037273525421,
"grad_norm": 4.701773166656494,
"learning_rate": 5.0765306122448985e-06,
"loss": 4.3091,
"step": 200
},
{
"epoch": 0.14366669327160986,
"grad_norm": 5.687783241271973,
"learning_rate": 5.7142857142857145e-06,
"loss": 4.3117,
"step": 225
},
{
"epoch": 0.15962965919067762,
"grad_norm": 3.9771533012390137,
"learning_rate": 6.352040816326531e-06,
"loss": 4.2744,
"step": 250
},
{
"epoch": 0.17559262510974538,
"grad_norm": 4.070949077606201,
"learning_rate": 6.989795918367348e-06,
"loss": 4.3154,
"step": 275
},
{
"epoch": 0.19155559102881314,
"grad_norm": 4.269972801208496,
"learning_rate": 7.627551020408163e-06,
"loss": 4.2603,
"step": 300
},
{
"epoch": 0.2075185569478809,
"grad_norm": 3.923750400543213,
"learning_rate": 8.26530612244898e-06,
"loss": 4.2725,
"step": 325
},
{
"epoch": 0.22348152286694867,
"grad_norm": 3.631035327911377,
"learning_rate": 8.903061224489795e-06,
"loss": 4.2377,
"step": 350
},
{
"epoch": 0.23944448878601643,
"grad_norm": 3.9385716915130615,
"learning_rate": 9.540816326530612e-06,
"loss": 4.2813,
"step": 375
},
{
"epoch": 0.2554074547050842,
"grad_norm": 4.328210353851318,
"learning_rate": 1.0178571428571429e-05,
"loss": 4.273,
"step": 400
},
{
"epoch": 0.27137042062415195,
"grad_norm": 3.800662040710449,
"learning_rate": 1.0816326530612246e-05,
"loss": 4.2285,
"step": 425
},
{
"epoch": 0.2873333865432197,
"grad_norm": 4.066190719604492,
"learning_rate": 1.1454081632653063e-05,
"loss": 4.2081,
"step": 450
},
{
"epoch": 0.3032963524622875,
"grad_norm": 3.563178300857544,
"learning_rate": 1.2091836734693878e-05,
"loss": 4.2176,
"step": 475
},
{
"epoch": 0.31925931838135524,
"grad_norm": 3.9187610149383545,
"learning_rate": 1.2729591836734697e-05,
"loss": 4.2281,
"step": 500
},
{
"epoch": 0.31925931838135524,
"eval_loss": 4.076858043670654,
"eval_runtime": 58.2059,
"eval_samples_per_second": 53.809,
"eval_steps_per_second": 26.904,
"step": 500
},
{
"epoch": 0.335222284300423,
"grad_norm": 3.895627498626709,
"learning_rate": 1.3367346938775512e-05,
"loss": 4.2277,
"step": 525
},
{
"epoch": 0.35118525021949076,
"grad_norm": 4.739663600921631,
"learning_rate": 1.4005102040816327e-05,
"loss": 4.216,
"step": 550
},
{
"epoch": 0.3671482161385585,
"grad_norm": 4.143375396728516,
"learning_rate": 1.4642857142857144e-05,
"loss": 4.1842,
"step": 575
},
{
"epoch": 0.3831111820576263,
"grad_norm": 4.17396879196167,
"learning_rate": 1.528061224489796e-05,
"loss": 4.2185,
"step": 600
},
{
"epoch": 0.39907414797669405,
"grad_norm": 4.158074855804443,
"learning_rate": 1.5918367346938776e-05,
"loss": 4.1617,
"step": 625
},
{
"epoch": 0.4150371138957618,
"grad_norm": 3.739445924758911,
"learning_rate": 1.655612244897959e-05,
"loss": 4.2013,
"step": 650
},
{
"epoch": 0.4310000798148296,
"grad_norm": 3.669919729232788,
"learning_rate": 1.719387755102041e-05,
"loss": 4.1807,
"step": 675
},
{
"epoch": 0.44696304573389734,
"grad_norm": 3.9435887336730957,
"learning_rate": 1.7831632653061225e-05,
"loss": 4.168,
"step": 700
},
{
"epoch": 0.4629260116529651,
"grad_norm": 4.32051420211792,
"learning_rate": 1.8469387755102043e-05,
"loss": 4.1481,
"step": 725
},
{
"epoch": 0.47888897757203286,
"grad_norm": 3.661527633666992,
"learning_rate": 1.910714285714286e-05,
"loss": 4.1624,
"step": 750
},
{
"epoch": 0.4948519434911006,
"grad_norm": 3.5897233486175537,
"learning_rate": 1.9744897959183677e-05,
"loss": 4.1445,
"step": 775
},
{
"epoch": 0.5108149094101684,
"grad_norm": 4.591588973999023,
"learning_rate": 1.9999776668891292e-05,
"loss": 4.1601,
"step": 800
},
{
"epoch": 0.5267778753292361,
"grad_norm": 4.352611541748047,
"learning_rate": 1.9998411903795984e-05,
"loss": 4.1446,
"step": 825
},
{
"epoch": 0.5427408412483039,
"grad_norm": 3.704148292541504,
"learning_rate": 1.9995806615567444e-05,
"loss": 4.1216,
"step": 850
},
{
"epoch": 0.5587038071673717,
"grad_norm": 3.941983699798584,
"learning_rate": 1.999196112744904e-05,
"loss": 4.1799,
"step": 875
},
{
"epoch": 0.5746667730864394,
"grad_norm": 4.044922828674316,
"learning_rate": 1.9986875916558237e-05,
"loss": 4.11,
"step": 900
},
{
"epoch": 0.5906297390055072,
"grad_norm": 3.791494131088257,
"learning_rate": 1.9980551613827405e-05,
"loss": 4.1668,
"step": 925
},
{
"epoch": 0.606592704924575,
"grad_norm": 3.999377965927124,
"learning_rate": 1.9972989003925544e-05,
"loss": 4.1645,
"step": 950
},
{
"epoch": 0.6225556708436427,
"grad_norm": 3.6708574295043945,
"learning_rate": 1.996418902516092e-05,
"loss": 4.1543,
"step": 975
},
{
"epoch": 0.6385186367627105,
"grad_norm": 3.741013526916504,
"learning_rate": 1.995415276936465e-05,
"loss": 4.1077,
"step": 1000
},
{
"epoch": 0.6385186367627105,
"eval_loss": 4.022159099578857,
"eval_runtime": 57.4323,
"eval_samples_per_second": 54.534,
"eval_steps_per_second": 27.267,
"step": 1000
},
{
"epoch": 0.6544816026817782,
"grad_norm": 3.789487838745117,
"learning_rate": 1.9942881481755233e-05,
"loss": 4.1637,
"step": 1025
},
{
"epoch": 0.670444568600846,
"grad_norm": 3.9883713722229004,
"learning_rate": 1.9930376560784057e-05,
"loss": 4.1303,
"step": 1050
},
{
"epoch": 0.6864075345199138,
"grad_norm": 3.6000473499298096,
"learning_rate": 1.9916639557961895e-05,
"loss": 4.0924,
"step": 1075
},
{
"epoch": 0.7023705004389815,
"grad_norm": 3.537168264389038,
"learning_rate": 1.99016721776664e-05,
"loss": 4.1139,
"step": 1100
},
{
"epoch": 0.7183334663580493,
"grad_norm": 3.460998773574829,
"learning_rate": 1.9885476276930628e-05,
"loss": 4.0959,
"step": 1125
},
{
"epoch": 0.734296432277117,
"grad_norm": 3.44901180267334,
"learning_rate": 1.9868053865212658e-05,
"loss": 4.1265,
"step": 1150
},
{
"epoch": 0.7502593981961848,
"grad_norm": 3.4085631370544434,
"learning_rate": 1.9849407104146254e-05,
"loss": 4.1237,
"step": 1175
},
{
"epoch": 0.7662223641152526,
"grad_norm": 3.4791786670684814,
"learning_rate": 1.982953830727268e-05,
"loss": 4.0928,
"step": 1200
},
{
"epoch": 0.7821853300343203,
"grad_norm": 3.5405592918395996,
"learning_rate": 1.9808449939753635e-05,
"loss": 4.1219,
"step": 1225
},
{
"epoch": 0.7981482959533881,
"grad_norm": 3.8546345233917236,
"learning_rate": 1.9786144618065414e-05,
"loss": 4.112,
"step": 1250
},
{
"epoch": 0.8141112618724559,
"grad_norm": 3.500869035720825,
"learning_rate": 1.976262510967428e-05,
"loss": 4.1272,
"step": 1275
},
{
"epoch": 0.8300742277915236,
"grad_norm": 3.4084880352020264,
"learning_rate": 1.973789433269308e-05,
"loss": 4.1496,
"step": 1300
},
{
"epoch": 0.8460371937105914,
"grad_norm": 3.3561558723449707,
"learning_rate": 1.97119553555192e-05,
"loss": 4.0731,
"step": 1325
},
{
"epoch": 0.8620001596296591,
"grad_norm": 3.4307825565338135,
"learning_rate": 1.9684811396453857e-05,
"loss": 4.1499,
"step": 1350
},
{
"epoch": 0.8779631255487269,
"grad_norm": 3.2400906085968018,
"learning_rate": 1.9656465823302806e-05,
"loss": 4.1142,
"step": 1375
},
{
"epoch": 0.8939260914677947,
"grad_norm": 3.5277132987976074,
"learning_rate": 1.962692215295849e-05,
"loss": 4.138,
"step": 1400
},
{
"epoch": 0.9098890573868624,
"grad_norm": 2.917336940765381,
"learning_rate": 1.959618405096368e-05,
"loss": 4.1087,
"step": 1425
},
{
"epoch": 0.9258520233059302,
"grad_norm": 3.4995276927948,
"learning_rate": 1.956425533105669e-05,
"loss": 4.129,
"step": 1450
},
{
"epoch": 0.941814989224998,
"grad_norm": 3.203911304473877,
"learning_rate": 1.953113995469821e-05,
"loss": 4.0769,
"step": 1475
},
{
"epoch": 0.9577779551440657,
"grad_norm": 3.1379663944244385,
"learning_rate": 1.949684203057978e-05,
"loss": 4.0667,
"step": 1500
},
{
"epoch": 0.9577779551440657,
"eval_loss": 3.9897916316986084,
"eval_runtime": 57.0506,
"eval_samples_per_second": 54.899,
"eval_steps_per_second": 27.449,
"step": 1500
},
{
"epoch": 0.9737409210631335,
"grad_norm": 3.1570990085601807,
"learning_rate": 1.9461365814114032e-05,
"loss": 4.08,
"step": 1525
},
{
"epoch": 0.9897038869822012,
"grad_norm": 3.5567002296447754,
"learning_rate": 1.9424715706906703e-05,
"loss": 4.0888,
"step": 1550
},
{
"epoch": 1.0051081490941016,
"grad_norm": 2.8667120933532715,
"learning_rate": 1.938689625621052e-05,
"loss": 4.1116,
"step": 1575
},
{
"epoch": 1.0210711150131695,
"grad_norm": 2.864108085632324,
"learning_rate": 1.9347912154361022e-05,
"loss": 4.1135,
"step": 1600
},
{
"epoch": 1.0370340809322371,
"grad_norm": 3.1839237213134766,
"learning_rate": 1.9307768238194363e-05,
"loss": 4.0433,
"step": 1625
},
{
"epoch": 1.052997046851305,
"grad_norm": 3.019927740097046,
"learning_rate": 1.9266469488447198e-05,
"loss": 4.094,
"step": 1650
},
{
"epoch": 1.0689600127703727,
"grad_norm": 3.2604196071624756,
"learning_rate": 1.9224021029138714e-05,
"loss": 4.0495,
"step": 1675
},
{
"epoch": 1.0849229786894405,
"grad_norm": 3.479423761367798,
"learning_rate": 1.9180428126934877e-05,
"loss": 4.0804,
"step": 1700
},
{
"epoch": 1.1008859446085082,
"grad_norm": 3.250142812728882,
"learning_rate": 1.9135696190495002e-05,
"loss": 4.0409,
"step": 1725
},
{
"epoch": 1.116848910527576,
"grad_norm": 3.3738601207733154,
"learning_rate": 1.9089830769800673e-05,
"loss": 4.0571,
"step": 1750
},
{
"epoch": 1.1328118764466437,
"grad_norm": 3.4340453147888184,
"learning_rate": 1.904283755546716e-05,
"loss": 4.0253,
"step": 1775
},
{
"epoch": 1.1487748423657116,
"grad_norm": 3.3150882720947266,
"learning_rate": 1.8994722378037343e-05,
"loss": 4.0714,
"step": 1800
},
{
"epoch": 1.1647378082847792,
"grad_norm": 3.114799737930298,
"learning_rate": 1.8945491207258356e-05,
"loss": 4.07,
"step": 1825
},
{
"epoch": 1.1807007742038471,
"grad_norm": 2.940286636352539,
"learning_rate": 1.8895150151340855e-05,
"loss": 4.0395,
"step": 1850
},
{
"epoch": 1.1966637401229148,
"grad_norm": 3.1590492725372314,
"learning_rate": 1.88437054562012e-05,
"loss": 4.0437,
"step": 1875
},
{
"epoch": 1.2126267060419826,
"grad_norm": 3.4203438758850098,
"learning_rate": 1.879116350468648e-05,
"loss": 4.0774,
"step": 1900
},
{
"epoch": 1.2285896719610503,
"grad_norm": 3.203162908554077,
"learning_rate": 1.8737530815782615e-05,
"loss": 4.0435,
"step": 1925
},
{
"epoch": 1.2445526378801182,
"grad_norm": 2.8159914016723633,
"learning_rate": 1.8682814043805496e-05,
"loss": 4.0611,
"step": 1950
},
{
"epoch": 1.2605156037991858,
"grad_norm": 3.0390403270721436,
"learning_rate": 1.8627019977575397e-05,
"loss": 4.0507,
"step": 1975
},
{
"epoch": 1.2764785697182537,
"grad_norm": 2.961411476135254,
"learning_rate": 1.857015553957466e-05,
"loss": 4.0905,
"step": 2000
},
{
"epoch": 1.2764785697182537,
"eval_loss": 3.9738128185272217,
"eval_runtime": 56.7214,
"eval_samples_per_second": 55.217,
"eval_steps_per_second": 27.609,
"step": 2000
},
{
"epoch": 1.2924415356373213,
"grad_norm": 2.742133378982544,
"learning_rate": 1.851222778508881e-05,
"loss": 4.0503,
"step": 2025
},
{
"epoch": 1.3084045015563892,
"grad_norm": 2.9378607273101807,
"learning_rate": 1.8453243901331194e-05,
"loss": 4.0667,
"step": 2050
},
{
"epoch": 1.3243674674754569,
"grad_norm": 2.7720441818237305,
"learning_rate": 1.8393211206551256e-05,
"loss": 4.0483,
"step": 2075
},
{
"epoch": 1.3403304333945247,
"grad_norm": 2.855170726776123,
"learning_rate": 1.8332137149126525e-05,
"loss": 4.0447,
"step": 2100
},
{
"epoch": 1.3562933993135924,
"grad_norm": 3.0880863666534424,
"learning_rate": 1.827002930663851e-05,
"loss": 4.0056,
"step": 2125
},
{
"epoch": 1.3722563652326603,
"grad_norm": 3.391598701477051,
"learning_rate": 1.82068953849325e-05,
"loss": 4.0299,
"step": 2150
},
{
"epoch": 1.388219331151728,
"grad_norm": 2.805473566055298,
"learning_rate": 1.8142743217161517e-05,
"loss": 4.0247,
"step": 2175
},
{
"epoch": 1.4041822970707958,
"grad_norm": 3.1002068519592285,
"learning_rate": 1.807758076281442e-05,
"loss": 4.023,
"step": 2200
},
{
"epoch": 1.4201452629898634,
"grad_norm": 3.025351047515869,
"learning_rate": 1.8011416106728363e-05,
"loss": 4.04,
"step": 2225
},
{
"epoch": 1.4361082289089313,
"grad_norm": 2.6007015705108643,
"learning_rate": 1.7944257458085693e-05,
"loss": 4.0736,
"step": 2250
},
{
"epoch": 1.4520711948279992,
"grad_norm": 2.8499741554260254,
"learning_rate": 1.787611314939541e-05,
"loss": 4.0532,
"step": 2275
},
{
"epoch": 1.4680341607470668,
"grad_norm": 2.8128528594970703,
"learning_rate": 1.780699163545936e-05,
"loss": 4.0392,
"step": 2300
},
{
"epoch": 1.4839971266661345,
"grad_norm": 3.1223385334014893,
"learning_rate": 1.7736901492323195e-05,
"loss": 4.0352,
"step": 2325
},
{
"epoch": 1.4999600925852024,
"grad_norm": 3.0269346237182617,
"learning_rate": 1.7665851416212365e-05,
"loss": 4.0201,
"step": 2350
},
{
"epoch": 1.5159230585042702,
"grad_norm": 2.9801249504089355,
"learning_rate": 1.759385022245313e-05,
"loss": 4.0303,
"step": 2375
},
{
"epoch": 1.5318860244233379,
"grad_norm": 2.950418710708618,
"learning_rate": 1.7520906844378834e-05,
"loss": 3.9942,
"step": 2400
},
{
"epoch": 1.5478489903424055,
"grad_norm": 2.954284191131592,
"learning_rate": 1.7447030332221534e-05,
"loss": 4.0157,
"step": 2425
},
{
"epoch": 1.5638119562614734,
"grad_norm": 2.7167422771453857,
"learning_rate": 1.7372229851989115e-05,
"loss": 4.0339,
"step": 2450
},
{
"epoch": 1.5797749221805413,
"grad_norm": 2.749498128890991,
"learning_rate": 1.7296514684328043e-05,
"loss": 4.0505,
"step": 2475
},
{
"epoch": 1.595737888099609,
"grad_norm": 3.0270655155181885,
"learning_rate": 1.7219894223371897e-05,
"loss": 4.008,
"step": 2500
},
{
"epoch": 1.595737888099609,
"eval_loss": 3.956145763397217,
"eval_runtime": 56.6738,
"eval_samples_per_second": 55.264,
"eval_steps_per_second": 27.632,
"step": 2500
},
{
"epoch": 1.6117008540186766,
"grad_norm": 2.99558162689209,
"learning_rate": 1.7142377975575826e-05,
"loss": 4.0452,
"step": 2525
},
{
"epoch": 1.6276638199377444,
"grad_norm": 3.2427620887756348,
"learning_rate": 1.706397555853706e-05,
"loss": 3.9951,
"step": 2550
},
{
"epoch": 1.6436267858568123,
"grad_norm": 2.987531900405884,
"learning_rate": 1.698469669980162e-05,
"loss": 4.0321,
"step": 2575
},
{
"epoch": 1.65958975177588,
"grad_norm": 2.827646255493164,
"learning_rate": 1.690455123565743e-05,
"loss": 4.0141,
"step": 2600
},
{
"epoch": 1.6755527176949476,
"grad_norm": 2.6261708736419678,
"learning_rate": 1.68235491099139e-05,
"loss": 4.021,
"step": 2625
},
{
"epoch": 1.6915156836140155,
"grad_norm": 2.822793960571289,
"learning_rate": 1.6741700372668153e-05,
"loss": 4.0724,
"step": 2650
},
{
"epoch": 1.7074786495330834,
"grad_norm": 2.6139183044433594,
"learning_rate": 1.6659015179058132e-05,
"loss": 4.0169,
"step": 2675
},
{
"epoch": 1.723441615452151,
"grad_norm": 2.580967426300049,
"learning_rate": 1.657550378800259e-05,
"loss": 4.0331,
"step": 2700
},
{
"epoch": 1.7394045813712187,
"grad_norm": 2.6083126068115234,
"learning_rate": 1.6491176560928267e-05,
"loss": 3.9969,
"step": 2725
},
{
"epoch": 1.7553675472902865,
"grad_norm": 2.944607734680176,
"learning_rate": 1.640604396048434e-05,
"loss": 4.0018,
"step": 2750
},
{
"epoch": 1.7713305132093544,
"grad_norm": 2.804600715637207,
"learning_rate": 1.632011654924426e-05,
"loss": 4.0159,
"step": 2775
},
{
"epoch": 1.787293479128422,
"grad_norm": 2.5616648197174072,
"learning_rate": 1.6233404988395272e-05,
"loss": 4.0208,
"step": 2800
},
{
"epoch": 1.8032564450474897,
"grad_norm": 2.834343671798706,
"learning_rate": 1.6145920036415643e-05,
"loss": 4.0495,
"step": 2825
},
{
"epoch": 1.8192194109665576,
"grad_norm": 2.5377800464630127,
"learning_rate": 1.6057672547739833e-05,
"loss": 4.0262,
"step": 2850
},
{
"epoch": 1.8351823768856255,
"grad_norm": 2.576991558074951,
"learning_rate": 1.596867347141177e-05,
"loss": 4.0126,
"step": 2875
},
{
"epoch": 1.8511453428046931,
"grad_norm": 2.5933356285095215,
"learning_rate": 1.587893384972638e-05,
"loss": 4.0411,
"step": 2900
},
{
"epoch": 1.8671083087237608,
"grad_norm": 2.572754144668579,
"learning_rate": 1.5788464816859544e-05,
"loss": 4.042,
"step": 2925
},
{
"epoch": 1.8830712746428286,
"grad_norm": 2.584364175796509,
"learning_rate": 1.5697277597486663e-05,
"loss": 4.0151,
"step": 2950
},
{
"epoch": 1.8990342405618965,
"grad_norm": 2.7844433784484863,
"learning_rate": 1.560538350538998e-05,
"loss": 4.0261,
"step": 2975
},
{
"epoch": 1.9149972064809642,
"grad_norm": 2.6380529403686523,
"learning_rate": 1.551279394205486e-05,
"loss": 4.0071,
"step": 3000
},
{
"epoch": 1.9149972064809642,
"eval_loss": 3.944791078567505,
"eval_runtime": 56.7672,
"eval_samples_per_second": 55.173,
"eval_steps_per_second": 27.586,
"step": 3000
},
{
"epoch": 1.9309601724000318,
"grad_norm": 2.678741216659546,
"learning_rate": 1.5419520395255204e-05,
"loss": 3.9901,
"step": 3025
},
{
"epoch": 1.9469231383190997,
"grad_norm": 2.701401472091675,
"learning_rate": 1.5325574437628107e-05,
"loss": 4.0162,
"step": 3050
},
{
"epoch": 1.9628861042381676,
"grad_norm": 2.727498769760132,
"learning_rate": 1.5230967725238036e-05,
"loss": 3.9851,
"step": 3075
},
{
"epoch": 1.9788490701572352,
"grad_norm": 2.747472047805786,
"learning_rate": 1.5135711996130624e-05,
"loss": 4.0346,
"step": 3100
},
{
"epoch": 1.9948120360763029,
"grad_norm": 2.5219974517822266,
"learning_rate": 1.503981906887634e-05,
"loss": 3.993,
"step": 3125
},
{
"epoch": 2.0102162981882032,
"grad_norm": 2.9478681087493896,
"learning_rate": 1.4943300841104094e-05,
"loss": 4.024,
"step": 3150
},
{
"epoch": 2.026179264107271,
"grad_norm": 2.684865713119507,
"learning_rate": 1.4846169288025092e-05,
"loss": 4.0307,
"step": 3175
},
{
"epoch": 2.042142230026339,
"grad_norm": 2.6168887615203857,
"learning_rate": 1.4748436460947064e-05,
"loss": 3.953,
"step": 3200
},
{
"epoch": 2.058105195945407,
"grad_norm": 2.68648099899292,
"learning_rate": 1.4650114485779e-05,
"loss": 4.0059,
"step": 3225
},
{
"epoch": 2.0740681618644743,
"grad_norm": 3.017493963241577,
"learning_rate": 1.4551215561526692e-05,
"loss": 3.931,
"step": 3250
},
{
"epoch": 2.090031127783542,
"grad_norm": 2.6104273796081543,
"learning_rate": 1.4451751958779165e-05,
"loss": 3.9746,
"step": 3275
},
{
"epoch": 2.10599409370261,
"grad_norm": 2.6534171104431152,
"learning_rate": 1.435173601818625e-05,
"loss": 3.9614,
"step": 3300
},
{
"epoch": 2.121957059621678,
"grad_norm": 2.456097364425659,
"learning_rate": 1.4251180148927439e-05,
"loss": 3.9736,
"step": 3325
},
{
"epoch": 2.1379200255407453,
"grad_norm": 2.6180832386016846,
"learning_rate": 1.4150096827172269e-05,
"loss": 3.9916,
"step": 3350
},
{
"epoch": 2.153882991459813,
"grad_norm": 2.7971746921539307,
"learning_rate": 1.4048498594532369e-05,
"loss": 3.9433,
"step": 3375
},
{
"epoch": 2.169845957378881,
"grad_norm": 3.2284584045410156,
"learning_rate": 1.3946398056505407e-05,
"loss": 3.9567,
"step": 3400
},
{
"epoch": 2.185808923297949,
"grad_norm": 2.5488057136535645,
"learning_rate": 1.3843807880911082e-05,
"loss": 3.9326,
"step": 3425
},
{
"epoch": 2.2017718892170164,
"grad_norm": 2.8564059734344482,
"learning_rate": 1.3740740796319424e-05,
"loss": 3.9668,
"step": 3450
},
{
"epoch": 2.2177348551360843,
"grad_norm": 2.912123441696167,
"learning_rate": 1.3637209590471521e-05,
"loss": 3.98,
"step": 3475
},
{
"epoch": 2.233697821055152,
"grad_norm": 2.7654449939727783,
"learning_rate": 1.3533227108692916e-05,
"loss": 3.9696,
"step": 3500
},
{
"epoch": 2.233697821055152,
"eval_loss": 3.9376115798950195,
"eval_runtime": 56.7367,
"eval_samples_per_second": 55.202,
"eval_steps_per_second": 27.601,
"step": 3500
},
{
"epoch": 2.24966078697422,
"grad_norm": 2.5010123252868652,
"learning_rate": 1.3428806252299877e-05,
"loss": 3.9693,
"step": 3525
},
{
"epoch": 2.2656237528932874,
"grad_norm": 2.5059149265289307,
"learning_rate": 1.3323959976998689e-05,
"loss": 3.9357,
"step": 3550
},
{
"epoch": 2.2815867188123553,
"grad_norm": 2.5838544368743896,
"learning_rate": 1.3218701291278215e-05,
"loss": 3.9711,
"step": 3575
},
{
"epoch": 2.297549684731423,
"grad_norm": 2.6254265308380127,
"learning_rate": 1.3113043254795922e-05,
"loss": 3.9931,
"step": 3600
},
{
"epoch": 2.313512650650491,
"grad_norm": 2.6003894805908203,
"learning_rate": 1.300699897675752e-05,
"loss": 3.988,
"step": 3625
},
{
"epoch": 2.3294756165695585,
"grad_norm": 2.9631710052490234,
"learning_rate": 1.2900581614290495e-05,
"loss": 3.9856,
"step": 3650
},
{
"epoch": 2.3454385824886264,
"grad_norm": 2.8343353271484375,
"learning_rate": 1.2793804370811667e-05,
"loss": 3.9814,
"step": 3675
},
{
"epoch": 2.3614015484076942,
"grad_norm": 2.731628179550171,
"learning_rate": 1.2686680494389018e-05,
"loss": 3.9898,
"step": 3700
},
{
"epoch": 2.377364514326762,
"grad_norm": 2.481356382369995,
"learning_rate": 1.2579223276097986e-05,
"loss": 4.0092,
"step": 3725
},
{
"epoch": 2.3933274802458295,
"grad_norm": 2.6943631172180176,
"learning_rate": 1.2471446048372401e-05,
"loss": 3.9694,
"step": 3750
},
{
"epoch": 2.4092904461648974,
"grad_norm": 2.6562039852142334,
"learning_rate": 1.2363362183350309e-05,
"loss": 3.9859,
"step": 3775
},
{
"epoch": 2.4252534120839653,
"grad_norm": 2.579770565032959,
"learning_rate": 1.2254985091214867e-05,
"loss": 3.9835,
"step": 3800
},
{
"epoch": 2.441216378003033,
"grad_norm": 2.8043432235717773,
"learning_rate": 1.2146328218530503e-05,
"loss": 3.9558,
"step": 3825
},
{
"epoch": 2.4571793439221006,
"grad_norm": 3.0811917781829834,
"learning_rate": 1.2037405046574598e-05,
"loss": 4.0002,
"step": 3850
},
{
"epoch": 2.4731423098411685,
"grad_norm": 2.6078948974609375,
"learning_rate": 1.1928229089664802e-05,
"loss": 3.9368,
"step": 3875
},
{
"epoch": 2.4891052757602363,
"grad_norm": 2.4601407051086426,
"learning_rate": 1.1818813893482321e-05,
"loss": 3.9381,
"step": 3900
},
{
"epoch": 2.5050682416793038,
"grad_norm": 2.945693016052246,
"learning_rate": 1.1709173033391247e-05,
"loss": 3.9915,
"step": 3925
},
{
"epoch": 2.5210312075983716,
"grad_norm": 2.4278717041015625,
"learning_rate": 1.1599320112754258e-05,
"loss": 3.9684,
"step": 3950
},
{
"epoch": 2.5369941735174395,
"grad_norm": 2.7524194717407227,
"learning_rate": 1.1489268761244804e-05,
"loss": 3.9619,
"step": 3975
},
{
"epoch": 2.5529571394365074,
"grad_norm": 2.7142021656036377,
"learning_rate": 1.1379032633156062e-05,
"loss": 3.9984,
"step": 4000
},
{
"epoch": 2.5529571394365074,
"eval_loss": 3.928858757019043,
"eval_runtime": 56.5867,
"eval_samples_per_second": 55.349,
"eval_steps_per_second": 27.674,
"step": 4000
},
{
"epoch": 2.5689201053555752,
"grad_norm": 2.842582941055298,
"learning_rate": 1.1268625405706804e-05,
"loss": 3.9872,
"step": 4025
},
{
"epoch": 2.5848830712746427,
"grad_norm": 2.6133430004119873,
"learning_rate": 1.1158060777344448e-05,
"loss": 3.9276,
"step": 4050
},
{
"epoch": 2.6008460371937105,
"grad_norm": 2.502631664276123,
"learning_rate": 1.1047352466045458e-05,
"loss": 3.9486,
"step": 4075
},
{
"epoch": 2.6168090031127784,
"grad_norm": 2.6683413982391357,
"learning_rate": 1.0936514207613336e-05,
"loss": 3.9574,
"step": 4100
},
{
"epoch": 2.632771969031846,
"grad_norm": 2.6480071544647217,
"learning_rate": 1.0825559753974385e-05,
"loss": 3.9646,
"step": 4125
},
{
"epoch": 2.6487349349509137,
"grad_norm": 2.713996171951294,
"learning_rate": 1.0714502871471475e-05,
"loss": 3.9898,
"step": 4150
},
{
"epoch": 2.6646979008699816,
"grad_norm": 2.5960659980773926,
"learning_rate": 1.0603357339156044e-05,
"loss": 3.9893,
"step": 4175
},
{
"epoch": 2.6806608667890495,
"grad_norm": 2.748525381088257,
"learning_rate": 1.0492136947078474e-05,
"loss": 3.931,
"step": 4200
},
{
"epoch": 2.6966238327081173,
"grad_norm": 2.875739574432373,
"learning_rate": 1.038085549457717e-05,
"loss": 3.9918,
"step": 4225
},
{
"epoch": 2.7125867986271848,
"grad_norm": 2.7640604972839355,
"learning_rate": 1.0269526788566408e-05,
"loss": 3.9917,
"step": 4250
},
{
"epoch": 2.7285497645462526,
"grad_norm": 2.540008068084717,
"learning_rate": 1.0158164641823312e-05,
"loss": 3.9956,
"step": 4275
},
{
"epoch": 2.7445127304653205,
"grad_norm": 3.028472661972046,
"learning_rate": 1.004678287127406e-05,
"loss": 3.9598,
"step": 4300
},
{
"epoch": 2.760475696384388,
"grad_norm": 2.5849356651306152,
"learning_rate": 9.935395296279605e-06,
"loss": 3.905,
"step": 4325
},
{
"epoch": 2.776438662303456,
"grad_norm": 2.5396087169647217,
"learning_rate": 9.824015736921058e-06,
"loss": 3.9343,
"step": 4350
},
{
"epoch": 2.7924016282225237,
"grad_norm": 2.9541361331939697,
"learning_rate": 9.712658012285015e-06,
"loss": 3.9106,
"step": 4375
},
{
"epoch": 2.8083645941415916,
"grad_norm": 2.7522456645965576,
"learning_rate": 9.601335938749002e-06,
"loss": 3.9761,
"step": 4400
},
{
"epoch": 2.8243275600606594,
"grad_norm": 2.7509615421295166,
"learning_rate": 9.490063328267235e-06,
"loss": 3.9299,
"step": 4425
},
{
"epoch": 2.840290525979727,
"grad_norm": 2.6110477447509766,
"learning_rate": 9.378853986656951e-06,
"loss": 3.9856,
"step": 4450
},
{
"epoch": 2.8562534918987947,
"grad_norm": 2.560284376144409,
"learning_rate": 9.267721711885486e-06,
"loss": 4.0167,
"step": 4475
},
{
"epoch": 2.8722164578178626,
"grad_norm": 2.499309539794922,
"learning_rate": 9.15668029235835e-06,
"loss": 3.9279,
"step": 4500
},
{
"epoch": 2.8722164578178626,
"eval_loss": 3.921231746673584,
"eval_runtime": 56.6026,
"eval_samples_per_second": 55.333,
"eval_steps_per_second": 27.667,
"step": 4500
},
{
"epoch": 2.8881794237369305,
"grad_norm": 2.4418981075286865,
"learning_rate": 9.045743505208442e-06,
"loss": 3.9964,
"step": 4525
},
{
"epoch": 2.9041423896559984,
"grad_norm": 2.7410218715667725,
"learning_rate": 8.934925114586729e-06,
"loss": 3.9709,
"step": 4550
},
{
"epoch": 2.920105355575066,
"grad_norm": 2.3856561183929443,
"learning_rate": 8.824238869954462e-06,
"loss": 3.9915,
"step": 4575
},
{
"epoch": 2.9360683214941337,
"grad_norm": 2.4345028400421143,
"learning_rate": 8.713698504377294e-06,
"loss": 3.9775,
"step": 4600
},
{
"epoch": 2.9520312874132015,
"grad_norm": 2.6661508083343506,
"learning_rate": 8.603317732821355e-06,
"loss": 3.9766,
"step": 4625
},
{
"epoch": 2.967994253332269,
"grad_norm": 2.3309764862060547,
"learning_rate": 8.493110250451628e-06,
"loss": 3.9815,
"step": 4650
},
{
"epoch": 2.983957219251337,
"grad_norm": 2.508256196975708,
"learning_rate": 8.38308973093275e-06,
"loss": 4.0027,
"step": 4675
},
{
"epoch": 2.9999201851704047,
"grad_norm": 2.6874983310699463,
"learning_rate": 8.273269824732516e-06,
"loss": 3.9101,
"step": 4700
},
{
"epoch": 3.015324447282305,
"grad_norm": 2.7417962551116943,
"learning_rate": 8.163664157428205e-06,
"loss": 3.9155,
"step": 4725
},
{
"epoch": 3.031287413201373,
"grad_norm": 2.679556131362915,
"learning_rate": 8.054286328016055e-06,
"loss": 3.9281,
"step": 4750
},
{
"epoch": 3.0472503791204404,
"grad_norm": 2.6242339611053467,
"learning_rate": 7.945149907223985e-06,
"loss": 3.8998,
"step": 4775
},
{
"epoch": 3.0632133450395083,
"grad_norm": 2.893059015274048,
"learning_rate": 7.836268435827875e-06,
"loss": 3.9077,
"step": 4800
},
{
"epoch": 3.079176310958576,
"grad_norm": 2.602938652038574,
"learning_rate": 7.727655422971514e-06,
"loss": 3.936,
"step": 4825
},
{
"epoch": 3.095139276877644,
"grad_norm": 2.4572951793670654,
"learning_rate": 7.619324344490488e-06,
"loss": 3.9821,
"step": 4850
},
{
"epoch": 3.1111022427967114,
"grad_norm": 2.6016576290130615,
"learning_rate": 7.511288641240227e-06,
"loss": 3.967,
"step": 4875
},
{
"epoch": 3.1270652087157793,
"grad_norm": 2.4637320041656494,
"learning_rate": 7.4035617174283646e-06,
"loss": 3.8937,
"step": 4900
},
{
"epoch": 3.143028174634847,
"grad_norm": 2.449878215789795,
"learning_rate": 7.2961569389516305e-06,
"loss": 3.8913,
"step": 4925
},
{
"epoch": 3.158991140553915,
"grad_norm": 2.3982160091400146,
"learning_rate": 7.189087631737551e-06,
"loss": 3.9394,
"step": 4950
},
{
"epoch": 3.1749541064729825,
"grad_norm": 2.531035900115967,
"learning_rate": 7.082367080091037e-06,
"loss": 3.9346,
"step": 4975
},
{
"epoch": 3.1909170723920504,
"grad_norm": 2.7257885932922363,
"learning_rate": 6.976008525046211e-06,
"loss": 3.9569,
"step": 5000
},
{
"epoch": 3.1909170723920504,
"eval_loss": 3.918074369430542,
"eval_runtime": 56.7976,
"eval_samples_per_second": 55.143,
"eval_steps_per_second": 27.572,
"step": 5000
},
{
"epoch": 3.2068800383111182,
"grad_norm": 2.6535191535949707,
"learning_rate": 6.870025162723538e-06,
"loss": 3.9732,
"step": 5025
},
{
"epoch": 3.222843004230186,
"grad_norm": 2.4435272216796875,
"learning_rate": 6.764430142692564e-06,
"loss": 3.946,
"step": 5050
},
{
"epoch": 3.2388059701492535,
"grad_norm": 2.4859092235565186,
"learning_rate": 6.659236566340422e-06,
"loss": 3.9508,
"step": 5075
},
{
"epoch": 3.2547689360683214,
"grad_norm": 2.4849720001220703,
"learning_rate": 6.554457485246332e-06,
"loss": 3.9247,
"step": 5100
},
{
"epoch": 3.2707319019873893,
"grad_norm": 2.3787262439727783,
"learning_rate": 6.4501058995622315e-06,
"loss": 3.9676,
"step": 5125
},
{
"epoch": 3.286694867906457,
"grad_norm": 2.625030279159546,
"learning_rate": 6.346194756399855e-06,
"loss": 3.948,
"step": 5150
},
{
"epoch": 3.302657833825525,
"grad_norm": 2.892484426498413,
"learning_rate": 6.242736948224333e-06,
"loss": 3.9499,
"step": 5175
},
{
"epoch": 3.3186207997445925,
"grad_norm": 2.6197938919067383,
"learning_rate": 6.139745311254621e-06,
"loss": 3.8792,
"step": 5200
},
{
"epoch": 3.3345837656636603,
"grad_norm": 2.62258243560791,
"learning_rate": 6.037232623870869e-06,
"loss": 3.9258,
"step": 5225
},
{
"epoch": 3.350546731582728,
"grad_norm": 2.7659857273101807,
"learning_rate": 5.9352116050289795e-06,
"loss": 3.9113,
"step": 5250
},
{
"epoch": 3.3665096975017956,
"grad_norm": 2.5385048389434814,
"learning_rate": 5.833694912682553e-06,
"loss": 3.9307,
"step": 5275
},
{
"epoch": 3.3824726634208635,
"grad_norm": 2.4550514221191406,
"learning_rate": 5.732695142212392e-06,
"loss": 3.9651,
"step": 5300
},
{
"epoch": 3.3984356293399314,
"grad_norm": 2.566981792449951,
"learning_rate": 5.632224824863741e-06,
"loss": 3.9423,
"step": 5325
},
{
"epoch": 3.4143985952589992,
"grad_norm": 2.4866788387298584,
"learning_rate": 5.5322964261915395e-06,
"loss": 3.959,
"step": 5350
},
{
"epoch": 3.430361561178067,
"grad_norm": 2.6524391174316406,
"learning_rate": 5.432922344513785e-06,
"loss": 3.932,
"step": 5375
},
{
"epoch": 3.4463245270971345,
"grad_norm": 2.34717059135437,
"learning_rate": 5.33411490937324e-06,
"loss": 3.932,
"step": 5400
},
{
"epoch": 3.4622874930162024,
"grad_norm": 2.664670705795288,
"learning_rate": 5.2358863800076956e-06,
"loss": 3.9499,
"step": 5425
},
{
"epoch": 3.4782504589352703,
"grad_norm": 2.7321012020111084,
"learning_rate": 5.13824894382893e-06,
"loss": 3.9093,
"step": 5450
},
{
"epoch": 3.4942134248543377,
"grad_norm": 2.481905221939087,
"learning_rate": 5.041214714910599e-06,
"loss": 3.8889,
"step": 5475
},
{
"epoch": 3.5101763907734056,
"grad_norm": 2.6542272567749023,
"learning_rate": 4.94479573248522e-06,
"loss": 3.9598,
"step": 5500
},
{
"epoch": 3.5101763907734056,
"eval_loss": 3.9154069423675537,
"eval_runtime": 56.6262,
"eval_samples_per_second": 55.31,
"eval_steps_per_second": 27.655,
"step": 5500
},
{
"epoch": 3.5261393566924735,
"grad_norm": 2.672531843185425,
"learning_rate": 4.849003959450432e-06,
"loss": 3.9798,
"step": 5525
},
{
"epoch": 3.5421023226115413,
"grad_norm": 2.5921006202697754,
"learning_rate": 4.753851280884745e-06,
"loss": 3.9279,
"step": 5550
},
{
"epoch": 3.558065288530609,
"grad_norm": 2.5902962684631348,
"learning_rate": 4.659349502572923e-06,
"loss": 3.9163,
"step": 5575
},
{
"epoch": 3.5740282544496766,
"grad_norm": 2.5092830657958984,
"learning_rate": 4.565510349541227e-06,
"loss": 3.9075,
"step": 5600
},
{
"epoch": 3.5899912203687445,
"grad_norm": 2.8172504901885986,
"learning_rate": 4.472345464602664e-06,
"loss": 3.8996,
"step": 5625
},
{
"epoch": 3.6059541862878124,
"grad_norm": 2.6843042373657227,
"learning_rate": 4.379866406912429e-06,
"loss": 3.9726,
"step": 5650
},
{
"epoch": 3.62191715220688,
"grad_norm": 2.3186111450195312,
"learning_rate": 4.28808465053376e-06,
"loss": 3.9418,
"step": 5675
},
{
"epoch": 3.6378801181259477,
"grad_norm": 2.6534008979797363,
"learning_rate": 4.197011583014312e-06,
"loss": 3.9505,
"step": 5700
},
{
"epoch": 3.6538430840450156,
"grad_norm": 2.683316230773926,
"learning_rate": 4.106658503973273e-06,
"loss": 3.9261,
"step": 5725
},
{
"epoch": 3.6698060499640834,
"grad_norm": 2.754112958908081,
"learning_rate": 4.017036623699415e-06,
"loss": 3.8915,
"step": 5750
},
{
"epoch": 3.6857690158831513,
"grad_norm": 2.441436767578125,
"learning_rate": 3.9281570617602145e-06,
"loss": 3.9543,
"step": 5775
},
{
"epoch": 3.7017319818022187,
"grad_norm": 2.5779573917388916,
"learning_rate": 3.840030845622196e-06,
"loss": 3.921,
"step": 5800
},
{
"epoch": 3.7176949477212866,
"grad_norm": 2.3267993927001953,
"learning_rate": 3.752668909282762e-06,
"loss": 3.9401,
"step": 5825
},
{
"epoch": 3.7336579136403545,
"grad_norm": 3.016676902770996,
"learning_rate": 3.6660820919135774e-06,
"loss": 3.9456,
"step": 5850
},
{
"epoch": 3.749620879559422,
"grad_norm": 2.6449170112609863,
"learning_rate": 3.580281136515732e-06,
"loss": 3.9071,
"step": 5875
},
{
"epoch": 3.76558384547849,
"grad_norm": 2.576220750808716,
"learning_rate": 3.495276688586835e-06,
"loss": 3.9559,
"step": 5900
},
{
"epoch": 3.7815468113975577,
"grad_norm": 2.5338728427886963,
"learning_rate": 3.4110792948002093e-06,
"loss": 3.9402,
"step": 5925
},
{
"epoch": 3.7975097773166255,
"grad_norm": 2.4438517093658447,
"learning_rate": 3.327699401696339e-06,
"loss": 3.9362,
"step": 5950
},
{
"epoch": 3.8134727432356934,
"grad_norm": 2.9886441230773926,
"learning_rate": 3.245147354386753e-06,
"loss": 3.9201,
"step": 5975
},
{
"epoch": 3.829435709154761,
"grad_norm": 2.4011476039886475,
"learning_rate": 3.163433395270481e-06,
"loss": 3.9403,
"step": 6000
},
{
"epoch": 3.829435709154761,
"eval_loss": 3.9126319885253906,
"eval_runtime": 95.1462,
"eval_samples_per_second": 32.918,
"eval_steps_per_second": 16.459,
"step": 6000
},
{
"epoch": 3.8453986750738287,
"grad_norm": 2.6228878498077393,
"learning_rate": 3.082567662763264e-06,
"loss": 3.9585,
"step": 6025
},
{
"epoch": 3.8613616409928966,
"grad_norm": 2.369616985321045,
"learning_rate": 3.0025601900396408e-06,
"loss": 3.8973,
"step": 6050
},
{
"epoch": 3.877324606911964,
"grad_norm": 2.92931866645813,
"learning_rate": 2.923420903788151e-06,
"loss": 3.9259,
"step": 6075
},
{
"epoch": 3.893287572831032,
"grad_norm": 2.371095657348633,
"learning_rate": 2.8451596229796763e-06,
"loss": 3.9598,
"step": 6100
},
{
"epoch": 3.9092505387500998,
"grad_norm": 2.415902853012085,
"learning_rate": 2.767786057649183e-06,
"loss": 3.9118,
"step": 6125
},
{
"epoch": 3.9252135046691676,
"grad_norm": 2.60103440284729,
"learning_rate": 2.6913098076909994e-06,
"loss": 3.9444,
"step": 6150
},
{
"epoch": 3.9411764705882355,
"grad_norm": 2.6399271488189697,
"learning_rate": 2.615740361667728e-06,
"loss": 3.9276,
"step": 6175
},
{
"epoch": 3.957139436507303,
"grad_norm": 2.461277723312378,
"learning_rate": 2.541087095632965e-06,
"loss": 3.9576,
"step": 6200
},
{
"epoch": 3.973102402426371,
"grad_norm": 2.5564517974853516,
"learning_rate": 2.467359271968016e-06,
"loss": 3.9056,
"step": 6225
},
{
"epoch": 3.9890653683454387,
"grad_norm": 2.6077992916107178,
"learning_rate": 2.394566038232682e-06,
"loss": 3.9513,
"step": 6250
},
{
"epoch": 4.004469630457339,
"grad_norm": 2.6047403812408447,
"learning_rate": 2.3227164260303148e-06,
"loss": 3.9438,
"step": 6275
},
{
"epoch": 4.0204325963764065,
"grad_norm": 2.5610029697418213,
"learning_rate": 2.251819349887224e-06,
"loss": 3.9488,
"step": 6300
},
{
"epoch": 4.036395562295475,
"grad_norm": 2.5663247108459473,
"learning_rate": 2.181883606146662e-06,
"loss": 3.9083,
"step": 6325
},
{
"epoch": 4.052358528214542,
"grad_norm": 2.7286441326141357,
"learning_rate": 2.1129178718774222e-06,
"loss": 3.9205,
"step": 6350
},
{
"epoch": 4.06832149413361,
"grad_norm": 2.6049561500549316,
"learning_rate": 2.044930703797272e-06,
"loss": 3.9414,
"step": 6375
},
{
"epoch": 4.084284460052678,
"grad_norm": 2.407318353652954,
"learning_rate": 1.9779305372112943e-06,
"loss": 3.9332,
"step": 6400
},
{
"epoch": 4.100247425971745,
"grad_norm": 2.5431180000305176,
"learning_rate": 1.911925684965309e-06,
"loss": 3.9233,
"step": 6425
},
{
"epoch": 4.116210391890814,
"grad_norm": 2.905609607696533,
"learning_rate": 1.846924336414474e-06,
"loss": 3.9074,
"step": 6450
},
{
"epoch": 4.132173357809881,
"grad_norm": 2.6817057132720947,
"learning_rate": 1.782934556407223e-06,
"loss": 3.8939,
"step": 6475
},
{
"epoch": 4.148136323728949,
"grad_norm": 2.507965564727783,
"learning_rate": 1.7199642842846387e-06,
"loss": 3.908,
"step": 6500
},
{
"epoch": 4.148136323728949,
"eval_loss": 3.9117023944854736,
"eval_runtime": 87.5275,
"eval_samples_per_second": 35.783,
"eval_steps_per_second": 17.892,
"step": 6500
},
{
"epoch": 4.164099289648017,
"grad_norm": 2.4522206783294678,
"learning_rate": 1.6580213328954054e-06,
"loss": 3.9176,
"step": 6525
},
{
"epoch": 4.180062255567084,
"grad_norm": 2.804759979248047,
"learning_rate": 1.5971133876264445e-06,
"loss": 3.8951,
"step": 6550
},
{
"epoch": 4.196025221486152,
"grad_norm": 2.3992278575897217,
"learning_rate": 1.5372480054493921e-06,
"loss": 3.8637,
"step": 6575
},
{
"epoch": 4.21198818740522,
"grad_norm": 2.68453049659729,
"learning_rate": 1.478432613982973e-06,
"loss": 3.9023,
"step": 6600
},
{
"epoch": 4.2279511533242875,
"grad_norm": 2.6714088916778564,
"learning_rate": 1.4206745105714415e-06,
"loss": 3.9102,
"step": 6625
},
{
"epoch": 4.243914119243356,
"grad_norm": 2.663738965988159,
"learning_rate": 1.363980861379196e-06,
"loss": 3.939,
"step": 6650
},
{
"epoch": 4.259877085162423,
"grad_norm": 2.4409677982330322,
"learning_rate": 1.3083587005016563e-06,
"loss": 3.8866,
"step": 6675
},
{
"epoch": 4.275840051081491,
"grad_norm": 2.3599727153778076,
"learning_rate": 1.253814929092515e-06,
"loss": 3.9531,
"step": 6700
},
{
"epoch": 4.291803017000559,
"grad_norm": 2.31123948097229,
"learning_rate": 1.200356314507517e-06,
"loss": 3.9207,
"step": 6725
},
{
"epoch": 4.307765982919626,
"grad_norm": 2.5220460891723633,
"learning_rate": 1.147989489464807e-06,
"loss": 3.9423,
"step": 6750
},
{
"epoch": 4.323728948838694,
"grad_norm": 2.528290033340454,
"learning_rate": 1.096720951222e-06,
"loss": 3.9337,
"step": 6775
},
{
"epoch": 4.339691914757762,
"grad_norm": 2.48093843460083,
"learning_rate": 1.0465570607700526e-06,
"loss": 3.9179,
"step": 6800
},
{
"epoch": 4.35565488067683,
"grad_norm": 2.3196213245391846,
"learning_rate": 9.97504042044042e-07,
"loss": 3.867,
"step": 6825
},
{
"epoch": 4.371617846595898,
"grad_norm": 2.3795883655548096,
"learning_rate": 9.495679811509483e-07,
"loss": 3.9262,
"step": 6850
},
{
"epoch": 4.387580812514965,
"grad_norm": 2.6998538970947266,
"learning_rate": 9.027548256145402e-07,
"loss": 3.9615,
"step": 6875
},
{
"epoch": 4.403543778434033,
"grad_norm": 2.639824867248535,
"learning_rate": 8.570703836374561e-07,
"loss": 3.9086,
"step": 6900
},
{
"epoch": 4.419506744353101,
"grad_norm": 2.8238446712493896,
"learning_rate": 8.125203233805634e-07,
"loss": 3.9274,
"step": 6925
},
{
"epoch": 4.4354697102721685,
"grad_norm": 2.5089192390441895,
"learning_rate": 7.691101722597038e-07,
"loss": 3.9387,
"step": 6950
},
{
"epoch": 4.451432676191236,
"grad_norm": 2.6850743293762207,
"learning_rate": 7.268453162598899e-07,
"loss": 3.9135,
"step": 6975
},
{
"epoch": 4.467395642110304,
"grad_norm": 2.6673882007598877,
"learning_rate": 6.857309992670625e-07,
"loss": 3.9344,
"step": 7000
},
{
"epoch": 4.467395642110304,
"eval_loss": 3.9108645915985107,
"eval_runtime": 88.1557,
"eval_samples_per_second": 35.528,
"eval_steps_per_second": 17.764,
"step": 7000
},
{
"epoch": 4.483358608029372,
"grad_norm": 2.8991503715515137,
"learning_rate": 6.457723224174606e-07,
"loss": 3.9679,
"step": 7025
},
{
"epoch": 4.49932157394844,
"grad_norm": 2.431222915649414,
"learning_rate": 6.069742434647286e-07,
"loss": 3.8838,
"step": 7050
},
{
"epoch": 4.515284539867507,
"grad_norm": 2.5432920455932617,
"learning_rate": 5.693415761647825e-07,
"loss": 3.899,
"step": 7075
},
{
"epoch": 4.531247505786575,
"grad_norm": 2.439650535583496,
"learning_rate": 5.328789896785635e-07,
"loss": 3.916,
"step": 7100
},
{
"epoch": 4.547210471705643,
"grad_norm": 2.2136306762695312,
"learning_rate": 4.97591007992726e-07,
"loss": 3.8504,
"step": 7125
},
{
"epoch": 4.563173437624711,
"grad_norm": 2.457735061645508,
"learning_rate": 4.6348200935834586e-07,
"loss": 3.9182,
"step": 7150
},
{
"epoch": 4.579136403543778,
"grad_norm": 2.562631368637085,
"learning_rate": 4.305562257476792e-07,
"loss": 3.9066,
"step": 7175
},
{
"epoch": 4.595099369462846,
"grad_norm": 2.6716058254241943,
"learning_rate": 3.988177423291195e-07,
"loss": 3.9478,
"step": 7200
},
{
"epoch": 4.611062335381914,
"grad_norm": 2.4732768535614014,
"learning_rate": 3.6827049696032233e-07,
"loss": 3.8824,
"step": 7225
},
{
"epoch": 4.627025301300982,
"grad_norm": 2.6977033615112305,
"learning_rate": 3.3891827969964373e-07,
"loss": 3.9434,
"step": 7250
},
{
"epoch": 4.6429882672200495,
"grad_norm": 2.5639913082122803,
"learning_rate": 3.107647323358842e-07,
"loss": 3.9115,
"step": 7275
},
{
"epoch": 4.658951233139117,
"grad_norm": 2.6874756813049316,
"learning_rate": 2.8381334793645466e-07,
"loss": 3.9292,
"step": 7300
},
{
"epoch": 4.674914199058185,
"grad_norm": 2.7557287216186523,
"learning_rate": 2.5806747041398403e-07,
"loss": 3.9431,
"step": 7325
},
{
"epoch": 4.690877164977253,
"grad_norm": 2.393786907196045,
"learning_rate": 2.3353029411142926e-07,
"loss": 3.8994,
"step": 7350
},
{
"epoch": 4.706840130896321,
"grad_norm": 2.4166479110717773,
"learning_rate": 2.1020486340574964e-07,
"loss": 3.9305,
"step": 7375
},
{
"epoch": 4.7228030968153885,
"grad_norm": 2.5869390964508057,
"learning_rate": 1.8809407233018272e-07,
"loss": 3.8979,
"step": 7400
},
{
"epoch": 4.738766062734456,
"grad_norm": 2.8491010665893555,
"learning_rate": 1.672006642151802e-07,
"loss": 3.9278,
"step": 7425
},
{
"epoch": 4.754729028653524,
"grad_norm": 2.427485704421997,
"learning_rate": 1.4752723134803137e-07,
"loss": 3.9174,
"step": 7450
},
{
"epoch": 4.770691994572592,
"grad_norm": 2.5240070819854736,
"learning_rate": 1.2907621465123587e-07,
"loss": 3.9097,
"step": 7475
},
{
"epoch": 4.786654960491659,
"grad_norm": 2.8449230194091797,
"learning_rate": 1.1184990337965384e-07,
"loss": 3.9645,
"step": 7500
},
{
"epoch": 4.786654960491659,
"eval_loss": 3.910543441772461,
"eval_runtime": 81.6041,
"eval_samples_per_second": 38.38,
"eval_steps_per_second": 19.19,
"step": 7500
},
{
"epoch": 4.802617926410727,
"grad_norm": 2.9600746631622314,
"learning_rate": 9.585043483647194e-08,
"loss": 3.9351,
"step": 7525
},
{
"epoch": 4.818580892329795,
"grad_norm": 2.489225149154663,
"learning_rate": 8.107979410802769e-08,
"loss": 3.9168,
"step": 7550
},
{
"epoch": 4.834543858248862,
"grad_norm": 2.658569574356079,
"learning_rate": 6.753981381751096e-08,
"loss": 3.9121,
"step": 7575
},
{
"epoch": 4.8505068241679306,
"grad_norm": 2.549792766571045,
"learning_rate": 5.523217389758695e-08,
"loss": 3.9393,
"step": 7600
},
{
"epoch": 4.866469790086998,
"grad_norm": 2.651370048522949,
"learning_rate": 4.4158401381966255e-08,
"loss": 3.9882,
"step": 7625
},
{
"epoch": 4.882432756006066,
"grad_norm": 2.443312644958496,
"learning_rate": 3.4319870215945297e-08,
"loss": 3.9325,
"step": 7650
},
{
"epoch": 4.898395721925134,
"grad_norm": 2.498542070388794,
"learning_rate": 2.571780108592936e-08,
"loss": 3.8985,
"step": 7675
},
{
"epoch": 4.914358687844201,
"grad_norm": 2.87577748298645,
"learning_rate": 1.8353261267988198e-08,
"loss": 3.9177,
"step": 7700
},
{
"epoch": 4.9303216537632695,
"grad_norm": 2.4187259674072266,
"learning_rate": 1.2227164495431932e-08,
"loss": 3.8866,
"step": 7725
},
{
"epoch": 4.946284619682337,
"grad_norm": 2.483729839324951,
"learning_rate": 7.3402708454450855e-09,
"loss": 3.9082,
"step": 7750
},
{
"epoch": 4.962247585601405,
"grad_norm": 2.356332778930664,
"learning_rate": 3.6931866447798004e-09,
"loss": 3.8904,
"step": 7775
},
{
"epoch": 4.978210551520473,
"grad_norm": 2.570075273513794,
"learning_rate": 1.2863643945282278e-09,
"loss": 3.9291,
"step": 7800
},
{
"epoch": 4.99417351743954,
"grad_norm": 2.4640989303588867,
"learning_rate": 1.201027139852151e-10,
"loss": 3.9131,
"step": 7825
}
],
"logging_steps": 25,
"max_steps": 7835,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.001
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.636798611456e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}