Files
GPT-2_academic_style_tune/trainer_state.json
ModelHub XC f6b4a7a3fd 初始化项目,由ModelHub XC社区提供模型
Model: Joshua-Sun-CompSci/GPT-2_academic_style_tune
Source: Original Platform
2026-05-13 22:08:20 +08:00

875 lines
21 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.20870265914585,
"eval_steps": 500,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010072522159548751,
"grad_norm": 2.187725782394409,
"learning_rate": 4.975322320709106e-05,
"loss": 3.892,
"step": 100
},
{
"epoch": 0.020145044319097503,
"grad_norm": 1.7743529081344604,
"learning_rate": 4.950141015310234e-05,
"loss": 3.7714,
"step": 200
},
{
"epoch": 0.030217566478646252,
"grad_norm": 1.555106520652771,
"learning_rate": 4.924959709911362e-05,
"loss": 3.7284,
"step": 300
},
{
"epoch": 0.040290088638195005,
"grad_norm": 1.5166494846343994,
"learning_rate": 4.89977840451249e-05,
"loss": 3.691,
"step": 400
},
{
"epoch": 0.050362610797743755,
"grad_norm": 1.30672025680542,
"learning_rate": 4.874597099113618e-05,
"loss": 3.6623,
"step": 500
},
{
"epoch": 0.060435132957292505,
"grad_norm": 1.320106863975525,
"learning_rate": 4.8494157937147464e-05,
"loss": 3.6443,
"step": 600
},
{
"epoch": 0.07050765511684126,
"grad_norm": 1.256370186805725,
"learning_rate": 4.8242344883158744e-05,
"loss": 3.6066,
"step": 700
},
{
"epoch": 0.08058017727639001,
"grad_norm": 1.2294926643371582,
"learning_rate": 4.7990531829170024e-05,
"loss": 3.5984,
"step": 800
},
{
"epoch": 0.09065269943593876,
"grad_norm": 1.1439231634140015,
"learning_rate": 4.7738718775181304e-05,
"loss": 3.5885,
"step": 900
},
{
"epoch": 0.10072522159548751,
"grad_norm": 1.132494330406189,
"learning_rate": 4.7486905721192584e-05,
"loss": 3.5649,
"step": 1000
},
{
"epoch": 0.11079774375503626,
"grad_norm": 1.2308531999588013,
"learning_rate": 4.723509266720387e-05,
"loss": 3.5437,
"step": 1100
},
{
"epoch": 0.12087026591458501,
"grad_norm": 1.0526450872421265,
"learning_rate": 4.698327961321515e-05,
"loss": 3.5426,
"step": 1200
},
{
"epoch": 0.13094278807413376,
"grad_norm": 1.063685655593872,
"learning_rate": 4.673146655922643e-05,
"loss": 3.5359,
"step": 1300
},
{
"epoch": 0.14101531023368252,
"grad_norm": 1.0278949737548828,
"learning_rate": 4.647965350523771e-05,
"loss": 3.5286,
"step": 1400
},
{
"epoch": 0.15108783239323126,
"grad_norm": 1.0139474868774414,
"learning_rate": 4.622784045124899e-05,
"loss": 3.5071,
"step": 1500
},
{
"epoch": 0.16116035455278002,
"grad_norm": 1.0838497877120972,
"learning_rate": 4.597602739726028e-05,
"loss": 3.5132,
"step": 1600
},
{
"epoch": 0.17123287671232876,
"grad_norm": 0.9688405394554138,
"learning_rate": 4.572421434327156e-05,
"loss": 3.496,
"step": 1700
},
{
"epoch": 0.18130539887187752,
"grad_norm": 0.9470923542976379,
"learning_rate": 4.547240128928284e-05,
"loss": 3.4971,
"step": 1800
},
{
"epoch": 0.19137792103142626,
"grad_norm": 0.9688748121261597,
"learning_rate": 4.522058823529412e-05,
"loss": 3.4827,
"step": 1900
},
{
"epoch": 0.20145044319097502,
"grad_norm": 0.9359450936317444,
"learning_rate": 4.49687751813054e-05,
"loss": 3.4771,
"step": 2000
},
{
"epoch": 0.21152296535052378,
"grad_norm": 0.9809269309043884,
"learning_rate": 4.4716962127316684e-05,
"loss": 3.4685,
"step": 2100
},
{
"epoch": 0.22159548751007252,
"grad_norm": 0.9238491058349609,
"learning_rate": 4.4465149073327964e-05,
"loss": 3.4603,
"step": 2200
},
{
"epoch": 0.23166800966962128,
"grad_norm": 0.9273353815078735,
"learning_rate": 4.4213336019339244e-05,
"loss": 3.4639,
"step": 2300
},
{
"epoch": 0.24174053182917002,
"grad_norm": 0.9298893809318542,
"learning_rate": 4.3961522965350524e-05,
"loss": 3.4526,
"step": 2400
},
{
"epoch": 0.2518130539887188,
"grad_norm": 0.9529768824577332,
"learning_rate": 4.370970991136181e-05,
"loss": 3.4473,
"step": 2500
},
{
"epoch": 0.2618855761482675,
"grad_norm": 0.9361433982849121,
"learning_rate": 4.345789685737309e-05,
"loss": 3.4397,
"step": 2600
},
{
"epoch": 0.27195809830781625,
"grad_norm": 0.913864254951477,
"learning_rate": 4.320608380338437e-05,
"loss": 3.4421,
"step": 2700
},
{
"epoch": 0.28203062046736505,
"grad_norm": 0.9004995226860046,
"learning_rate": 4.295427074939565e-05,
"loss": 3.4216,
"step": 2800
},
{
"epoch": 0.2921031426269138,
"grad_norm": 0.9446254968643188,
"learning_rate": 4.270245769540693e-05,
"loss": 3.4334,
"step": 2900
},
{
"epoch": 0.3021756647864625,
"grad_norm": 0.902070939540863,
"learning_rate": 4.245064464141822e-05,
"loss": 3.4257,
"step": 3000
},
{
"epoch": 0.3122481869460113,
"grad_norm": 0.8563255071640015,
"learning_rate": 4.21988315874295e-05,
"loss": 3.4336,
"step": 3100
},
{
"epoch": 0.32232070910556004,
"grad_norm": 0.8490210175514221,
"learning_rate": 4.194701853344078e-05,
"loss": 3.4212,
"step": 3200
},
{
"epoch": 0.3323932312651088,
"grad_norm": 0.8508225679397583,
"learning_rate": 4.169520547945206e-05,
"loss": 3.4186,
"step": 3300
},
{
"epoch": 0.3424657534246575,
"grad_norm": 0.8672498464584351,
"learning_rate": 4.144339242546334e-05,
"loss": 3.4149,
"step": 3400
},
{
"epoch": 0.3525382755842063,
"grad_norm": 0.9109395146369934,
"learning_rate": 4.119157937147462e-05,
"loss": 3.4027,
"step": 3500
},
{
"epoch": 0.36261079774375504,
"grad_norm": 0.9174132347106934,
"learning_rate": 4.09397663174859e-05,
"loss": 3.4152,
"step": 3600
},
{
"epoch": 0.3726833199033038,
"grad_norm": 0.8484577536582947,
"learning_rate": 4.068795326349718e-05,
"loss": 3.3936,
"step": 3700
},
{
"epoch": 0.3827558420628525,
"grad_norm": 0.8233897089958191,
"learning_rate": 4.043614020950846e-05,
"loss": 3.3997,
"step": 3800
},
{
"epoch": 0.3928283642224013,
"grad_norm": 0.8456436991691589,
"learning_rate": 4.018432715551974e-05,
"loss": 3.3947,
"step": 3900
},
{
"epoch": 0.40290088638195004,
"grad_norm": 0.8862765431404114,
"learning_rate": 3.993251410153103e-05,
"loss": 3.3872,
"step": 4000
},
{
"epoch": 0.4129734085414988,
"grad_norm": 0.8666160106658936,
"learning_rate": 3.968070104754231e-05,
"loss": 3.3917,
"step": 4100
},
{
"epoch": 0.42304593070104757,
"grad_norm": 0.8640286922454834,
"learning_rate": 3.942888799355359e-05,
"loss": 3.3808,
"step": 4200
},
{
"epoch": 0.4331184528605963,
"grad_norm": 0.874873161315918,
"learning_rate": 3.917707493956487e-05,
"loss": 3.376,
"step": 4300
},
{
"epoch": 0.44319097502014504,
"grad_norm": 0.8452139496803284,
"learning_rate": 3.892526188557615e-05,
"loss": 3.3868,
"step": 4400
},
{
"epoch": 0.4532634971796938,
"grad_norm": 0.9129024147987366,
"learning_rate": 3.8673448831587436e-05,
"loss": 3.377,
"step": 4500
},
{
"epoch": 0.46333601933924257,
"grad_norm": 0.8627080321311951,
"learning_rate": 3.8421635777598716e-05,
"loss": 3.3755,
"step": 4600
},
{
"epoch": 0.4734085414987913,
"grad_norm": 0.8462901711463928,
"learning_rate": 3.8169822723609996e-05,
"loss": 3.3718,
"step": 4700
},
{
"epoch": 0.48348106365834004,
"grad_norm": 0.8301746249198914,
"learning_rate": 3.7918009669621276e-05,
"loss": 3.3731,
"step": 4800
},
{
"epoch": 0.4935535858178888,
"grad_norm": 0.8276001214981079,
"learning_rate": 3.7666196615632556e-05,
"loss": 3.3711,
"step": 4900
},
{
"epoch": 0.5036261079774376,
"grad_norm": 0.8247309327125549,
"learning_rate": 3.741438356164384e-05,
"loss": 3.3684,
"step": 5000
},
{
"epoch": 0.5136986301369864,
"grad_norm": 0.8287461400032043,
"learning_rate": 3.716257050765512e-05,
"loss": 3.3692,
"step": 5100
},
{
"epoch": 0.523771152296535,
"grad_norm": 0.8390397429466248,
"learning_rate": 3.69107574536664e-05,
"loss": 3.3606,
"step": 5200
},
{
"epoch": 0.5338436744560838,
"grad_norm": 0.814963161945343,
"learning_rate": 3.665894439967768e-05,
"loss": 3.3622,
"step": 5300
},
{
"epoch": 0.5439161966156325,
"grad_norm": 0.8671918511390686,
"learning_rate": 3.640713134568896e-05,
"loss": 3.3495,
"step": 5400
},
{
"epoch": 0.5539887187751813,
"grad_norm": 0.8633044958114624,
"learning_rate": 3.615531829170024e-05,
"loss": 3.3521,
"step": 5500
},
{
"epoch": 0.5640612409347301,
"grad_norm": 0.8254402279853821,
"learning_rate": 3.590350523771153e-05,
"loss": 3.3419,
"step": 5600
},
{
"epoch": 0.5741337630942788,
"grad_norm": 0.8266107439994812,
"learning_rate": 3.565169218372281e-05,
"loss": 3.3529,
"step": 5700
},
{
"epoch": 0.5842062852538276,
"grad_norm": 0.8199731111526489,
"learning_rate": 3.539987912973409e-05,
"loss": 3.3425,
"step": 5800
},
{
"epoch": 0.5942788074133764,
"grad_norm": 0.8126055002212524,
"learning_rate": 3.514806607574537e-05,
"loss": 3.345,
"step": 5900
},
{
"epoch": 0.604351329572925,
"grad_norm": 0.8237761855125427,
"learning_rate": 3.489625302175665e-05,
"loss": 3.3411,
"step": 6000
},
{
"epoch": 0.6144238517324738,
"grad_norm": 0.8377647995948792,
"learning_rate": 3.4644439967767936e-05,
"loss": 3.3342,
"step": 6100
},
{
"epoch": 0.6244963738920226,
"grad_norm": 0.8051643371582031,
"learning_rate": 3.43951450443191e-05,
"loss": 3.3316,
"step": 6200
},
{
"epoch": 0.6345688960515713,
"grad_norm": 0.8488379716873169,
"learning_rate": 3.414333199033038e-05,
"loss": 3.3368,
"step": 6300
},
{
"epoch": 0.6446414182111201,
"grad_norm": 0.8671479225158691,
"learning_rate": 3.3891518936341657e-05,
"loss": 3.3372,
"step": 6400
},
{
"epoch": 0.6547139403706688,
"grad_norm": 0.8239823579788208,
"learning_rate": 3.363970588235294e-05,
"loss": 3.3266,
"step": 6500
},
{
"epoch": 0.6647864625302176,
"grad_norm": 0.8625733852386475,
"learning_rate": 3.338789282836422e-05,
"loss": 3.3313,
"step": 6600
},
{
"epoch": 0.6748589846897664,
"grad_norm": 0.8307435512542725,
"learning_rate": 3.31360797743755e-05,
"loss": 3.3334,
"step": 6700
},
{
"epoch": 0.684931506849315,
"grad_norm": 0.8079032301902771,
"learning_rate": 3.288426672038678e-05,
"loss": 3.3326,
"step": 6800
},
{
"epoch": 0.6950040290088638,
"grad_norm": 0.8174043893814087,
"learning_rate": 3.263245366639806e-05,
"loss": 3.3257,
"step": 6900
},
{
"epoch": 0.7050765511684126,
"grad_norm": 0.8356669545173645,
"learning_rate": 3.238064061240934e-05,
"loss": 3.3254,
"step": 7000
},
{
"epoch": 0.7151490733279613,
"grad_norm": 0.8560519814491272,
"learning_rate": 3.212882755842063e-05,
"loss": 3.3214,
"step": 7100
},
{
"epoch": 0.7252215954875101,
"grad_norm": 0.7916297912597656,
"learning_rate": 3.187701450443191e-05,
"loss": 3.322,
"step": 7200
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.8525456786155701,
"learning_rate": 3.162520145044319e-05,
"loss": 3.3126,
"step": 7300
},
{
"epoch": 0.7453666398066076,
"grad_norm": 0.7955446243286133,
"learning_rate": 3.137338839645447e-05,
"loss": 3.3141,
"step": 7400
},
{
"epoch": 0.7554391619661563,
"grad_norm": 0.85768061876297,
"learning_rate": 3.1121575342465756e-05,
"loss": 3.3084,
"step": 7500
},
{
"epoch": 0.765511684125705,
"grad_norm": 0.8224223256111145,
"learning_rate": 3.0869762288477036e-05,
"loss": 3.3137,
"step": 7600
},
{
"epoch": 0.7755842062852538,
"grad_norm": 0.8332231640815735,
"learning_rate": 3.0617949234488316e-05,
"loss": 3.3034,
"step": 7700
},
{
"epoch": 0.7856567284448026,
"grad_norm": 0.8170804381370544,
"learning_rate": 3.03661361804996e-05,
"loss": 3.3089,
"step": 7800
},
{
"epoch": 0.7957292506043513,
"grad_norm": 0.8121609091758728,
"learning_rate": 3.011432312651088e-05,
"loss": 3.3153,
"step": 7900
},
{
"epoch": 0.8058017727639001,
"grad_norm": 0.8763326406478882,
"learning_rate": 2.986251007252216e-05,
"loss": 3.2977,
"step": 8000
},
{
"epoch": 0.8158742949234489,
"grad_norm": 0.8313045501708984,
"learning_rate": 2.961069701853344e-05,
"loss": 3.3112,
"step": 8100
},
{
"epoch": 0.8259468170829976,
"grad_norm": 0.8360570669174194,
"learning_rate": 2.9358883964544726e-05,
"loss": 3.2993,
"step": 8200
},
{
"epoch": 0.8360193392425463,
"grad_norm": 0.8124191761016846,
"learning_rate": 2.9107070910556006e-05,
"loss": 3.3043,
"step": 8300
},
{
"epoch": 0.8460918614020951,
"grad_norm": 0.8028224110603333,
"learning_rate": 2.8855257856567286e-05,
"loss": 3.2932,
"step": 8400
},
{
"epoch": 0.8561643835616438,
"grad_norm": 0.8234061002731323,
"learning_rate": 2.8605962933118454e-05,
"loss": 3.2919,
"step": 8500
},
{
"epoch": 0.8662369057211926,
"grad_norm": 0.8246094584465027,
"learning_rate": 2.8354149879129737e-05,
"loss": 3.3036,
"step": 8600
},
{
"epoch": 0.8763094278807413,
"grad_norm": 0.7993488311767578,
"learning_rate": 2.8102336825141017e-05,
"loss": 3.301,
"step": 8700
},
{
"epoch": 0.8863819500402901,
"grad_norm": 0.8250619173049927,
"learning_rate": 2.7850523771152297e-05,
"loss": 3.3014,
"step": 8800
},
{
"epoch": 0.8964544721998389,
"grad_norm": 0.7909018993377686,
"learning_rate": 2.7598710717163577e-05,
"loss": 3.2997,
"step": 8900
},
{
"epoch": 0.9065269943593876,
"grad_norm": 0.8329810500144958,
"learning_rate": 2.734689766317486e-05,
"loss": 3.2909,
"step": 9000
},
{
"epoch": 0.9165995165189363,
"grad_norm": 0.7919082045555115,
"learning_rate": 2.709508460918614e-05,
"loss": 3.2974,
"step": 9100
},
{
"epoch": 0.9266720386784851,
"grad_norm": 0.8208735585212708,
"learning_rate": 2.6843271555197423e-05,
"loss": 3.2885,
"step": 9200
},
{
"epoch": 0.9367445608380338,
"grad_norm": 0.8451607823371887,
"learning_rate": 2.6591458501208703e-05,
"loss": 3.2895,
"step": 9300
},
{
"epoch": 0.9468170829975826,
"grad_norm": 0.7859387993812561,
"learning_rate": 2.6339645447219983e-05,
"loss": 3.2808,
"step": 9400
},
{
"epoch": 0.9568896051571314,
"grad_norm": 0.8202655911445618,
"learning_rate": 2.6087832393231267e-05,
"loss": 3.2822,
"step": 9500
},
{
"epoch": 0.9669621273166801,
"grad_norm": 0.8055428266525269,
"learning_rate": 2.5836019339242547e-05,
"loss": 3.2767,
"step": 9600
},
{
"epoch": 0.9770346494762289,
"grad_norm": 0.8379536867141724,
"learning_rate": 2.558420628525383e-05,
"loss": 3.2846,
"step": 9700
},
{
"epoch": 0.9871071716357775,
"grad_norm": 0.8224022388458252,
"learning_rate": 2.533239323126511e-05,
"loss": 3.2822,
"step": 9800
},
{
"epoch": 0.9971796937953263,
"grad_norm": 0.7945728898048401,
"learning_rate": 2.508058017727639e-05,
"loss": 3.2909,
"step": 9900
},
{
"epoch": 1.0072522159548751,
"grad_norm": 0.8296171426773071,
"learning_rate": 2.4828767123287673e-05,
"loss": 3.2785,
"step": 10000
},
{
"epoch": 1.017324738114424,
"grad_norm": 0.8091270327568054,
"learning_rate": 2.4576954069298953e-05,
"loss": 3.2586,
"step": 10100
},
{
"epoch": 1.0273972602739727,
"grad_norm": 0.8024172782897949,
"learning_rate": 2.4325141015310236e-05,
"loss": 3.2558,
"step": 10200
},
{
"epoch": 1.0374697824335213,
"grad_norm": 0.80719393491745,
"learning_rate": 2.4073327961321516e-05,
"loss": 3.2542,
"step": 10300
},
{
"epoch": 1.04754230459307,
"grad_norm": 0.7982654571533203,
"learning_rate": 2.38215149073328e-05,
"loss": 3.2499,
"step": 10400
},
{
"epoch": 1.0576148267526189,
"grad_norm": 0.8148714303970337,
"learning_rate": 2.356970185334408e-05,
"loss": 3.2567,
"step": 10500
},
{
"epoch": 1.0676873489121677,
"grad_norm": 0.8775522708892822,
"learning_rate": 2.331788879935536e-05,
"loss": 3.2449,
"step": 10600
},
{
"epoch": 1.0777598710717164,
"grad_norm": 0.8066820502281189,
"learning_rate": 2.3066075745366643e-05,
"loss": 3.255,
"step": 10700
},
{
"epoch": 1.087832393231265,
"grad_norm": 0.8724685311317444,
"learning_rate": 2.2814262691377923e-05,
"loss": 3.2485,
"step": 10800
},
{
"epoch": 1.0979049153908138,
"grad_norm": 0.8520035147666931,
"learning_rate": 2.2562449637389206e-05,
"loss": 3.2498,
"step": 10900
},
{
"epoch": 1.1079774375503626,
"grad_norm": 0.8030887246131897,
"learning_rate": 2.2310636583400486e-05,
"loss": 3.2495,
"step": 11000
},
{
"epoch": 1.1180499597099114,
"grad_norm": 0.7946292757987976,
"learning_rate": 2.2058823529411766e-05,
"loss": 3.2576,
"step": 11100
},
{
"epoch": 1.1281224818694602,
"grad_norm": 0.8343091607093811,
"learning_rate": 2.180701047542305e-05,
"loss": 3.2547,
"step": 11200
},
{
"epoch": 1.1381950040290088,
"grad_norm": 0.8491663932800293,
"learning_rate": 2.1557715551974214e-05,
"loss": 3.2472,
"step": 11300
},
{
"epoch": 1.1482675261885575,
"grad_norm": 0.8408398628234863,
"learning_rate": 2.1305902497985497e-05,
"loss": 3.256,
"step": 11400
},
{
"epoch": 1.1583400483481063,
"grad_norm": 0.8578426837921143,
"learning_rate": 2.1054089443996777e-05,
"loss": 3.2484,
"step": 11500
},
{
"epoch": 1.1684125705076551,
"grad_norm": 0.8466009497642517,
"learning_rate": 2.080227639000806e-05,
"loss": 3.2527,
"step": 11600
},
{
"epoch": 1.178485092667204,
"grad_norm": 0.8367530107498169,
"learning_rate": 2.055046333601934e-05,
"loss": 3.2487,
"step": 11700
},
{
"epoch": 1.1885576148267527,
"grad_norm": 0.8025128245353699,
"learning_rate": 2.029865028203062e-05,
"loss": 3.2544,
"step": 11800
},
{
"epoch": 1.1986301369863013,
"grad_norm": 0.8721017241477966,
"learning_rate": 2.0046837228041904e-05,
"loss": 3.248,
"step": 11900
},
{
"epoch": 1.20870265914585,
"grad_norm": 0.813346803188324,
"learning_rate": 1.9795024174053184e-05,
"loss": 3.2481,
"step": 12000
}
],
"logging_steps": 100,
"max_steps": 19856,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.0167808851968e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}