Files
polyalign-qwen2.5-1.5b-en-sft/trainer_state.json

681 lines
16 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 10000,
"global_step": 9132,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010950803515207929,
"grad_norm": 5.517458438873291,
"learning_rate": 1.0831509846827136e-06,
"loss": 2.5966,
"step": 100
},
{
"epoch": 0.021901607030415857,
"grad_norm": 5.273920059204102,
"learning_rate": 2.177242888402626e-06,
"loss": 2.3125,
"step": 200
},
{
"epoch": 0.03285241054562379,
"grad_norm": 5.7524261474609375,
"learning_rate": 3.2713347921225385e-06,
"loss": 2.1574,
"step": 300
},
{
"epoch": 0.043803214060831715,
"grad_norm": 4.415042400360107,
"learning_rate": 4.365426695842451e-06,
"loss": 1.9882,
"step": 400
},
{
"epoch": 0.05475401757603964,
"grad_norm": 3.4843761920928955,
"learning_rate": 5.459518599562363e-06,
"loss": 1.9221,
"step": 500
},
{
"epoch": 0.06570482109124758,
"grad_norm": 3.8609654903411865,
"learning_rate": 6.553610503282276e-06,
"loss": 1.9077,
"step": 600
},
{
"epoch": 0.0766556246064555,
"grad_norm": 3.310288429260254,
"learning_rate": 7.64770240700219e-06,
"loss": 1.8748,
"step": 700
},
{
"epoch": 0.08760642812166343,
"grad_norm": 3.843994140625,
"learning_rate": 8.741794310722102e-06,
"loss": 1.88,
"step": 800
},
{
"epoch": 0.09855723163687136,
"grad_norm": 3.1463918685913086,
"learning_rate": 9.835886214442013e-06,
"loss": 1.8617,
"step": 900
},
{
"epoch": 0.10950803515207928,
"grad_norm": 3.763201951980591,
"learning_rate": 9.997360588415263e-06,
"loss": 1.8323,
"step": 1000
},
{
"epoch": 0.12045883866728721,
"grad_norm": 2.8540737628936768,
"learning_rate": 9.987501154068591e-06,
"loss": 1.8238,
"step": 1100
},
{
"epoch": 0.13140964218249515,
"grad_norm": 3.5842385292053223,
"learning_rate": 9.970353900512644e-06,
"loss": 1.8213,
"step": 1200
},
{
"epoch": 0.14236044569770306,
"grad_norm": 4.1146135330200195,
"learning_rate": 9.945943883598031e-06,
"loss": 1.8147,
"step": 1300
},
{
"epoch": 0.153311249212911,
"grad_norm": 3.3109846115112305,
"learning_rate": 9.914306771645357e-06,
"loss": 1.7824,
"step": 1400
},
{
"epoch": 0.16426205272811892,
"grad_norm": 3.7267284393310547,
"learning_rate": 9.875488793326074e-06,
"loss": 1.7896,
"step": 1500
},
{
"epoch": 0.17521285624332686,
"grad_norm": 4.14414644241333,
"learning_rate": 9.82954667011238e-06,
"loss": 1.7788,
"step": 1600
},
{
"epoch": 0.18616365975853477,
"grad_norm": 3.413673162460327,
"learning_rate": 9.776547533394874e-06,
"loss": 1.7894,
"step": 1700
},
{
"epoch": 0.1971144632737427,
"grad_norm": 3.6218016147613525,
"learning_rate": 9.716568826389045e-06,
"loss": 1.752,
"step": 1800
},
{
"epoch": 0.20806526678895063,
"grad_norm": 3.2655038833618164,
"learning_rate": 9.649698190973977e-06,
"loss": 1.7587,
"step": 1900
},
{
"epoch": 0.21901607030415857,
"grad_norm": 3.188708543777466,
"learning_rate": 9.576033339628578e-06,
"loss": 1.7648,
"step": 2000
},
{
"epoch": 0.2299668738193665,
"grad_norm": 3.2714788913726807,
"learning_rate": 9.495681912652486e-06,
"loss": 1.7507,
"step": 2100
},
{
"epoch": 0.24091767733457442,
"grad_norm": 3.6338818073272705,
"learning_rate": 9.408761320880292e-06,
"loss": 1.7628,
"step": 2200
},
{
"epoch": 0.25186848084978236,
"grad_norm": 3.466273546218872,
"learning_rate": 9.315398574118876e-06,
"loss": 1.7299,
"step": 2300
},
{
"epoch": 0.2628192843649903,
"grad_norm": 3.3957815170288086,
"learning_rate": 9.215730095558582e-06,
"loss": 1.7188,
"step": 2400
},
{
"epoch": 0.2737700878801982,
"grad_norm": 3.8373427391052246,
"learning_rate": 9.10990152242939e-06,
"loss": 1.7139,
"step": 2500
},
{
"epoch": 0.28472089139540613,
"grad_norm": 3.5663390159606934,
"learning_rate": 8.998067493193395e-06,
"loss": 1.711,
"step": 2600
},
{
"epoch": 0.29567169491061407,
"grad_norm": 3.543086290359497,
"learning_rate": 8.880391421584511e-06,
"loss": 1.7143,
"step": 2700
},
{
"epoch": 0.306622498425822,
"grad_norm": 3.4232897758483887,
"learning_rate": 8.757045257825642e-06,
"loss": 1.693,
"step": 2800
},
{
"epoch": 0.31757330194102995,
"grad_norm": 3.153503179550171,
"learning_rate": 8.628209237372148e-06,
"loss": 1.6915,
"step": 2900
},
{
"epoch": 0.32852410545623784,
"grad_norm": 3.6370930671691895,
"learning_rate": 8.494071617548831e-06,
"loss": 1.6932,
"step": 3000
},
{
"epoch": 0.3394749089714458,
"grad_norm": 3.5099680423736572,
"learning_rate": 8.354828402465215e-06,
"loss": 1.6522,
"step": 3100
},
{
"epoch": 0.3504257124866537,
"grad_norm": 4.088729381561279,
"learning_rate": 8.210683056611086e-06,
"loss": 1.6759,
"step": 3200
},
{
"epoch": 0.36137651600186166,
"grad_norm": 4.009633541107178,
"learning_rate": 8.06184620755083e-06,
"loss": 1.6584,
"step": 3300
},
{
"epoch": 0.37232731951706954,
"grad_norm": 3.3517208099365234,
"learning_rate": 7.90853533815094e-06,
"loss": 1.6598,
"step": 3400
},
{
"epoch": 0.3832781230322775,
"grad_norm": 3.5369486808776855,
"learning_rate": 7.750974468790462e-06,
"loss": 1.6602,
"step": 3500
},
{
"epoch": 0.3942289265474854,
"grad_norm": 3.1376800537109375,
"learning_rate": 7.589393830018696e-06,
"loss": 1.6466,
"step": 3600
},
{
"epoch": 0.40517973006269337,
"grad_norm": 3.253649950027466,
"learning_rate": 7.4240295261385205e-06,
"loss": 1.6247,
"step": 3700
},
{
"epoch": 0.41613053357790125,
"grad_norm": 3.432814359664917,
"learning_rate": 7.2551231902068775e-06,
"loss": 1.6204,
"step": 3800
},
{
"epoch": 0.4270813370931092,
"grad_norm": 3.8177409172058105,
"learning_rate": 7.082921630956545e-06,
"loss": 1.618,
"step": 3900
},
{
"epoch": 0.43803214060831713,
"grad_norm": 3.866558313369751,
"learning_rate": 6.9076764721551385e-06,
"loss": 1.6237,
"step": 4000
},
{
"epoch": 0.4489829441235251,
"grad_norm": 3.8972673416137695,
"learning_rate": 6.729643784928295e-06,
"loss": 1.6185,
"step": 4100
},
{
"epoch": 0.459933747638733,
"grad_norm": 3.088886260986328,
"learning_rate": 6.549083713584314e-06,
"loss": 1.6114,
"step": 4200
},
{
"epoch": 0.4708845511539409,
"grad_norm": 3.4185397624969482,
"learning_rate": 6.366260095486977e-06,
"loss": 1.6123,
"step": 4300
},
{
"epoch": 0.48183535466914884,
"grad_norm": 4.178684234619141,
"learning_rate": 6.181440075532042e-06,
"loss": 1.5628,
"step": 4400
},
{
"epoch": 0.4927861581843568,
"grad_norm": 3.807727813720703,
"learning_rate": 5.99489371579069e-06,
"loss": 1.5712,
"step": 4500
},
{
"epoch": 0.5037369616995647,
"grad_norm": 3.5558791160583496,
"learning_rate": 5.806893600890361e-06,
"loss": 1.5599,
"step": 4600
},
{
"epoch": 0.5146877652147727,
"grad_norm": 3.706101894378662,
"learning_rate": 5.617714439709588e-06,
"loss": 1.5595,
"step": 4700
},
{
"epoch": 0.5256385687299806,
"grad_norm": 3.324862003326416,
"learning_rate": 5.42763266396884e-06,
"loss": 1.5747,
"step": 4800
},
{
"epoch": 0.5365893722451885,
"grad_norm": 3.485616445541382,
"learning_rate": 5.236926024303909e-06,
"loss": 1.5547,
"step": 4900
},
{
"epoch": 0.5475401757603964,
"grad_norm": 3.410731077194214,
"learning_rate": 5.045873184412099e-06,
"loss": 1.5846,
"step": 5000
},
{
"epoch": 0.5584909792756043,
"grad_norm": 3.5248425006866455,
"learning_rate": 4.854753313864212e-06,
"loss": 1.5473,
"step": 5100
},
{
"epoch": 0.5694417827908123,
"grad_norm": 3.993953227996826,
"learning_rate": 4.663845680177349e-06,
"loss": 1.5513,
"step": 5200
},
{
"epoch": 0.5803925863060202,
"grad_norm": 3.5981061458587646,
"learning_rate": 4.473429240744606e-06,
"loss": 1.5596,
"step": 5300
},
{
"epoch": 0.5913433898212281,
"grad_norm": 4.107110977172852,
"learning_rate": 4.283782235217901e-06,
"loss": 1.5334,
"step": 5400
},
{
"epoch": 0.6022941933364361,
"grad_norm": 3.5353140830993652,
"learning_rate": 4.095181778939598e-06,
"loss": 1.5183,
"step": 5500
},
{
"epoch": 0.613244996851644,
"grad_norm": 3.8818717002868652,
"learning_rate": 3.90790345801699e-06,
"loss": 1.5402,
"step": 5600
},
{
"epoch": 0.624195800366852,
"grad_norm": 3.5173981189727783,
"learning_rate": 3.7222209266313026e-06,
"loss": 1.5132,
"step": 5700
},
{
"epoch": 0.6351466038820599,
"grad_norm": 4.288154602050781,
"learning_rate": 3.538405507169692e-06,
"loss": 1.5098,
"step": 5800
},
{
"epoch": 0.6460974073972677,
"grad_norm": 3.918605089187622,
"learning_rate": 3.356725793764477e-06,
"loss": 1.5147,
"step": 5900
},
{
"epoch": 0.6570482109124757,
"grad_norm": 4.286097049713135,
"learning_rate": 3.1774472598189503e-06,
"loss": 1.5384,
"step": 6000
},
{
"epoch": 0.6679990144276836,
"grad_norm": 3.301164150238037,
"learning_rate": 3.0008318700932426e-06,
"loss": 1.5411,
"step": 6100
},
{
"epoch": 0.6789498179428916,
"grad_norm": 3.758004903793335,
"learning_rate": 2.827137697917096e-06,
"loss": 1.5003,
"step": 6200
},
{
"epoch": 0.6899006214580995,
"grad_norm": 3.1831018924713135,
"learning_rate": 2.6566185480888276e-06,
"loss": 1.5175,
"step": 6300
},
{
"epoch": 0.7008514249733074,
"grad_norm": 3.6004598140716553,
"learning_rate": 2.4895235860115652e-06,
"loss": 1.4974,
"step": 6400
},
{
"epoch": 0.7118022284885154,
"grad_norm": 3.951835870742798,
"learning_rate": 2.326096973608648e-06,
"loss": 1.497,
"step": 6500
},
{
"epoch": 0.7227530320037233,
"grad_norm": 3.2882447242736816,
"learning_rate": 2.166577512550162e-06,
"loss": 1.502,
"step": 6600
},
{
"epoch": 0.7337038355189311,
"grad_norm": 4.077866077423096,
"learning_rate": 2.0111982953120073e-06,
"loss": 1.5173,
"step": 6700
},
{
"epoch": 0.7446546390341391,
"grad_norm": 4.122990608215332,
"learning_rate": 1.8601863645773128e-06,
"loss": 1.4877,
"step": 6800
},
{
"epoch": 0.755605442549347,
"grad_norm": 3.5521764755249023,
"learning_rate": 1.7137623814779036e-06,
"loss": 1.4705,
"step": 6900
},
{
"epoch": 0.766556246064555,
"grad_norm": 3.5683555603027344,
"learning_rate": 1.5721403031606048e-06,
"loss": 1.4747,
"step": 7000
},
{
"epoch": 0.7775070495797629,
"grad_norm": 3.852078914642334,
"learning_rate": 1.43552707014953e-06,
"loss": 1.5005,
"step": 7100
},
{
"epoch": 0.7884578530949709,
"grad_norm": 3.6315114498138428,
"learning_rate": 1.3041223039611489e-06,
"loss": 1.5038,
"step": 7200
},
{
"epoch": 0.7994086566101788,
"grad_norm": 3.6797256469726562,
"learning_rate": 1.1781180154140331e-06,
"loss": 1.5086,
"step": 7300
},
{
"epoch": 0.8103594601253867,
"grad_norm": 3.5173234939575195,
"learning_rate": 1.057698324059469e-06,
"loss": 1.4665,
"step": 7400
},
{
"epoch": 0.8213102636405947,
"grad_norm": 3.8179659843444824,
"learning_rate": 9.43039189142922e-07,
"loss": 1.4686,
"step": 7500
},
{
"epoch": 0.8322610671558025,
"grad_norm": 4.368917942047119,
"learning_rate": 8.343081524894763e-07,
"loss": 1.4933,
"step": 7600
},
{
"epoch": 0.8432118706710104,
"grad_norm": 4.125387191772461,
"learning_rate": 7.316640936889491e-07,
"loss": 1.4976,
"step": 7700
},
{
"epoch": 0.8541626741862184,
"grad_norm": 4.123210906982422,
"learning_rate": 6.352569979384027e-07,
"loss": 1.4663,
"step": 7800
},
{
"epoch": 0.8651134777014263,
"grad_norm": 3.9425387382507324,
"learning_rate": 5.452277368812936e-07,
"loss": 1.4635,
"step": 7900
},
{
"epoch": 0.8760642812166343,
"grad_norm": 3.996006727218628,
"learning_rate": 4.617078627635019e-07,
"loss": 1.4739,
"step": 8000
},
{
"epoch": 0.8870150847318422,
"grad_norm": 3.8166754245758057,
"learning_rate": 3.8481941620700127e-07,
"loss": 1.4525,
"step": 8100
},
{
"epoch": 0.8979658882470501,
"grad_norm": 4.163847923278809,
"learning_rate": 3.146747478820938e-07,
"loss": 1.4467,
"step": 8200
},
{
"epoch": 0.9089166917622581,
"grad_norm": 3.3138253688812256,
"learning_rate": 2.513763543387465e-07,
"loss": 1.4893,
"step": 8300
},
{
"epoch": 0.919867495277466,
"grad_norm": 3.702721118927002,
"learning_rate": 1.9501672823693584e-07,
"loss": 1.4246,
"step": 8400
},
{
"epoch": 0.9308182987926739,
"grad_norm": 3.539092779159546,
"learning_rate": 1.4567822319484614e-07,
"loss": 1.456,
"step": 8500
},
{
"epoch": 0.9417691023078818,
"grad_norm": 4.07131814956665,
"learning_rate": 1.0343293345239702e-07,
"loss": 1.4473,
"step": 8600
},
{
"epoch": 0.9527199058230897,
"grad_norm": 3.898056745529175,
"learning_rate": 6.834258852594866e-08,
"loss": 1.4813,
"step": 8700
},
{
"epoch": 0.9636707093382977,
"grad_norm": 3.525865316390991,
"learning_rate": 4.045846300811229e-08,
"loss": 1.4259,
"step": 8800
},
{
"epoch": 0.9746215128535056,
"grad_norm": 3.3846275806427,
"learning_rate": 1.9821301644462056e-08,
"loss": 1.4595,
"step": 8900
},
{
"epoch": 0.9855723163687136,
"grad_norm": 3.4979400634765625,
"learning_rate": 6.461259796644026e-09,
"loss": 1.4601,
"step": 9000
},
{
"epoch": 0.9965231198839215,
"grad_norm": 3.377657413482666,
"learning_rate": 3.978593788622753e-10,
"loss": 1.4738,
"step": 9100
},
{
"epoch": 1.0,
"step": 9132,
"total_flos": 1.591591112898773e+18,
"train_loss": 1.633884816748293,
"train_runtime": 12935.5693,
"train_samples_per_second": 45.179,
"train_steps_per_second": 0.706
}
],
"logging_steps": 100,
"max_steps": 9132,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.591591112898773e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}