{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 914, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03281378178835111, "grad_norm": 7.324607115951123, "learning_rate": 9.782608695652175e-07, "loss": 1.6507, "step": 10 }, { "epoch": 0.06562756357670221, "grad_norm": 2.6718330452258137, "learning_rate": 2.065217391304348e-06, "loss": 1.5293, "step": 20 }, { "epoch": 0.09844134536505332, "grad_norm": 2.0449760430297435, "learning_rate": 3.152173913043479e-06, "loss": 1.3882, "step": 30 }, { "epoch": 0.13125512715340443, "grad_norm": 1.7054134520698259, "learning_rate": 4.239130434782609e-06, "loss": 1.3212, "step": 40 }, { "epoch": 0.16406890894175555, "grad_norm": 1.5625383091861629, "learning_rate": 5.3260869565217395e-06, "loss": 1.2339, "step": 50 }, { "epoch": 0.19688269073010664, "grad_norm": 1.5467201317331245, "learning_rate": 6.41304347826087e-06, "loss": 1.2209, "step": 60 }, { "epoch": 0.22969647251845776, "grad_norm": 1.5441409779630197, "learning_rate": 7.500000000000001e-06, "loss": 1.1981, "step": 70 }, { "epoch": 0.26251025430680885, "grad_norm": 1.5810387088382716, "learning_rate": 8.586956521739131e-06, "loss": 1.1633, "step": 80 }, { "epoch": 0.29532403609515995, "grad_norm": 1.4632034044381061, "learning_rate": 9.673913043478262e-06, "loss": 1.1815, "step": 90 }, { "epoch": 0.3281378178835111, "grad_norm": 1.6769268374195203, "learning_rate": 9.998215114657564e-06, "loss": 1.1741, "step": 100 }, { "epoch": 0.3609515996718622, "grad_norm": 1.564606037108119, "learning_rate": 9.98947588668843e-06, "loss": 1.13, "step": 110 }, { "epoch": 0.3937653814602133, "grad_norm": 1.4659283460430306, "learning_rate": 9.973467196782484e-06, "loss": 1.1339, "step": 120 }, { "epoch": 0.4265791632485644, "grad_norm": 1.3900414131292986, "learning_rate": 9.950212368945013e-06, "loss": 1.1501, "step": 130 }, { "epoch": 0.4593929450369155, "grad_norm": 1.533573901481892, "learning_rate": 9.91974528450737e-06, "loss": 1.1374, "step": 140 }, { "epoch": 0.4922067268252666, "grad_norm": 1.4930324222767686, "learning_rate": 9.882110332763275e-06, "loss": 1.1316, "step": 150 }, { "epoch": 0.5250205086136177, "grad_norm": 1.4649309232116048, "learning_rate": 9.83736234629543e-06, "loss": 1.1199, "step": 160 }, { "epoch": 0.5578342904019689, "grad_norm": 1.4519140933698538, "learning_rate": 9.785566521086695e-06, "loss": 1.1163, "step": 170 }, { "epoch": 0.5906480721903199, "grad_norm": 1.4591675768161227, "learning_rate": 9.726798321532205e-06, "loss": 1.125, "step": 180 }, { "epoch": 0.623461853978671, "grad_norm": 1.3084115670512635, "learning_rate": 9.661143370490846e-06, "loss": 1.1385, "step": 190 }, { "epoch": 0.6562756357670222, "grad_norm": 1.428698415469623, "learning_rate": 9.588697324536254e-06, "loss": 1.0995, "step": 200 }, { "epoch": 0.6890894175553732, "grad_norm": 1.3195726786177668, "learning_rate": 9.509565734589105e-06, "loss": 1.105, "step": 210 }, { "epoch": 0.7219031993437244, "grad_norm": 1.4644538779508807, "learning_rate": 9.423863892133754e-06, "loss": 1.0949, "step": 220 }, { "epoch": 0.7547169811320755, "grad_norm": 1.4442654685795682, "learning_rate": 9.33171666124326e-06, "loss": 1.1097, "step": 230 }, { "epoch": 0.7875307629204266, "grad_norm": 1.4628666514015816, "learning_rate": 9.233258296657547e-06, "loss": 1.0915, "step": 240 }, { "epoch": 0.8203445447087777, "grad_norm": 1.4584300507718668, "learning_rate": 9.128632248179761e-06, "loss": 1.0952, "step": 250 }, { "epoch": 0.8531583264971287, "grad_norm": 1.3907705989649481, "learning_rate": 9.017990951675764e-06, "loss": 1.1072, "step": 260 }, { "epoch": 0.8859721082854799, "grad_norm": 1.44611189547518, "learning_rate": 8.901495606981339e-06, "loss": 1.0908, "step": 270 }, { "epoch": 0.918785890073831, "grad_norm": 1.4283544277374125, "learning_rate": 8.779315943040629e-06, "loss": 1.0934, "step": 280 }, { "epoch": 0.9515996718621821, "grad_norm": 1.3679340439375278, "learning_rate": 8.65162997061802e-06, "loss": 1.0902, "step": 290 }, { "epoch": 0.9844134536505332, "grad_norm": 1.4053097778093493, "learning_rate": 8.518623722943747e-06, "loss": 1.0826, "step": 300 }, { "epoch": 1.0164068908941755, "grad_norm": 1.358051135209502, "learning_rate": 8.380490984671105e-06, "loss": 1.0004, "step": 310 }, { "epoch": 1.0492206726825266, "grad_norm": 1.4920803109368017, "learning_rate": 8.23743300954015e-06, "loss": 0.9529, "step": 320 }, { "epoch": 1.0820344544708778, "grad_norm": 1.391421362612517, "learning_rate": 8.089658227159239e-06, "loss": 0.9108, "step": 330 }, { "epoch": 1.114848236259229, "grad_norm": 1.6241278760522855, "learning_rate": 7.937381939331628e-06, "loss": 0.9279, "step": 340 }, { "epoch": 1.14766201804758, "grad_norm": 1.4776439598223472, "learning_rate": 7.780826006369586e-06, "loss": 0.9332, "step": 350 }, { "epoch": 1.1804757998359312, "grad_norm": 1.531082298396227, "learning_rate": 7.620218523852987e-06, "loss": 0.9503, "step": 360 }, { "epoch": 1.2132895816242821, "grad_norm": 1.4148212125540105, "learning_rate": 7.4557934903034035e-06, "loss": 0.9409, "step": 370 }, { "epoch": 1.2461033634126333, "grad_norm": 1.569873004540387, "learning_rate": 7.287790466257854e-06, "loss": 0.9228, "step": 380 }, { "epoch": 1.2789171452009844, "grad_norm": 1.50216408338208, "learning_rate": 7.116454225238909e-06, "loss": 0.9354, "step": 390 }, { "epoch": 1.3117309269893356, "grad_norm": 1.4646843748186438, "learning_rate": 6.942034397129702e-06, "loss": 0.9372, "step": 400 }, { "epoch": 1.3445447087776867, "grad_norm": 1.5763941611606018, "learning_rate": 6.764785104473411e-06, "loss": 0.9169, "step": 410 }, { "epoch": 1.3773584905660377, "grad_norm": 1.4076340592270304, "learning_rate": 6.584964592227135e-06, "loss": 0.9235, "step": 420 }, { "epoch": 1.4101722723543888, "grad_norm": 1.5649588493962718, "learning_rate": 6.402834851509564e-06, "loss": 0.926, "step": 430 }, { "epoch": 1.44298605414274, "grad_norm": 1.370995330259522, "learning_rate": 6.2186612378906545e-06, "loss": 0.9327, "step": 440 }, { "epoch": 1.475799835931091, "grad_norm": 1.4405579132350896, "learning_rate": 6.0327120847794415e-06, "loss": 0.9461, "step": 450 }, { "epoch": 1.5086136177194422, "grad_norm": 1.5139954304026377, "learning_rate": 5.845258312473252e-06, "loss": 0.9479, "step": 460 }, { "epoch": 1.5414273995077932, "grad_norm": 1.5381263145609851, "learning_rate": 5.656573033437932e-06, "loss": 0.9217, "step": 470 }, { "epoch": 1.5742411812961445, "grad_norm": 1.5176651174503644, "learning_rate": 5.466931154394171e-06, "loss": 0.9402, "step": 480 }, { "epoch": 1.6070549630844955, "grad_norm": 1.64200137668156, "learning_rate": 5.276608975789683e-06, "loss": 0.925, "step": 490 }, { "epoch": 1.6398687448728466, "grad_norm": 1.4813527495999301, "learning_rate": 5.085883789240764e-06, "loss": 0.9268, "step": 500 }, { "epoch": 1.6726825266611978, "grad_norm": 1.5116269108263887, "learning_rate": 4.8950334735297746e-06, "loss": 0.9095, "step": 510 }, { "epoch": 1.7054963084495487, "grad_norm": 1.4490396822436944, "learning_rate": 4.704336089747135e-06, "loss": 0.9341, "step": 520 }, { "epoch": 1.7383100902379, "grad_norm": 1.5628125198488163, "learning_rate": 4.514069476167716e-06, "loss": 0.932, "step": 530 }, { "epoch": 1.771123872026251, "grad_norm": 1.4687886059027409, "learning_rate": 4.324510843451851e-06, "loss": 0.9311, "step": 540 }, { "epoch": 1.8039376538146021, "grad_norm": 1.5227999474981233, "learning_rate": 4.135936370760759e-06, "loss": 0.9046, "step": 550 }, { "epoch": 1.8367514356029533, "grad_norm": 1.4388610554454382, "learning_rate": 3.9486208033748315e-06, "loss": 0.9378, "step": 560 }, { "epoch": 1.8695652173913042, "grad_norm": 1.3581055291483148, "learning_rate": 3.762837052401004e-06, "loss": 0.9235, "step": 570 }, { "epoch": 1.9023789991796556, "grad_norm": 1.5099266943158, "learning_rate": 3.5788557971524695e-06, "loss": 0.9444, "step": 580 }, { "epoch": 1.9351927809680065, "grad_norm": 1.5169803759126903, "learning_rate": 3.3969450907799966e-06, "loss": 0.9279, "step": 590 }, { "epoch": 1.9680065627563577, "grad_norm": 1.3979513991441848, "learning_rate": 3.217369969729476e-06, "loss": 0.9115, "step": 600 }, { "epoch": 2.0032813781788352, "grad_norm": 2.105088244846462, "learning_rate": 3.0403920675946826e-06, "loss": 0.8327, "step": 610 }, { "epoch": 2.036095159967186, "grad_norm": 1.6690335971877808, "learning_rate": 2.8662692339278387e-06, "loss": 0.7782, "step": 620 }, { "epoch": 2.0689089417555375, "grad_norm": 1.705302067748598, "learning_rate": 2.6952551585633947e-06, "loss": 0.7875, "step": 630 }, { "epoch": 2.1017227235438884, "grad_norm": 1.6066988675182772, "learning_rate": 2.52759900200232e-06, "loss": 0.7708, "step": 640 }, { "epoch": 2.1345365053322394, "grad_norm": 1.5203406830167119, "learning_rate": 2.3635450323954773e-06, "loss": 0.7927, "step": 650 }, { "epoch": 2.1673502871205907, "grad_norm": 1.626389642459301, "learning_rate": 2.2033322696549197e-06, "loss": 0.7885, "step": 660 }, { "epoch": 2.2001640689089417, "grad_norm": 1.7664606984797573, "learning_rate": 2.0471941372116793e-06, "loss": 0.7626, "step": 670 }, { "epoch": 2.232977850697293, "grad_norm": 1.5793686740276833, "learning_rate": 1.8953581219273987e-06, "loss": 0.7754, "step": 680 }, { "epoch": 2.265791632485644, "grad_norm": 1.6724958298486645, "learning_rate": 1.7480454426552773e-06, "loss": 0.7783, "step": 690 }, { "epoch": 2.298605414273995, "grad_norm": 1.5819324740235794, "learning_rate": 1.6054707279332865e-06, "loss": 0.7705, "step": 700 }, { "epoch": 2.3314191960623463, "grad_norm": 1.6323457128335996, "learning_rate": 1.4678417032791653e-06, "loss": 0.7699, "step": 710 }, { "epoch": 2.364232977850697, "grad_norm": 1.7857816578985155, "learning_rate": 1.335358888542862e-06, "loss": 0.7526, "step": 720 }, { "epoch": 2.3970467596390486, "grad_norm": 1.5935449756290072, "learning_rate": 1.20821530575733e-06, "loss": 0.7918, "step": 730 }, { "epoch": 2.4298605414273995, "grad_norm": 1.693473193818772, "learning_rate": 1.0865961979133245e-06, "loss": 0.7815, "step": 740 }, { "epoch": 2.462674323215751, "grad_norm": 1.6977380314021282, "learning_rate": 9.706787590679685e-07, "loss": 0.7731, "step": 750 }, { "epoch": 2.495488105004102, "grad_norm": 1.701374033891568, "learning_rate": 8.606318761802584e-07, "loss": 0.7666, "step": 760 }, { "epoch": 2.5283018867924527, "grad_norm": 1.8239897351347403, "learning_rate": 7.566158830496917e-07, "loss": 0.7657, "step": 770 }, { "epoch": 2.561115668580804, "grad_norm": 1.7594557186973525, "learning_rate": 6.587823267164911e-07, "loss": 0.7798, "step": 780 }, { "epoch": 2.593929450369155, "grad_norm": 1.6074400677010736, "learning_rate": 5.672737466637701e-07, "loss": 0.7816, "step": 790 }, { "epoch": 2.626743232157506, "grad_norm": 1.589497024974972, "learning_rate": 4.822234671433552e-07, "loss": 0.7837, "step": 800 }, { "epoch": 2.6595570139458573, "grad_norm": 1.5745731469005892, "learning_rate": 4.03755402927804e-07, "loss": 0.7747, "step": 810 }, { "epoch": 2.6923707957342082, "grad_norm": 1.6156048098137663, "learning_rate": 3.319838787716634e-07, "loss": 0.7793, "step": 820 }, { "epoch": 2.7251845775225596, "grad_norm": 1.6695478870964964, "learning_rate": 2.6701346284499e-07, "loss": 0.7542, "step": 830 }, { "epoch": 2.7579983593109105, "grad_norm": 1.5107440160015784, "learning_rate": 2.0893881438180275e-07, "loss": 0.7844, "step": 840 }, { "epoch": 2.790812141099262, "grad_norm": 1.6477028499246302, "learning_rate": 1.578445457654637e-07, "loss": 0.7643, "step": 850 }, { "epoch": 2.823625922887613, "grad_norm": 1.5854989560185238, "learning_rate": 1.1380509925189853e-07, "loss": 0.7673, "step": 860 }, { "epoch": 2.8564397046759638, "grad_norm": 1.6787763051906506, "learning_rate": 7.688463851028227e-08, "loss": 0.769, "step": 870 }, { "epoch": 2.889253486464315, "grad_norm": 1.581911801204859, "learning_rate": 4.713695513920147e-08, "loss": 0.7799, "step": 880 }, { "epoch": 2.922067268252666, "grad_norm": 1.697466745242019, "learning_rate": 2.4605390294497043e-08, "loss": 0.785, "step": 890 }, { "epoch": 2.954881050041017, "grad_norm": 1.6425019567953216, "learning_rate": 9.322771542978892e-09, "loss": 0.7753, "step": 900 }, { "epoch": 2.9876948318293683, "grad_norm": 1.5783466963735138, "learning_rate": 1.3113650340046413e-09, "loss": 0.7911, "step": 910 }, { "epoch": 3.0, "step": 914, "total_flos": 181229773520896.0, "train_loss": 0.266992773216715, "train_runtime": 3529.0775, "train_samples_per_second": 16.577, "train_steps_per_second": 0.259 } ], "logging_steps": 10, "max_steps": 915, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 181229773520896.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }