{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 50, "global_step": 208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.048154093097913325, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 3.2127, "mean_token_accuracy": 0.46168680985768634, "num_tokens": 75485.0, "step": 5 }, { "epoch": 0.09630818619582665, "grad_norm": 64.61959180680725, "learning_rate": 7.142857142857143e-07, "loss": 3.0841, "mean_token_accuracy": 0.475005774696668, "num_tokens": 152218.0, "step": 10 }, { "epoch": 0.14446227929373998, "grad_norm": 17.215265821295326, "learning_rate": 1.904761904761905e-06, "loss": 2.495, "mean_token_accuracy": 0.5029486671090126, "num_tokens": 229001.0, "step": 15 }, { "epoch": 0.1926163723916533, "grad_norm": 11.563556030723554, "learning_rate": 3.0952380952380957e-06, "loss": 2.2543, "mean_token_accuracy": 0.5284626389543216, "num_tokens": 312645.0, "step": 20 }, { "epoch": 0.24077046548956663, "grad_norm": 10.852610543715219, "learning_rate": 4.2857142857142855e-06, "loss": 2.2246, "mean_token_accuracy": 0.5318902805447578, "num_tokens": 386208.0, "step": 25 }, { "epoch": 0.28892455858747995, "grad_norm": 9.119415789634052, "learning_rate": 4.998588939359435e-06, "loss": 2.1391, "mean_token_accuracy": 0.5403730009992918, "num_tokens": 459892.0, "step": 30 }, { "epoch": 0.33707865168539325, "grad_norm": 9.556488008181512, "learning_rate": 4.982732794633588e-06, "loss": 2.0366, "mean_token_accuracy": 0.5523023113608361, "num_tokens": 539227.0, "step": 35 }, { "epoch": 0.3852327447833066, "grad_norm": 10.950335586706586, "learning_rate": 4.949368867399567e-06, "loss": 1.9687, "mean_token_accuracy": 0.5595992157856623, "num_tokens": 623998.0, "step": 40 }, { "epoch": 0.4333868378812199, "grad_norm": 13.153480823091519, "learning_rate": 4.8987324340362445e-06, "loss": 1.9844, "mean_token_accuracy": 0.553166281680266, "num_tokens": 699068.0, "step": 45 }, { "epoch": 0.48154093097913325, "grad_norm": 13.142079983198274, "learning_rate": 4.83118057351089e-06, "loss": 1.9087, "mean_token_accuracy": 0.5681869551539421, "num_tokens": 778660.0, "step": 50 }, { "epoch": 0.48154093097913325, "eval_loss": 2.167543411254883, "eval_mean_token_accuracy": 0.5312999839901079, "eval_num_tokens": 778660.0, "eval_runtime": 21.1924, "eval_samples_per_second": 39.92, "eval_steps_per_second": 6.653, "step": 50 }, { "epoch": 0.5296950240770465, "grad_norm": 12.256186585359213, "learning_rate": 4.747189649322894e-06, "loss": 1.8243, "mean_token_accuracy": 0.5790426706274351, "num_tokens": 851058.0, "step": 55 }, { "epoch": 0.5778491171749599, "grad_norm": 12.978284928577644, "learning_rate": 4.647351950274548e-06, "loss": 1.7875, "mean_token_accuracy": 0.5851869150996208, "num_tokens": 924603.0, "step": 60 }, { "epoch": 0.6260032102728732, "grad_norm": 13.029623166002976, "learning_rate": 4.532371513757564e-06, "loss": 1.7335, "mean_token_accuracy": 0.5922025551398595, "num_tokens": 1003191.0, "step": 65 }, { "epoch": 0.6741573033707865, "grad_norm": 12.579451224581598, "learning_rate": 4.403059161008762e-06, "loss": 1.6306, "mean_token_accuracy": 0.6137229397892952, "num_tokens": 1084380.0, "step": 70 }, { "epoch": 0.7223113964686998, "grad_norm": 8.781697503753147, "learning_rate": 4.2603267793453925e-06, "loss": 1.6176, "mean_token_accuracy": 0.6137419521808625, "num_tokens": 1158912.0, "step": 75 }, { "epoch": 0.7704654895666132, "grad_norm": 10.353422421141829, "learning_rate": 4.105180891700746e-06, "loss": 1.6269, "mean_token_accuracy": 0.6146789371967316, "num_tokens": 1233420.0, "step": 80 }, { "epoch": 0.8186195826645265, "grad_norm": 11.13618005328121, "learning_rate": 3.938715558806525e-06, "loss": 1.5826, "mean_token_accuracy": 0.6222126225630442, "num_tokens": 1304754.0, "step": 85 }, { "epoch": 0.8667736757624398, "grad_norm": 9.752065879303624, "learning_rate": 3.7621046640744973e-06, "loss": 1.4856, "mean_token_accuracy": 0.63288747270902, "num_tokens": 1379628.0, "step": 90 }, { "epoch": 0.9149277688603531, "grad_norm": 11.231583007810258, "learning_rate": 3.5765936355830353e-06, "loss": 1.4762, "mean_token_accuracy": 0.6418172031641006, "num_tokens": 1457505.0, "step": 95 }, { "epoch": 0.9630818619582665, "grad_norm": 11.752764891234037, "learning_rate": 3.3834906635436355e-06, "loss": 1.4247, "mean_token_accuracy": 0.6503905127445857, "num_tokens": 1534494.0, "step": 100 }, { "epoch": 0.9630818619582665, "eval_loss": 2.2062435150146484, "eval_mean_token_accuracy": 0.5240506884899545, "eval_num_tokens": 1534494.0, "eval_runtime": 21.3286, "eval_samples_per_second": 39.665, "eval_steps_per_second": 6.611, "step": 100 }, { "epoch": 1.0096308186195826, "grad_norm": 14.993373196343875, "learning_rate": 3.184157475180208e-06, "loss": 1.3243, "mean_token_accuracy": 0.6802589081484696, "num_tokens": 1612839.0, "step": 105 }, { "epoch": 1.057784911717496, "grad_norm": 14.998634173948343, "learning_rate": 2.9799997320750506e-06, "loss": 0.8052, "mean_token_accuracy": 0.7905437111854553, "num_tokens": 1694998.0, "step": 110 }, { "epoch": 1.1059390048154094, "grad_norm": 10.81772163666591, "learning_rate": 2.7724571176976734e-06, "loss": 0.7258, "mean_token_accuracy": 0.8077059368292491, "num_tokens": 1772379.0, "step": 115 }, { "epoch": 1.1540930979133226, "grad_norm": 10.11949536342127, "learning_rate": 2.562993185017431e-06, "loss": 0.7364, "mean_token_accuracy": 0.8070536454518636, "num_tokens": 1849604.0, "step": 120 }, { "epoch": 1.202247191011236, "grad_norm": 13.125126385789981, "learning_rate": 2.3530850357927563e-06, "loss": 0.6751, "mean_token_accuracy": 0.8201006094614665, "num_tokens": 1928130.0, "step": 125 }, { "epoch": 1.2504012841091492, "grad_norm": 11.420765028503823, "learning_rate": 2.1442129043167877e-06, "loss": 0.661, "mean_token_accuracy": 0.823780287305514, "num_tokens": 1998942.0, "step": 130 }, { "epoch": 1.2985553772070626, "grad_norm": 11.889207346196013, "learning_rate": 1.937849719072931e-06, "loss": 0.6308, "mean_token_accuracy": 0.8344957828521729, "num_tokens": 2077600.0, "step": 135 }, { "epoch": 1.346709470304976, "grad_norm": 12.096248248433344, "learning_rate": 1.7354507159096649e-06, "loss": 0.6068, "mean_token_accuracy": 0.8379041075706481, "num_tokens": 2156924.0, "step": 140 }, { "epoch": 1.3948635634028892, "grad_norm": 11.293697729091125, "learning_rate": 1.5384431759806085e-06, "loss": 0.5968, "mean_token_accuracy": 0.8397696912288666, "num_tokens": 2231114.0, "step": 145 }, { "epoch": 1.4430176565008026, "grad_norm": 9.124895851710205, "learning_rate": 1.348216360816041e-06, "loss": 0.5947, "mean_token_accuracy": 0.8452269424994786, "num_tokens": 2309259.0, "step": 150 }, { "epoch": 1.4430176565008026, "eval_loss": 2.382331132888794, "eval_mean_token_accuracy": 0.5126862906395121, "eval_num_tokens": 2309259.0, "eval_runtime": 21.3723, "eval_samples_per_second": 39.584, "eval_steps_per_second": 6.597, "step": 150 }, { "epoch": 1.491171749598716, "grad_norm": 11.6588250049219, "learning_rate": 1.1661117155019295e-06, "loss": 0.5642, "mean_token_accuracy": 0.8507290641466777, "num_tokens": 2391120.0, "step": 155 }, { "epoch": 1.5393258426966292, "grad_norm": 10.660140323644573, "learning_rate": 9.934134090518593e-07, "loss": 0.5677, "mean_token_accuracy": 0.8485025237003962, "num_tokens": 2474474.0, "step": 160 }, { "epoch": 1.5874799357945424, "grad_norm": 10.626219804281234, "learning_rate": 8.313392786794833e-07, "loss": 0.5783, "mean_token_accuracy": 0.8521494368712107, "num_tokens": 2554920.0, "step": 165 }, { "epoch": 1.635634028892456, "grad_norm": 10.159951789201457, "learning_rate": 6.810322418308085e-07, "loss": 0.5231, "mean_token_accuracy": 0.8625788191954294, "num_tokens": 2631940.0, "step": 170 }, { "epoch": 1.6837881219903692, "grad_norm": 11.3836838523216, "learning_rate": 5.435522365371376e-07, "loss": 0.4797, "mean_token_accuracy": 0.8722179641326269, "num_tokens": 2704574.0, "step": 175 }, { "epoch": 1.7319422150882824, "grad_norm": 14.381826637802636, "learning_rate": 4.198687469238297e-07, "loss": 0.5057, "mean_token_accuracy": 0.8718133926391601, "num_tokens": 2779644.0, "step": 180 }, { "epoch": 1.7800963081861958, "grad_norm": 10.843940846678331, "learning_rate": 3.1085396658363884e-07, "loss": 0.4763, "mean_token_accuracy": 0.8735970060030619, "num_tokens": 2858883.0, "step": 185 }, { "epoch": 1.8282504012841092, "grad_norm": 11.83942523855346, "learning_rate": 2.1727664802529218e-07, "loss": 0.4585, "mean_token_accuracy": 0.8757681508858999, "num_tokens": 2935988.0, "step": 190 }, { "epoch": 1.8764044943820224, "grad_norm": 10.716830524055672, "learning_rate": 1.3979668156987424e-07, "loss": 0.4513, "mean_token_accuracy": 0.8814116775989532, "num_tokens": 3008300.0, "step": 195 }, { "epoch": 1.9245585874799358, "grad_norm": 11.700054619778108, "learning_rate": 7.896044192366587e-08, "loss": 0.465, "mean_token_accuracy": 0.8805319319168726, "num_tokens": 3082266.0, "step": 200 }, { "epoch": 1.9245585874799358, "eval_loss": 2.468674898147583, "eval_mean_token_accuracy": 0.5063806645413662, "eval_num_tokens": 3082266.0, "eval_runtime": 21.1843, "eval_samples_per_second": 39.935, "eval_steps_per_second": 6.656, "step": 200 }, { "epoch": 1.9727126805778492, "grad_norm": 10.325385037101356, "learning_rate": 3.51969352425624e-08, "loss": 0.4718, "mean_token_accuracy": 0.8750891566276551, "num_tokens": 3153878.0, "step": 205 }, { "epoch": 2.0, "mean_token_accuracy": 0.8762102390036863, "num_tokens": 3195825.0, "step": 208, "total_flos": 9184917397504.0, "train_loss": 1.2657821722901785, "train_runtime": 1593.4425, "train_samples_per_second": 9.381, "train_steps_per_second": 0.131 } ], "logging_steps": 5, "max_steps": 208, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9184917397504.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }