{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 25, "global_step": 295, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01694915254237288, "grad_norm": 16.34368133544922, "learning_rate": 0.0, "loss": 1.6295, "num_input_tokens_seen": 112224, "step": 1, "train_runtime": 24.172, "train_tokens_per_second": 4642.725 }, { "epoch": 0.03389830508474576, "grad_norm": 15.402971267700195, "learning_rate": 1.6666666666666667e-06, "loss": 1.6581, "num_input_tokens_seen": 221080, "step": 2, "train_runtime": 34.4875, "train_tokens_per_second": 6410.436 }, { "epoch": 0.05084745762711865, "grad_norm": 14.902987480163574, "learning_rate": 3.3333333333333333e-06, "loss": 1.6797, "num_input_tokens_seen": 326656, "step": 3, "train_runtime": 44.7565, "train_tokens_per_second": 7298.518 }, { "epoch": 0.06779661016949153, "grad_norm": 9.34762954711914, "learning_rate": 5e-06, "loss": 1.4077, "num_input_tokens_seen": 433832, "step": 4, "train_runtime": 55.089, "train_tokens_per_second": 7875.117 }, { "epoch": 0.0847457627118644, "grad_norm": 5.879909515380859, "learning_rate": 6.666666666666667e-06, "loss": 1.3219, "num_input_tokens_seen": 548184, "step": 5, "train_runtime": 65.5147, "train_tokens_per_second": 8367.344 }, { "epoch": 0.1016949152542373, "grad_norm": 2.9537651538848877, "learning_rate": 8.333333333333334e-06, "loss": 1.14, "num_input_tokens_seen": 655632, "step": 6, "train_runtime": 75.8973, "train_tokens_per_second": 8638.415 }, { "epoch": 0.11864406779661017, "grad_norm": 2.5423429012298584, "learning_rate": 1e-05, "loss": 1.0531, "num_input_tokens_seen": 759704, "step": 7, "train_runtime": 85.9306, "train_tokens_per_second": 8840.903 }, { "epoch": 0.13559322033898305, "grad_norm": 1.7696669101715088, "learning_rate": 1.1666666666666668e-05, "loss": 0.9491, "num_input_tokens_seen": 875624, "step": 8, "train_runtime": 96.0912, "train_tokens_per_second": 9112.426 }, { "epoch": 0.15254237288135594, "grad_norm": 1.8822790384292603, "learning_rate": 1.3333333333333333e-05, "loss": 0.9397, "num_input_tokens_seen": 989656, "step": 9, "train_runtime": 106.4674, "train_tokens_per_second": 9295.392 }, { "epoch": 0.1694915254237288, "grad_norm": 1.5840418338775635, "learning_rate": 1.5e-05, "loss": 0.8694, "num_input_tokens_seen": 1092152, "step": 10, "train_runtime": 116.3444, "train_tokens_per_second": 9387.229 }, { "epoch": 0.1864406779661017, "grad_norm": 1.7065584659576416, "learning_rate": 1.6666666666666667e-05, "loss": 0.8185, "num_input_tokens_seen": 1204856, "step": 11, "train_runtime": 126.69, "train_tokens_per_second": 9510.268 }, { "epoch": 0.2033898305084746, "grad_norm": 1.621471643447876, "learning_rate": 1.8333333333333333e-05, "loss": 0.7581, "num_input_tokens_seen": 1302968, "step": 12, "train_runtime": 136.9093, "train_tokens_per_second": 9517.016 }, { "epoch": 0.22033898305084745, "grad_norm": 1.5125755071640015, "learning_rate": 2e-05, "loss": 0.7413, "num_input_tokens_seen": 1405520, "step": 13, "train_runtime": 147.1109, "train_tokens_per_second": 9554.152 }, { "epoch": 0.23728813559322035, "grad_norm": 1.5941375494003296, "learning_rate": 2.1666666666666667e-05, "loss": 0.719, "num_input_tokens_seen": 1510384, "step": 14, "train_runtime": 157.3295, "train_tokens_per_second": 9600.131 }, { "epoch": 0.2542372881355932, "grad_norm": 1.9046858549118042, "learning_rate": 2.3333333333333336e-05, "loss": 0.7001, "num_input_tokens_seen": 1615456, "step": 15, "train_runtime": 167.5094, "train_tokens_per_second": 9643.97 }, { "epoch": 0.2711864406779661, "grad_norm": 1.1321921348571777, "learning_rate": 2.5e-05, "loss": 0.682, "num_input_tokens_seen": 1720464, "step": 16, "train_runtime": 177.392, "train_tokens_per_second": 9698.654 }, { "epoch": 0.288135593220339, "grad_norm": 1.1452239751815796, "learning_rate": 2.6666666666666667e-05, "loss": 0.6806, "num_input_tokens_seen": 1825984, "step": 17, "train_runtime": 187.6572, "train_tokens_per_second": 9730.423 }, { "epoch": 0.3050847457627119, "grad_norm": 1.1809570789337158, "learning_rate": 2.8333333333333335e-05, "loss": 0.6672, "num_input_tokens_seen": 1925424, "step": 18, "train_runtime": 197.7852, "train_tokens_per_second": 9734.926 }, { "epoch": 0.3220338983050847, "grad_norm": 1.0847970247268677, "learning_rate": 3e-05, "loss": 0.6324, "num_input_tokens_seen": 2024656, "step": 19, "train_runtime": 207.8433, "train_tokens_per_second": 9741.263 }, { "epoch": 0.3389830508474576, "grad_norm": 1.0256402492523193, "learning_rate": 3.1666666666666666e-05, "loss": 0.6849, "num_input_tokens_seen": 2123808, "step": 20, "train_runtime": 217.9334, "train_tokens_per_second": 9745.216 }, { "epoch": 0.3559322033898305, "grad_norm": 0.9755237102508545, "learning_rate": 3.3333333333333335e-05, "loss": 0.6998, "num_input_tokens_seen": 2236536, "step": 21, "train_runtime": 228.3115, "train_tokens_per_second": 9795.987 }, { "epoch": 0.3728813559322034, "grad_norm": 1.146010398864746, "learning_rate": 3.5e-05, "loss": 0.6178, "num_input_tokens_seen": 2345176, "step": 22, "train_runtime": 238.4088, "train_tokens_per_second": 9836.786 }, { "epoch": 0.3898305084745763, "grad_norm": 1.0620143413543701, "learning_rate": 3.6666666666666666e-05, "loss": 0.6417, "num_input_tokens_seen": 2450632, "step": 23, "train_runtime": 248.3988, "train_tokens_per_second": 9865.715 }, { "epoch": 0.4067796610169492, "grad_norm": 1.7118717432022095, "learning_rate": 3.8333333333333334e-05, "loss": 0.681, "num_input_tokens_seen": 2560968, "step": 24, "train_runtime": 258.705, "train_tokens_per_second": 9899.182 }, { "epoch": 0.423728813559322, "grad_norm": 1.1660561561584473, "learning_rate": 4e-05, "loss": 0.6877, "num_input_tokens_seen": 2673304, "step": 25, "train_runtime": 269.0249, "train_tokens_per_second": 9937.014 }, { "epoch": 0.423728813559322, "eval_accuracy": 0.8128813839238969, "eval_loss": 0.671046793460846, "eval_runtime": 4.3547, "eval_samples_per_second": 11.482, "eval_steps_per_second": 2.985, "num_input_tokens_seen": 2673304, "step": 25 }, { "epoch": 0.4406779661016949, "grad_norm": 0.9917629957199097, "learning_rate": 4.166666666666667e-05, "loss": 0.6362, "num_input_tokens_seen": 2769736, "step": 26, "train_runtime": 283.1953, "train_tokens_per_second": 9780.303 }, { "epoch": 0.4576271186440678, "grad_norm": 1.1648198366165161, "learning_rate": 4.3333333333333334e-05, "loss": 0.6535, "num_input_tokens_seen": 2875984, "step": 27, "train_runtime": 293.3704, "train_tokens_per_second": 9803.251 }, { "epoch": 0.4745762711864407, "grad_norm": 1.1746153831481934, "learning_rate": 4.5e-05, "loss": 0.7158, "num_input_tokens_seen": 2986112, "step": 28, "train_runtime": 303.5965, "train_tokens_per_second": 9835.791 }, { "epoch": 0.4915254237288136, "grad_norm": 1.2997013330459595, "learning_rate": 4.666666666666667e-05, "loss": 0.6744, "num_input_tokens_seen": 3093880, "step": 29, "train_runtime": 313.983, "train_tokens_per_second": 9853.654 }, { "epoch": 0.5084745762711864, "grad_norm": 1.8045681715011597, "learning_rate": 4.8333333333333334e-05, "loss": 0.6442, "num_input_tokens_seen": 3202720, "step": 30, "train_runtime": 324.0355, "train_tokens_per_second": 9883.855 }, { "epoch": 0.5254237288135594, "grad_norm": 0.9538795351982117, "learning_rate": 5e-05, "loss": 0.6758, "num_input_tokens_seen": 3311816, "step": 31, "train_runtime": 334.371, "train_tokens_per_second": 9904.616 }, { "epoch": 0.5423728813559322, "grad_norm": 1.0621432065963745, "learning_rate": 4.999824323801887e-05, "loss": 0.7101, "num_input_tokens_seen": 3415424, "step": 32, "train_runtime": 344.5683, "train_tokens_per_second": 9912.182 }, { "epoch": 0.559322033898305, "grad_norm": 0.9179059267044067, "learning_rate": 4.9992973198972505e-05, "loss": 0.6392, "num_input_tokens_seen": 3524136, "step": 33, "train_runtime": 354.8912, "train_tokens_per_second": 9930.186 }, { "epoch": 0.576271186440678, "grad_norm": 0.9196195006370544, "learning_rate": 4.998419062351724e-05, "loss": 0.5893, "num_input_tokens_seen": 3631528, "step": 34, "train_runtime": 365.0364, "train_tokens_per_second": 9948.4 }, { "epoch": 0.5932203389830508, "grad_norm": 1.0634722709655762, "learning_rate": 4.997189674596463e-05, "loss": 0.6018, "num_input_tokens_seen": 3740096, "step": 35, "train_runtime": 375.4073, "train_tokens_per_second": 9962.768 }, { "epoch": 0.6101694915254238, "grad_norm": 0.7719584703445435, "learning_rate": 4.995609329410804e-05, "loss": 0.6345, "num_input_tokens_seen": 3846680, "step": 36, "train_runtime": 385.614, "train_tokens_per_second": 9975.469 }, { "epoch": 0.6271186440677966, "grad_norm": 0.8835194706916809, "learning_rate": 4.993678248897972e-05, "loss": 0.6499, "num_input_tokens_seen": 3951992, "step": 37, "train_runtime": 395.8528, "train_tokens_per_second": 9983.489 }, { "epoch": 0.6440677966101694, "grad_norm": 0.8042682409286499, "learning_rate": 4.9913967044538734e-05, "loss": 0.6397, "num_input_tokens_seen": 4058880, "step": 38, "train_runtime": 406.182, "train_tokens_per_second": 9992.761 }, { "epoch": 0.6610169491525424, "grad_norm": 0.8412677049636841, "learning_rate": 4.9887650167289525e-05, "loss": 0.6596, "num_input_tokens_seen": 4152608, "step": 39, "train_runtime": 416.0998, "train_tokens_per_second": 9979.836 }, { "epoch": 0.6779661016949152, "grad_norm": 0.8093813061714172, "learning_rate": 4.985783555583123e-05, "loss": 0.5761, "num_input_tokens_seen": 4261232, "step": 40, "train_runtime": 426.049, "train_tokens_per_second": 10001.741 }, { "epoch": 0.6949152542372882, "grad_norm": 0.7076805233955383, "learning_rate": 4.982452740033793e-05, "loss": 0.65, "num_input_tokens_seen": 4368744, "step": 41, "train_runtime": 436.3634, "train_tokens_per_second": 10011.711 }, { "epoch": 0.711864406779661, "grad_norm": 0.8573846220970154, "learning_rate": 4.978773038196972e-05, "loss": 0.6319, "num_input_tokens_seen": 4470520, "step": 42, "train_runtime": 446.562, "train_tokens_per_second": 10010.972 }, { "epoch": 0.7288135593220338, "grad_norm": 0.8405332565307617, "learning_rate": 4.974744967221483e-05, "loss": 0.6236, "num_input_tokens_seen": 4575488, "step": 43, "train_runtime": 456.8737, "train_tokens_per_second": 10014.777 }, { "epoch": 0.7457627118644068, "grad_norm": 0.816683292388916, "learning_rate": 4.9703690932162824e-05, "loss": 0.6215, "num_input_tokens_seen": 4680592, "step": 44, "train_runtime": 467.0937, "train_tokens_per_second": 10020.671 }, { "epoch": 0.7627118644067796, "grad_norm": 0.797042727470398, "learning_rate": 4.9656460311708963e-05, "loss": 0.6175, "num_input_tokens_seen": 4783320, "step": 45, "train_runtime": 477.1334, "train_tokens_per_second": 10025.122 }, { "epoch": 0.7796610169491526, "grad_norm": 0.7709528803825378, "learning_rate": 4.960576444868992e-05, "loss": 0.5959, "num_input_tokens_seen": 4883848, "step": 46, "train_runtime": 487.3769, "train_tokens_per_second": 10020.681 }, { "epoch": 0.7966101694915254, "grad_norm": 0.7377520799636841, "learning_rate": 4.955161046795088e-05, "loss": 0.6428, "num_input_tokens_seen": 4995680, "step": 47, "train_runtime": 497.7583, "train_tokens_per_second": 10036.358 }, { "epoch": 0.8135593220338984, "grad_norm": 0.7387161254882812, "learning_rate": 4.9494005980344194e-05, "loss": 0.6275, "num_input_tokens_seen": 5106208, "step": 48, "train_runtime": 507.8631, "train_tokens_per_second": 10054.3 }, { "epoch": 0.8305084745762712, "grad_norm": 0.6989988088607788, "learning_rate": 4.943295908165977e-05, "loss": 0.5977, "num_input_tokens_seen": 5215272, "step": 49, "train_runtime": 518.169, "train_tokens_per_second": 10064.81 }, { "epoch": 0.847457627118644, "grad_norm": 0.6940891146659851, "learning_rate": 4.936847835148725e-05, "loss": 0.5901, "num_input_tokens_seen": 5320152, "step": 50, "train_runtime": 528.4296, "train_tokens_per_second": 10067.853 }, { "epoch": 0.847457627118644, "eval_accuracy": 0.8260805511105289, "eval_loss": 0.5966207981109619, "eval_runtime": 4.3377, "eval_samples_per_second": 11.527, "eval_steps_per_second": 2.997, "num_input_tokens_seen": 5320152, "step": 50 }, { "epoch": 0.864406779661017, "grad_norm": 0.7691423892974854, "learning_rate": 4.930057285201027e-05, "loss": 0.6284, "num_input_tokens_seen": 5424336, "step": 51, "train_runtime": 543.1554, "train_tokens_per_second": 9986.711 }, { "epoch": 0.8813559322033898, "grad_norm": 0.7801838517189026, "learning_rate": 4.9229252126732814e-05, "loss": 0.6046, "num_input_tokens_seen": 5536400, "step": 52, "train_runtime": 553.5015, "train_tokens_per_second": 10002.503 }, { "epoch": 0.8983050847457628, "grad_norm": 0.7027463316917419, "learning_rate": 4.9154526199137964e-05, "loss": 0.5333, "num_input_tokens_seen": 5647488, "step": 53, "train_runtime": 563.8958, "train_tokens_per_second": 10015.127 }, { "epoch": 0.9152542372881356, "grad_norm": 0.6528388261795044, "learning_rate": 4.9076405571279207e-05, "loss": 0.6442, "num_input_tokens_seen": 5751248, "step": 54, "train_runtime": 574.1388, "train_tokens_per_second": 10017.173 }, { "epoch": 0.9322033898305084, "grad_norm": 0.8217389583587646, "learning_rate": 4.8994901222304465e-05, "loss": 0.663, "num_input_tokens_seen": 5858104, "step": 55, "train_runtime": 584.4942, "train_tokens_per_second": 10022.518 }, { "epoch": 0.9491525423728814, "grad_norm": 0.6466934084892273, "learning_rate": 4.891002460691306e-05, "loss": 0.5856, "num_input_tokens_seen": 5958920, "step": 56, "train_runtime": 594.703, "train_tokens_per_second": 10019.993 }, { "epoch": 0.9661016949152542, "grad_norm": 0.6264472603797913, "learning_rate": 4.882178765374589e-05, "loss": 0.5298, "num_input_tokens_seen": 6067168, "step": 57, "train_runtime": 605.0659, "train_tokens_per_second": 10027.285 }, { "epoch": 0.9830508474576272, "grad_norm": 0.6835854053497314, "learning_rate": 4.87302027637089e-05, "loss": 0.6364, "num_input_tokens_seen": 6164248, "step": 58, "train_runtime": 615.1097, "train_tokens_per_second": 10021.38 }, { "epoch": 1.0, "grad_norm": 0.715363085269928, "learning_rate": 4.863528280823033e-05, "loss": 0.5508, "num_input_tokens_seen": 6269656, "step": 59, "train_runtime": 625.2947, "train_tokens_per_second": 10026.722 }, { "epoch": 1.0169491525423728, "grad_norm": 0.6624979972839355, "learning_rate": 4.853704112745172e-05, "loss": 0.5221, "num_input_tokens_seen": 6377200, "step": 60, "train_runtime": 635.7108, "train_tokens_per_second": 10031.606 }, { "epoch": 1.0338983050847457, "grad_norm": 0.7150872349739075, "learning_rate": 4.8435491528353026e-05, "loss": 0.5232, "num_input_tokens_seen": 6478784, "step": 61, "train_runtime": 645.6922, "train_tokens_per_second": 10033.858 }, { "epoch": 1.0508474576271187, "grad_norm": 0.643916666507721, "learning_rate": 4.833064828281225e-05, "loss": 0.4441, "num_input_tokens_seen": 6594704, "step": 62, "train_runtime": 656.0298, "train_tokens_per_second": 10052.445 }, { "epoch": 1.0677966101694916, "grad_norm": 0.6635810732841492, "learning_rate": 4.822252612559961e-05, "loss": 0.5256, "num_input_tokens_seen": 6694384, "step": 63, "train_runtime": 665.4726, "train_tokens_per_second": 10059.593 }, { "epoch": 1.0847457627118644, "grad_norm": 0.7547369003295898, "learning_rate": 4.811114025230672e-05, "loss": 0.518, "num_input_tokens_seen": 6795736, "step": 64, "train_runtime": 675.6098, "train_tokens_per_second": 10058.67 }, { "epoch": 1.1016949152542372, "grad_norm": 0.6073519587516785, "learning_rate": 4.799650631721096e-05, "loss": 0.5539, "num_input_tokens_seen": 6903368, "step": 65, "train_runtime": 685.8813, "train_tokens_per_second": 10064.96 }, { "epoch": 1.11864406779661, "grad_norm": 0.6640107035636902, "learning_rate": 4.787864043107546e-05, "loss": 0.4718, "num_input_tokens_seen": 7002776, "step": 66, "train_runtime": 695.8898, "train_tokens_per_second": 10063.052 }, { "epoch": 1.1355932203389831, "grad_norm": 0.6934003829956055, "learning_rate": 4.775755915888483e-05, "loss": 0.4699, "num_input_tokens_seen": 7095696, "step": 67, "train_runtime": 705.6961, "train_tokens_per_second": 10054.89 }, { "epoch": 1.152542372881356, "grad_norm": 0.7117975950241089, "learning_rate": 4.763327951751711e-05, "loss": 0.5381, "num_input_tokens_seen": 7206656, "step": 68, "train_runtime": 716.9173, "train_tokens_per_second": 10052.283 }, { "epoch": 1.1694915254237288, "grad_norm": 0.7451320886611938, "learning_rate": 4.750581897335222e-05, "loss": 0.4825, "num_input_tokens_seen": 7324712, "step": 69, "train_runtime": 728.3753, "train_tokens_per_second": 10056.234 }, { "epoch": 1.1864406779661016, "grad_norm": 0.6758464574813843, "learning_rate": 4.737519543981721e-05, "loss": 0.5406, "num_input_tokens_seen": 7436408, "step": 70, "train_runtime": 739.8095, "train_tokens_per_second": 10051.788 }, { "epoch": 1.2033898305084745, "grad_norm": 0.6071574091911316, "learning_rate": 4.724142727486869e-05, "loss": 0.5419, "num_input_tokens_seen": 7537992, "step": 71, "train_runtime": 750.8169, "train_tokens_per_second": 10039.721 }, { "epoch": 1.2203389830508475, "grad_norm": 0.5895605087280273, "learning_rate": 4.7104533278412763e-05, "loss": 0.5035, "num_input_tokens_seen": 7651968, "step": 72, "train_runtime": 762.2806, "train_tokens_per_second": 10038.256 }, { "epoch": 1.2372881355932204, "grad_norm": 0.6282557845115662, "learning_rate": 4.696453268966291e-05, "loss": 0.529, "num_input_tokens_seen": 7748848, "step": 73, "train_runtime": 773.5666, "train_tokens_per_second": 10017.04 }, { "epoch": 1.2542372881355932, "grad_norm": 0.6695516705513, "learning_rate": 4.6821445184436066e-05, "loss": 0.4829, "num_input_tokens_seen": 7845760, "step": 74, "train_runtime": 783.5687, "train_tokens_per_second": 10012.856 }, { "epoch": 1.271186440677966, "grad_norm": 0.5903871059417725, "learning_rate": 4.667529087238736e-05, "loss": 0.4792, "num_input_tokens_seen": 7948872, "step": 75, "train_runtime": 793.8129, "train_tokens_per_second": 10013.534 }, { "epoch": 1.271186440677966, "eval_accuracy": 0.8345456753981751, "eval_loss": 0.5687937140464783, "eval_runtime": 4.3453, "eval_samples_per_second": 11.507, "eval_steps_per_second": 2.992, "num_input_tokens_seen": 7948872, "step": 75 }, { "epoch": 1.288135593220339, "grad_norm": 0.5066044330596924, "learning_rate": 4.652609029418389e-05, "loss": 0.4475, "num_input_tokens_seen": 8053096, "step": 76, "train_runtime": 808.4485, "train_tokens_per_second": 9961.174 }, { "epoch": 1.305084745762712, "grad_norm": 0.7508769035339355, "learning_rate": 4.6373864418617935e-05, "loss": 0.4814, "num_input_tokens_seen": 8165544, "step": 77, "train_runtime": 818.7742, "train_tokens_per_second": 9972.888 }, { "epoch": 1.3220338983050848, "grad_norm": 0.6583501100540161, "learning_rate": 4.6218634639659954e-05, "loss": 0.551, "num_input_tokens_seen": 8280040, "step": 78, "train_runtime": 829.0981, "train_tokens_per_second": 9986.803 }, { "epoch": 1.3389830508474576, "grad_norm": 0.5735164284706116, "learning_rate": 4.606042277345185e-05, "loss": 0.4689, "num_input_tokens_seen": 8380248, "step": 79, "train_runtime": 839.0856, "train_tokens_per_second": 9987.357 }, { "epoch": 1.3559322033898304, "grad_norm": 0.6262232065200806, "learning_rate": 4.5899251055240963e-05, "loss": 0.4489, "num_input_tokens_seen": 8488848, "step": 80, "train_runtime": 849.3643, "train_tokens_per_second": 9994.354 }, { "epoch": 1.3728813559322033, "grad_norm": 0.7844845652580261, "learning_rate": 4.573514213625505e-05, "loss": 0.593, "num_input_tokens_seen": 8593232, "step": 81, "train_runtime": 859.579, "train_tokens_per_second": 9997.024 }, { "epoch": 1.3898305084745763, "grad_norm": 0.6617870926856995, "learning_rate": 4.5568119080518864e-05, "loss": 0.5531, "num_input_tokens_seen": 8692096, "step": 82, "train_runtime": 869.7519, "train_tokens_per_second": 9993.765 }, { "epoch": 1.4067796610169492, "grad_norm": 0.621767520904541, "learning_rate": 4.539820536161278e-05, "loss": 0.4688, "num_input_tokens_seen": 8802024, "step": 83, "train_runtime": 880.1033, "train_tokens_per_second": 10001.126 }, { "epoch": 1.423728813559322, "grad_norm": 0.7459584474563599, "learning_rate": 4.522542485937369e-05, "loss": 0.4728, "num_input_tokens_seen": 8915296, "step": 84, "train_runtime": 890.2946, "train_tokens_per_second": 10013.871 }, { "epoch": 1.4406779661016949, "grad_norm": 0.711216390132904, "learning_rate": 4.504980185653899e-05, "loss": 0.5405, "num_input_tokens_seen": 9018176, "step": 85, "train_runtime": 900.7028, "train_tokens_per_second": 10012.377 }, { "epoch": 1.457627118644068, "grad_norm": 0.6414653062820435, "learning_rate": 4.4871361035333836e-05, "loss": 0.4356, "num_input_tokens_seen": 9127880, "step": 86, "train_runtime": 911.0595, "train_tokens_per_second": 10018.972 }, { "epoch": 1.4745762711864407, "grad_norm": 0.6491935849189758, "learning_rate": 4.469012747400227e-05, "loss": 0.4982, "num_input_tokens_seen": 9241680, "step": 87, "train_runtime": 921.4507, "train_tokens_per_second": 10029.49 }, { "epoch": 1.4915254237288136, "grad_norm": 0.6951228380203247, "learning_rate": 4.450612664328271e-05, "loss": 0.535, "num_input_tokens_seen": 9349080, "step": 88, "train_runtime": 931.7221, "train_tokens_per_second": 10034.194 }, { "epoch": 1.5084745762711864, "grad_norm": 0.5840334892272949, "learning_rate": 4.431938440282828e-05, "loss": 0.4983, "num_input_tokens_seen": 9460424, "step": 89, "train_runtime": 942.087, "train_tokens_per_second": 10041.986 }, { "epoch": 1.5254237288135593, "grad_norm": 0.5770622491836548, "learning_rate": 4.412992699757244e-05, "loss": 0.4654, "num_input_tokens_seen": 9577240, "step": 90, "train_runtime": 952.4898, "train_tokens_per_second": 10054.953 }, { "epoch": 1.542372881355932, "grad_norm": 0.673603355884552, "learning_rate": 4.3937781054040505e-05, "loss": 0.4369, "num_input_tokens_seen": 9680760, "step": 91, "train_runtime": 962.7675, "train_tokens_per_second": 10055.138 }, { "epoch": 1.559322033898305, "grad_norm": 0.6873490214347839, "learning_rate": 4.374297357660756e-05, "loss": 0.4837, "num_input_tokens_seen": 9791984, "step": 92, "train_runtime": 973.1429, "train_tokens_per_second": 10062.226 }, { "epoch": 1.576271186440678, "grad_norm": 0.5499905943870544, "learning_rate": 4.354553194370321e-05, "loss": 0.4735, "num_input_tokens_seen": 9895992, "step": 93, "train_runtime": 983.411, "train_tokens_per_second": 10062.926 }, { "epoch": 1.5932203389830508, "grad_norm": 0.553594172000885, "learning_rate": 4.334548390396377e-05, "loss": 0.4411, "num_input_tokens_seen": 10007400, "step": 94, "train_runtime": 993.6791, "train_tokens_per_second": 10071.058 }, { "epoch": 1.6101694915254239, "grad_norm": 0.5988419651985168, "learning_rate": 4.3142857572332504e-05, "loss": 0.4829, "num_input_tokens_seen": 10122232, "step": 95, "train_runtime": 1003.9748, "train_tokens_per_second": 10082.157 }, { "epoch": 1.6271186440677967, "grad_norm": 0.5911242961883545, "learning_rate": 4.293768142610828e-05, "loss": 0.4373, "num_input_tokens_seen": 10211000, "step": 96, "train_runtime": 1013.7445, "train_tokens_per_second": 10072.557 }, { "epoch": 1.6440677966101696, "grad_norm": 0.640306830406189, "learning_rate": 4.272998430094334e-05, "loss": 0.4391, "num_input_tokens_seen": 10323768, "step": 97, "train_runtime": 1024.1228, "train_tokens_per_second": 10080.596 }, { "epoch": 1.6610169491525424, "grad_norm": 0.6589894890785217, "learning_rate": 4.2519795386790716e-05, "loss": 0.5014, "num_input_tokens_seen": 10423272, "step": 98, "train_runtime": 1034.1041, "train_tokens_per_second": 10079.519 }, { "epoch": 1.6779661016949152, "grad_norm": 0.6693786978721619, "learning_rate": 4.23071442238019e-05, "loss": 0.5095, "num_input_tokens_seen": 10535696, "step": 99, "train_runtime": 1044.4991, "train_tokens_per_second": 10086.841 }, { "epoch": 1.694915254237288, "grad_norm": 0.597254753112793, "learning_rate": 4.209206069817513e-05, "loss": 0.4444, "num_input_tokens_seen": 10640880, "step": 100, "train_runtime": 1054.8328, "train_tokens_per_second": 10087.741 }, { "epoch": 1.694915254237288, "eval_accuracy": 0.8374839692392347, "eval_loss": 0.5552906394004822, "eval_runtime": 4.3531, "eval_samples_per_second": 11.486, "eval_steps_per_second": 2.986, "num_input_tokens_seen": 10640880, "step": 100 }, { "epoch": 1.711864406779661, "grad_norm": 0.7455243468284607, "learning_rate": 4.187457503795527e-05, "loss": 0.4959, "num_input_tokens_seen": 10745624, "step": 101, "train_runtime": 1069.4889, "train_tokens_per_second": 10047.439 }, { "epoch": 1.7288135593220337, "grad_norm": 0.5884683132171631, "learning_rate": 4.165471780878546e-05, "loss": 0.433, "num_input_tokens_seen": 10844744, "step": 102, "train_runtime": 1079.7793, "train_tokens_per_second": 10043.482 }, { "epoch": 1.7457627118644068, "grad_norm": 0.5727217793464661, "learning_rate": 4.1432519909611415e-05, "loss": 0.4856, "num_input_tokens_seen": 10952176, "step": 103, "train_runtime": 1090.1011, "train_tokens_per_second": 10046.936 }, { "epoch": 1.7627118644067796, "grad_norm": 0.5741541385650635, "learning_rate": 4.120801256833887e-05, "loss": 0.4413, "num_input_tokens_seen": 11066704, "step": 104, "train_runtime": 1100.4868, "train_tokens_per_second": 10056.19 }, { "epoch": 1.7796610169491527, "grad_norm": 0.6238065958023071, "learning_rate": 4.098122733744475e-05, "loss": 0.4558, "num_input_tokens_seen": 11167664, "step": 105, "train_runtime": 1110.4653, "train_tokens_per_second": 10056.743 }, { "epoch": 1.7966101694915255, "grad_norm": 0.59928959608078, "learning_rate": 4.075219608954278e-05, "loss": 0.5277, "num_input_tokens_seen": 11267192, "step": 106, "train_runtime": 1120.3081, "train_tokens_per_second": 10057.226 }, { "epoch": 1.8135593220338984, "grad_norm": 0.5390007495880127, "learning_rate": 4.052095101290406e-05, "loss": 0.4027, "num_input_tokens_seen": 11381440, "step": 107, "train_runtime": 1130.5809, "train_tokens_per_second": 10066.895 }, { "epoch": 1.8305084745762712, "grad_norm": 0.49983635544776917, "learning_rate": 4.02875246069333e-05, "loss": 0.459, "num_input_tokens_seen": 11478120, "step": 108, "train_runtime": 1140.6584, "train_tokens_per_second": 10062.715 }, { "epoch": 1.847457627118644, "grad_norm": 0.5167087316513062, "learning_rate": 4.005194967760135e-05, "loss": 0.4386, "num_input_tokens_seen": 11584096, "step": 109, "train_runtime": 1150.7061, "train_tokens_per_second": 10066.946 }, { "epoch": 1.8644067796610169, "grad_norm": 0.5469056963920593, "learning_rate": 3.981425933283456e-05, "loss": 0.3941, "num_input_tokens_seen": 11695448, "step": 110, "train_runtime": 1161.1546, "train_tokens_per_second": 10072.258 }, { "epoch": 1.8813559322033897, "grad_norm": 0.5128666758537292, "learning_rate": 3.95744869778618e-05, "loss": 0.4912, "num_input_tokens_seen": 11805544, "step": 111, "train_runtime": 1171.4829, "train_tokens_per_second": 10077.436 }, { "epoch": 1.8983050847457628, "grad_norm": 0.5504162311553955, "learning_rate": 3.933266631051968e-05, "loss": 0.4393, "num_input_tokens_seen": 11903888, "step": 112, "train_runtime": 1181.3806, "train_tokens_per_second": 10076.252 }, { "epoch": 1.9152542372881356, "grad_norm": 0.5731999278068542, "learning_rate": 3.9088831316516564e-05, "loss": 0.4292, "num_input_tokens_seen": 12004128, "step": 113, "train_runtime": 1191.1538, "train_tokens_per_second": 10077.732 }, { "epoch": 1.9322033898305084, "grad_norm": 0.5877408981323242, "learning_rate": 3.8843016264656215e-05, "loss": 0.4825, "num_input_tokens_seen": 12115840, "step": 114, "train_runtime": 1201.5735, "train_tokens_per_second": 10083.312 }, { "epoch": 1.9491525423728815, "grad_norm": 0.6028397679328918, "learning_rate": 3.8595255702021635e-05, "loss": 0.5266, "num_input_tokens_seen": 12232504, "step": 115, "train_runtime": 1211.9761, "train_tokens_per_second": 10093.024 }, { "epoch": 1.9661016949152543, "grad_norm": 0.6042894721031189, "learning_rate": 3.8345584449119776e-05, "loss": 0.4424, "num_input_tokens_seen": 12339872, "step": 116, "train_runtime": 1222.2981, "train_tokens_per_second": 10095.632 }, { "epoch": 1.9830508474576272, "grad_norm": 0.6503923535346985, "learning_rate": 3.809403759498782e-05, "loss": 0.4777, "num_input_tokens_seen": 12440032, "step": 117, "train_runtime": 1232.5124, "train_tokens_per_second": 10093.231 }, { "epoch": 2.0, "grad_norm": 0.5169121026992798, "learning_rate": 3.784065049226176e-05, "loss": 0.4401, "num_input_tokens_seen": 12542672, "step": 118, "train_runtime": 1242.4604, "train_tokens_per_second": 10095.028 }, { "epoch": 2.016949152542373, "grad_norm": 0.5643147230148315, "learning_rate": 3.758545875220788e-05, "loss": 0.3796, "num_input_tokens_seen": 12655008, "step": 119, "train_runtime": 1252.8834, "train_tokens_per_second": 10100.707 }, { "epoch": 2.0338983050847457, "grad_norm": 0.5204115509986877, "learning_rate": 3.732849823971793e-05, "loss": 0.3662, "num_input_tokens_seen": 12769960, "step": 120, "train_runtime": 1263.2752, "train_tokens_per_second": 10108.613 }, { "epoch": 2.0508474576271185, "grad_norm": 0.5998212695121765, "learning_rate": 3.706980506826863e-05, "loss": 0.3615, "num_input_tokens_seen": 12881984, "step": 121, "train_runtime": 1273.3162, "train_tokens_per_second": 10116.878 }, { "epoch": 2.0677966101694913, "grad_norm": 0.4841426908969879, "learning_rate": 3.6809415594846236e-05, "loss": 0.3831, "num_input_tokens_seen": 12981888, "step": 122, "train_runtime": 1283.5823, "train_tokens_per_second": 10113.795 }, { "epoch": 2.084745762711864, "grad_norm": 0.4841645359992981, "learning_rate": 3.6547366414836936e-05, "loss": 0.2879, "num_input_tokens_seen": 13082360, "step": 123, "train_runtime": 1293.8742, "train_tokens_per_second": 10110.998 }, { "epoch": 2.1016949152542375, "grad_norm": 0.5998939871788025, "learning_rate": 3.628369435688366e-05, "loss": 0.4776, "num_input_tokens_seen": 13195264, "step": 124, "train_runtime": 1304.2365, "train_tokens_per_second": 10117.233 }, { "epoch": 2.1186440677966103, "grad_norm": 0.6515780687332153, "learning_rate": 3.601843647771016e-05, "loss": 0.3788, "num_input_tokens_seen": 13298752, "step": 125, "train_runtime": 1314.4968, "train_tokens_per_second": 10116.991 }, { "epoch": 2.1186440677966103, "eval_accuracy": 0.8475513962799617, "eval_loss": 0.529534637928009, "eval_runtime": 4.3446, "eval_samples_per_second": 11.509, "eval_steps_per_second": 2.992, "num_input_tokens_seen": 13298752, "step": 125 }, { "epoch": 2.135593220338983, "grad_norm": 0.535269558429718, "learning_rate": 3.575163005691302e-05, "loss": 0.3697, "num_input_tokens_seen": 13394544, "step": 126, "train_runtime": 1328.823, "train_tokens_per_second": 10080.006 }, { "epoch": 2.152542372881356, "grad_norm": 0.639999270439148, "learning_rate": 3.548331259172234e-05, "loss": 0.3783, "num_input_tokens_seen": 13503584, "step": 127, "train_runtime": 1339.1132, "train_tokens_per_second": 10083.975 }, { "epoch": 2.169491525423729, "grad_norm": 0.6342602372169495, "learning_rate": 3.5213521791731875e-05, "loss": 0.3652, "num_input_tokens_seen": 13607464, "step": 128, "train_runtime": 1349.4055, "train_tokens_per_second": 10084.044 }, { "epoch": 2.1864406779661016, "grad_norm": 0.5488477945327759, "learning_rate": 3.4942295573599245e-05, "loss": 0.366, "num_input_tokens_seen": 13708112, "step": 129, "train_runtime": 1359.3979, "train_tokens_per_second": 10083.958 }, { "epoch": 2.2033898305084745, "grad_norm": 0.5964322090148926, "learning_rate": 3.46696720557171e-05, "loss": 0.3134, "num_input_tokens_seen": 13815872, "step": 130, "train_runtime": 1369.6909, "train_tokens_per_second": 10086.854 }, { "epoch": 2.2203389830508473, "grad_norm": 0.5261477828025818, "learning_rate": 3.4395689552855955e-05, "loss": 0.3162, "num_input_tokens_seen": 13920760, "step": 131, "train_runtime": 1379.5953, "train_tokens_per_second": 10090.467 }, { "epoch": 2.23728813559322, "grad_norm": 0.5315053462982178, "learning_rate": 3.412038657077939e-05, "loss": 0.3835, "num_input_tokens_seen": 14014280, "step": 132, "train_runtime": 1389.228, "train_tokens_per_second": 10087.818 }, { "epoch": 2.2542372881355934, "grad_norm": 0.518429696559906, "learning_rate": 3.3843801800832354e-05, "loss": 0.3628, "num_input_tokens_seen": 14109848, "step": 133, "train_runtime": 1399.3481, "train_tokens_per_second": 10083.158 }, { "epoch": 2.2711864406779663, "grad_norm": 0.5626394152641296, "learning_rate": 3.356597411450353e-05, "loss": 0.3635, "num_input_tokens_seen": 14217008, "step": 134, "train_runtime": 1409.3478, "train_tokens_per_second": 10087.65 }, { "epoch": 2.288135593220339, "grad_norm": 0.5665557384490967, "learning_rate": 3.328694255796226e-05, "loss": 0.3426, "num_input_tokens_seen": 14326608, "step": 135, "train_runtime": 1419.5891, "train_tokens_per_second": 10092.081 }, { "epoch": 2.305084745762712, "grad_norm": 0.5634236931800842, "learning_rate": 3.300674634657094e-05, "loss": 0.3817, "num_input_tokens_seen": 14425192, "step": 136, "train_runtime": 1429.8654, "train_tokens_per_second": 10088.496 }, { "epoch": 2.3220338983050848, "grad_norm": 0.5703310966491699, "learning_rate": 3.272542485937369e-05, "loss": 0.367, "num_input_tokens_seen": 14516048, "step": 137, "train_runtime": 1439.5089, "train_tokens_per_second": 10084.028 }, { "epoch": 2.3389830508474576, "grad_norm": 0.4954843819141388, "learning_rate": 3.244301763356195e-05, "loss": 0.4014, "num_input_tokens_seen": 14612784, "step": 138, "train_runtime": 1449.7059, "train_tokens_per_second": 10079.827 }, { "epoch": 2.3559322033898304, "grad_norm": 0.4663669168949127, "learning_rate": 3.215956435891793e-05, "loss": 0.3442, "num_input_tokens_seen": 14712832, "step": 139, "train_runtime": 1459.9018, "train_tokens_per_second": 10077.96 }, { "epoch": 2.3728813559322033, "grad_norm": 0.482755571603775, "learning_rate": 3.187510487223655e-05, "loss": 0.3084, "num_input_tokens_seen": 14826672, "step": 140, "train_runtime": 1470.2591, "train_tokens_per_second": 10084.394 }, { "epoch": 2.389830508474576, "grad_norm": 0.5014492273330688, "learning_rate": 3.158967915172669e-05, "loss": 0.3533, "num_input_tokens_seen": 14931848, "step": 141, "train_runtime": 1480.4925, "train_tokens_per_second": 10085.73 }, { "epoch": 2.406779661016949, "grad_norm": 0.5160117745399475, "learning_rate": 3.130332731139272e-05, "loss": 0.3522, "num_input_tokens_seen": 15033416, "step": 142, "train_runtime": 1490.7429, "train_tokens_per_second": 10084.513 }, { "epoch": 2.423728813559322, "grad_norm": 0.547079861164093, "learning_rate": 3.101608959539671e-05, "loss": 0.3409, "num_input_tokens_seen": 15144040, "step": 143, "train_runtime": 1501.0195, "train_tokens_per_second": 10089.17 }, { "epoch": 2.440677966101695, "grad_norm": 0.6849552989006042, "learning_rate": 3.072800637240261e-05, "loss": 0.399, "num_input_tokens_seen": 15253280, "step": 144, "train_runtime": 1511.2733, "train_tokens_per_second": 10092.999 }, { "epoch": 2.457627118644068, "grad_norm": 0.4753686487674713, "learning_rate": 3.0439118129902698e-05, "loss": 0.2888, "num_input_tokens_seen": 15361952, "step": 145, "train_runtime": 1521.6339, "train_tokens_per_second": 10095.695 }, { "epoch": 2.4745762711864407, "grad_norm": 0.49981778860092163, "learning_rate": 3.014946546852746e-05, "loss": 0.4014, "num_input_tokens_seen": 15457896, "step": 146, "train_runtime": 1531.8793, "train_tokens_per_second": 10090.806 }, { "epoch": 2.4915254237288136, "grad_norm": 0.5016078948974609, "learning_rate": 2.9859089096339566e-05, "loss": 0.3612, "num_input_tokens_seen": 15570464, "step": 147, "train_runtime": 1542.2135, "train_tokens_per_second": 10096.179 }, { "epoch": 2.5084745762711864, "grad_norm": 0.5225934982299805, "learning_rate": 2.9568029823112688e-05, "loss": 0.4234, "num_input_tokens_seen": 15681264, "step": 148, "train_runtime": 1552.5788, "train_tokens_per_second": 10100.141 }, { "epoch": 2.5254237288135593, "grad_norm": 0.5351992845535278, "learning_rate": 2.9276328554596055e-05, "loss": 0.4073, "num_input_tokens_seen": 15788384, "step": 149, "train_runtime": 1562.9111, "train_tokens_per_second": 10101.908 }, { "epoch": 2.542372881355932, "grad_norm": 0.6268981099128723, "learning_rate": 2.8984026286765542e-05, "loss": 0.435, "num_input_tokens_seen": 15891024, "step": 150, "train_runtime": 1573.2633, "train_tokens_per_second": 10100.677 }, { "epoch": 2.542372881355932, "eval_accuracy": 0.8443465952747787, "eval_loss": 0.5156561136245728, "eval_runtime": 4.3482, "eval_samples_per_second": 11.499, "eval_steps_per_second": 2.99, "num_input_tokens_seen": 15891024, "step": 150 }, { "epoch": 2.559322033898305, "grad_norm": 0.5867091417312622, "learning_rate": 2.8691164100062034e-05, "loss": 0.4432, "num_input_tokens_seen": 15998080, "step": 151, "train_runtime": 1588.0035, "train_tokens_per_second": 10074.335 }, { "epoch": 2.576271186440678, "grad_norm": 0.5568689703941345, "learning_rate": 2.8397783153617958e-05, "loss": 0.4135, "num_input_tokens_seen": 16111136, "step": 152, "train_runtime": 1598.1101, "train_tokens_per_second": 10081.368 }, { "epoch": 2.593220338983051, "grad_norm": 0.48513880372047424, "learning_rate": 2.8103924679472737e-05, "loss": 0.3563, "num_input_tokens_seen": 16210312, "step": 153, "train_runtime": 1608.3504, "train_tokens_per_second": 10078.844 }, { "epoch": 2.610169491525424, "grad_norm": 0.5153747200965881, "learning_rate": 2.7809629976777973e-05, "loss": 0.3564, "num_input_tokens_seen": 16315056, "step": 154, "train_runtime": 1618.3735, "train_tokens_per_second": 10081.144 }, { "epoch": 2.6271186440677967, "grad_norm": 0.5335708856582642, "learning_rate": 2.7514940405993272e-05, "loss": 0.3611, "num_input_tokens_seen": 16417080, "step": 155, "train_runtime": 1628.6088, "train_tokens_per_second": 10080.432 }, { "epoch": 2.6440677966101696, "grad_norm": 0.5242218971252441, "learning_rate": 2.7219897383073373e-05, "loss": 0.3847, "num_input_tokens_seen": 16532576, "step": 156, "train_runtime": 1638.9544, "train_tokens_per_second": 10087.27 }, { "epoch": 2.6610169491525424, "grad_norm": 0.6446425318717957, "learning_rate": 2.6924542373647505e-05, "loss": 0.3309, "num_input_tokens_seen": 16644840, "step": 157, "train_runtime": 1649.3099, "train_tokens_per_second": 10092.003 }, { "epoch": 2.6779661016949152, "grad_norm": 0.47864437103271484, "learning_rate": 2.6628916887191784e-05, "loss": 0.3207, "num_input_tokens_seen": 16745864, "step": 158, "train_runtime": 1659.0578, "train_tokens_per_second": 10093.599 }, { "epoch": 2.694915254237288, "grad_norm": 0.5996072292327881, "learning_rate": 2.633306247119544e-05, "loss": 0.3676, "num_input_tokens_seen": 16858920, "step": 159, "train_runtime": 1669.3621, "train_tokens_per_second": 10099.019 }, { "epoch": 2.711864406779661, "grad_norm": 0.7424564361572266, "learning_rate": 2.603702070532167e-05, "loss": 0.3098, "num_input_tokens_seen": 16968168, "step": 160, "train_runtime": 1679.5951, "train_tokens_per_second": 10102.535 }, { "epoch": 2.7288135593220337, "grad_norm": 0.4871710240840912, "learning_rate": 2.5740833195563996e-05, "loss": 0.3391, "num_input_tokens_seen": 17075304, "step": 161, "train_runtime": 1689.916, "train_tokens_per_second": 10104.232 }, { "epoch": 2.7457627118644066, "grad_norm": 0.5770386457443237, "learning_rate": 2.5444541568398937e-05, "loss": 0.3637, "num_input_tokens_seen": 17193960, "step": 162, "train_runtime": 1700.2957, "train_tokens_per_second": 10112.335 }, { "epoch": 2.7627118644067794, "grad_norm": 0.4808778464794159, "learning_rate": 2.5148187464935763e-05, "loss": 0.3388, "num_input_tokens_seen": 17304184, "step": 163, "train_runtime": 1710.5416, "train_tokens_per_second": 10116.202 }, { "epoch": 2.7796610169491527, "grad_norm": 0.596011757850647, "learning_rate": 2.485181253506424e-05, "loss": 0.361, "num_input_tokens_seen": 17408944, "step": 164, "train_runtime": 1720.6002, "train_tokens_per_second": 10117.948 }, { "epoch": 2.7966101694915255, "grad_norm": 0.5141506195068359, "learning_rate": 2.4555458431601065e-05, "loss": 0.3551, "num_input_tokens_seen": 17512736, "step": 165, "train_runtime": 1730.778, "train_tokens_per_second": 10118.418 }, { "epoch": 2.8135593220338984, "grad_norm": 0.49830204248428345, "learning_rate": 2.4259166804436006e-05, "loss": 0.386, "num_input_tokens_seen": 17617368, "step": 166, "train_runtime": 1740.8375, "train_tokens_per_second": 10120.053 }, { "epoch": 2.830508474576271, "grad_norm": 0.4794394373893738, "learning_rate": 2.3962979294678337e-05, "loss": 0.3624, "num_input_tokens_seen": 17723424, "step": 167, "train_runtime": 1751.1217, "train_tokens_per_second": 10121.184 }, { "epoch": 2.847457627118644, "grad_norm": 0.4836881160736084, "learning_rate": 2.3666937528804563e-05, "loss": 0.3517, "num_input_tokens_seen": 17840688, "step": 168, "train_runtime": 1761.4439, "train_tokens_per_second": 10128.445 }, { "epoch": 2.864406779661017, "grad_norm": 0.6007115840911865, "learning_rate": 2.337108311280822e-05, "loss": 0.345, "num_input_tokens_seen": 17958736, "step": 169, "train_runtime": 1771.8446, "train_tokens_per_second": 10135.616 }, { "epoch": 2.8813559322033897, "grad_norm": 0.46608850359916687, "learning_rate": 2.3075457626352504e-05, "loss": 0.3491, "num_input_tokens_seen": 18060792, "step": 170, "train_runtime": 1782.1555, "train_tokens_per_second": 10134.24 }, { "epoch": 2.898305084745763, "grad_norm": 0.486213743686676, "learning_rate": 2.2780102616926633e-05, "loss": 0.3555, "num_input_tokens_seen": 18161344, "step": 171, "train_runtime": 1792.0836, "train_tokens_per_second": 10134.206 }, { "epoch": 2.915254237288136, "grad_norm": 0.5085980892181396, "learning_rate": 2.2485059594006734e-05, "loss": 0.3597, "num_input_tokens_seen": 18267840, "step": 172, "train_runtime": 1802.1661, "train_tokens_per_second": 10136.602 }, { "epoch": 2.9322033898305087, "grad_norm": 0.5648550987243652, "learning_rate": 2.2190370023222033e-05, "loss": 0.3601, "num_input_tokens_seen": 18371632, "step": 173, "train_runtime": 1812.1336, "train_tokens_per_second": 10138.122 }, { "epoch": 2.9491525423728815, "grad_norm": 0.5039841532707214, "learning_rate": 2.189607532052727e-05, "loss": 0.3321, "num_input_tokens_seen": 18493104, "step": 174, "train_runtime": 1822.5279, "train_tokens_per_second": 10146.953 }, { "epoch": 2.9661016949152543, "grad_norm": 0.45758432149887085, "learning_rate": 2.1602216846382048e-05, "loss": 0.2966, "num_input_tokens_seen": 18607368, "step": 175, "train_runtime": 1832.8984, "train_tokens_per_second": 10151.882 }, { "epoch": 2.9661016949152543, "eval_accuracy": 0.849152949938464, "eval_loss": 0.4958828091621399, "eval_runtime": 4.3492, "eval_samples_per_second": 11.496, "eval_steps_per_second": 2.989, "num_input_tokens_seen": 18607368, "step": 175 }, { "epoch": 2.983050847457627, "grad_norm": 0.4918224811553955, "learning_rate": 2.1308835899937972e-05, "loss": 0.3531, "num_input_tokens_seen": 18709368, "step": 176, "train_runtime": 1847.2674, "train_tokens_per_second": 10128.132 }, { "epoch": 3.0, "grad_norm": 0.5761289000511169, "learning_rate": 2.1015973713234464e-05, "loss": 0.3393, "num_input_tokens_seen": 18815328, "step": 177, "train_runtime": 1857.5601, "train_tokens_per_second": 10129.055 }, { "epoch": 3.016949152542373, "grad_norm": 0.43620994687080383, "learning_rate": 2.0723671445403954e-05, "loss": 0.2607, "num_input_tokens_seen": 18917216, "step": 178, "train_runtime": 1867.504, "train_tokens_per_second": 10129.679 }, { "epoch": 3.0338983050847457, "grad_norm": 0.4892238676548004, "learning_rate": 2.0431970176887315e-05, "loss": 0.271, "num_input_tokens_seen": 19027776, "step": 179, "train_runtime": 1877.6618, "train_tokens_per_second": 10133.761 }, { "epoch": 3.0508474576271185, "grad_norm": 0.47473910450935364, "learning_rate": 2.014091090366044e-05, "loss": 0.2439, "num_input_tokens_seen": 19135480, "step": 180, "train_runtime": 1888.0587, "train_tokens_per_second": 10135.003 }, { "epoch": 3.0677966101694913, "grad_norm": 0.47560861706733704, "learning_rate": 1.9850534531472546e-05, "loss": 0.2996, "num_input_tokens_seen": 19243480, "step": 181, "train_runtime": 1898.3716, "train_tokens_per_second": 10136.835 }, { "epoch": 3.084745762711864, "grad_norm": 0.47860297560691833, "learning_rate": 1.9560881870097308e-05, "loss": 0.2709, "num_input_tokens_seen": 19349232, "step": 182, "train_runtime": 1908.3819, "train_tokens_per_second": 10139.077 }, { "epoch": 3.1016949152542375, "grad_norm": 0.43783503770828247, "learning_rate": 1.9271993627597396e-05, "loss": 0.2506, "num_input_tokens_seen": 19468240, "step": 183, "train_runtime": 1918.7532, "train_tokens_per_second": 10146.297 }, { "epoch": 3.1186440677966103, "grad_norm": 0.4044387936592102, "learning_rate": 1.8983910404603296e-05, "loss": 0.1991, "num_input_tokens_seen": 19580696, "step": 184, "train_runtime": 1929.134, "train_tokens_per_second": 10149.993 }, { "epoch": 3.135593220338983, "grad_norm": 0.5283013582229614, "learning_rate": 1.8696672688607293e-05, "loss": 0.2483, "num_input_tokens_seen": 19685800, "step": 185, "train_runtime": 1939.0969, "train_tokens_per_second": 10152.046 }, { "epoch": 3.152542372881356, "grad_norm": 0.5997490286827087, "learning_rate": 1.8410320848273315e-05, "loss": 0.2796, "num_input_tokens_seen": 19787360, "step": 186, "train_runtime": 1949.3992, "train_tokens_per_second": 10150.491 }, { "epoch": 3.169491525423729, "grad_norm": 0.5210835933685303, "learning_rate": 1.8124895127763458e-05, "loss": 0.2604, "num_input_tokens_seen": 19887912, "step": 187, "train_runtime": 1959.6817, "train_tokens_per_second": 10148.542 }, { "epoch": 3.1864406779661016, "grad_norm": 0.46026745438575745, "learning_rate": 1.7840435641082072e-05, "loss": 0.2759, "num_input_tokens_seen": 19978168, "step": 188, "train_runtime": 1969.631, "train_tokens_per_second": 10143.102 }, { "epoch": 3.2033898305084745, "grad_norm": 0.4410321116447449, "learning_rate": 1.7556982366438053e-05, "loss": 0.2912, "num_input_tokens_seen": 20090288, "step": 189, "train_runtime": 1980.0875, "train_tokens_per_second": 10146.162 }, { "epoch": 3.2203389830508473, "grad_norm": 0.4692417085170746, "learning_rate": 1.7274575140626318e-05, "loss": 0.2852, "num_input_tokens_seen": 20188176, "step": 190, "train_runtime": 1990.314, "train_tokens_per_second": 10143.211 }, { "epoch": 3.23728813559322, "grad_norm": 0.5449389815330505, "learning_rate": 1.6993253653429063e-05, "loss": 0.2625, "num_input_tokens_seen": 20294944, "step": 191, "train_runtime": 2000.5661, "train_tokens_per_second": 10144.6 }, { "epoch": 3.2542372881355934, "grad_norm": 0.524983823299408, "learning_rate": 1.6713057442037743e-05, "loss": 0.2443, "num_input_tokens_seen": 20393248, "step": 192, "train_runtime": 2010.7028, "train_tokens_per_second": 10142.348 }, { "epoch": 3.2711864406779663, "grad_norm": 0.3914013206958771, "learning_rate": 1.6434025885496467e-05, "loss": 0.2252, "num_input_tokens_seen": 20495136, "step": 193, "train_runtime": 2021.0229, "train_tokens_per_second": 10140.972 }, { "epoch": 3.288135593220339, "grad_norm": 0.6633609533309937, "learning_rate": 1.6156198199167655e-05, "loss": 0.3712, "num_input_tokens_seen": 20590656, "step": 194, "train_runtime": 2031.1673, "train_tokens_per_second": 10137.351 }, { "epoch": 3.305084745762712, "grad_norm": 0.4628910422325134, "learning_rate": 1.5879613429220626e-05, "loss": 0.2243, "num_input_tokens_seen": 20701792, "step": 195, "train_runtime": 2041.5305, "train_tokens_per_second": 10140.33 }, { "epoch": 3.3220338983050848, "grad_norm": 0.5391764044761658, "learning_rate": 1.560431044714405e-05, "loss": 0.2873, "num_input_tokens_seen": 20806728, "step": 196, "train_runtime": 2051.8377, "train_tokens_per_second": 10140.533 }, { "epoch": 3.3389830508474576, "grad_norm": 0.46970924735069275, "learning_rate": 1.5330327944282913e-05, "loss": 0.259, "num_input_tokens_seen": 20909128, "step": 197, "train_runtime": 2062.1357, "train_tokens_per_second": 10139.55 }, { "epoch": 3.3559322033898304, "grad_norm": 0.47797513008117676, "learning_rate": 1.5057704426400767e-05, "loss": 0.2636, "num_input_tokens_seen": 21021888, "step": 198, "train_runtime": 2072.3947, "train_tokens_per_second": 10143.767 }, { "epoch": 3.3728813559322033, "grad_norm": 0.485984206199646, "learning_rate": 1.4786478208268134e-05, "loss": 0.2666, "num_input_tokens_seen": 21127504, "step": 199, "train_runtime": 2082.6099, "train_tokens_per_second": 10144.725 }, { "epoch": 3.389830508474576, "grad_norm": 0.5332046747207642, "learning_rate": 1.4516687408277669e-05, "loss": 0.2524, "num_input_tokens_seen": 21230584, "step": 200, "train_runtime": 2092.7929, "train_tokens_per_second": 10144.618 }, { "epoch": 3.389830508474576, "eval_accuracy": 0.8512740325455173, "eval_loss": 0.4950821101665497, "eval_runtime": 4.3457, "eval_samples_per_second": 11.506, "eval_steps_per_second": 2.991, "num_input_tokens_seen": 21230584, "step": 200 }, { "epoch": 3.406779661016949, "grad_norm": 0.5226387977600098, "learning_rate": 1.4248369943086998e-05, "loss": 0.2496, "num_input_tokens_seen": 21344472, "step": 201, "train_runtime": 2107.5277, "train_tokens_per_second": 10127.73 }, { "epoch": 3.423728813559322, "grad_norm": 0.5772292613983154, "learning_rate": 1.3981563522289848e-05, "loss": 0.3348, "num_input_tokens_seen": 21449200, "step": 202, "train_runtime": 2117.6581, "train_tokens_per_second": 10128.736 }, { "epoch": 3.440677966101695, "grad_norm": 0.47435376048088074, "learning_rate": 1.3716305643116345e-05, "loss": 0.242, "num_input_tokens_seen": 21543072, "step": 203, "train_runtime": 2127.3103, "train_tokens_per_second": 10126.906 }, { "epoch": 3.457627118644068, "grad_norm": 0.5281843543052673, "learning_rate": 1.3452633585163072e-05, "loss": 0.2973, "num_input_tokens_seen": 21656624, "step": 204, "train_runtime": 2137.3742, "train_tokens_per_second": 10132.35 }, { "epoch": 3.4745762711864407, "grad_norm": 0.4969395697116852, "learning_rate": 1.3190584405153767e-05, "loss": 0.2397, "num_input_tokens_seen": 21771480, "step": 205, "train_runtime": 2147.7555, "train_tokens_per_second": 10136.852 }, { "epoch": 3.4915254237288136, "grad_norm": 0.43185102939605713, "learning_rate": 1.2930194931731382e-05, "loss": 0.2163, "num_input_tokens_seen": 21884760, "step": 206, "train_runtime": 2158.1144, "train_tokens_per_second": 10140.686 }, { "epoch": 3.5084745762711864, "grad_norm": 0.6581583023071289, "learning_rate": 1.2671501760282079e-05, "loss": 0.3422, "num_input_tokens_seen": 21991712, "step": 207, "train_runtime": 2168.313, "train_tokens_per_second": 10142.314 }, { "epoch": 3.5254237288135593, "grad_norm": 0.5069866180419922, "learning_rate": 1.2414541247792121e-05, "loss": 0.2829, "num_input_tokens_seen": 22104552, "step": 208, "train_runtime": 2178.6146, "train_tokens_per_second": 10146.151 }, { "epoch": 3.542372881355932, "grad_norm": 0.46896129846572876, "learning_rate": 1.2159349507738247e-05, "loss": 0.2411, "num_input_tokens_seen": 22209288, "step": 209, "train_runtime": 2188.8257, "train_tokens_per_second": 10146.668 }, { "epoch": 3.559322033898305, "grad_norm": 0.4443514943122864, "learning_rate": 1.1905962405012192e-05, "loss": 0.2872, "num_input_tokens_seen": 22307624, "step": 210, "train_runtime": 2198.6992, "train_tokens_per_second": 10145.828 }, { "epoch": 3.576271186440678, "grad_norm": 0.6187303066253662, "learning_rate": 1.1654415550880243e-05, "loss": 0.3551, "num_input_tokens_seen": 22407656, "step": 211, "train_runtime": 2208.9033, "train_tokens_per_second": 10144.245 }, { "epoch": 3.593220338983051, "grad_norm": 0.5176218152046204, "learning_rate": 1.1404744297978373e-05, "loss": 0.2102, "num_input_tokens_seen": 22516640, "step": 212, "train_runtime": 2219.0952, "train_tokens_per_second": 10146.766 }, { "epoch": 3.610169491525424, "grad_norm": 0.5846608281135559, "learning_rate": 1.1156983735343796e-05, "loss": 0.2977, "num_input_tokens_seen": 22620992, "step": 213, "train_runtime": 2229.4499, "train_tokens_per_second": 10146.445 }, { "epoch": 3.6271186440677967, "grad_norm": 0.5257160663604736, "learning_rate": 1.0911168683483449e-05, "loss": 0.2581, "num_input_tokens_seen": 22736624, "step": 214, "train_runtime": 2239.7969, "train_tokens_per_second": 10151.199 }, { "epoch": 3.6440677966101696, "grad_norm": 0.419572651386261, "learning_rate": 1.0667333689480322e-05, "loss": 0.2166, "num_input_tokens_seen": 22855144, "step": 215, "train_runtime": 2250.1683, "train_tokens_per_second": 10157.082 }, { "epoch": 3.6610169491525424, "grad_norm": 0.4591136574745178, "learning_rate": 1.0425513022138203e-05, "loss": 0.2322, "num_input_tokens_seen": 22965704, "step": 216, "train_runtime": 2260.5309, "train_tokens_per_second": 10159.429 }, { "epoch": 3.6779661016949152, "grad_norm": 0.407421737909317, "learning_rate": 1.0185740667165456e-05, "loss": 0.301, "num_input_tokens_seen": 23070056, "step": 217, "train_runtime": 2270.4807, "train_tokens_per_second": 10160.868 }, { "epoch": 3.694915254237288, "grad_norm": 0.42050784826278687, "learning_rate": 9.948050322398658e-06, "loss": 0.2224, "num_input_tokens_seen": 23180184, "step": 218, "train_runtime": 2280.8847, "train_tokens_per_second": 10162.804 }, { "epoch": 3.711864406779661, "grad_norm": 0.49007946252822876, "learning_rate": 9.712475393066705e-06, "loss": 0.3068, "num_input_tokens_seen": 23285216, "step": 219, "train_runtime": 2291.106, "train_tokens_per_second": 10163.308 }, { "epoch": 3.7288135593220337, "grad_norm": 0.41998228430747986, "learning_rate": 9.479048987095954e-06, "loss": 0.2098, "num_input_tokens_seen": 23393240, "step": 220, "train_runtime": 2301.46, "train_tokens_per_second": 10164.522 }, { "epoch": 3.7457627118644066, "grad_norm": 0.5238583087921143, "learning_rate": 9.247803910457226e-06, "loss": 0.2637, "num_input_tokens_seen": 23505224, "step": 221, "train_runtime": 2311.8168, "train_tokens_per_second": 10167.425 }, { "epoch": 3.7627118644067794, "grad_norm": 0.4628532826900482, "learning_rate": 9.018772662555252e-06, "loss": 0.2402, "num_input_tokens_seen": 23602096, "step": 222, "train_runtime": 2322.0686, "train_tokens_per_second": 10164.254 }, { "epoch": 3.7796610169491527, "grad_norm": 0.42983704805374146, "learning_rate": 8.791987431661137e-06, "loss": 0.232, "num_input_tokens_seen": 23697608, "step": 223, "train_runtime": 2331.884, "train_tokens_per_second": 10162.43 }, { "epoch": 3.7966101694915255, "grad_norm": 0.5031875967979431, "learning_rate": 8.567480090388586e-06, "loss": 0.298, "num_input_tokens_seen": 23795304, "step": 224, "train_runtime": 2342.077, "train_tokens_per_second": 10159.915 }, { "epoch": 3.8135593220338984, "grad_norm": 0.5014523863792419, "learning_rate": 8.34528219121455e-06, "loss": 0.2689, "num_input_tokens_seen": 23905280, "step": 225, "train_runtime": 2352.3335, "train_tokens_per_second": 10162.368 }, { "epoch": 3.8135593220338984, "eval_accuracy": 0.8556592515560445, "eval_loss": 0.4844910502433777, "eval_runtime": 4.3302, "eval_samples_per_second": 11.547, "eval_steps_per_second": 3.002, "num_input_tokens_seen": 23905280, "step": 225 }, { "epoch": 3.830508474576271, "grad_norm": 0.46191343665122986, "learning_rate": 8.125424962044742e-06, "loss": 0.2417, "num_input_tokens_seen": 24015504, "step": 226, "train_runtime": 2367.013, "train_tokens_per_second": 10145.911 }, { "epoch": 3.847457627118644, "grad_norm": 0.5780752897262573, "learning_rate": 7.907939301824884e-06, "loss": 0.2673, "num_input_tokens_seen": 24128928, "step": 227, "train_runtime": 2377.1018, "train_tokens_per_second": 10150.566 }, { "epoch": 3.864406779661017, "grad_norm": 0.5428578853607178, "learning_rate": 7.692855776198114e-06, "loss": 0.2541, "num_input_tokens_seen": 24232712, "step": 228, "train_runtime": 2387.4126, "train_tokens_per_second": 10150.199 }, { "epoch": 3.8813559322033897, "grad_norm": 0.48555266857147217, "learning_rate": 7.480204613209288e-06, "loss": 0.2341, "num_input_tokens_seen": 24337744, "step": 229, "train_runtime": 2397.6403, "train_tokens_per_second": 10150.707 }, { "epoch": 3.898305084745763, "grad_norm": 0.46837303042411804, "learning_rate": 7.2700156990566675e-06, "loss": 0.2309, "num_input_tokens_seen": 24446736, "step": 230, "train_runtime": 2408.0187, "train_tokens_per_second": 10152.22 }, { "epoch": 3.915254237288136, "grad_norm": 0.5462357997894287, "learning_rate": 7.062318573891716e-06, "loss": 0.2718, "num_input_tokens_seen": 24562728, "step": 231, "train_runtime": 2418.4129, "train_tokens_per_second": 10156.548 }, { "epoch": 3.9322033898305087, "grad_norm": 0.5173876285552979, "learning_rate": 6.85714242766749e-06, "loss": 0.2529, "num_input_tokens_seen": 24669264, "step": 232, "train_runtime": 2428.6573, "train_tokens_per_second": 10157.573 }, { "epoch": 3.9491525423728815, "grad_norm": 0.5115875601768494, "learning_rate": 6.654516096036231e-06, "loss": 0.2899, "num_input_tokens_seen": 24774256, "step": 233, "train_runtime": 2440.0145, "train_tokens_per_second": 10153.323 }, { "epoch": 3.9661016949152543, "grad_norm": 0.4781138002872467, "learning_rate": 6.4544680562968e-06, "loss": 0.3025, "num_input_tokens_seen": 24882752, "step": 234, "train_runtime": 2451.4184, "train_tokens_per_second": 10150.349 }, { "epoch": 3.983050847457627, "grad_norm": 0.46944358944892883, "learning_rate": 6.25702642339244e-06, "loss": 0.2283, "num_input_tokens_seen": 24984864, "step": 235, "train_runtime": 2462.74, "train_tokens_per_second": 10145.149 }, { "epoch": 4.0, "grad_norm": 0.5003960728645325, "learning_rate": 6.062218945959497e-06, "loss": 0.2404, "num_input_tokens_seen": 25087648, "step": 236, "train_runtime": 2474.0239, "train_tokens_per_second": 10140.423 }, { "epoch": 4.016949152542373, "grad_norm": 0.4178627133369446, "learning_rate": 5.87007300242757e-06, "loss": 0.1724, "num_input_tokens_seen": 25195544, "step": 237, "train_runtime": 2485.4081, "train_tokens_per_second": 10137.387 }, { "epoch": 4.033898305084746, "grad_norm": 0.45459651947021484, "learning_rate": 5.680615597171718e-06, "loss": 0.1724, "num_input_tokens_seen": 25298896, "step": 238, "train_runtime": 2496.7382, "train_tokens_per_second": 10132.779 }, { "epoch": 4.0508474576271185, "grad_norm": 0.44449377059936523, "learning_rate": 5.493873356717288e-06, "loss": 0.2116, "num_input_tokens_seen": 25400872, "step": 239, "train_runtime": 2508.1036, "train_tokens_per_second": 10127.521 }, { "epoch": 4.067796610169491, "grad_norm": 0.4421490728855133, "learning_rate": 5.309872525997736e-06, "loss": 0.1622, "num_input_tokens_seen": 25512440, "step": 240, "train_runtime": 2519.2603, "train_tokens_per_second": 10126.957 }, { "epoch": 4.084745762711864, "grad_norm": 0.40225303173065186, "learning_rate": 5.128638964666166e-06, "loss": 0.1675, "num_input_tokens_seen": 25630856, "step": 241, "train_runtime": 2530.7189, "train_tokens_per_second": 10127.895 }, { "epoch": 4.101694915254237, "grad_norm": 0.4766783118247986, "learning_rate": 4.950198143461013e-06, "loss": 0.177, "num_input_tokens_seen": 25735072, "step": 242, "train_runtime": 2542.0428, "train_tokens_per_second": 10123.776 }, { "epoch": 4.11864406779661, "grad_norm": 0.40857142210006714, "learning_rate": 4.7745751406263165e-06, "loss": 0.1777, "num_input_tokens_seen": 25828728, "step": 243, "train_runtime": 2552.7154, "train_tokens_per_second": 10118.139 }, { "epoch": 4.135593220338983, "grad_norm": 0.5124621987342834, "learning_rate": 4.601794638387219e-06, "loss": 0.2315, "num_input_tokens_seen": 25934056, "step": 244, "train_runtime": 2563.8172, "train_tokens_per_second": 10115.408 }, { "epoch": 4.1525423728813555, "grad_norm": 0.4303904175758362, "learning_rate": 4.43188091948113e-06, "loss": 0.1943, "num_input_tokens_seen": 26045936, "step": 245, "train_runtime": 2575.1723, "train_tokens_per_second": 10114.25 }, { "epoch": 4.169491525423728, "grad_norm": 0.5164250135421753, "learning_rate": 4.264857863744956e-06, "loss": 0.2207, "num_input_tokens_seen": 26149848, "step": 246, "train_runtime": 2586.5085, "train_tokens_per_second": 10110.096 }, { "epoch": 4.186440677966102, "grad_norm": 0.4310712516307831, "learning_rate": 4.1007489447590365e-06, "loss": 0.1831, "num_input_tokens_seen": 26254680, "step": 247, "train_runtime": 2597.5682, "train_tokens_per_second": 10107.407 }, { "epoch": 4.203389830508475, "grad_norm": 0.5040118098258972, "learning_rate": 3.939577226548152e-06, "loss": 0.2043, "num_input_tokens_seen": 26355904, "step": 248, "train_runtime": 2608.9171, "train_tokens_per_second": 10102.239 }, { "epoch": 4.220338983050848, "grad_norm": 0.3849621117115021, "learning_rate": 3.781365360340056e-06, "loss": 0.1638, "num_input_tokens_seen": 26449120, "step": 249, "train_runtime": 2620.1674, "train_tokens_per_second": 10094.439 }, { "epoch": 4.237288135593221, "grad_norm": 0.4804217517375946, "learning_rate": 3.6261355813820645e-06, "loss": 0.1457, "num_input_tokens_seen": 26551272, "step": 250, "train_runtime": 2631.5568, "train_tokens_per_second": 10089.568 }, { "epoch": 4.237288135593221, "eval_accuracy": 0.856542517040091, "eval_loss": 0.48583686351776123, "eval_runtime": 4.3487, "eval_samples_per_second": 11.498, "eval_steps_per_second": 2.989, "num_input_tokens_seen": 26551272, "step": 250 }, { "epoch": 4.254237288135593, "grad_norm": 0.4448375403881073, "learning_rate": 3.4739097058161114e-06, "loss": 0.213, "num_input_tokens_seen": 26656864, "step": 251, "train_runtime": 2647.2819, "train_tokens_per_second": 10069.522 }, { "epoch": 4.271186440677966, "grad_norm": 0.40976589918136597, "learning_rate": 3.324709127612649e-06, "loss": 0.1551, "num_input_tokens_seen": 26761368, "step": 252, "train_runtime": 2658.5975, "train_tokens_per_second": 10065.972 }, { "epoch": 4.288135593220339, "grad_norm": 0.5147706270217896, "learning_rate": 3.1785548155639444e-06, "loss": 0.2033, "num_input_tokens_seen": 26860000, "step": 253, "train_runtime": 2669.9079, "train_tokens_per_second": 10060.272 }, { "epoch": 4.305084745762712, "grad_norm": 0.4196415841579437, "learning_rate": 3.035467310337095e-06, "loss": 0.1509, "num_input_tokens_seen": 26971656, "step": 254, "train_runtime": 2681.2328, "train_tokens_per_second": 10059.423 }, { "epoch": 4.322033898305085, "grad_norm": 0.6118748784065247, "learning_rate": 2.895466721587245e-06, "loss": 0.1798, "num_input_tokens_seen": 27072048, "step": 255, "train_runtime": 2692.5604, "train_tokens_per_second": 10054.389 }, { "epoch": 4.338983050847458, "grad_norm": 0.49149730801582336, "learning_rate": 2.75857272513132e-06, "loss": 0.1817, "num_input_tokens_seen": 27181384, "step": 256, "train_runtime": 2704.0171, "train_tokens_per_second": 10052.224 }, { "epoch": 4.3559322033898304, "grad_norm": 0.5347187519073486, "learning_rate": 2.624804560182789e-06, "loss": 0.2302, "num_input_tokens_seen": 27283960, "step": 257, "train_runtime": 2715.419, "train_tokens_per_second": 10047.79 }, { "epoch": 4.372881355932203, "grad_norm": 0.4388182759284973, "learning_rate": 2.494181026647782e-06, "loss": 0.1795, "num_input_tokens_seen": 27387568, "step": 258, "train_runtime": 2726.8004, "train_tokens_per_second": 10043.848 }, { "epoch": 4.389830508474576, "grad_norm": 0.4659542441368103, "learning_rate": 2.3667204824828953e-06, "loss": 0.158, "num_input_tokens_seen": 27495760, "step": 259, "train_runtime": 2738.2116, "train_tokens_per_second": 10041.503 }, { "epoch": 4.406779661016949, "grad_norm": 0.3610907196998596, "learning_rate": 2.2424408411151704e-06, "loss": 0.1672, "num_input_tokens_seen": 27602880, "step": 260, "train_runtime": 2749.6201, "train_tokens_per_second": 10038.798 }, { "epoch": 4.423728813559322, "grad_norm": 0.46502378582954407, "learning_rate": 2.1213595689245386e-06, "loss": 0.2301, "num_input_tokens_seen": 27704640, "step": 261, "train_runtime": 2760.4128, "train_tokens_per_second": 10036.412 }, { "epoch": 4.440677966101695, "grad_norm": 0.3998453617095947, "learning_rate": 2.00349368278904e-06, "loss": 0.1928, "num_input_tokens_seen": 27812152, "step": 262, "train_runtime": 2771.7382, "train_tokens_per_second": 10034.191 }, { "epoch": 4.4576271186440675, "grad_norm": 0.4807196259498596, "learning_rate": 1.8888597476932834e-06, "loss": 0.218, "num_input_tokens_seen": 27916960, "step": 263, "train_runtime": 2783.07, "train_tokens_per_second": 10030.994 }, { "epoch": 4.47457627118644, "grad_norm": 0.4619472324848175, "learning_rate": 1.7774738744003927e-06, "loss": 0.1459, "num_input_tokens_seen": 28032376, "step": 264, "train_runtime": 2794.6692, "train_tokens_per_second": 10030.66 }, { "epoch": 4.491525423728813, "grad_norm": 0.5940751433372498, "learning_rate": 1.6693517171877533e-06, "loss": 0.1564, "num_input_tokens_seen": 28132512, "step": 265, "train_runtime": 2805.8149, "train_tokens_per_second": 10026.503 }, { "epoch": 4.508474576271187, "grad_norm": 0.41254979372024536, "learning_rate": 1.5645084716469777e-06, "loss": 0.134, "num_input_tokens_seen": 28234672, "step": 266, "train_runtime": 2816.9381, "train_tokens_per_second": 10023.178 }, { "epoch": 4.52542372881356, "grad_norm": 0.4586114287376404, "learning_rate": 1.4629588725482841e-06, "loss": 0.1599, "num_input_tokens_seen": 28338776, "step": 267, "train_runtime": 2828.0299, "train_tokens_per_second": 10020.678 }, { "epoch": 4.5423728813559325, "grad_norm": 0.4491695761680603, "learning_rate": 1.3647171917696684e-06, "loss": 0.1864, "num_input_tokens_seen": 28458896, "step": 268, "train_runtime": 2839.5002, "train_tokens_per_second": 10022.502 }, { "epoch": 4.559322033898305, "grad_norm": 0.5336939096450806, "learning_rate": 1.2697972362911064e-06, "loss": 0.2124, "num_input_tokens_seen": 28570072, "step": 269, "train_runtime": 2850.906, "train_tokens_per_second": 10021.401 }, { "epoch": 4.576271186440678, "grad_norm": 0.4559793770313263, "learning_rate": 1.1782123462541178e-06, "loss": 0.1485, "num_input_tokens_seen": 28674856, "step": 270, "train_runtime": 2862.1961, "train_tokens_per_second": 10018.48 }, { "epoch": 4.593220338983051, "grad_norm": 0.46341943740844727, "learning_rate": 1.0899753930869394e-06, "loss": 0.2151, "num_input_tokens_seen": 28779768, "step": 271, "train_runtime": 2873.4101, "train_tokens_per_second": 10015.893 }, { "epoch": 4.610169491525424, "grad_norm": 0.5154189467430115, "learning_rate": 1.00509877769554e-06, "loss": 0.2067, "num_input_tokens_seen": 28885568, "step": 272, "train_runtime": 2884.7694, "train_tokens_per_second": 10013.129 }, { "epoch": 4.627118644067797, "grad_norm": 0.38729819655418396, "learning_rate": 9.235944287207976e-07, "loss": 0.1291, "num_input_tokens_seen": 29000280, "step": 273, "train_runtime": 2896.1682, "train_tokens_per_second": 10013.327 }, { "epoch": 4.6440677966101696, "grad_norm": 0.43197062611579895, "learning_rate": 8.454738008620456e-07, "loss": 0.2032, "num_input_tokens_seen": 29100816, "step": 274, "train_runtime": 2907.2873, "train_tokens_per_second": 10009.611 }, { "epoch": 4.661016949152542, "grad_norm": 0.46766197681427, "learning_rate": 7.707478732671941e-07, "loss": 0.1878, "num_input_tokens_seen": 29196936, "step": 275, "train_runtime": 2918.59, "train_tokens_per_second": 10003.781 }, { "epoch": 4.661016949152542, "eval_accuracy": 0.8570156034618972, "eval_loss": 0.48511388897895813, "eval_runtime": 4.3472, "eval_samples_per_second": 11.502, "eval_steps_per_second": 2.99, "num_input_tokens_seen": 29196936, "step": 275 }, { "epoch": 4.677966101694915, "grad_norm": 0.44221359491348267, "learning_rate": 6.994271479897314e-07, "loss": 0.2355, "num_input_tokens_seen": 29310880, "step": 276, "train_runtime": 2934.3811, "train_tokens_per_second": 9988.777 }, { "epoch": 4.694915254237288, "grad_norm": 0.4733332395553589, "learning_rate": 6.315216485127506e-07, "loss": 0.1741, "num_input_tokens_seen": 29421104, "step": 277, "train_runtime": 2945.4888, "train_tokens_per_second": 9988.53 }, { "epoch": 4.711864406779661, "grad_norm": 0.49647605419158936, "learning_rate": 5.670409183402364e-07, "loss": 0.2569, "num_input_tokens_seen": 29524408, "step": 278, "train_runtime": 2956.8376, "train_tokens_per_second": 9985.13 }, { "epoch": 4.728813559322034, "grad_norm": 0.4930429756641388, "learning_rate": 5.059940196558088e-07, "loss": 0.2181, "num_input_tokens_seen": 29624776, "step": 279, "train_runtime": 2967.8288, "train_tokens_per_second": 9981.969 }, { "epoch": 4.745762711864407, "grad_norm": 0.5534479022026062, "learning_rate": 4.4838953204912326e-07, "loss": 0.2237, "num_input_tokens_seen": 29728640, "step": 280, "train_runtime": 2979.2153, "train_tokens_per_second": 9978.681 }, { "epoch": 4.762711864406779, "grad_norm": 0.4766373634338379, "learning_rate": 3.9423555131007925e-07, "loss": 0.2027, "num_input_tokens_seen": 29832320, "step": 281, "train_runtime": 2990.5952, "train_tokens_per_second": 9975.379 }, { "epoch": 4.779661016949152, "grad_norm": 0.40961286425590515, "learning_rate": 3.435396882910391e-07, "loss": 0.1866, "num_input_tokens_seen": 29938136, "step": 282, "train_runtime": 3001.9098, "train_tokens_per_second": 9973.03 }, { "epoch": 4.796610169491525, "grad_norm": 0.449975848197937, "learning_rate": 2.963090678371805e-07, "loss": 0.1648, "num_input_tokens_seen": 30038384, "step": 283, "train_runtime": 3013.1073, "train_tokens_per_second": 9969.238 }, { "epoch": 4.813559322033898, "grad_norm": 0.4254641830921173, "learning_rate": 2.5255032778517264e-07, "loss": 0.1827, "num_input_tokens_seen": 30155184, "step": 284, "train_runtime": 3024.58, "train_tokens_per_second": 9970.04 }, { "epoch": 4.830508474576272, "grad_norm": 0.3542494475841522, "learning_rate": 2.1226961803028632e-07, "loss": 0.194, "num_input_tokens_seen": 30258792, "step": 285, "train_runtime": 3035.7174, "train_tokens_per_second": 9967.592 }, { "epoch": 4.847457627118644, "grad_norm": 0.51301109790802, "learning_rate": 1.7547259966207708e-07, "loss": 0.197, "num_input_tokens_seen": 30360016, "step": 286, "train_runtime": 3046.9789, "train_tokens_per_second": 9963.973 }, { "epoch": 4.864406779661017, "grad_norm": 0.3893289566040039, "learning_rate": 1.4216444416877695e-07, "loss": 0.1479, "num_input_tokens_seen": 30457880, "step": 287, "train_runtime": 3058.3141, "train_tokens_per_second": 9959.042 }, { "epoch": 4.88135593220339, "grad_norm": 0.44192439317703247, "learning_rate": 1.1234983271048161e-07, "loss": 0.1922, "num_input_tokens_seen": 30561968, "step": 288, "train_runtime": 3069.6644, "train_tokens_per_second": 9956.127 }, { "epoch": 4.898305084745763, "grad_norm": 0.46849843859672546, "learning_rate": 8.603295546126821e-08, "loss": 0.2168, "num_input_tokens_seen": 30666960, "step": 289, "train_runtime": 3080.873, "train_tokens_per_second": 9953.984 }, { "epoch": 4.915254237288136, "grad_norm": 0.4828685522079468, "learning_rate": 6.321751102028595e-08, "loss": 0.1848, "num_input_tokens_seen": 30789448, "step": 290, "train_runtime": 3092.3693, "train_tokens_per_second": 9956.588 }, { "epoch": 4.932203389830509, "grad_norm": 0.5445053577423096, "learning_rate": 4.390670589196622e-08, "loss": 0.1965, "num_input_tokens_seen": 30910944, "step": 291, "train_runtime": 3103.817, "train_tokens_per_second": 9959.01 }, { "epoch": 4.9491525423728815, "grad_norm": 0.5450407862663269, "learning_rate": 2.8103254035369285e-08, "loss": 0.2339, "num_input_tokens_seen": 31025152, "step": 292, "train_runtime": 3115.2864, "train_tokens_per_second": 9959.005 }, { "epoch": 4.966101694915254, "grad_norm": 0.48578107357025146, "learning_rate": 1.5809376482767147e-08, "loss": 0.1937, "num_input_tokens_seen": 31135592, "step": 293, "train_runtime": 3126.6738, "train_tokens_per_second": 9958.056 }, { "epoch": 4.983050847457627, "grad_norm": 0.5169346332550049, "learning_rate": 7.0268010274959775e-09, "loss": 0.1793, "num_input_tokens_seen": 31247744, "step": 294, "train_runtime": 3138.117, "train_tokens_per_second": 9957.482 }, { "epoch": 5.0, "grad_norm": 0.4611862003803253, "learning_rate": 1.7567619811281744e-09, "loss": 0.1822, "num_input_tokens_seen": 31362728, "step": 295, "train_runtime": 3149.5513, "train_tokens_per_second": 9957.84 }, { "epoch": 5.0, "num_input_tokens_seen": 31362728, "step": 295, "total_flos": 1.4241950524474655e+18, "train_loss": 0.4095978515633082, "train_runtime": 3265.849, "train_samples_per_second": 1.442, "train_steps_per_second": 0.09 } ], "logging_steps": 1, "max_steps": 295, "num_input_tokens_seen": 31362728, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4241950524474655e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }