Files
Qwen3-8B-SFT-envbench_gpt5-…/trainer_state.json

3105 lines
88 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 25,
"global_step": 295,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01694915254237288,
"grad_norm": 16.34368133544922,
"learning_rate": 0.0,
"loss": 1.6295,
"num_input_tokens_seen": 112224,
"step": 1,
"train_runtime": 24.172,
"train_tokens_per_second": 4642.725
},
{
"epoch": 0.03389830508474576,
"grad_norm": 15.402971267700195,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.6581,
"num_input_tokens_seen": 221080,
"step": 2,
"train_runtime": 34.4875,
"train_tokens_per_second": 6410.436
},
{
"epoch": 0.05084745762711865,
"grad_norm": 14.902987480163574,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.6797,
"num_input_tokens_seen": 326656,
"step": 3,
"train_runtime": 44.7565,
"train_tokens_per_second": 7298.518
},
{
"epoch": 0.06779661016949153,
"grad_norm": 9.34762954711914,
"learning_rate": 5e-06,
"loss": 1.4077,
"num_input_tokens_seen": 433832,
"step": 4,
"train_runtime": 55.089,
"train_tokens_per_second": 7875.117
},
{
"epoch": 0.0847457627118644,
"grad_norm": 5.879909515380859,
"learning_rate": 6.666666666666667e-06,
"loss": 1.3219,
"num_input_tokens_seen": 548184,
"step": 5,
"train_runtime": 65.5147,
"train_tokens_per_second": 8367.344
},
{
"epoch": 0.1016949152542373,
"grad_norm": 2.9537651538848877,
"learning_rate": 8.333333333333334e-06,
"loss": 1.14,
"num_input_tokens_seen": 655632,
"step": 6,
"train_runtime": 75.8973,
"train_tokens_per_second": 8638.415
},
{
"epoch": 0.11864406779661017,
"grad_norm": 2.5423429012298584,
"learning_rate": 1e-05,
"loss": 1.0531,
"num_input_tokens_seen": 759704,
"step": 7,
"train_runtime": 85.9306,
"train_tokens_per_second": 8840.903
},
{
"epoch": 0.13559322033898305,
"grad_norm": 1.7696669101715088,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.9491,
"num_input_tokens_seen": 875624,
"step": 8,
"train_runtime": 96.0912,
"train_tokens_per_second": 9112.426
},
{
"epoch": 0.15254237288135594,
"grad_norm": 1.8822790384292603,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.9397,
"num_input_tokens_seen": 989656,
"step": 9,
"train_runtime": 106.4674,
"train_tokens_per_second": 9295.392
},
{
"epoch": 0.1694915254237288,
"grad_norm": 1.5840418338775635,
"learning_rate": 1.5e-05,
"loss": 0.8694,
"num_input_tokens_seen": 1092152,
"step": 10,
"train_runtime": 116.3444,
"train_tokens_per_second": 9387.229
},
{
"epoch": 0.1864406779661017,
"grad_norm": 1.7065584659576416,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.8185,
"num_input_tokens_seen": 1204856,
"step": 11,
"train_runtime": 126.69,
"train_tokens_per_second": 9510.268
},
{
"epoch": 0.2033898305084746,
"grad_norm": 1.621471643447876,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.7581,
"num_input_tokens_seen": 1302968,
"step": 12,
"train_runtime": 136.9093,
"train_tokens_per_second": 9517.016
},
{
"epoch": 0.22033898305084745,
"grad_norm": 1.5125755071640015,
"learning_rate": 2e-05,
"loss": 0.7413,
"num_input_tokens_seen": 1405520,
"step": 13,
"train_runtime": 147.1109,
"train_tokens_per_second": 9554.152
},
{
"epoch": 0.23728813559322035,
"grad_norm": 1.5941375494003296,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.719,
"num_input_tokens_seen": 1510384,
"step": 14,
"train_runtime": 157.3295,
"train_tokens_per_second": 9600.131
},
{
"epoch": 0.2542372881355932,
"grad_norm": 1.9046858549118042,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.7001,
"num_input_tokens_seen": 1615456,
"step": 15,
"train_runtime": 167.5094,
"train_tokens_per_second": 9643.97
},
{
"epoch": 0.2711864406779661,
"grad_norm": 1.1321921348571777,
"learning_rate": 2.5e-05,
"loss": 0.682,
"num_input_tokens_seen": 1720464,
"step": 16,
"train_runtime": 177.392,
"train_tokens_per_second": 9698.654
},
{
"epoch": 0.288135593220339,
"grad_norm": 1.1452239751815796,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.6806,
"num_input_tokens_seen": 1825984,
"step": 17,
"train_runtime": 187.6572,
"train_tokens_per_second": 9730.423
},
{
"epoch": 0.3050847457627119,
"grad_norm": 1.1809570789337158,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.6672,
"num_input_tokens_seen": 1925424,
"step": 18,
"train_runtime": 197.7852,
"train_tokens_per_second": 9734.926
},
{
"epoch": 0.3220338983050847,
"grad_norm": 1.0847970247268677,
"learning_rate": 3e-05,
"loss": 0.6324,
"num_input_tokens_seen": 2024656,
"step": 19,
"train_runtime": 207.8433,
"train_tokens_per_second": 9741.263
},
{
"epoch": 0.3389830508474576,
"grad_norm": 1.0256402492523193,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.6849,
"num_input_tokens_seen": 2123808,
"step": 20,
"train_runtime": 217.9334,
"train_tokens_per_second": 9745.216
},
{
"epoch": 0.3559322033898305,
"grad_norm": 0.9755237102508545,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.6998,
"num_input_tokens_seen": 2236536,
"step": 21,
"train_runtime": 228.3115,
"train_tokens_per_second": 9795.987
},
{
"epoch": 0.3728813559322034,
"grad_norm": 1.146010398864746,
"learning_rate": 3.5e-05,
"loss": 0.6178,
"num_input_tokens_seen": 2345176,
"step": 22,
"train_runtime": 238.4088,
"train_tokens_per_second": 9836.786
},
{
"epoch": 0.3898305084745763,
"grad_norm": 1.0620143413543701,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.6417,
"num_input_tokens_seen": 2450632,
"step": 23,
"train_runtime": 248.3988,
"train_tokens_per_second": 9865.715
},
{
"epoch": 0.4067796610169492,
"grad_norm": 1.7118717432022095,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.681,
"num_input_tokens_seen": 2560968,
"step": 24,
"train_runtime": 258.705,
"train_tokens_per_second": 9899.182
},
{
"epoch": 0.423728813559322,
"grad_norm": 1.1660561561584473,
"learning_rate": 4e-05,
"loss": 0.6877,
"num_input_tokens_seen": 2673304,
"step": 25,
"train_runtime": 269.0249,
"train_tokens_per_second": 9937.014
},
{
"epoch": 0.423728813559322,
"eval_accuracy": 0.8128813839238969,
"eval_loss": 0.671046793460846,
"eval_runtime": 4.3547,
"eval_samples_per_second": 11.482,
"eval_steps_per_second": 2.985,
"num_input_tokens_seen": 2673304,
"step": 25
},
{
"epoch": 0.4406779661016949,
"grad_norm": 0.9917629957199097,
"learning_rate": 4.166666666666667e-05,
"loss": 0.6362,
"num_input_tokens_seen": 2769736,
"step": 26,
"train_runtime": 283.1953,
"train_tokens_per_second": 9780.303
},
{
"epoch": 0.4576271186440678,
"grad_norm": 1.1648198366165161,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.6535,
"num_input_tokens_seen": 2875984,
"step": 27,
"train_runtime": 293.3704,
"train_tokens_per_second": 9803.251
},
{
"epoch": 0.4745762711864407,
"grad_norm": 1.1746153831481934,
"learning_rate": 4.5e-05,
"loss": 0.7158,
"num_input_tokens_seen": 2986112,
"step": 28,
"train_runtime": 303.5965,
"train_tokens_per_second": 9835.791
},
{
"epoch": 0.4915254237288136,
"grad_norm": 1.2997013330459595,
"learning_rate": 4.666666666666667e-05,
"loss": 0.6744,
"num_input_tokens_seen": 3093880,
"step": 29,
"train_runtime": 313.983,
"train_tokens_per_second": 9853.654
},
{
"epoch": 0.5084745762711864,
"grad_norm": 1.8045681715011597,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.6442,
"num_input_tokens_seen": 3202720,
"step": 30,
"train_runtime": 324.0355,
"train_tokens_per_second": 9883.855
},
{
"epoch": 0.5254237288135594,
"grad_norm": 0.9538795351982117,
"learning_rate": 5e-05,
"loss": 0.6758,
"num_input_tokens_seen": 3311816,
"step": 31,
"train_runtime": 334.371,
"train_tokens_per_second": 9904.616
},
{
"epoch": 0.5423728813559322,
"grad_norm": 1.0621432065963745,
"learning_rate": 4.999824323801887e-05,
"loss": 0.7101,
"num_input_tokens_seen": 3415424,
"step": 32,
"train_runtime": 344.5683,
"train_tokens_per_second": 9912.182
},
{
"epoch": 0.559322033898305,
"grad_norm": 0.9179059267044067,
"learning_rate": 4.9992973198972505e-05,
"loss": 0.6392,
"num_input_tokens_seen": 3524136,
"step": 33,
"train_runtime": 354.8912,
"train_tokens_per_second": 9930.186
},
{
"epoch": 0.576271186440678,
"grad_norm": 0.9196195006370544,
"learning_rate": 4.998419062351724e-05,
"loss": 0.5893,
"num_input_tokens_seen": 3631528,
"step": 34,
"train_runtime": 365.0364,
"train_tokens_per_second": 9948.4
},
{
"epoch": 0.5932203389830508,
"grad_norm": 1.0634722709655762,
"learning_rate": 4.997189674596463e-05,
"loss": 0.6018,
"num_input_tokens_seen": 3740096,
"step": 35,
"train_runtime": 375.4073,
"train_tokens_per_second": 9962.768
},
{
"epoch": 0.6101694915254238,
"grad_norm": 0.7719584703445435,
"learning_rate": 4.995609329410804e-05,
"loss": 0.6345,
"num_input_tokens_seen": 3846680,
"step": 36,
"train_runtime": 385.614,
"train_tokens_per_second": 9975.469
},
{
"epoch": 0.6271186440677966,
"grad_norm": 0.8835194706916809,
"learning_rate": 4.993678248897972e-05,
"loss": 0.6499,
"num_input_tokens_seen": 3951992,
"step": 37,
"train_runtime": 395.8528,
"train_tokens_per_second": 9983.489
},
{
"epoch": 0.6440677966101694,
"grad_norm": 0.8042682409286499,
"learning_rate": 4.9913967044538734e-05,
"loss": 0.6397,
"num_input_tokens_seen": 4058880,
"step": 38,
"train_runtime": 406.182,
"train_tokens_per_second": 9992.761
},
{
"epoch": 0.6610169491525424,
"grad_norm": 0.8412677049636841,
"learning_rate": 4.9887650167289525e-05,
"loss": 0.6596,
"num_input_tokens_seen": 4152608,
"step": 39,
"train_runtime": 416.0998,
"train_tokens_per_second": 9979.836
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.8093813061714172,
"learning_rate": 4.985783555583123e-05,
"loss": 0.5761,
"num_input_tokens_seen": 4261232,
"step": 40,
"train_runtime": 426.049,
"train_tokens_per_second": 10001.741
},
{
"epoch": 0.6949152542372882,
"grad_norm": 0.7076805233955383,
"learning_rate": 4.982452740033793e-05,
"loss": 0.65,
"num_input_tokens_seen": 4368744,
"step": 41,
"train_runtime": 436.3634,
"train_tokens_per_second": 10011.711
},
{
"epoch": 0.711864406779661,
"grad_norm": 0.8573846220970154,
"learning_rate": 4.978773038196972e-05,
"loss": 0.6319,
"num_input_tokens_seen": 4470520,
"step": 42,
"train_runtime": 446.562,
"train_tokens_per_second": 10010.972
},
{
"epoch": 0.7288135593220338,
"grad_norm": 0.8405332565307617,
"learning_rate": 4.974744967221483e-05,
"loss": 0.6236,
"num_input_tokens_seen": 4575488,
"step": 43,
"train_runtime": 456.8737,
"train_tokens_per_second": 10014.777
},
{
"epoch": 0.7457627118644068,
"grad_norm": 0.816683292388916,
"learning_rate": 4.9703690932162824e-05,
"loss": 0.6215,
"num_input_tokens_seen": 4680592,
"step": 44,
"train_runtime": 467.0937,
"train_tokens_per_second": 10020.671
},
{
"epoch": 0.7627118644067796,
"grad_norm": 0.797042727470398,
"learning_rate": 4.9656460311708963e-05,
"loss": 0.6175,
"num_input_tokens_seen": 4783320,
"step": 45,
"train_runtime": 477.1334,
"train_tokens_per_second": 10025.122
},
{
"epoch": 0.7796610169491526,
"grad_norm": 0.7709528803825378,
"learning_rate": 4.960576444868992e-05,
"loss": 0.5959,
"num_input_tokens_seen": 4883848,
"step": 46,
"train_runtime": 487.3769,
"train_tokens_per_second": 10020.681
},
{
"epoch": 0.7966101694915254,
"grad_norm": 0.7377520799636841,
"learning_rate": 4.955161046795088e-05,
"loss": 0.6428,
"num_input_tokens_seen": 4995680,
"step": 47,
"train_runtime": 497.7583,
"train_tokens_per_second": 10036.358
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.7387161254882812,
"learning_rate": 4.9494005980344194e-05,
"loss": 0.6275,
"num_input_tokens_seen": 5106208,
"step": 48,
"train_runtime": 507.8631,
"train_tokens_per_second": 10054.3
},
{
"epoch": 0.8305084745762712,
"grad_norm": 0.6989988088607788,
"learning_rate": 4.943295908165977e-05,
"loss": 0.5977,
"num_input_tokens_seen": 5215272,
"step": 49,
"train_runtime": 518.169,
"train_tokens_per_second": 10064.81
},
{
"epoch": 0.847457627118644,
"grad_norm": 0.6940891146659851,
"learning_rate": 4.936847835148725e-05,
"loss": 0.5901,
"num_input_tokens_seen": 5320152,
"step": 50,
"train_runtime": 528.4296,
"train_tokens_per_second": 10067.853
},
{
"epoch": 0.847457627118644,
"eval_accuracy": 0.8260805511105289,
"eval_loss": 0.5966207981109619,
"eval_runtime": 4.3377,
"eval_samples_per_second": 11.527,
"eval_steps_per_second": 2.997,
"num_input_tokens_seen": 5320152,
"step": 50
},
{
"epoch": 0.864406779661017,
"grad_norm": 0.7691423892974854,
"learning_rate": 4.930057285201027e-05,
"loss": 0.6284,
"num_input_tokens_seen": 5424336,
"step": 51,
"train_runtime": 543.1554,
"train_tokens_per_second": 9986.711
},
{
"epoch": 0.8813559322033898,
"grad_norm": 0.7801838517189026,
"learning_rate": 4.9229252126732814e-05,
"loss": 0.6046,
"num_input_tokens_seen": 5536400,
"step": 52,
"train_runtime": 553.5015,
"train_tokens_per_second": 10002.503
},
{
"epoch": 0.8983050847457628,
"grad_norm": 0.7027463316917419,
"learning_rate": 4.9154526199137964e-05,
"loss": 0.5333,
"num_input_tokens_seen": 5647488,
"step": 53,
"train_runtime": 563.8958,
"train_tokens_per_second": 10015.127
},
{
"epoch": 0.9152542372881356,
"grad_norm": 0.6528388261795044,
"learning_rate": 4.9076405571279207e-05,
"loss": 0.6442,
"num_input_tokens_seen": 5751248,
"step": 54,
"train_runtime": 574.1388,
"train_tokens_per_second": 10017.173
},
{
"epoch": 0.9322033898305084,
"grad_norm": 0.8217389583587646,
"learning_rate": 4.8994901222304465e-05,
"loss": 0.663,
"num_input_tokens_seen": 5858104,
"step": 55,
"train_runtime": 584.4942,
"train_tokens_per_second": 10022.518
},
{
"epoch": 0.9491525423728814,
"grad_norm": 0.6466934084892273,
"learning_rate": 4.891002460691306e-05,
"loss": 0.5856,
"num_input_tokens_seen": 5958920,
"step": 56,
"train_runtime": 594.703,
"train_tokens_per_second": 10019.993
},
{
"epoch": 0.9661016949152542,
"grad_norm": 0.6264472603797913,
"learning_rate": 4.882178765374589e-05,
"loss": 0.5298,
"num_input_tokens_seen": 6067168,
"step": 57,
"train_runtime": 605.0659,
"train_tokens_per_second": 10027.285
},
{
"epoch": 0.9830508474576272,
"grad_norm": 0.6835854053497314,
"learning_rate": 4.87302027637089e-05,
"loss": 0.6364,
"num_input_tokens_seen": 6164248,
"step": 58,
"train_runtime": 615.1097,
"train_tokens_per_second": 10021.38
},
{
"epoch": 1.0,
"grad_norm": 0.715363085269928,
"learning_rate": 4.863528280823033e-05,
"loss": 0.5508,
"num_input_tokens_seen": 6269656,
"step": 59,
"train_runtime": 625.2947,
"train_tokens_per_second": 10026.722
},
{
"epoch": 1.0169491525423728,
"grad_norm": 0.6624979972839355,
"learning_rate": 4.853704112745172e-05,
"loss": 0.5221,
"num_input_tokens_seen": 6377200,
"step": 60,
"train_runtime": 635.7108,
"train_tokens_per_second": 10031.606
},
{
"epoch": 1.0338983050847457,
"grad_norm": 0.7150872349739075,
"learning_rate": 4.8435491528353026e-05,
"loss": 0.5232,
"num_input_tokens_seen": 6478784,
"step": 61,
"train_runtime": 645.6922,
"train_tokens_per_second": 10033.858
},
{
"epoch": 1.0508474576271187,
"grad_norm": 0.643916666507721,
"learning_rate": 4.833064828281225e-05,
"loss": 0.4441,
"num_input_tokens_seen": 6594704,
"step": 62,
"train_runtime": 656.0298,
"train_tokens_per_second": 10052.445
},
{
"epoch": 1.0677966101694916,
"grad_norm": 0.6635810732841492,
"learning_rate": 4.822252612559961e-05,
"loss": 0.5256,
"num_input_tokens_seen": 6694384,
"step": 63,
"train_runtime": 665.4726,
"train_tokens_per_second": 10059.593
},
{
"epoch": 1.0847457627118644,
"grad_norm": 0.7547369003295898,
"learning_rate": 4.811114025230672e-05,
"loss": 0.518,
"num_input_tokens_seen": 6795736,
"step": 64,
"train_runtime": 675.6098,
"train_tokens_per_second": 10058.67
},
{
"epoch": 1.1016949152542372,
"grad_norm": 0.6073519587516785,
"learning_rate": 4.799650631721096e-05,
"loss": 0.5539,
"num_input_tokens_seen": 6903368,
"step": 65,
"train_runtime": 685.8813,
"train_tokens_per_second": 10064.96
},
{
"epoch": 1.11864406779661,
"grad_norm": 0.6640107035636902,
"learning_rate": 4.787864043107546e-05,
"loss": 0.4718,
"num_input_tokens_seen": 7002776,
"step": 66,
"train_runtime": 695.8898,
"train_tokens_per_second": 10063.052
},
{
"epoch": 1.1355932203389831,
"grad_norm": 0.6934003829956055,
"learning_rate": 4.775755915888483e-05,
"loss": 0.4699,
"num_input_tokens_seen": 7095696,
"step": 67,
"train_runtime": 705.6961,
"train_tokens_per_second": 10054.89
},
{
"epoch": 1.152542372881356,
"grad_norm": 0.7117975950241089,
"learning_rate": 4.763327951751711e-05,
"loss": 0.5381,
"num_input_tokens_seen": 7206656,
"step": 68,
"train_runtime": 716.9173,
"train_tokens_per_second": 10052.283
},
{
"epoch": 1.1694915254237288,
"grad_norm": 0.7451320886611938,
"learning_rate": 4.750581897335222e-05,
"loss": 0.4825,
"num_input_tokens_seen": 7324712,
"step": 69,
"train_runtime": 728.3753,
"train_tokens_per_second": 10056.234
},
{
"epoch": 1.1864406779661016,
"grad_norm": 0.6758464574813843,
"learning_rate": 4.737519543981721e-05,
"loss": 0.5406,
"num_input_tokens_seen": 7436408,
"step": 70,
"train_runtime": 739.8095,
"train_tokens_per_second": 10051.788
},
{
"epoch": 1.2033898305084745,
"grad_norm": 0.6071574091911316,
"learning_rate": 4.724142727486869e-05,
"loss": 0.5419,
"num_input_tokens_seen": 7537992,
"step": 71,
"train_runtime": 750.8169,
"train_tokens_per_second": 10039.721
},
{
"epoch": 1.2203389830508475,
"grad_norm": 0.5895605087280273,
"learning_rate": 4.7104533278412763e-05,
"loss": 0.5035,
"num_input_tokens_seen": 7651968,
"step": 72,
"train_runtime": 762.2806,
"train_tokens_per_second": 10038.256
},
{
"epoch": 1.2372881355932204,
"grad_norm": 0.6282557845115662,
"learning_rate": 4.696453268966291e-05,
"loss": 0.529,
"num_input_tokens_seen": 7748848,
"step": 73,
"train_runtime": 773.5666,
"train_tokens_per_second": 10017.04
},
{
"epoch": 1.2542372881355932,
"grad_norm": 0.6695516705513,
"learning_rate": 4.6821445184436066e-05,
"loss": 0.4829,
"num_input_tokens_seen": 7845760,
"step": 74,
"train_runtime": 783.5687,
"train_tokens_per_second": 10012.856
},
{
"epoch": 1.271186440677966,
"grad_norm": 0.5903871059417725,
"learning_rate": 4.667529087238736e-05,
"loss": 0.4792,
"num_input_tokens_seen": 7948872,
"step": 75,
"train_runtime": 793.8129,
"train_tokens_per_second": 10013.534
},
{
"epoch": 1.271186440677966,
"eval_accuracy": 0.8345456753981751,
"eval_loss": 0.5687937140464783,
"eval_runtime": 4.3453,
"eval_samples_per_second": 11.507,
"eval_steps_per_second": 2.992,
"num_input_tokens_seen": 7948872,
"step": 75
},
{
"epoch": 1.288135593220339,
"grad_norm": 0.5066044330596924,
"learning_rate": 4.652609029418389e-05,
"loss": 0.4475,
"num_input_tokens_seen": 8053096,
"step": 76,
"train_runtime": 808.4485,
"train_tokens_per_second": 9961.174
},
{
"epoch": 1.305084745762712,
"grad_norm": 0.7508769035339355,
"learning_rate": 4.6373864418617935e-05,
"loss": 0.4814,
"num_input_tokens_seen": 8165544,
"step": 77,
"train_runtime": 818.7742,
"train_tokens_per_second": 9972.888
},
{
"epoch": 1.3220338983050848,
"grad_norm": 0.6583501100540161,
"learning_rate": 4.6218634639659954e-05,
"loss": 0.551,
"num_input_tokens_seen": 8280040,
"step": 78,
"train_runtime": 829.0981,
"train_tokens_per_second": 9986.803
},
{
"epoch": 1.3389830508474576,
"grad_norm": 0.5735164284706116,
"learning_rate": 4.606042277345185e-05,
"loss": 0.4689,
"num_input_tokens_seen": 8380248,
"step": 79,
"train_runtime": 839.0856,
"train_tokens_per_second": 9987.357
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.6262232065200806,
"learning_rate": 4.5899251055240963e-05,
"loss": 0.4489,
"num_input_tokens_seen": 8488848,
"step": 80,
"train_runtime": 849.3643,
"train_tokens_per_second": 9994.354
},
{
"epoch": 1.3728813559322033,
"grad_norm": 0.7844845652580261,
"learning_rate": 4.573514213625505e-05,
"loss": 0.593,
"num_input_tokens_seen": 8593232,
"step": 81,
"train_runtime": 859.579,
"train_tokens_per_second": 9997.024
},
{
"epoch": 1.3898305084745763,
"grad_norm": 0.6617870926856995,
"learning_rate": 4.5568119080518864e-05,
"loss": 0.5531,
"num_input_tokens_seen": 8692096,
"step": 82,
"train_runtime": 869.7519,
"train_tokens_per_second": 9993.765
},
{
"epoch": 1.4067796610169492,
"grad_norm": 0.621767520904541,
"learning_rate": 4.539820536161278e-05,
"loss": 0.4688,
"num_input_tokens_seen": 8802024,
"step": 83,
"train_runtime": 880.1033,
"train_tokens_per_second": 10001.126
},
{
"epoch": 1.423728813559322,
"grad_norm": 0.7459584474563599,
"learning_rate": 4.522542485937369e-05,
"loss": 0.4728,
"num_input_tokens_seen": 8915296,
"step": 84,
"train_runtime": 890.2946,
"train_tokens_per_second": 10013.871
},
{
"epoch": 1.4406779661016949,
"grad_norm": 0.711216390132904,
"learning_rate": 4.504980185653899e-05,
"loss": 0.5405,
"num_input_tokens_seen": 9018176,
"step": 85,
"train_runtime": 900.7028,
"train_tokens_per_second": 10012.377
},
{
"epoch": 1.457627118644068,
"grad_norm": 0.6414653062820435,
"learning_rate": 4.4871361035333836e-05,
"loss": 0.4356,
"num_input_tokens_seen": 9127880,
"step": 86,
"train_runtime": 911.0595,
"train_tokens_per_second": 10018.972
},
{
"epoch": 1.4745762711864407,
"grad_norm": 0.6491935849189758,
"learning_rate": 4.469012747400227e-05,
"loss": 0.4982,
"num_input_tokens_seen": 9241680,
"step": 87,
"train_runtime": 921.4507,
"train_tokens_per_second": 10029.49
},
{
"epoch": 1.4915254237288136,
"grad_norm": 0.6951228380203247,
"learning_rate": 4.450612664328271e-05,
"loss": 0.535,
"num_input_tokens_seen": 9349080,
"step": 88,
"train_runtime": 931.7221,
"train_tokens_per_second": 10034.194
},
{
"epoch": 1.5084745762711864,
"grad_norm": 0.5840334892272949,
"learning_rate": 4.431938440282828e-05,
"loss": 0.4983,
"num_input_tokens_seen": 9460424,
"step": 89,
"train_runtime": 942.087,
"train_tokens_per_second": 10041.986
},
{
"epoch": 1.5254237288135593,
"grad_norm": 0.5770622491836548,
"learning_rate": 4.412992699757244e-05,
"loss": 0.4654,
"num_input_tokens_seen": 9577240,
"step": 90,
"train_runtime": 952.4898,
"train_tokens_per_second": 10054.953
},
{
"epoch": 1.542372881355932,
"grad_norm": 0.673603355884552,
"learning_rate": 4.3937781054040505e-05,
"loss": 0.4369,
"num_input_tokens_seen": 9680760,
"step": 91,
"train_runtime": 962.7675,
"train_tokens_per_second": 10055.138
},
{
"epoch": 1.559322033898305,
"grad_norm": 0.6873490214347839,
"learning_rate": 4.374297357660756e-05,
"loss": 0.4837,
"num_input_tokens_seen": 9791984,
"step": 92,
"train_runtime": 973.1429,
"train_tokens_per_second": 10062.226
},
{
"epoch": 1.576271186440678,
"grad_norm": 0.5499905943870544,
"learning_rate": 4.354553194370321e-05,
"loss": 0.4735,
"num_input_tokens_seen": 9895992,
"step": 93,
"train_runtime": 983.411,
"train_tokens_per_second": 10062.926
},
{
"epoch": 1.5932203389830508,
"grad_norm": 0.553594172000885,
"learning_rate": 4.334548390396377e-05,
"loss": 0.4411,
"num_input_tokens_seen": 10007400,
"step": 94,
"train_runtime": 993.6791,
"train_tokens_per_second": 10071.058
},
{
"epoch": 1.6101694915254239,
"grad_norm": 0.5988419651985168,
"learning_rate": 4.3142857572332504e-05,
"loss": 0.4829,
"num_input_tokens_seen": 10122232,
"step": 95,
"train_runtime": 1003.9748,
"train_tokens_per_second": 10082.157
},
{
"epoch": 1.6271186440677967,
"grad_norm": 0.5911242961883545,
"learning_rate": 4.293768142610828e-05,
"loss": 0.4373,
"num_input_tokens_seen": 10211000,
"step": 96,
"train_runtime": 1013.7445,
"train_tokens_per_second": 10072.557
},
{
"epoch": 1.6440677966101696,
"grad_norm": 0.640306830406189,
"learning_rate": 4.272998430094334e-05,
"loss": 0.4391,
"num_input_tokens_seen": 10323768,
"step": 97,
"train_runtime": 1024.1228,
"train_tokens_per_second": 10080.596
},
{
"epoch": 1.6610169491525424,
"grad_norm": 0.6589894890785217,
"learning_rate": 4.2519795386790716e-05,
"loss": 0.5014,
"num_input_tokens_seen": 10423272,
"step": 98,
"train_runtime": 1034.1041,
"train_tokens_per_second": 10079.519
},
{
"epoch": 1.6779661016949152,
"grad_norm": 0.6693786978721619,
"learning_rate": 4.23071442238019e-05,
"loss": 0.5095,
"num_input_tokens_seen": 10535696,
"step": 99,
"train_runtime": 1044.4991,
"train_tokens_per_second": 10086.841
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.597254753112793,
"learning_rate": 4.209206069817513e-05,
"loss": 0.4444,
"num_input_tokens_seen": 10640880,
"step": 100,
"train_runtime": 1054.8328,
"train_tokens_per_second": 10087.741
},
{
"epoch": 1.694915254237288,
"eval_accuracy": 0.8374839692392347,
"eval_loss": 0.5552906394004822,
"eval_runtime": 4.3531,
"eval_samples_per_second": 11.486,
"eval_steps_per_second": 2.986,
"num_input_tokens_seen": 10640880,
"step": 100
},
{
"epoch": 1.711864406779661,
"grad_norm": 0.7455243468284607,
"learning_rate": 4.187457503795527e-05,
"loss": 0.4959,
"num_input_tokens_seen": 10745624,
"step": 101,
"train_runtime": 1069.4889,
"train_tokens_per_second": 10047.439
},
{
"epoch": 1.7288135593220337,
"grad_norm": 0.5884683132171631,
"learning_rate": 4.165471780878546e-05,
"loss": 0.433,
"num_input_tokens_seen": 10844744,
"step": 102,
"train_runtime": 1079.7793,
"train_tokens_per_second": 10043.482
},
{
"epoch": 1.7457627118644068,
"grad_norm": 0.5727217793464661,
"learning_rate": 4.1432519909611415e-05,
"loss": 0.4856,
"num_input_tokens_seen": 10952176,
"step": 103,
"train_runtime": 1090.1011,
"train_tokens_per_second": 10046.936
},
{
"epoch": 1.7627118644067796,
"grad_norm": 0.5741541385650635,
"learning_rate": 4.120801256833887e-05,
"loss": 0.4413,
"num_input_tokens_seen": 11066704,
"step": 104,
"train_runtime": 1100.4868,
"train_tokens_per_second": 10056.19
},
{
"epoch": 1.7796610169491527,
"grad_norm": 0.6238065958023071,
"learning_rate": 4.098122733744475e-05,
"loss": 0.4558,
"num_input_tokens_seen": 11167664,
"step": 105,
"train_runtime": 1110.4653,
"train_tokens_per_second": 10056.743
},
{
"epoch": 1.7966101694915255,
"grad_norm": 0.59928959608078,
"learning_rate": 4.075219608954278e-05,
"loss": 0.5277,
"num_input_tokens_seen": 11267192,
"step": 106,
"train_runtime": 1120.3081,
"train_tokens_per_second": 10057.226
},
{
"epoch": 1.8135593220338984,
"grad_norm": 0.5390007495880127,
"learning_rate": 4.052095101290406e-05,
"loss": 0.4027,
"num_input_tokens_seen": 11381440,
"step": 107,
"train_runtime": 1130.5809,
"train_tokens_per_second": 10066.895
},
{
"epoch": 1.8305084745762712,
"grad_norm": 0.49983635544776917,
"learning_rate": 4.02875246069333e-05,
"loss": 0.459,
"num_input_tokens_seen": 11478120,
"step": 108,
"train_runtime": 1140.6584,
"train_tokens_per_second": 10062.715
},
{
"epoch": 1.847457627118644,
"grad_norm": 0.5167087316513062,
"learning_rate": 4.005194967760135e-05,
"loss": 0.4386,
"num_input_tokens_seen": 11584096,
"step": 109,
"train_runtime": 1150.7061,
"train_tokens_per_second": 10066.946
},
{
"epoch": 1.8644067796610169,
"grad_norm": 0.5469056963920593,
"learning_rate": 3.981425933283456e-05,
"loss": 0.3941,
"num_input_tokens_seen": 11695448,
"step": 110,
"train_runtime": 1161.1546,
"train_tokens_per_second": 10072.258
},
{
"epoch": 1.8813559322033897,
"grad_norm": 0.5128666758537292,
"learning_rate": 3.95744869778618e-05,
"loss": 0.4912,
"num_input_tokens_seen": 11805544,
"step": 111,
"train_runtime": 1171.4829,
"train_tokens_per_second": 10077.436
},
{
"epoch": 1.8983050847457628,
"grad_norm": 0.5504162311553955,
"learning_rate": 3.933266631051968e-05,
"loss": 0.4393,
"num_input_tokens_seen": 11903888,
"step": 112,
"train_runtime": 1181.3806,
"train_tokens_per_second": 10076.252
},
{
"epoch": 1.9152542372881356,
"grad_norm": 0.5731999278068542,
"learning_rate": 3.9088831316516564e-05,
"loss": 0.4292,
"num_input_tokens_seen": 12004128,
"step": 113,
"train_runtime": 1191.1538,
"train_tokens_per_second": 10077.732
},
{
"epoch": 1.9322033898305084,
"grad_norm": 0.5877408981323242,
"learning_rate": 3.8843016264656215e-05,
"loss": 0.4825,
"num_input_tokens_seen": 12115840,
"step": 114,
"train_runtime": 1201.5735,
"train_tokens_per_second": 10083.312
},
{
"epoch": 1.9491525423728815,
"grad_norm": 0.6028397679328918,
"learning_rate": 3.8595255702021635e-05,
"loss": 0.5266,
"num_input_tokens_seen": 12232504,
"step": 115,
"train_runtime": 1211.9761,
"train_tokens_per_second": 10093.024
},
{
"epoch": 1.9661016949152543,
"grad_norm": 0.6042894721031189,
"learning_rate": 3.8345584449119776e-05,
"loss": 0.4424,
"num_input_tokens_seen": 12339872,
"step": 116,
"train_runtime": 1222.2981,
"train_tokens_per_second": 10095.632
},
{
"epoch": 1.9830508474576272,
"grad_norm": 0.6503923535346985,
"learning_rate": 3.809403759498782e-05,
"loss": 0.4777,
"num_input_tokens_seen": 12440032,
"step": 117,
"train_runtime": 1232.5124,
"train_tokens_per_second": 10093.231
},
{
"epoch": 2.0,
"grad_norm": 0.5169121026992798,
"learning_rate": 3.784065049226176e-05,
"loss": 0.4401,
"num_input_tokens_seen": 12542672,
"step": 118,
"train_runtime": 1242.4604,
"train_tokens_per_second": 10095.028
},
{
"epoch": 2.016949152542373,
"grad_norm": 0.5643147230148315,
"learning_rate": 3.758545875220788e-05,
"loss": 0.3796,
"num_input_tokens_seen": 12655008,
"step": 119,
"train_runtime": 1252.8834,
"train_tokens_per_second": 10100.707
},
{
"epoch": 2.0338983050847457,
"grad_norm": 0.5204115509986877,
"learning_rate": 3.732849823971793e-05,
"loss": 0.3662,
"num_input_tokens_seen": 12769960,
"step": 120,
"train_runtime": 1263.2752,
"train_tokens_per_second": 10108.613
},
{
"epoch": 2.0508474576271185,
"grad_norm": 0.5998212695121765,
"learning_rate": 3.706980506826863e-05,
"loss": 0.3615,
"num_input_tokens_seen": 12881984,
"step": 121,
"train_runtime": 1273.3162,
"train_tokens_per_second": 10116.878
},
{
"epoch": 2.0677966101694913,
"grad_norm": 0.4841426908969879,
"learning_rate": 3.6809415594846236e-05,
"loss": 0.3831,
"num_input_tokens_seen": 12981888,
"step": 122,
"train_runtime": 1283.5823,
"train_tokens_per_second": 10113.795
},
{
"epoch": 2.084745762711864,
"grad_norm": 0.4841645359992981,
"learning_rate": 3.6547366414836936e-05,
"loss": 0.2879,
"num_input_tokens_seen": 13082360,
"step": 123,
"train_runtime": 1293.8742,
"train_tokens_per_second": 10110.998
},
{
"epoch": 2.1016949152542375,
"grad_norm": 0.5998939871788025,
"learning_rate": 3.628369435688366e-05,
"loss": 0.4776,
"num_input_tokens_seen": 13195264,
"step": 124,
"train_runtime": 1304.2365,
"train_tokens_per_second": 10117.233
},
{
"epoch": 2.1186440677966103,
"grad_norm": 0.6515780687332153,
"learning_rate": 3.601843647771016e-05,
"loss": 0.3788,
"num_input_tokens_seen": 13298752,
"step": 125,
"train_runtime": 1314.4968,
"train_tokens_per_second": 10116.991
},
{
"epoch": 2.1186440677966103,
"eval_accuracy": 0.8475513962799617,
"eval_loss": 0.529534637928009,
"eval_runtime": 4.3446,
"eval_samples_per_second": 11.509,
"eval_steps_per_second": 2.992,
"num_input_tokens_seen": 13298752,
"step": 125
},
{
"epoch": 2.135593220338983,
"grad_norm": 0.535269558429718,
"learning_rate": 3.575163005691302e-05,
"loss": 0.3697,
"num_input_tokens_seen": 13394544,
"step": 126,
"train_runtime": 1328.823,
"train_tokens_per_second": 10080.006
},
{
"epoch": 2.152542372881356,
"grad_norm": 0.639999270439148,
"learning_rate": 3.548331259172234e-05,
"loss": 0.3783,
"num_input_tokens_seen": 13503584,
"step": 127,
"train_runtime": 1339.1132,
"train_tokens_per_second": 10083.975
},
{
"epoch": 2.169491525423729,
"grad_norm": 0.6342602372169495,
"learning_rate": 3.5213521791731875e-05,
"loss": 0.3652,
"num_input_tokens_seen": 13607464,
"step": 128,
"train_runtime": 1349.4055,
"train_tokens_per_second": 10084.044
},
{
"epoch": 2.1864406779661016,
"grad_norm": 0.5488477945327759,
"learning_rate": 3.4942295573599245e-05,
"loss": 0.366,
"num_input_tokens_seen": 13708112,
"step": 129,
"train_runtime": 1359.3979,
"train_tokens_per_second": 10083.958
},
{
"epoch": 2.2033898305084745,
"grad_norm": 0.5964322090148926,
"learning_rate": 3.46696720557171e-05,
"loss": 0.3134,
"num_input_tokens_seen": 13815872,
"step": 130,
"train_runtime": 1369.6909,
"train_tokens_per_second": 10086.854
},
{
"epoch": 2.2203389830508473,
"grad_norm": 0.5261477828025818,
"learning_rate": 3.4395689552855955e-05,
"loss": 0.3162,
"num_input_tokens_seen": 13920760,
"step": 131,
"train_runtime": 1379.5953,
"train_tokens_per_second": 10090.467
},
{
"epoch": 2.23728813559322,
"grad_norm": 0.5315053462982178,
"learning_rate": 3.412038657077939e-05,
"loss": 0.3835,
"num_input_tokens_seen": 14014280,
"step": 132,
"train_runtime": 1389.228,
"train_tokens_per_second": 10087.818
},
{
"epoch": 2.2542372881355934,
"grad_norm": 0.518429696559906,
"learning_rate": 3.3843801800832354e-05,
"loss": 0.3628,
"num_input_tokens_seen": 14109848,
"step": 133,
"train_runtime": 1399.3481,
"train_tokens_per_second": 10083.158
},
{
"epoch": 2.2711864406779663,
"grad_norm": 0.5626394152641296,
"learning_rate": 3.356597411450353e-05,
"loss": 0.3635,
"num_input_tokens_seen": 14217008,
"step": 134,
"train_runtime": 1409.3478,
"train_tokens_per_second": 10087.65
},
{
"epoch": 2.288135593220339,
"grad_norm": 0.5665557384490967,
"learning_rate": 3.328694255796226e-05,
"loss": 0.3426,
"num_input_tokens_seen": 14326608,
"step": 135,
"train_runtime": 1419.5891,
"train_tokens_per_second": 10092.081
},
{
"epoch": 2.305084745762712,
"grad_norm": 0.5634236931800842,
"learning_rate": 3.300674634657094e-05,
"loss": 0.3817,
"num_input_tokens_seen": 14425192,
"step": 136,
"train_runtime": 1429.8654,
"train_tokens_per_second": 10088.496
},
{
"epoch": 2.3220338983050848,
"grad_norm": 0.5703310966491699,
"learning_rate": 3.272542485937369e-05,
"loss": 0.367,
"num_input_tokens_seen": 14516048,
"step": 137,
"train_runtime": 1439.5089,
"train_tokens_per_second": 10084.028
},
{
"epoch": 2.3389830508474576,
"grad_norm": 0.4954843819141388,
"learning_rate": 3.244301763356195e-05,
"loss": 0.4014,
"num_input_tokens_seen": 14612784,
"step": 138,
"train_runtime": 1449.7059,
"train_tokens_per_second": 10079.827
},
{
"epoch": 2.3559322033898304,
"grad_norm": 0.4663669168949127,
"learning_rate": 3.215956435891793e-05,
"loss": 0.3442,
"num_input_tokens_seen": 14712832,
"step": 139,
"train_runtime": 1459.9018,
"train_tokens_per_second": 10077.96
},
{
"epoch": 2.3728813559322033,
"grad_norm": 0.482755571603775,
"learning_rate": 3.187510487223655e-05,
"loss": 0.3084,
"num_input_tokens_seen": 14826672,
"step": 140,
"train_runtime": 1470.2591,
"train_tokens_per_second": 10084.394
},
{
"epoch": 2.389830508474576,
"grad_norm": 0.5014492273330688,
"learning_rate": 3.158967915172669e-05,
"loss": 0.3533,
"num_input_tokens_seen": 14931848,
"step": 141,
"train_runtime": 1480.4925,
"train_tokens_per_second": 10085.73
},
{
"epoch": 2.406779661016949,
"grad_norm": 0.5160117745399475,
"learning_rate": 3.130332731139272e-05,
"loss": 0.3522,
"num_input_tokens_seen": 15033416,
"step": 142,
"train_runtime": 1490.7429,
"train_tokens_per_second": 10084.513
},
{
"epoch": 2.423728813559322,
"grad_norm": 0.547079861164093,
"learning_rate": 3.101608959539671e-05,
"loss": 0.3409,
"num_input_tokens_seen": 15144040,
"step": 143,
"train_runtime": 1501.0195,
"train_tokens_per_second": 10089.17
},
{
"epoch": 2.440677966101695,
"grad_norm": 0.6849552989006042,
"learning_rate": 3.072800637240261e-05,
"loss": 0.399,
"num_input_tokens_seen": 15253280,
"step": 144,
"train_runtime": 1511.2733,
"train_tokens_per_second": 10092.999
},
{
"epoch": 2.457627118644068,
"grad_norm": 0.4753686487674713,
"learning_rate": 3.0439118129902698e-05,
"loss": 0.2888,
"num_input_tokens_seen": 15361952,
"step": 145,
"train_runtime": 1521.6339,
"train_tokens_per_second": 10095.695
},
{
"epoch": 2.4745762711864407,
"grad_norm": 0.49981778860092163,
"learning_rate": 3.014946546852746e-05,
"loss": 0.4014,
"num_input_tokens_seen": 15457896,
"step": 146,
"train_runtime": 1531.8793,
"train_tokens_per_second": 10090.806
},
{
"epoch": 2.4915254237288136,
"grad_norm": 0.5016078948974609,
"learning_rate": 2.9859089096339566e-05,
"loss": 0.3612,
"num_input_tokens_seen": 15570464,
"step": 147,
"train_runtime": 1542.2135,
"train_tokens_per_second": 10096.179
},
{
"epoch": 2.5084745762711864,
"grad_norm": 0.5225934982299805,
"learning_rate": 2.9568029823112688e-05,
"loss": 0.4234,
"num_input_tokens_seen": 15681264,
"step": 148,
"train_runtime": 1552.5788,
"train_tokens_per_second": 10100.141
},
{
"epoch": 2.5254237288135593,
"grad_norm": 0.5351992845535278,
"learning_rate": 2.9276328554596055e-05,
"loss": 0.4073,
"num_input_tokens_seen": 15788384,
"step": 149,
"train_runtime": 1562.9111,
"train_tokens_per_second": 10101.908
},
{
"epoch": 2.542372881355932,
"grad_norm": 0.6268981099128723,
"learning_rate": 2.8984026286765542e-05,
"loss": 0.435,
"num_input_tokens_seen": 15891024,
"step": 150,
"train_runtime": 1573.2633,
"train_tokens_per_second": 10100.677
},
{
"epoch": 2.542372881355932,
"eval_accuracy": 0.8443465952747787,
"eval_loss": 0.5156561136245728,
"eval_runtime": 4.3482,
"eval_samples_per_second": 11.499,
"eval_steps_per_second": 2.99,
"num_input_tokens_seen": 15891024,
"step": 150
},
{
"epoch": 2.559322033898305,
"grad_norm": 0.5867091417312622,
"learning_rate": 2.8691164100062034e-05,
"loss": 0.4432,
"num_input_tokens_seen": 15998080,
"step": 151,
"train_runtime": 1588.0035,
"train_tokens_per_second": 10074.335
},
{
"epoch": 2.576271186440678,
"grad_norm": 0.5568689703941345,
"learning_rate": 2.8397783153617958e-05,
"loss": 0.4135,
"num_input_tokens_seen": 16111136,
"step": 152,
"train_runtime": 1598.1101,
"train_tokens_per_second": 10081.368
},
{
"epoch": 2.593220338983051,
"grad_norm": 0.48513880372047424,
"learning_rate": 2.8103924679472737e-05,
"loss": 0.3563,
"num_input_tokens_seen": 16210312,
"step": 153,
"train_runtime": 1608.3504,
"train_tokens_per_second": 10078.844
},
{
"epoch": 2.610169491525424,
"grad_norm": 0.5153747200965881,
"learning_rate": 2.7809629976777973e-05,
"loss": 0.3564,
"num_input_tokens_seen": 16315056,
"step": 154,
"train_runtime": 1618.3735,
"train_tokens_per_second": 10081.144
},
{
"epoch": 2.6271186440677967,
"grad_norm": 0.5335708856582642,
"learning_rate": 2.7514940405993272e-05,
"loss": 0.3611,
"num_input_tokens_seen": 16417080,
"step": 155,
"train_runtime": 1628.6088,
"train_tokens_per_second": 10080.432
},
{
"epoch": 2.6440677966101696,
"grad_norm": 0.5242218971252441,
"learning_rate": 2.7219897383073373e-05,
"loss": 0.3847,
"num_input_tokens_seen": 16532576,
"step": 156,
"train_runtime": 1638.9544,
"train_tokens_per_second": 10087.27
},
{
"epoch": 2.6610169491525424,
"grad_norm": 0.6446425318717957,
"learning_rate": 2.6924542373647505e-05,
"loss": 0.3309,
"num_input_tokens_seen": 16644840,
"step": 157,
"train_runtime": 1649.3099,
"train_tokens_per_second": 10092.003
},
{
"epoch": 2.6779661016949152,
"grad_norm": 0.47864437103271484,
"learning_rate": 2.6628916887191784e-05,
"loss": 0.3207,
"num_input_tokens_seen": 16745864,
"step": 158,
"train_runtime": 1659.0578,
"train_tokens_per_second": 10093.599
},
{
"epoch": 2.694915254237288,
"grad_norm": 0.5996072292327881,
"learning_rate": 2.633306247119544e-05,
"loss": 0.3676,
"num_input_tokens_seen": 16858920,
"step": 159,
"train_runtime": 1669.3621,
"train_tokens_per_second": 10099.019
},
{
"epoch": 2.711864406779661,
"grad_norm": 0.7424564361572266,
"learning_rate": 2.603702070532167e-05,
"loss": 0.3098,
"num_input_tokens_seen": 16968168,
"step": 160,
"train_runtime": 1679.5951,
"train_tokens_per_second": 10102.535
},
{
"epoch": 2.7288135593220337,
"grad_norm": 0.4871710240840912,
"learning_rate": 2.5740833195563996e-05,
"loss": 0.3391,
"num_input_tokens_seen": 17075304,
"step": 161,
"train_runtime": 1689.916,
"train_tokens_per_second": 10104.232
},
{
"epoch": 2.7457627118644066,
"grad_norm": 0.5770386457443237,
"learning_rate": 2.5444541568398937e-05,
"loss": 0.3637,
"num_input_tokens_seen": 17193960,
"step": 162,
"train_runtime": 1700.2957,
"train_tokens_per_second": 10112.335
},
{
"epoch": 2.7627118644067794,
"grad_norm": 0.4808778464794159,
"learning_rate": 2.5148187464935763e-05,
"loss": 0.3388,
"num_input_tokens_seen": 17304184,
"step": 163,
"train_runtime": 1710.5416,
"train_tokens_per_second": 10116.202
},
{
"epoch": 2.7796610169491527,
"grad_norm": 0.596011757850647,
"learning_rate": 2.485181253506424e-05,
"loss": 0.361,
"num_input_tokens_seen": 17408944,
"step": 164,
"train_runtime": 1720.6002,
"train_tokens_per_second": 10117.948
},
{
"epoch": 2.7966101694915255,
"grad_norm": 0.5141506195068359,
"learning_rate": 2.4555458431601065e-05,
"loss": 0.3551,
"num_input_tokens_seen": 17512736,
"step": 165,
"train_runtime": 1730.778,
"train_tokens_per_second": 10118.418
},
{
"epoch": 2.8135593220338984,
"grad_norm": 0.49830204248428345,
"learning_rate": 2.4259166804436006e-05,
"loss": 0.386,
"num_input_tokens_seen": 17617368,
"step": 166,
"train_runtime": 1740.8375,
"train_tokens_per_second": 10120.053
},
{
"epoch": 2.830508474576271,
"grad_norm": 0.4794394373893738,
"learning_rate": 2.3962979294678337e-05,
"loss": 0.3624,
"num_input_tokens_seen": 17723424,
"step": 167,
"train_runtime": 1751.1217,
"train_tokens_per_second": 10121.184
},
{
"epoch": 2.847457627118644,
"grad_norm": 0.4836881160736084,
"learning_rate": 2.3666937528804563e-05,
"loss": 0.3517,
"num_input_tokens_seen": 17840688,
"step": 168,
"train_runtime": 1761.4439,
"train_tokens_per_second": 10128.445
},
{
"epoch": 2.864406779661017,
"grad_norm": 0.6007115840911865,
"learning_rate": 2.337108311280822e-05,
"loss": 0.345,
"num_input_tokens_seen": 17958736,
"step": 169,
"train_runtime": 1771.8446,
"train_tokens_per_second": 10135.616
},
{
"epoch": 2.8813559322033897,
"grad_norm": 0.46608850359916687,
"learning_rate": 2.3075457626352504e-05,
"loss": 0.3491,
"num_input_tokens_seen": 18060792,
"step": 170,
"train_runtime": 1782.1555,
"train_tokens_per_second": 10134.24
},
{
"epoch": 2.898305084745763,
"grad_norm": 0.486213743686676,
"learning_rate": 2.2780102616926633e-05,
"loss": 0.3555,
"num_input_tokens_seen": 18161344,
"step": 171,
"train_runtime": 1792.0836,
"train_tokens_per_second": 10134.206
},
{
"epoch": 2.915254237288136,
"grad_norm": 0.5085980892181396,
"learning_rate": 2.2485059594006734e-05,
"loss": 0.3597,
"num_input_tokens_seen": 18267840,
"step": 172,
"train_runtime": 1802.1661,
"train_tokens_per_second": 10136.602
},
{
"epoch": 2.9322033898305087,
"grad_norm": 0.5648550987243652,
"learning_rate": 2.2190370023222033e-05,
"loss": 0.3601,
"num_input_tokens_seen": 18371632,
"step": 173,
"train_runtime": 1812.1336,
"train_tokens_per_second": 10138.122
},
{
"epoch": 2.9491525423728815,
"grad_norm": 0.5039841532707214,
"learning_rate": 2.189607532052727e-05,
"loss": 0.3321,
"num_input_tokens_seen": 18493104,
"step": 174,
"train_runtime": 1822.5279,
"train_tokens_per_second": 10146.953
},
{
"epoch": 2.9661016949152543,
"grad_norm": 0.45758432149887085,
"learning_rate": 2.1602216846382048e-05,
"loss": 0.2966,
"num_input_tokens_seen": 18607368,
"step": 175,
"train_runtime": 1832.8984,
"train_tokens_per_second": 10151.882
},
{
"epoch": 2.9661016949152543,
"eval_accuracy": 0.849152949938464,
"eval_loss": 0.4958828091621399,
"eval_runtime": 4.3492,
"eval_samples_per_second": 11.496,
"eval_steps_per_second": 2.989,
"num_input_tokens_seen": 18607368,
"step": 175
},
{
"epoch": 2.983050847457627,
"grad_norm": 0.4918224811553955,
"learning_rate": 2.1308835899937972e-05,
"loss": 0.3531,
"num_input_tokens_seen": 18709368,
"step": 176,
"train_runtime": 1847.2674,
"train_tokens_per_second": 10128.132
},
{
"epoch": 3.0,
"grad_norm": 0.5761289000511169,
"learning_rate": 2.1015973713234464e-05,
"loss": 0.3393,
"num_input_tokens_seen": 18815328,
"step": 177,
"train_runtime": 1857.5601,
"train_tokens_per_second": 10129.055
},
{
"epoch": 3.016949152542373,
"grad_norm": 0.43620994687080383,
"learning_rate": 2.0723671445403954e-05,
"loss": 0.2607,
"num_input_tokens_seen": 18917216,
"step": 178,
"train_runtime": 1867.504,
"train_tokens_per_second": 10129.679
},
{
"epoch": 3.0338983050847457,
"grad_norm": 0.4892238676548004,
"learning_rate": 2.0431970176887315e-05,
"loss": 0.271,
"num_input_tokens_seen": 19027776,
"step": 179,
"train_runtime": 1877.6618,
"train_tokens_per_second": 10133.761
},
{
"epoch": 3.0508474576271185,
"grad_norm": 0.47473910450935364,
"learning_rate": 2.014091090366044e-05,
"loss": 0.2439,
"num_input_tokens_seen": 19135480,
"step": 180,
"train_runtime": 1888.0587,
"train_tokens_per_second": 10135.003
},
{
"epoch": 3.0677966101694913,
"grad_norm": 0.47560861706733704,
"learning_rate": 1.9850534531472546e-05,
"loss": 0.2996,
"num_input_tokens_seen": 19243480,
"step": 181,
"train_runtime": 1898.3716,
"train_tokens_per_second": 10136.835
},
{
"epoch": 3.084745762711864,
"grad_norm": 0.47860297560691833,
"learning_rate": 1.9560881870097308e-05,
"loss": 0.2709,
"num_input_tokens_seen": 19349232,
"step": 182,
"train_runtime": 1908.3819,
"train_tokens_per_second": 10139.077
},
{
"epoch": 3.1016949152542375,
"grad_norm": 0.43783503770828247,
"learning_rate": 1.9271993627597396e-05,
"loss": 0.2506,
"num_input_tokens_seen": 19468240,
"step": 183,
"train_runtime": 1918.7532,
"train_tokens_per_second": 10146.297
},
{
"epoch": 3.1186440677966103,
"grad_norm": 0.4044387936592102,
"learning_rate": 1.8983910404603296e-05,
"loss": 0.1991,
"num_input_tokens_seen": 19580696,
"step": 184,
"train_runtime": 1929.134,
"train_tokens_per_second": 10149.993
},
{
"epoch": 3.135593220338983,
"grad_norm": 0.5283013582229614,
"learning_rate": 1.8696672688607293e-05,
"loss": 0.2483,
"num_input_tokens_seen": 19685800,
"step": 185,
"train_runtime": 1939.0969,
"train_tokens_per_second": 10152.046
},
{
"epoch": 3.152542372881356,
"grad_norm": 0.5997490286827087,
"learning_rate": 1.8410320848273315e-05,
"loss": 0.2796,
"num_input_tokens_seen": 19787360,
"step": 186,
"train_runtime": 1949.3992,
"train_tokens_per_second": 10150.491
},
{
"epoch": 3.169491525423729,
"grad_norm": 0.5210835933685303,
"learning_rate": 1.8124895127763458e-05,
"loss": 0.2604,
"num_input_tokens_seen": 19887912,
"step": 187,
"train_runtime": 1959.6817,
"train_tokens_per_second": 10148.542
},
{
"epoch": 3.1864406779661016,
"grad_norm": 0.46026745438575745,
"learning_rate": 1.7840435641082072e-05,
"loss": 0.2759,
"num_input_tokens_seen": 19978168,
"step": 188,
"train_runtime": 1969.631,
"train_tokens_per_second": 10143.102
},
{
"epoch": 3.2033898305084745,
"grad_norm": 0.4410321116447449,
"learning_rate": 1.7556982366438053e-05,
"loss": 0.2912,
"num_input_tokens_seen": 20090288,
"step": 189,
"train_runtime": 1980.0875,
"train_tokens_per_second": 10146.162
},
{
"epoch": 3.2203389830508473,
"grad_norm": 0.4692417085170746,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.2852,
"num_input_tokens_seen": 20188176,
"step": 190,
"train_runtime": 1990.314,
"train_tokens_per_second": 10143.211
},
{
"epoch": 3.23728813559322,
"grad_norm": 0.5449389815330505,
"learning_rate": 1.6993253653429063e-05,
"loss": 0.2625,
"num_input_tokens_seen": 20294944,
"step": 191,
"train_runtime": 2000.5661,
"train_tokens_per_second": 10144.6
},
{
"epoch": 3.2542372881355934,
"grad_norm": 0.524983823299408,
"learning_rate": 1.6713057442037743e-05,
"loss": 0.2443,
"num_input_tokens_seen": 20393248,
"step": 192,
"train_runtime": 2010.7028,
"train_tokens_per_second": 10142.348
},
{
"epoch": 3.2711864406779663,
"grad_norm": 0.3914013206958771,
"learning_rate": 1.6434025885496467e-05,
"loss": 0.2252,
"num_input_tokens_seen": 20495136,
"step": 193,
"train_runtime": 2021.0229,
"train_tokens_per_second": 10140.972
},
{
"epoch": 3.288135593220339,
"grad_norm": 0.6633609533309937,
"learning_rate": 1.6156198199167655e-05,
"loss": 0.3712,
"num_input_tokens_seen": 20590656,
"step": 194,
"train_runtime": 2031.1673,
"train_tokens_per_second": 10137.351
},
{
"epoch": 3.305084745762712,
"grad_norm": 0.4628910422325134,
"learning_rate": 1.5879613429220626e-05,
"loss": 0.2243,
"num_input_tokens_seen": 20701792,
"step": 195,
"train_runtime": 2041.5305,
"train_tokens_per_second": 10140.33
},
{
"epoch": 3.3220338983050848,
"grad_norm": 0.5391764044761658,
"learning_rate": 1.560431044714405e-05,
"loss": 0.2873,
"num_input_tokens_seen": 20806728,
"step": 196,
"train_runtime": 2051.8377,
"train_tokens_per_second": 10140.533
},
{
"epoch": 3.3389830508474576,
"grad_norm": 0.46970924735069275,
"learning_rate": 1.5330327944282913e-05,
"loss": 0.259,
"num_input_tokens_seen": 20909128,
"step": 197,
"train_runtime": 2062.1357,
"train_tokens_per_second": 10139.55
},
{
"epoch": 3.3559322033898304,
"grad_norm": 0.47797513008117676,
"learning_rate": 1.5057704426400767e-05,
"loss": 0.2636,
"num_input_tokens_seen": 21021888,
"step": 198,
"train_runtime": 2072.3947,
"train_tokens_per_second": 10143.767
},
{
"epoch": 3.3728813559322033,
"grad_norm": 0.485984206199646,
"learning_rate": 1.4786478208268134e-05,
"loss": 0.2666,
"num_input_tokens_seen": 21127504,
"step": 199,
"train_runtime": 2082.6099,
"train_tokens_per_second": 10144.725
},
{
"epoch": 3.389830508474576,
"grad_norm": 0.5332046747207642,
"learning_rate": 1.4516687408277669e-05,
"loss": 0.2524,
"num_input_tokens_seen": 21230584,
"step": 200,
"train_runtime": 2092.7929,
"train_tokens_per_second": 10144.618
},
{
"epoch": 3.389830508474576,
"eval_accuracy": 0.8512740325455173,
"eval_loss": 0.4950821101665497,
"eval_runtime": 4.3457,
"eval_samples_per_second": 11.506,
"eval_steps_per_second": 2.991,
"num_input_tokens_seen": 21230584,
"step": 200
},
{
"epoch": 3.406779661016949,
"grad_norm": 0.5226387977600098,
"learning_rate": 1.4248369943086998e-05,
"loss": 0.2496,
"num_input_tokens_seen": 21344472,
"step": 201,
"train_runtime": 2107.5277,
"train_tokens_per_second": 10127.73
},
{
"epoch": 3.423728813559322,
"grad_norm": 0.5772292613983154,
"learning_rate": 1.3981563522289848e-05,
"loss": 0.3348,
"num_input_tokens_seen": 21449200,
"step": 202,
"train_runtime": 2117.6581,
"train_tokens_per_second": 10128.736
},
{
"epoch": 3.440677966101695,
"grad_norm": 0.47435376048088074,
"learning_rate": 1.3716305643116345e-05,
"loss": 0.242,
"num_input_tokens_seen": 21543072,
"step": 203,
"train_runtime": 2127.3103,
"train_tokens_per_second": 10126.906
},
{
"epoch": 3.457627118644068,
"grad_norm": 0.5281843543052673,
"learning_rate": 1.3452633585163072e-05,
"loss": 0.2973,
"num_input_tokens_seen": 21656624,
"step": 204,
"train_runtime": 2137.3742,
"train_tokens_per_second": 10132.35
},
{
"epoch": 3.4745762711864407,
"grad_norm": 0.4969395697116852,
"learning_rate": 1.3190584405153767e-05,
"loss": 0.2397,
"num_input_tokens_seen": 21771480,
"step": 205,
"train_runtime": 2147.7555,
"train_tokens_per_second": 10136.852
},
{
"epoch": 3.4915254237288136,
"grad_norm": 0.43185102939605713,
"learning_rate": 1.2930194931731382e-05,
"loss": 0.2163,
"num_input_tokens_seen": 21884760,
"step": 206,
"train_runtime": 2158.1144,
"train_tokens_per_second": 10140.686
},
{
"epoch": 3.5084745762711864,
"grad_norm": 0.6581583023071289,
"learning_rate": 1.2671501760282079e-05,
"loss": 0.3422,
"num_input_tokens_seen": 21991712,
"step": 207,
"train_runtime": 2168.313,
"train_tokens_per_second": 10142.314
},
{
"epoch": 3.5254237288135593,
"grad_norm": 0.5069866180419922,
"learning_rate": 1.2414541247792121e-05,
"loss": 0.2829,
"num_input_tokens_seen": 22104552,
"step": 208,
"train_runtime": 2178.6146,
"train_tokens_per_second": 10146.151
},
{
"epoch": 3.542372881355932,
"grad_norm": 0.46896129846572876,
"learning_rate": 1.2159349507738247e-05,
"loss": 0.2411,
"num_input_tokens_seen": 22209288,
"step": 209,
"train_runtime": 2188.8257,
"train_tokens_per_second": 10146.668
},
{
"epoch": 3.559322033898305,
"grad_norm": 0.4443514943122864,
"learning_rate": 1.1905962405012192e-05,
"loss": 0.2872,
"num_input_tokens_seen": 22307624,
"step": 210,
"train_runtime": 2198.6992,
"train_tokens_per_second": 10145.828
},
{
"epoch": 3.576271186440678,
"grad_norm": 0.6187303066253662,
"learning_rate": 1.1654415550880243e-05,
"loss": 0.3551,
"num_input_tokens_seen": 22407656,
"step": 211,
"train_runtime": 2208.9033,
"train_tokens_per_second": 10144.245
},
{
"epoch": 3.593220338983051,
"grad_norm": 0.5176218152046204,
"learning_rate": 1.1404744297978373e-05,
"loss": 0.2102,
"num_input_tokens_seen": 22516640,
"step": 212,
"train_runtime": 2219.0952,
"train_tokens_per_second": 10146.766
},
{
"epoch": 3.610169491525424,
"grad_norm": 0.5846608281135559,
"learning_rate": 1.1156983735343796e-05,
"loss": 0.2977,
"num_input_tokens_seen": 22620992,
"step": 213,
"train_runtime": 2229.4499,
"train_tokens_per_second": 10146.445
},
{
"epoch": 3.6271186440677967,
"grad_norm": 0.5257160663604736,
"learning_rate": 1.0911168683483449e-05,
"loss": 0.2581,
"num_input_tokens_seen": 22736624,
"step": 214,
"train_runtime": 2239.7969,
"train_tokens_per_second": 10151.199
},
{
"epoch": 3.6440677966101696,
"grad_norm": 0.419572651386261,
"learning_rate": 1.0667333689480322e-05,
"loss": 0.2166,
"num_input_tokens_seen": 22855144,
"step": 215,
"train_runtime": 2250.1683,
"train_tokens_per_second": 10157.082
},
{
"epoch": 3.6610169491525424,
"grad_norm": 0.4591136574745178,
"learning_rate": 1.0425513022138203e-05,
"loss": 0.2322,
"num_input_tokens_seen": 22965704,
"step": 216,
"train_runtime": 2260.5309,
"train_tokens_per_second": 10159.429
},
{
"epoch": 3.6779661016949152,
"grad_norm": 0.407421737909317,
"learning_rate": 1.0185740667165456e-05,
"loss": 0.301,
"num_input_tokens_seen": 23070056,
"step": 217,
"train_runtime": 2270.4807,
"train_tokens_per_second": 10160.868
},
{
"epoch": 3.694915254237288,
"grad_norm": 0.42050784826278687,
"learning_rate": 9.948050322398658e-06,
"loss": 0.2224,
"num_input_tokens_seen": 23180184,
"step": 218,
"train_runtime": 2280.8847,
"train_tokens_per_second": 10162.804
},
{
"epoch": 3.711864406779661,
"grad_norm": 0.49007946252822876,
"learning_rate": 9.712475393066705e-06,
"loss": 0.3068,
"num_input_tokens_seen": 23285216,
"step": 219,
"train_runtime": 2291.106,
"train_tokens_per_second": 10163.308
},
{
"epoch": 3.7288135593220337,
"grad_norm": 0.41998228430747986,
"learning_rate": 9.479048987095954e-06,
"loss": 0.2098,
"num_input_tokens_seen": 23393240,
"step": 220,
"train_runtime": 2301.46,
"train_tokens_per_second": 10164.522
},
{
"epoch": 3.7457627118644066,
"grad_norm": 0.5238583087921143,
"learning_rate": 9.247803910457226e-06,
"loss": 0.2637,
"num_input_tokens_seen": 23505224,
"step": 221,
"train_runtime": 2311.8168,
"train_tokens_per_second": 10167.425
},
{
"epoch": 3.7627118644067794,
"grad_norm": 0.4628532826900482,
"learning_rate": 9.018772662555252e-06,
"loss": 0.2402,
"num_input_tokens_seen": 23602096,
"step": 222,
"train_runtime": 2322.0686,
"train_tokens_per_second": 10164.254
},
{
"epoch": 3.7796610169491527,
"grad_norm": 0.42983704805374146,
"learning_rate": 8.791987431661137e-06,
"loss": 0.232,
"num_input_tokens_seen": 23697608,
"step": 223,
"train_runtime": 2331.884,
"train_tokens_per_second": 10162.43
},
{
"epoch": 3.7966101694915255,
"grad_norm": 0.5031875967979431,
"learning_rate": 8.567480090388586e-06,
"loss": 0.298,
"num_input_tokens_seen": 23795304,
"step": 224,
"train_runtime": 2342.077,
"train_tokens_per_second": 10159.915
},
{
"epoch": 3.8135593220338984,
"grad_norm": 0.5014523863792419,
"learning_rate": 8.34528219121455e-06,
"loss": 0.2689,
"num_input_tokens_seen": 23905280,
"step": 225,
"train_runtime": 2352.3335,
"train_tokens_per_second": 10162.368
},
{
"epoch": 3.8135593220338984,
"eval_accuracy": 0.8556592515560445,
"eval_loss": 0.4844910502433777,
"eval_runtime": 4.3302,
"eval_samples_per_second": 11.547,
"eval_steps_per_second": 3.002,
"num_input_tokens_seen": 23905280,
"step": 225
},
{
"epoch": 3.830508474576271,
"grad_norm": 0.46191343665122986,
"learning_rate": 8.125424962044742e-06,
"loss": 0.2417,
"num_input_tokens_seen": 24015504,
"step": 226,
"train_runtime": 2367.013,
"train_tokens_per_second": 10145.911
},
{
"epoch": 3.847457627118644,
"grad_norm": 0.5780752897262573,
"learning_rate": 7.907939301824884e-06,
"loss": 0.2673,
"num_input_tokens_seen": 24128928,
"step": 227,
"train_runtime": 2377.1018,
"train_tokens_per_second": 10150.566
},
{
"epoch": 3.864406779661017,
"grad_norm": 0.5428578853607178,
"learning_rate": 7.692855776198114e-06,
"loss": 0.2541,
"num_input_tokens_seen": 24232712,
"step": 228,
"train_runtime": 2387.4126,
"train_tokens_per_second": 10150.199
},
{
"epoch": 3.8813559322033897,
"grad_norm": 0.48555266857147217,
"learning_rate": 7.480204613209288e-06,
"loss": 0.2341,
"num_input_tokens_seen": 24337744,
"step": 229,
"train_runtime": 2397.6403,
"train_tokens_per_second": 10150.707
},
{
"epoch": 3.898305084745763,
"grad_norm": 0.46837303042411804,
"learning_rate": 7.2700156990566675e-06,
"loss": 0.2309,
"num_input_tokens_seen": 24446736,
"step": 230,
"train_runtime": 2408.0187,
"train_tokens_per_second": 10152.22
},
{
"epoch": 3.915254237288136,
"grad_norm": 0.5462357997894287,
"learning_rate": 7.062318573891716e-06,
"loss": 0.2718,
"num_input_tokens_seen": 24562728,
"step": 231,
"train_runtime": 2418.4129,
"train_tokens_per_second": 10156.548
},
{
"epoch": 3.9322033898305087,
"grad_norm": 0.5173876285552979,
"learning_rate": 6.85714242766749e-06,
"loss": 0.2529,
"num_input_tokens_seen": 24669264,
"step": 232,
"train_runtime": 2428.6573,
"train_tokens_per_second": 10157.573
},
{
"epoch": 3.9491525423728815,
"grad_norm": 0.5115875601768494,
"learning_rate": 6.654516096036231e-06,
"loss": 0.2899,
"num_input_tokens_seen": 24774256,
"step": 233,
"train_runtime": 2440.0145,
"train_tokens_per_second": 10153.323
},
{
"epoch": 3.9661016949152543,
"grad_norm": 0.4781138002872467,
"learning_rate": 6.4544680562968e-06,
"loss": 0.3025,
"num_input_tokens_seen": 24882752,
"step": 234,
"train_runtime": 2451.4184,
"train_tokens_per_second": 10150.349
},
{
"epoch": 3.983050847457627,
"grad_norm": 0.46944358944892883,
"learning_rate": 6.25702642339244e-06,
"loss": 0.2283,
"num_input_tokens_seen": 24984864,
"step": 235,
"train_runtime": 2462.74,
"train_tokens_per_second": 10145.149
},
{
"epoch": 4.0,
"grad_norm": 0.5003960728645325,
"learning_rate": 6.062218945959497e-06,
"loss": 0.2404,
"num_input_tokens_seen": 25087648,
"step": 236,
"train_runtime": 2474.0239,
"train_tokens_per_second": 10140.423
},
{
"epoch": 4.016949152542373,
"grad_norm": 0.4178627133369446,
"learning_rate": 5.87007300242757e-06,
"loss": 0.1724,
"num_input_tokens_seen": 25195544,
"step": 237,
"train_runtime": 2485.4081,
"train_tokens_per_second": 10137.387
},
{
"epoch": 4.033898305084746,
"grad_norm": 0.45459651947021484,
"learning_rate": 5.680615597171718e-06,
"loss": 0.1724,
"num_input_tokens_seen": 25298896,
"step": 238,
"train_runtime": 2496.7382,
"train_tokens_per_second": 10132.779
},
{
"epoch": 4.0508474576271185,
"grad_norm": 0.44449377059936523,
"learning_rate": 5.493873356717288e-06,
"loss": 0.2116,
"num_input_tokens_seen": 25400872,
"step": 239,
"train_runtime": 2508.1036,
"train_tokens_per_second": 10127.521
},
{
"epoch": 4.067796610169491,
"grad_norm": 0.4421490728855133,
"learning_rate": 5.309872525997736e-06,
"loss": 0.1622,
"num_input_tokens_seen": 25512440,
"step": 240,
"train_runtime": 2519.2603,
"train_tokens_per_second": 10126.957
},
{
"epoch": 4.084745762711864,
"grad_norm": 0.40225303173065186,
"learning_rate": 5.128638964666166e-06,
"loss": 0.1675,
"num_input_tokens_seen": 25630856,
"step": 241,
"train_runtime": 2530.7189,
"train_tokens_per_second": 10127.895
},
{
"epoch": 4.101694915254237,
"grad_norm": 0.4766783118247986,
"learning_rate": 4.950198143461013e-06,
"loss": 0.177,
"num_input_tokens_seen": 25735072,
"step": 242,
"train_runtime": 2542.0428,
"train_tokens_per_second": 10123.776
},
{
"epoch": 4.11864406779661,
"grad_norm": 0.40857142210006714,
"learning_rate": 4.7745751406263165e-06,
"loss": 0.1777,
"num_input_tokens_seen": 25828728,
"step": 243,
"train_runtime": 2552.7154,
"train_tokens_per_second": 10118.139
},
{
"epoch": 4.135593220338983,
"grad_norm": 0.5124621987342834,
"learning_rate": 4.601794638387219e-06,
"loss": 0.2315,
"num_input_tokens_seen": 25934056,
"step": 244,
"train_runtime": 2563.8172,
"train_tokens_per_second": 10115.408
},
{
"epoch": 4.1525423728813555,
"grad_norm": 0.4303904175758362,
"learning_rate": 4.43188091948113e-06,
"loss": 0.1943,
"num_input_tokens_seen": 26045936,
"step": 245,
"train_runtime": 2575.1723,
"train_tokens_per_second": 10114.25
},
{
"epoch": 4.169491525423728,
"grad_norm": 0.5164250135421753,
"learning_rate": 4.264857863744956e-06,
"loss": 0.2207,
"num_input_tokens_seen": 26149848,
"step": 246,
"train_runtime": 2586.5085,
"train_tokens_per_second": 10110.096
},
{
"epoch": 4.186440677966102,
"grad_norm": 0.4310712516307831,
"learning_rate": 4.1007489447590365e-06,
"loss": 0.1831,
"num_input_tokens_seen": 26254680,
"step": 247,
"train_runtime": 2597.5682,
"train_tokens_per_second": 10107.407
},
{
"epoch": 4.203389830508475,
"grad_norm": 0.5040118098258972,
"learning_rate": 3.939577226548152e-06,
"loss": 0.2043,
"num_input_tokens_seen": 26355904,
"step": 248,
"train_runtime": 2608.9171,
"train_tokens_per_second": 10102.239
},
{
"epoch": 4.220338983050848,
"grad_norm": 0.3849621117115021,
"learning_rate": 3.781365360340056e-06,
"loss": 0.1638,
"num_input_tokens_seen": 26449120,
"step": 249,
"train_runtime": 2620.1674,
"train_tokens_per_second": 10094.439
},
{
"epoch": 4.237288135593221,
"grad_norm": 0.4804217517375946,
"learning_rate": 3.6261355813820645e-06,
"loss": 0.1457,
"num_input_tokens_seen": 26551272,
"step": 250,
"train_runtime": 2631.5568,
"train_tokens_per_second": 10089.568
},
{
"epoch": 4.237288135593221,
"eval_accuracy": 0.856542517040091,
"eval_loss": 0.48583686351776123,
"eval_runtime": 4.3487,
"eval_samples_per_second": 11.498,
"eval_steps_per_second": 2.989,
"num_input_tokens_seen": 26551272,
"step": 250
},
{
"epoch": 4.254237288135593,
"grad_norm": 0.4448375403881073,
"learning_rate": 3.4739097058161114e-06,
"loss": 0.213,
"num_input_tokens_seen": 26656864,
"step": 251,
"train_runtime": 2647.2819,
"train_tokens_per_second": 10069.522
},
{
"epoch": 4.271186440677966,
"grad_norm": 0.40976589918136597,
"learning_rate": 3.324709127612649e-06,
"loss": 0.1551,
"num_input_tokens_seen": 26761368,
"step": 252,
"train_runtime": 2658.5975,
"train_tokens_per_second": 10065.972
},
{
"epoch": 4.288135593220339,
"grad_norm": 0.5147706270217896,
"learning_rate": 3.1785548155639444e-06,
"loss": 0.2033,
"num_input_tokens_seen": 26860000,
"step": 253,
"train_runtime": 2669.9079,
"train_tokens_per_second": 10060.272
},
{
"epoch": 4.305084745762712,
"grad_norm": 0.4196415841579437,
"learning_rate": 3.035467310337095e-06,
"loss": 0.1509,
"num_input_tokens_seen": 26971656,
"step": 254,
"train_runtime": 2681.2328,
"train_tokens_per_second": 10059.423
},
{
"epoch": 4.322033898305085,
"grad_norm": 0.6118748784065247,
"learning_rate": 2.895466721587245e-06,
"loss": 0.1798,
"num_input_tokens_seen": 27072048,
"step": 255,
"train_runtime": 2692.5604,
"train_tokens_per_second": 10054.389
},
{
"epoch": 4.338983050847458,
"grad_norm": 0.49149730801582336,
"learning_rate": 2.75857272513132e-06,
"loss": 0.1817,
"num_input_tokens_seen": 27181384,
"step": 256,
"train_runtime": 2704.0171,
"train_tokens_per_second": 10052.224
},
{
"epoch": 4.3559322033898304,
"grad_norm": 0.5347187519073486,
"learning_rate": 2.624804560182789e-06,
"loss": 0.2302,
"num_input_tokens_seen": 27283960,
"step": 257,
"train_runtime": 2715.419,
"train_tokens_per_second": 10047.79
},
{
"epoch": 4.372881355932203,
"grad_norm": 0.4388182759284973,
"learning_rate": 2.494181026647782e-06,
"loss": 0.1795,
"num_input_tokens_seen": 27387568,
"step": 258,
"train_runtime": 2726.8004,
"train_tokens_per_second": 10043.848
},
{
"epoch": 4.389830508474576,
"grad_norm": 0.4659542441368103,
"learning_rate": 2.3667204824828953e-06,
"loss": 0.158,
"num_input_tokens_seen": 27495760,
"step": 259,
"train_runtime": 2738.2116,
"train_tokens_per_second": 10041.503
},
{
"epoch": 4.406779661016949,
"grad_norm": 0.3610907196998596,
"learning_rate": 2.2424408411151704e-06,
"loss": 0.1672,
"num_input_tokens_seen": 27602880,
"step": 260,
"train_runtime": 2749.6201,
"train_tokens_per_second": 10038.798
},
{
"epoch": 4.423728813559322,
"grad_norm": 0.46502378582954407,
"learning_rate": 2.1213595689245386e-06,
"loss": 0.2301,
"num_input_tokens_seen": 27704640,
"step": 261,
"train_runtime": 2760.4128,
"train_tokens_per_second": 10036.412
},
{
"epoch": 4.440677966101695,
"grad_norm": 0.3998453617095947,
"learning_rate": 2.00349368278904e-06,
"loss": 0.1928,
"num_input_tokens_seen": 27812152,
"step": 262,
"train_runtime": 2771.7382,
"train_tokens_per_second": 10034.191
},
{
"epoch": 4.4576271186440675,
"grad_norm": 0.4807196259498596,
"learning_rate": 1.8888597476932834e-06,
"loss": 0.218,
"num_input_tokens_seen": 27916960,
"step": 263,
"train_runtime": 2783.07,
"train_tokens_per_second": 10030.994
},
{
"epoch": 4.47457627118644,
"grad_norm": 0.4619472324848175,
"learning_rate": 1.7774738744003927e-06,
"loss": 0.1459,
"num_input_tokens_seen": 28032376,
"step": 264,
"train_runtime": 2794.6692,
"train_tokens_per_second": 10030.66
},
{
"epoch": 4.491525423728813,
"grad_norm": 0.5940751433372498,
"learning_rate": 1.6693517171877533e-06,
"loss": 0.1564,
"num_input_tokens_seen": 28132512,
"step": 265,
"train_runtime": 2805.8149,
"train_tokens_per_second": 10026.503
},
{
"epoch": 4.508474576271187,
"grad_norm": 0.41254979372024536,
"learning_rate": 1.5645084716469777e-06,
"loss": 0.134,
"num_input_tokens_seen": 28234672,
"step": 266,
"train_runtime": 2816.9381,
"train_tokens_per_second": 10023.178
},
{
"epoch": 4.52542372881356,
"grad_norm": 0.4586114287376404,
"learning_rate": 1.4629588725482841e-06,
"loss": 0.1599,
"num_input_tokens_seen": 28338776,
"step": 267,
"train_runtime": 2828.0299,
"train_tokens_per_second": 10020.678
},
{
"epoch": 4.5423728813559325,
"grad_norm": 0.4491695761680603,
"learning_rate": 1.3647171917696684e-06,
"loss": 0.1864,
"num_input_tokens_seen": 28458896,
"step": 268,
"train_runtime": 2839.5002,
"train_tokens_per_second": 10022.502
},
{
"epoch": 4.559322033898305,
"grad_norm": 0.5336939096450806,
"learning_rate": 1.2697972362911064e-06,
"loss": 0.2124,
"num_input_tokens_seen": 28570072,
"step": 269,
"train_runtime": 2850.906,
"train_tokens_per_second": 10021.401
},
{
"epoch": 4.576271186440678,
"grad_norm": 0.4559793770313263,
"learning_rate": 1.1782123462541178e-06,
"loss": 0.1485,
"num_input_tokens_seen": 28674856,
"step": 270,
"train_runtime": 2862.1961,
"train_tokens_per_second": 10018.48
},
{
"epoch": 4.593220338983051,
"grad_norm": 0.46341943740844727,
"learning_rate": 1.0899753930869394e-06,
"loss": 0.2151,
"num_input_tokens_seen": 28779768,
"step": 271,
"train_runtime": 2873.4101,
"train_tokens_per_second": 10015.893
},
{
"epoch": 4.610169491525424,
"grad_norm": 0.5154189467430115,
"learning_rate": 1.00509877769554e-06,
"loss": 0.2067,
"num_input_tokens_seen": 28885568,
"step": 272,
"train_runtime": 2884.7694,
"train_tokens_per_second": 10013.129
},
{
"epoch": 4.627118644067797,
"grad_norm": 0.38729819655418396,
"learning_rate": 9.235944287207976e-07,
"loss": 0.1291,
"num_input_tokens_seen": 29000280,
"step": 273,
"train_runtime": 2896.1682,
"train_tokens_per_second": 10013.327
},
{
"epoch": 4.6440677966101696,
"grad_norm": 0.43197062611579895,
"learning_rate": 8.454738008620456e-07,
"loss": 0.2032,
"num_input_tokens_seen": 29100816,
"step": 274,
"train_runtime": 2907.2873,
"train_tokens_per_second": 10009.611
},
{
"epoch": 4.661016949152542,
"grad_norm": 0.46766197681427,
"learning_rate": 7.707478732671941e-07,
"loss": 0.1878,
"num_input_tokens_seen": 29196936,
"step": 275,
"train_runtime": 2918.59,
"train_tokens_per_second": 10003.781
},
{
"epoch": 4.661016949152542,
"eval_accuracy": 0.8570156034618972,
"eval_loss": 0.48511388897895813,
"eval_runtime": 4.3472,
"eval_samples_per_second": 11.502,
"eval_steps_per_second": 2.99,
"num_input_tokens_seen": 29196936,
"step": 275
},
{
"epoch": 4.677966101694915,
"grad_norm": 0.44221359491348267,
"learning_rate": 6.994271479897314e-07,
"loss": 0.2355,
"num_input_tokens_seen": 29310880,
"step": 276,
"train_runtime": 2934.3811,
"train_tokens_per_second": 9988.777
},
{
"epoch": 4.694915254237288,
"grad_norm": 0.4733332395553589,
"learning_rate": 6.315216485127506e-07,
"loss": 0.1741,
"num_input_tokens_seen": 29421104,
"step": 277,
"train_runtime": 2945.4888,
"train_tokens_per_second": 9988.53
},
{
"epoch": 4.711864406779661,
"grad_norm": 0.49647605419158936,
"learning_rate": 5.670409183402364e-07,
"loss": 0.2569,
"num_input_tokens_seen": 29524408,
"step": 278,
"train_runtime": 2956.8376,
"train_tokens_per_second": 9985.13
},
{
"epoch": 4.728813559322034,
"grad_norm": 0.4930429756641388,
"learning_rate": 5.059940196558088e-07,
"loss": 0.2181,
"num_input_tokens_seen": 29624776,
"step": 279,
"train_runtime": 2967.8288,
"train_tokens_per_second": 9981.969
},
{
"epoch": 4.745762711864407,
"grad_norm": 0.5534479022026062,
"learning_rate": 4.4838953204912326e-07,
"loss": 0.2237,
"num_input_tokens_seen": 29728640,
"step": 280,
"train_runtime": 2979.2153,
"train_tokens_per_second": 9978.681
},
{
"epoch": 4.762711864406779,
"grad_norm": 0.4766373634338379,
"learning_rate": 3.9423555131007925e-07,
"loss": 0.2027,
"num_input_tokens_seen": 29832320,
"step": 281,
"train_runtime": 2990.5952,
"train_tokens_per_second": 9975.379
},
{
"epoch": 4.779661016949152,
"grad_norm": 0.40961286425590515,
"learning_rate": 3.435396882910391e-07,
"loss": 0.1866,
"num_input_tokens_seen": 29938136,
"step": 282,
"train_runtime": 3001.9098,
"train_tokens_per_second": 9973.03
},
{
"epoch": 4.796610169491525,
"grad_norm": 0.449975848197937,
"learning_rate": 2.963090678371805e-07,
"loss": 0.1648,
"num_input_tokens_seen": 30038384,
"step": 283,
"train_runtime": 3013.1073,
"train_tokens_per_second": 9969.238
},
{
"epoch": 4.813559322033898,
"grad_norm": 0.4254641830921173,
"learning_rate": 2.5255032778517264e-07,
"loss": 0.1827,
"num_input_tokens_seen": 30155184,
"step": 284,
"train_runtime": 3024.58,
"train_tokens_per_second": 9970.04
},
{
"epoch": 4.830508474576272,
"grad_norm": 0.3542494475841522,
"learning_rate": 2.1226961803028632e-07,
"loss": 0.194,
"num_input_tokens_seen": 30258792,
"step": 285,
"train_runtime": 3035.7174,
"train_tokens_per_second": 9967.592
},
{
"epoch": 4.847457627118644,
"grad_norm": 0.51301109790802,
"learning_rate": 1.7547259966207708e-07,
"loss": 0.197,
"num_input_tokens_seen": 30360016,
"step": 286,
"train_runtime": 3046.9789,
"train_tokens_per_second": 9963.973
},
{
"epoch": 4.864406779661017,
"grad_norm": 0.3893289566040039,
"learning_rate": 1.4216444416877695e-07,
"loss": 0.1479,
"num_input_tokens_seen": 30457880,
"step": 287,
"train_runtime": 3058.3141,
"train_tokens_per_second": 9959.042
},
{
"epoch": 4.88135593220339,
"grad_norm": 0.44192439317703247,
"learning_rate": 1.1234983271048161e-07,
"loss": 0.1922,
"num_input_tokens_seen": 30561968,
"step": 288,
"train_runtime": 3069.6644,
"train_tokens_per_second": 9956.127
},
{
"epoch": 4.898305084745763,
"grad_norm": 0.46849843859672546,
"learning_rate": 8.603295546126821e-08,
"loss": 0.2168,
"num_input_tokens_seen": 30666960,
"step": 289,
"train_runtime": 3080.873,
"train_tokens_per_second": 9953.984
},
{
"epoch": 4.915254237288136,
"grad_norm": 0.4828685522079468,
"learning_rate": 6.321751102028595e-08,
"loss": 0.1848,
"num_input_tokens_seen": 30789448,
"step": 290,
"train_runtime": 3092.3693,
"train_tokens_per_second": 9956.588
},
{
"epoch": 4.932203389830509,
"grad_norm": 0.5445053577423096,
"learning_rate": 4.390670589196622e-08,
"loss": 0.1965,
"num_input_tokens_seen": 30910944,
"step": 291,
"train_runtime": 3103.817,
"train_tokens_per_second": 9959.01
},
{
"epoch": 4.9491525423728815,
"grad_norm": 0.5450407862663269,
"learning_rate": 2.8103254035369285e-08,
"loss": 0.2339,
"num_input_tokens_seen": 31025152,
"step": 292,
"train_runtime": 3115.2864,
"train_tokens_per_second": 9959.005
},
{
"epoch": 4.966101694915254,
"grad_norm": 0.48578107357025146,
"learning_rate": 1.5809376482767147e-08,
"loss": 0.1937,
"num_input_tokens_seen": 31135592,
"step": 293,
"train_runtime": 3126.6738,
"train_tokens_per_second": 9958.056
},
{
"epoch": 4.983050847457627,
"grad_norm": 0.5169346332550049,
"learning_rate": 7.0268010274959775e-09,
"loss": 0.1793,
"num_input_tokens_seen": 31247744,
"step": 294,
"train_runtime": 3138.117,
"train_tokens_per_second": 9957.482
},
{
"epoch": 5.0,
"grad_norm": 0.4611862003803253,
"learning_rate": 1.7567619811281744e-09,
"loss": 0.1822,
"num_input_tokens_seen": 31362728,
"step": 295,
"train_runtime": 3149.5513,
"train_tokens_per_second": 9957.84
},
{
"epoch": 5.0,
"num_input_tokens_seen": 31362728,
"step": 295,
"total_flos": 1.4241950524474655e+18,
"train_loss": 0.4095978515633082,
"train_runtime": 3265.849,
"train_samples_per_second": 1.442,
"train_steps_per_second": 0.09
}
],
"logging_steps": 1,
"max_steps": 295,
"num_input_tokens_seen": 31362728,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4241950524474655e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}