{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 98.9795918367347, "eval_steps": 500, "global_step": 29100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0204081632653061, "grad_norm": 1.544520378112793, "learning_rate": 1.0170068027210885e-05, "loss": 9.6915, "step": 300 }, { "epoch": 2.0408163265306123, "grad_norm": 1.6265058517456055, "learning_rate": 2.0374149659863947e-05, "loss": 8.3327, "step": 600 }, { "epoch": 3.061224489795918, "grad_norm": 2.0470693111419678, "learning_rate": 3.0578231292517004e-05, "loss": 7.7972, "step": 900 }, { "epoch": 4.081632653061225, "grad_norm": 1.7373660802841187, "learning_rate": 4.078231292517007e-05, "loss": 7.508, "step": 1200 }, { "epoch": 5.1020408163265305, "grad_norm": 2.189188241958618, "learning_rate": 5.0986394557823136e-05, "loss": 7.2126, "step": 1500 }, { "epoch": 6.122448979591836, "grad_norm": 2.1524977684020996, "learning_rate": 6.11904761904762e-05, "loss": 6.9194, "step": 1800 }, { "epoch": 7.142857142857143, "grad_norm": 2.03401517868042, "learning_rate": 7.139455782312926e-05, "loss": 6.6392, "step": 2100 }, { "epoch": 8.16326530612245, "grad_norm": 2.0746707916259766, "learning_rate": 8.159863945578233e-05, "loss": 6.3837, "step": 2400 }, { "epoch": 9.183673469387756, "grad_norm": 2.1161186695098877, "learning_rate": 9.180272108843538e-05, "loss": 6.1311, "step": 2700 }, { "epoch": 10.204081632653061, "grad_norm": 2.1764469146728516, "learning_rate": 9.977702191987907e-05, "loss": 5.8614, "step": 3000 }, { "epoch": 11.224489795918368, "grad_norm": 2.414827823638916, "learning_rate": 9.86432350718065e-05, "loss": 5.5985, "step": 3300 }, { "epoch": 12.244897959183673, "grad_norm": 2.5662031173706055, "learning_rate": 9.750944822373394e-05, "loss": 5.3029, "step": 3600 }, { "epoch": 13.26530612244898, "grad_norm": 2.821119785308838, "learning_rate": 9.637566137566139e-05, "loss": 5.0116, "step": 3900 }, { "epoch": 14.285714285714286, "grad_norm": 3.0048000812530518, "learning_rate": 9.524187452758882e-05, "loss": 4.7344, "step": 4200 }, { "epoch": 15.306122448979592, "grad_norm": 3.242598295211792, "learning_rate": 9.410808767951625e-05, "loss": 4.4621, "step": 4500 }, { "epoch": 16.3265306122449, "grad_norm": 3.316965103149414, "learning_rate": 9.29743008314437e-05, "loss": 4.2054, "step": 4800 }, { "epoch": 17.346938775510203, "grad_norm": 3.7419989109039307, "learning_rate": 9.184051398337114e-05, "loss": 3.9517, "step": 5100 }, { "epoch": 18.367346938775512, "grad_norm": 3.7479987144470215, "learning_rate": 9.070672713529857e-05, "loss": 3.7011, "step": 5400 }, { "epoch": 19.387755102040817, "grad_norm": 3.7604148387908936, "learning_rate": 8.9572940287226e-05, "loss": 3.4686, "step": 5700 }, { "epoch": 20.408163265306122, "grad_norm": 4.052367210388184, "learning_rate": 8.843915343915344e-05, "loss": 3.2285, "step": 6000 }, { "epoch": 21.428571428571427, "grad_norm": 4.340782642364502, "learning_rate": 8.730536659108089e-05, "loss": 3.0094, "step": 6300 }, { "epoch": 22.448979591836736, "grad_norm": 4.590024948120117, "learning_rate": 8.617157974300832e-05, "loss": 2.7901, "step": 6600 }, { "epoch": 23.46938775510204, "grad_norm": 4.343008995056152, "learning_rate": 8.503779289493575e-05, "loss": 2.5887, "step": 6900 }, { "epoch": 24.489795918367346, "grad_norm": 4.200931549072266, "learning_rate": 8.39040060468632e-05, "loss": 2.3804, "step": 7200 }, { "epoch": 25.510204081632654, "grad_norm": 4.765750408172607, "learning_rate": 8.277021919879064e-05, "loss": 2.1853, "step": 7500 }, { "epoch": 26.53061224489796, "grad_norm": 4.196296215057373, "learning_rate": 8.163643235071807e-05, "loss": 2.0026, "step": 7800 }, { "epoch": 27.551020408163264, "grad_norm": 4.485163688659668, "learning_rate": 8.05026455026455e-05, "loss": 1.8218, "step": 8100 }, { "epoch": 28.571428571428573, "grad_norm": 4.515989780426025, "learning_rate": 7.936885865457294e-05, "loss": 1.6588, "step": 8400 }, { "epoch": 29.591836734693878, "grad_norm": 4.492111682891846, "learning_rate": 7.823507180650039e-05, "loss": 1.4945, "step": 8700 }, { "epoch": 30.612244897959183, "grad_norm": 4.51740026473999, "learning_rate": 7.710128495842782e-05, "loss": 1.3481, "step": 9000 }, { "epoch": 31.632653061224488, "grad_norm": 4.193362236022949, "learning_rate": 7.596749811035526e-05, "loss": 1.2062, "step": 9300 }, { "epoch": 32.6530612244898, "grad_norm": 4.017958164215088, "learning_rate": 7.483371126228269e-05, "loss": 1.0725, "step": 9600 }, { "epoch": 33.673469387755105, "grad_norm": 4.180546283721924, "learning_rate": 7.369992441421014e-05, "loss": 0.9466, "step": 9900 }, { "epoch": 34.69387755102041, "grad_norm": 4.280745983123779, "learning_rate": 7.256613756613757e-05, "loss": 0.8416, "step": 10200 }, { "epoch": 35.714285714285715, "grad_norm": 3.9538300037384033, "learning_rate": 7.143235071806501e-05, "loss": 0.7401, "step": 10500 }, { "epoch": 36.734693877551024, "grad_norm": 4.204588890075684, "learning_rate": 7.029856386999244e-05, "loss": 0.6532, "step": 10800 }, { "epoch": 37.755102040816325, "grad_norm": 3.8845582008361816, "learning_rate": 6.916477702191987e-05, "loss": 0.5701, "step": 11100 }, { "epoch": 38.775510204081634, "grad_norm": 3.7283339500427246, "learning_rate": 6.803099017384732e-05, "loss": 0.5032, "step": 11400 }, { "epoch": 39.795918367346935, "grad_norm": 3.3194797039031982, "learning_rate": 6.689720332577476e-05, "loss": 0.4403, "step": 11700 }, { "epoch": 40.816326530612244, "grad_norm": 3.4429259300231934, "learning_rate": 6.57634164777022e-05, "loss": 0.3887, "step": 12000 }, { "epoch": 41.83673469387755, "grad_norm": 3.080552577972412, "learning_rate": 6.462962962962962e-05, "loss": 0.3442, "step": 12300 }, { "epoch": 42.857142857142854, "grad_norm": 3.2737112045288086, "learning_rate": 6.349584278155707e-05, "loss": 0.3071, "step": 12600 }, { "epoch": 43.87755102040816, "grad_norm": 2.8895883560180664, "learning_rate": 6.236205593348451e-05, "loss": 0.275, "step": 12900 }, { "epoch": 44.89795918367347, "grad_norm": 3.075352430343628, "learning_rate": 6.122826908541194e-05, "loss": 0.2475, "step": 13200 }, { "epoch": 45.91836734693877, "grad_norm": 2.932194471359253, "learning_rate": 6.009448223733938e-05, "loss": 0.2239, "step": 13500 }, { "epoch": 46.93877551020408, "grad_norm": 2.5952064990997314, "learning_rate": 5.896069538926682e-05, "loss": 0.2045, "step": 13800 }, { "epoch": 47.95918367346939, "grad_norm": 2.456416606903076, "learning_rate": 5.7826908541194255e-05, "loss": 0.1875, "step": 14100 }, { "epoch": 48.97959183673469, "grad_norm": 2.836243152618408, "learning_rate": 5.66931216931217e-05, "loss": 0.1717, "step": 14400 }, { "epoch": 50.0, "grad_norm": 2.4769959449768066, "learning_rate": 5.5559334845049137e-05, "loss": 0.1582, "step": 14700 }, { "epoch": 51.02040816326531, "grad_norm": 1.9502800703048706, "learning_rate": 5.442554799697657e-05, "loss": 0.1468, "step": 15000 }, { "epoch": 52.04081632653061, "grad_norm": 2.145501136779785, "learning_rate": 5.3291761148904005e-05, "loss": 0.1366, "step": 15300 }, { "epoch": 53.06122448979592, "grad_norm": 1.8530632257461548, "learning_rate": 5.215797430083145e-05, "loss": 0.1266, "step": 15600 }, { "epoch": 54.08163265306123, "grad_norm": 2.0811140537261963, "learning_rate": 5.1024187452758886e-05, "loss": 0.1179, "step": 15900 }, { "epoch": 55.10204081632653, "grad_norm": 1.8534563779830933, "learning_rate": 4.9890400604686324e-05, "loss": 0.1096, "step": 16200 }, { "epoch": 56.12244897959184, "grad_norm": 1.7965441942214966, "learning_rate": 4.875661375661376e-05, "loss": 0.1032, "step": 16500 }, { "epoch": 57.142857142857146, "grad_norm": 1.824494481086731, "learning_rate": 4.76228269085412e-05, "loss": 0.0967, "step": 16800 }, { "epoch": 58.16326530612245, "grad_norm": 1.6980013847351074, "learning_rate": 4.6489040060468636e-05, "loss": 0.0907, "step": 17100 }, { "epoch": 59.183673469387756, "grad_norm": 1.6149917840957642, "learning_rate": 4.5355253212396074e-05, "loss": 0.085, "step": 17400 }, { "epoch": 60.204081632653065, "grad_norm": 1.788779854774475, "learning_rate": 4.422146636432351e-05, "loss": 0.0804, "step": 17700 }, { "epoch": 61.224489795918366, "grad_norm": 1.7672044038772583, "learning_rate": 4.308767951625094e-05, "loss": 0.0759, "step": 18000 }, { "epoch": 62.244897959183675, "grad_norm": 1.5566641092300415, "learning_rate": 4.1953892668178386e-05, "loss": 0.0722, "step": 18300 }, { "epoch": 63.265306122448976, "grad_norm": 1.291110873222351, "learning_rate": 4.082010582010582e-05, "loss": 0.0678, "step": 18600 }, { "epoch": 64.28571428571429, "grad_norm": 1.596009373664856, "learning_rate": 3.968631897203326e-05, "loss": 0.0639, "step": 18900 }, { "epoch": 65.3061224489796, "grad_norm": 1.4961538314819336, "learning_rate": 3.85525321239607e-05, "loss": 0.0605, "step": 19200 }, { "epoch": 66.3265306122449, "grad_norm": 1.383008599281311, "learning_rate": 3.7418745275888136e-05, "loss": 0.0571, "step": 19500 }, { "epoch": 67.34693877551021, "grad_norm": 1.1882243156433105, "learning_rate": 3.628495842781557e-05, "loss": 0.0541, "step": 19800 }, { "epoch": 68.36734693877551, "grad_norm": 1.4175117015838623, "learning_rate": 3.515117157974301e-05, "loss": 0.0515, "step": 20100 }, { "epoch": 69.38775510204081, "grad_norm": 1.412561058998108, "learning_rate": 3.401738473167045e-05, "loss": 0.0492, "step": 20400 }, { "epoch": 70.40816326530613, "grad_norm": 1.358535885810852, "learning_rate": 3.2883597883597886e-05, "loss": 0.0462, "step": 20700 }, { "epoch": 71.42857142857143, "grad_norm": 1.338392972946167, "learning_rate": 3.174981103552532e-05, "loss": 0.0443, "step": 21000 }, { "epoch": 72.44897959183673, "grad_norm": 1.3225773572921753, "learning_rate": 3.061602418745276e-05, "loss": 0.0419, "step": 21300 }, { "epoch": 73.46938775510205, "grad_norm": 1.1727213859558105, "learning_rate": 2.9482237339380198e-05, "loss": 0.0394, "step": 21600 }, { "epoch": 74.48979591836735, "grad_norm": 1.25161612033844, "learning_rate": 2.834845049130764e-05, "loss": 0.0375, "step": 21900 }, { "epoch": 75.51020408163265, "grad_norm": 1.240116834640503, "learning_rate": 2.7214663643235073e-05, "loss": 0.0358, "step": 22200 }, { "epoch": 76.53061224489795, "grad_norm": 0.9712527394294739, "learning_rate": 2.6080876795162514e-05, "loss": 0.0339, "step": 22500 }, { "epoch": 77.55102040816327, "grad_norm": 1.147048830986023, "learning_rate": 2.4947089947089948e-05, "loss": 0.0323, "step": 22800 }, { "epoch": 78.57142857142857, "grad_norm": 1.0916506052017212, "learning_rate": 2.3813303099017385e-05, "loss": 0.0308, "step": 23100 }, { "epoch": 79.59183673469387, "grad_norm": 1.128098964691162, "learning_rate": 2.2679516250944823e-05, "loss": 0.0294, "step": 23400 }, { "epoch": 80.61224489795919, "grad_norm": 1.0495482683181763, "learning_rate": 2.154572940287226e-05, "loss": 0.0276, "step": 23700 }, { "epoch": 81.63265306122449, "grad_norm": 0.8648446798324585, "learning_rate": 2.0411942554799698e-05, "loss": 0.0261, "step": 24000 }, { "epoch": 82.65306122448979, "grad_norm": 1.1346194744110107, "learning_rate": 1.9278155706727135e-05, "loss": 0.0245, "step": 24300 }, { "epoch": 83.6734693877551, "grad_norm": 0.9076672196388245, "learning_rate": 1.8144368858654572e-05, "loss": 0.0237, "step": 24600 }, { "epoch": 84.6938775510204, "grad_norm": 1.0035544633865356, "learning_rate": 1.701058201058201e-05, "loss": 0.0222, "step": 24900 }, { "epoch": 85.71428571428571, "grad_norm": 0.769279956817627, "learning_rate": 1.587679516250945e-05, "loss": 0.0211, "step": 25200 }, { "epoch": 86.73469387755102, "grad_norm": 0.9665892124176025, "learning_rate": 1.4743008314436888e-05, "loss": 0.0202, "step": 25500 }, { "epoch": 87.75510204081633, "grad_norm": 0.9066174626350403, "learning_rate": 1.3609221466364324e-05, "loss": 0.0193, "step": 25800 }, { "epoch": 88.77551020408163, "grad_norm": 0.9459673166275024, "learning_rate": 1.2475434618291761e-05, "loss": 0.0183, "step": 26100 }, { "epoch": 89.79591836734694, "grad_norm": 0.8062217235565186, "learning_rate": 1.1341647770219199e-05, "loss": 0.0174, "step": 26400 }, { "epoch": 90.81632653061224, "grad_norm": 0.8470116853713989, "learning_rate": 1.0207860922146636e-05, "loss": 0.0167, "step": 26700 }, { "epoch": 91.83673469387755, "grad_norm": 0.7526578903198242, "learning_rate": 9.074074074074075e-06, "loss": 0.016, "step": 27000 }, { "epoch": 92.85714285714286, "grad_norm": 0.6859294176101685, "learning_rate": 7.940287226001513e-06, "loss": 0.0154, "step": 27300 }, { "epoch": 93.87755102040816, "grad_norm": 0.574286937713623, "learning_rate": 6.8065003779289495e-06, "loss": 0.0147, "step": 27600 }, { "epoch": 94.89795918367346, "grad_norm": 0.9053287506103516, "learning_rate": 5.672713529856388e-06, "loss": 0.0143, "step": 27900 }, { "epoch": 95.91836734693878, "grad_norm": 0.5810430645942688, "learning_rate": 4.538926681783825e-06, "loss": 0.0137, "step": 28200 }, { "epoch": 96.93877551020408, "grad_norm": 0.5778042674064636, "learning_rate": 3.4051398337112627e-06, "loss": 0.0132, "step": 28500 }, { "epoch": 97.95918367346938, "grad_norm": 0.5646163821220398, "learning_rate": 2.2713529856387e-06, "loss": 0.0129, "step": 28800 }, { "epoch": 98.9795918367347, "grad_norm": 0.6283496022224426, "learning_rate": 1.1375661375661376e-06, "loss": 0.0126, "step": 29100 } ], "logging_steps": 300, "max_steps": 29400, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.08287850496e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }