{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 2.2997210025787354, "learning_rate": 0.00015998457923856519, "loss": 1.2401, "step": 1 }, { "epoch": 0.025, "grad_norm": 24.28518295288086, "learning_rate": 0.00015993832289925785, "loss": 4.0648, "step": 2 }, { "epoch": 0.0375, "grad_norm": 12.727800369262695, "learning_rate": 0.0001598612488147773, "loss": 2.7044, "step": 3 }, { "epoch": 0.05, "grad_norm": 4.934032440185547, "learning_rate": 0.00015975338669865026, "loss": 1.994, "step": 4 }, { "epoch": 0.0625, "grad_norm": 8.1886625289917, "learning_rate": 0.00015961477813377576, "loss": 2.1426, "step": 5 }, { "epoch": 0.075, "grad_norm": 2.910872220993042, "learning_rate": 0.00015944547655639412, "loss": 1.7254, "step": 6 }, { "epoch": 0.0875, "grad_norm": 1.565290093421936, "learning_rate": 0.00015924554723548617, "loss": 1.5187, "step": 7 }, { "epoch": 0.1, "grad_norm": 2.9080827236175537, "learning_rate": 0.00015901506724761103, "loss": 1.5405, "step": 8 }, { "epoch": 0.1125, "grad_norm": 1.875126838684082, "learning_rate": 0.00015875412544719134, "loss": 1.4493, "step": 9 }, { "epoch": 0.125, "grad_norm": 1.2589935064315796, "learning_rate": 0.00015846282243225845, "loss": 1.4103, "step": 10 }, { "epoch": 0.1375, "grad_norm": 1.2925529479980469, "learning_rate": 0.0001581412705056698, "loss": 1.3507, "step": 11 }, { "epoch": 0.15, "grad_norm": 1.4467802047729492, "learning_rate": 0.00015778959363181415, "loss": 1.3, "step": 12 }, { "epoch": 0.1625, "grad_norm": 1.267639398574829, "learning_rate": 0.0001574079273888208, "loss": 1.2974, "step": 13 }, { "epoch": 0.175, "grad_norm": 1.0911085605621338, "learning_rate": 0.00015699641891629178, "loss": 1.2635, "step": 14 }, { "epoch": 0.1875, "grad_norm": 0.9065354466438293, "learning_rate": 0.00015655522685857672, "loss": 1.2119, "step": 15 }, { "epoch": 0.2, "grad_norm": 0.7415559887886047, "learning_rate": 0.0001560845213036123, "loss": 1.2337, "step": 16 }, { "epoch": 0.2125, "grad_norm": 0.7553166151046753, "learning_rate": 0.00015558448371735025, "loss": 1.1884, "step": 17 }, { "epoch": 0.225, "grad_norm": 0.5407947301864624, "learning_rate": 0.00015505530687379875, "loss": 1.17, "step": 18 }, { "epoch": 0.2375, "grad_norm": 0.5162355899810791, "learning_rate": 0.00015449719478070428, "loss": 1.1879, "step": 19 }, { "epoch": 0.25, "grad_norm": 0.5688554644584656, "learning_rate": 0.00015391036260090294, "loss": 1.1767, "step": 20 }, { "epoch": 0.2625, "grad_norm": 0.48555564880371094, "learning_rate": 0.0001532950365693709, "loss": 1.1726, "step": 21 }, { "epoch": 0.275, "grad_norm": 0.4502723515033722, "learning_rate": 0.00015265145390600652, "loss": 1.163, "step": 22 }, { "epoch": 0.2875, "grad_norm": 0.3590157926082611, "learning_rate": 0.00015197986272417774, "loss": 1.1839, "step": 23 }, { "epoch": 0.3, "grad_norm": 0.38364410400390625, "learning_rate": 0.00015128052193506944, "loss": 1.1642, "step": 24 }, { "epoch": 0.3125, "grad_norm": 0.36856546998023987, "learning_rate": 0.0001505537011478684, "loss": 1.1495, "step": 25 }, { "epoch": 0.325, "grad_norm": 0.3514528274536133, "learning_rate": 0.0001497996805658238, "loss": 1.1257, "step": 26 }, { "epoch": 0.3375, "grad_norm": 0.42414528131484985, "learning_rate": 0.00014901875087822337, "loss": 1.1463, "step": 27 }, { "epoch": 0.35, "grad_norm": 0.35511669516563416, "learning_rate": 0.0001482112131483274, "loss": 1.141, "step": 28 }, { "epoch": 0.3625, "grad_norm": 0.3799460530281067, "learning_rate": 0.00014737737869730292, "loss": 1.1414, "step": 29 }, { "epoch": 0.375, "grad_norm": 0.26333189010620117, "learning_rate": 0.00014651756898420365, "loss": 1.1352, "step": 30 }, { "epoch": 0.3875, "grad_norm": 0.37996864318847656, "learning_rate": 0.0001456321154820411, "loss": 1.1167, "step": 31 }, { "epoch": 0.4, "grad_norm": 0.3210310637950897, "learning_rate": 0.00014472135954999581, "loss": 1.113, "step": 32 }, { "epoch": 0.4125, "grad_norm": 0.342960923910141, "learning_rate": 0.00014378565230181657, "loss": 1.1201, "step": 33 }, { "epoch": 0.425, "grad_norm": 0.30171331763267517, "learning_rate": 0.0001428253544704596, "loss": 1.1303, "step": 34 }, { "epoch": 0.4375, "grad_norm": 0.3308579623699188, "learning_rate": 0.00014184083626901897, "loss": 1.135, "step": 35 }, { "epoch": 0.45, "grad_norm": 0.33749139308929443, "learning_rate": 0.0001408324772480025, "loss": 1.1413, "step": 36 }, { "epoch": 0.4625, "grad_norm": 0.29873886704444885, "learning_rate": 0.00013980066614900776, "loss": 1.1406, "step": 37 }, { "epoch": 0.475, "grad_norm": 0.25276514887809753, "learning_rate": 0.00013874580075485485, "loss": 1.1421, "step": 38 }, { "epoch": 0.4875, "grad_norm": 0.2849913537502289, "learning_rate": 0.00013766828773623352, "loss": 1.1298, "step": 39 }, { "epoch": 0.5, "grad_norm": 0.27665936946868896, "learning_rate": 0.00013656854249492382, "loss": 1.1052, "step": 40 }, { "epoch": 0.5125, "grad_norm": 0.31618547439575195, "learning_rate": 0.0001354469890036509, "loss": 1.1124, "step": 41 }, { "epoch": 0.525, "grad_norm": 0.30855098366737366, "learning_rate": 0.00013430405964263536, "loss": 1.1164, "step": 42 }, { "epoch": 0.5375, "grad_norm": 0.24974325299263, "learning_rate": 0.00013314019503290255, "loss": 1.1379, "step": 43 }, { "epoch": 0.55, "grad_norm": 0.259245365858078, "learning_rate": 0.00013195584386641469, "loss": 1.0963, "step": 44 }, { "epoch": 0.5625, "grad_norm": 0.3342917561531067, "learning_rate": 0.00013075146273309164, "loss": 1.1089, "step": 45 }, { "epoch": 0.575, "grad_norm": 0.3317720293998718, "learning_rate": 0.00012952751594478675, "loss": 1.1226, "step": 46 }, { "epoch": 0.5875, "grad_norm": 0.2566727101802826, "learning_rate": 0.0001282844753562857, "loss": 1.1035, "step": 47 }, { "epoch": 0.6, "grad_norm": 0.25012263655662537, "learning_rate": 0.00012702282018339786, "loss": 1.0713, "step": 48 }, { "epoch": 0.6125, "grad_norm": 0.2855740189552307, "learning_rate": 0.00012574303681820898, "loss": 1.1232, "step": 49 }, { "epoch": 0.625, "grad_norm": 0.21377117931842804, "learning_rate": 0.0001244456186415682, "loss": 1.0726, "step": 50 }, { "epoch": 0.6375, "grad_norm": 0.3012278079986572, "learning_rate": 0.00012313106583288004, "loss": 1.0855, "step": 51 }, { "epoch": 0.65, "grad_norm": 0.2754627764225006, "learning_rate": 0.00012179988517727591, "loss": 1.113, "step": 52 }, { "epoch": 0.6625, "grad_norm": 0.2773728668689728, "learning_rate": 0.00012045258987023879, "loss": 1.0931, "step": 53 }, { "epoch": 0.675, "grad_norm": 0.3616091012954712, "learning_rate": 0.00011908969931975641, "loss": 1.1007, "step": 54 }, { "epoch": 0.6875, "grad_norm": 0.28011709451675415, "learning_rate": 0.00011771173894607985, "loss": 1.1312, "step": 55 }, { "epoch": 0.7, "grad_norm": 0.19245974719524384, "learning_rate": 0.00011631923997916375, "loss": 1.0784, "step": 56 }, { "epoch": 0.7125, "grad_norm": 0.2807864844799042, "learning_rate": 0.00011491273925386736, "loss": 1.0766, "step": 57 }, { "epoch": 0.725, "grad_norm": 0.25869062542915344, "learning_rate": 0.00011349277900299426, "loss": 1.0929, "step": 58 }, { "epoch": 0.7375, "grad_norm": 0.20374780893325806, "learning_rate": 0.00011205990664825127, "loss": 1.0977, "step": 59 }, { "epoch": 0.75, "grad_norm": 0.275302529335022, "learning_rate": 0.00011061467458920719, "loss": 1.1218, "step": 60 }, { "epoch": 0.7625, "grad_norm": 0.26479312777519226, "learning_rate": 0.00010915763999033201, "loss": 1.0972, "step": 61 }, { "epoch": 0.775, "grad_norm": 0.20327049493789673, "learning_rate": 0.00010768936456619945, "loss": 1.0723, "step": 62 }, { "epoch": 0.7875, "grad_norm": 0.18908362090587616, "learning_rate": 0.0001062104143649355, "loss": 1.1059, "step": 63 }, { "epoch": 0.8, "grad_norm": 0.2153932750225067, "learning_rate": 0.0001047213595499958, "loss": 1.1112, "step": 64 }, { "epoch": 0.8125, "grad_norm": 0.23449215292930603, "learning_rate": 0.000103222774180357, "loss": 1.1125, "step": 65 }, { "epoch": 0.825, "grad_norm": 0.19739866256713867, "learning_rate": 0.00010171523598920594, "loss": 1.0506, "step": 66 }, { "epoch": 0.8375, "grad_norm": 0.579247236251831, "learning_rate": 0.00010019932616121264, "loss": 1.0599, "step": 67 }, { "epoch": 0.85, "grad_norm": 0.2158878892660141, "learning_rate": 9.867562910847246e-05, "loss": 1.1116, "step": 68 }, { "epoch": 0.8625, "grad_norm": 0.24534355103969574, "learning_rate": 9.714473224520406e-05, "loss": 1.1039, "step": 69 }, { "epoch": 0.875, "grad_norm": 0.1604059487581253, "learning_rate": 9.560722576129029e-05, "loss": 1.1007, "step": 70 }, { "epoch": 0.8875, "grad_norm": 0.24135896563529968, "learning_rate": 9.406370239474839e-05, "loss": 1.0976, "step": 71 }, { "epoch": 0.9, "grad_norm": 0.2200448215007782, "learning_rate": 9.251475720321848e-05, "loss": 1.1001, "step": 72 }, { "epoch": 0.9125, "grad_norm": 0.17517372965812683, "learning_rate": 9.096098733455746e-05, "loss": 1.0864, "step": 73 }, { "epoch": 0.925, "grad_norm": 0.23631267249584198, "learning_rate": 8.940299179662703e-05, "loss": 1.0976, "step": 74 }, { "epoch": 0.9375, "grad_norm": 0.17627741396427155, "learning_rate": 8.784137122636488e-05, "loss": 1.1049, "step": 75 }, { "epoch": 0.95, "grad_norm": 0.1840021312236786, "learning_rate": 8.627672765822762e-05, "loss": 1.0504, "step": 76 }, { "epoch": 0.9625, "grad_norm": 0.1925836205482483, "learning_rate": 8.470966429209512e-05, "loss": 1.1028, "step": 77 }, { "epoch": 0.975, "grad_norm": 0.18122681975364685, "learning_rate": 8.31407852607255e-05, "loss": 1.0605, "step": 78 }, { "epoch": 0.9875, "grad_norm": 0.16970321536064148, "learning_rate": 8.157069539685026e-05, "loss": 1.1069, "step": 79 }, { "epoch": 1.0, "grad_norm": 0.2044173628091812, "learning_rate": 8e-05, "loss": 1.1173, "step": 80 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6380640896784794e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }