{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8764800560947236, "eval_steps": 500, "global_step": 55000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007968000509952033, "grad_norm": 118280.5625, "learning_rate": 2.9936381561814055e-05, "loss": 1.2421, "step": 500 }, { "epoch": 0.015936001019904066, "grad_norm": 93972.8125, "learning_rate": 2.9856659208197584e-05, "loss": 0.7745, "step": 1000 }, { "epoch": 0.023904001529856098, "grad_norm": 78047.78125, "learning_rate": 2.9776936854581113e-05, "loss": 0.73, "step": 1500 }, { "epoch": 0.03187200203980813, "grad_norm": 72831.09375, "learning_rate": 2.9697214500964642e-05, "loss": 0.7087, "step": 2000 }, { "epoch": 0.03984000254976016, "grad_norm": 62310.4453125, "learning_rate": 2.961749214734817e-05, "loss": 0.6964, "step": 2500 }, { "epoch": 0.047808003059712195, "grad_norm": 65517.09375, "learning_rate": 2.9537769793731697e-05, "loss": 0.6858, "step": 3000 }, { "epoch": 0.05577600356966423, "grad_norm": 57505.90625, "learning_rate": 2.9458047440115226e-05, "loss": 0.6761, "step": 3500 }, { "epoch": 0.06374400407961626, "grad_norm": 55419.26171875, "learning_rate": 2.9378325086498752e-05, "loss": 0.6679, "step": 4000 }, { "epoch": 0.07171200458956829, "grad_norm": 51254.3671875, "learning_rate": 2.929860273288228e-05, "loss": 0.6602, "step": 4500 }, { "epoch": 0.07968000509952032, "grad_norm": 51469.9765625, "learning_rate": 2.9218880379265813e-05, "loss": 0.6554, "step": 5000 }, { "epoch": 0.08764800560947236, "grad_norm": 69357.2265625, "learning_rate": 2.913915802564934e-05, "loss": 0.6516, "step": 5500 }, { "epoch": 0.09561600611942439, "grad_norm": 48255.7890625, "learning_rate": 2.9059435672032868e-05, "loss": 0.6482, "step": 6000 }, { "epoch": 0.10358400662937642, "grad_norm": 59810.20703125, "learning_rate": 2.8979713318416394e-05, "loss": 0.6397, "step": 6500 }, { "epoch": 0.11155200713932846, "grad_norm": 54012.7109375, "learning_rate": 2.8899990964799923e-05, "loss": 0.6425, "step": 7000 }, { "epoch": 0.11952000764928049, "grad_norm": 47228.74609375, "learning_rate": 2.8820268611183452e-05, "loss": 0.6392, "step": 7500 }, { "epoch": 0.12748800815923253, "grad_norm": 55561.9453125, "learning_rate": 2.874054625756698e-05, "loss": 0.6342, "step": 8000 }, { "epoch": 0.13545600866918456, "grad_norm": 50830.77734375, "learning_rate": 2.866082390395051e-05, "loss": 0.6288, "step": 8500 }, { "epoch": 0.14342400917913659, "grad_norm": 45037.15234375, "learning_rate": 2.858110155033404e-05, "loss": 0.6283, "step": 9000 }, { "epoch": 0.1513920096890886, "grad_norm": 42729.57421875, "learning_rate": 2.8501379196717565e-05, "loss": 0.6252, "step": 9500 }, { "epoch": 0.15936001019904064, "grad_norm": 46592.18359375, "learning_rate": 2.8421656843101094e-05, "loss": 0.6199, "step": 10000 }, { "epoch": 0.1673280107089927, "grad_norm": 40794.70703125, "learning_rate": 2.834193448948462e-05, "loss": 0.6237, "step": 10500 }, { "epoch": 0.17529601121894473, "grad_norm": 40227.96484375, "learning_rate": 2.8262212135868152e-05, "loss": 0.6174, "step": 11000 }, { "epoch": 0.18326401172889675, "grad_norm": 49994.6015625, "learning_rate": 2.818248978225168e-05, "loss": 0.6151, "step": 11500 }, { "epoch": 0.19123201223884878, "grad_norm": 55100.75390625, "learning_rate": 2.8102767428635207e-05, "loss": 0.6117, "step": 12000 }, { "epoch": 0.1992000127488008, "grad_norm": 93011.546875, "learning_rate": 2.8023045075018736e-05, "loss": 0.6122, "step": 12500 }, { "epoch": 0.20716801325875284, "grad_norm": 47631.09375, "learning_rate": 2.794332272140226e-05, "loss": 0.6133, "step": 13000 }, { "epoch": 0.2151360137687049, "grad_norm": 42235.15625, "learning_rate": 2.786360036778579e-05, "loss": 0.6038, "step": 13500 }, { "epoch": 0.22310401427865692, "grad_norm": 41144.3359375, "learning_rate": 2.7783878014169323e-05, "loss": 0.6074, "step": 14000 }, { "epoch": 0.23107201478860895, "grad_norm": 47137.19921875, "learning_rate": 2.770415566055285e-05, "loss": 0.6047, "step": 14500 }, { "epoch": 0.23904001529856098, "grad_norm": 37974.65625, "learning_rate": 2.7624433306936378e-05, "loss": 0.6058, "step": 15000 }, { "epoch": 0.247008015808513, "grad_norm": 49424.44140625, "learning_rate": 2.7544710953319903e-05, "loss": 0.6014, "step": 15500 }, { "epoch": 0.25497601631846506, "grad_norm": 57363.96875, "learning_rate": 2.7464988599703432e-05, "loss": 0.6012, "step": 16000 }, { "epoch": 0.26294401682841706, "grad_norm": 48850.90234375, "learning_rate": 2.738526624608696e-05, "loss": 0.5969, "step": 16500 }, { "epoch": 0.2709120173383691, "grad_norm": 60200.43359375, "learning_rate": 2.730554389247049e-05, "loss": 0.5958, "step": 17000 }, { "epoch": 0.2788800178483211, "grad_norm": 48039.16015625, "learning_rate": 2.722582153885402e-05, "loss": 0.598, "step": 17500 }, { "epoch": 0.28684801835827317, "grad_norm": 34431.2265625, "learning_rate": 2.7146099185237545e-05, "loss": 0.5953, "step": 18000 }, { "epoch": 0.2948160188682252, "grad_norm": 34913.16015625, "learning_rate": 2.7066376831621074e-05, "loss": 0.5932, "step": 18500 }, { "epoch": 0.3027840193781772, "grad_norm": 36192.234375, "learning_rate": 2.6986654478004603e-05, "loss": 0.5939, "step": 19000 }, { "epoch": 0.3107520198881293, "grad_norm": 46195.62890625, "learning_rate": 2.690693212438813e-05, "loss": 0.5914, "step": 19500 }, { "epoch": 0.3187200203980813, "grad_norm": 43815.43359375, "learning_rate": 2.682720977077166e-05, "loss": 0.5904, "step": 20000 }, { "epoch": 0.32668802090803334, "grad_norm": 35572.34375, "learning_rate": 2.674748741715519e-05, "loss": 0.5865, "step": 20500 }, { "epoch": 0.3346560214179854, "grad_norm": 36805.02734375, "learning_rate": 2.6667765063538716e-05, "loss": 0.5904, "step": 21000 }, { "epoch": 0.3426240219279374, "grad_norm": 45271.30078125, "learning_rate": 2.6588042709922245e-05, "loss": 0.5873, "step": 21500 }, { "epoch": 0.35059202243788945, "grad_norm": 35245.0234375, "learning_rate": 2.650832035630577e-05, "loss": 0.5856, "step": 22000 }, { "epoch": 0.35856002294784145, "grad_norm": 34578.25, "learning_rate": 2.64285980026893e-05, "loss": 0.5795, "step": 22500 }, { "epoch": 0.3665280234577935, "grad_norm": 40504.375, "learning_rate": 2.6348875649072832e-05, "loss": 0.5846, "step": 23000 }, { "epoch": 0.37449602396774556, "grad_norm": 37683.4453125, "learning_rate": 2.6269153295456358e-05, "loss": 0.5792, "step": 23500 }, { "epoch": 0.38246402447769756, "grad_norm": 35323.41015625, "learning_rate": 2.6189430941839887e-05, "loss": 0.5803, "step": 24000 }, { "epoch": 0.3904320249876496, "grad_norm": 47546.1953125, "learning_rate": 2.6109708588223413e-05, "loss": 0.5805, "step": 24500 }, { "epoch": 0.3984000254976016, "grad_norm": 44604.859375, "learning_rate": 2.6029986234606942e-05, "loss": 0.5773, "step": 25000 }, { "epoch": 0.4063680260075537, "grad_norm": 47108.78125, "learning_rate": 2.595026388099047e-05, "loss": 0.5825, "step": 25500 }, { "epoch": 0.4143360265175057, "grad_norm": 42788.87109375, "learning_rate": 2.5870541527374e-05, "loss": 0.578, "step": 26000 }, { "epoch": 0.42230402702745773, "grad_norm": 33550.89453125, "learning_rate": 2.579081917375753e-05, "loss": 0.5779, "step": 26500 }, { "epoch": 0.4302720275374098, "grad_norm": 36036.8984375, "learning_rate": 2.5711096820141055e-05, "loss": 0.574, "step": 27000 }, { "epoch": 0.4382400280473618, "grad_norm": 61990.4296875, "learning_rate": 2.5631374466524584e-05, "loss": 0.5751, "step": 27500 }, { "epoch": 0.44620802855731384, "grad_norm": 35250.4375, "learning_rate": 2.5551652112908113e-05, "loss": 0.5761, "step": 28000 }, { "epoch": 0.45417602906726584, "grad_norm": 40302.27734375, "learning_rate": 2.547192975929164e-05, "loss": 0.5723, "step": 28500 }, { "epoch": 0.4621440295772179, "grad_norm": 40248.03515625, "learning_rate": 2.539220740567517e-05, "loss": 0.5754, "step": 29000 }, { "epoch": 0.47011203008716995, "grad_norm": 38083.6484375, "learning_rate": 2.5312485052058697e-05, "loss": 0.5719, "step": 29500 }, { "epoch": 0.47808003059712195, "grad_norm": 32291.21484375, "learning_rate": 2.5232762698442226e-05, "loss": 0.5726, "step": 30000 }, { "epoch": 0.486048031107074, "grad_norm": 39065.84375, "learning_rate": 2.5153040344825755e-05, "loss": 0.5716, "step": 30500 }, { "epoch": 0.494016031617026, "grad_norm": 46160.44921875, "learning_rate": 2.507331799120928e-05, "loss": 0.5723, "step": 31000 }, { "epoch": 0.5019840321269781, "grad_norm": 36111.65625, "learning_rate": 2.499359563759281e-05, "loss": 0.569, "step": 31500 }, { "epoch": 0.5099520326369301, "grad_norm": 33012.55859375, "learning_rate": 2.4913873283976342e-05, "loss": 0.5692, "step": 32000 }, { "epoch": 0.5179200331468822, "grad_norm": 40300.83203125, "learning_rate": 2.4834150930359868e-05, "loss": 0.5686, "step": 32500 }, { "epoch": 0.5258880336568341, "grad_norm": 38612.78515625, "learning_rate": 2.4754428576743397e-05, "loss": 0.5663, "step": 33000 }, { "epoch": 0.5338560341667862, "grad_norm": 52813.0234375, "learning_rate": 2.4674706223126922e-05, "loss": 0.5662, "step": 33500 }, { "epoch": 0.5418240346767382, "grad_norm": 36266.23046875, "learning_rate": 2.459498386951045e-05, "loss": 0.5635, "step": 34000 }, { "epoch": 0.5497920351866903, "grad_norm": 38791.453125, "learning_rate": 2.451526151589398e-05, "loss": 0.5677, "step": 34500 }, { "epoch": 0.5577600356966422, "grad_norm": 43926.4296875, "learning_rate": 2.443553916227751e-05, "loss": 0.5688, "step": 35000 }, { "epoch": 0.5657280362065943, "grad_norm": 32247.173828125, "learning_rate": 2.435581680866104e-05, "loss": 0.5686, "step": 35500 }, { "epoch": 0.5736960367165463, "grad_norm": 35230.44140625, "learning_rate": 2.4276094455044564e-05, "loss": 0.566, "step": 36000 }, { "epoch": 0.5816640372264984, "grad_norm": 33197.58203125, "learning_rate": 2.4196372101428093e-05, "loss": 0.5629, "step": 36500 }, { "epoch": 0.5896320377364505, "grad_norm": 41267.5390625, "learning_rate": 2.4116649747811623e-05, "loss": 0.5642, "step": 37000 }, { "epoch": 0.5976000382464024, "grad_norm": 41852.71875, "learning_rate": 2.4036927394195148e-05, "loss": 0.5635, "step": 37500 }, { "epoch": 0.6055680387563545, "grad_norm": 32820.1015625, "learning_rate": 2.395720504057868e-05, "loss": 0.5639, "step": 38000 }, { "epoch": 0.6135360392663065, "grad_norm": 39303.70703125, "learning_rate": 2.3877482686962206e-05, "loss": 0.5605, "step": 38500 }, { "epoch": 0.6215040397762586, "grad_norm": 53008.375, "learning_rate": 2.3797760333345735e-05, "loss": 0.562, "step": 39000 }, { "epoch": 0.6294720402862106, "grad_norm": 41845.28515625, "learning_rate": 2.3718037979729264e-05, "loss": 0.5641, "step": 39500 }, { "epoch": 0.6374400407961626, "grad_norm": 49620.7734375, "learning_rate": 2.363831562611279e-05, "loss": 0.5582, "step": 40000 }, { "epoch": 0.6454080413061146, "grad_norm": 46115.32421875, "learning_rate": 2.355859327249632e-05, "loss": 0.5616, "step": 40500 }, { "epoch": 0.6533760418160667, "grad_norm": 33357.73828125, "learning_rate": 2.3478870918879848e-05, "loss": 0.5633, "step": 41000 }, { "epoch": 0.6613440423260187, "grad_norm": 48345.0859375, "learning_rate": 2.3399148565263377e-05, "loss": 0.5588, "step": 41500 }, { "epoch": 0.6693120428359708, "grad_norm": 46256.62890625, "learning_rate": 2.3319426211646906e-05, "loss": 0.5584, "step": 42000 }, { "epoch": 0.6772800433459227, "grad_norm": 34897.8828125, "learning_rate": 2.3239703858030432e-05, "loss": 0.5577, "step": 42500 }, { "epoch": 0.6852480438558748, "grad_norm": 86223.203125, "learning_rate": 2.315998150441396e-05, "loss": 0.5549, "step": 43000 }, { "epoch": 0.6932160443658268, "grad_norm": 32481.861328125, "learning_rate": 2.308025915079749e-05, "loss": 0.5623, "step": 43500 }, { "epoch": 0.7011840448757789, "grad_norm": 33890.0859375, "learning_rate": 2.300053679718102e-05, "loss": 0.5618, "step": 44000 }, { "epoch": 0.709152045385731, "grad_norm": 32936.66015625, "learning_rate": 2.2920814443564548e-05, "loss": 0.5613, "step": 44500 }, { "epoch": 0.7171200458956829, "grad_norm": 36461.4453125, "learning_rate": 2.2841092089948074e-05, "loss": 0.5581, "step": 45000 }, { "epoch": 0.725088046405635, "grad_norm": 35596.3046875, "learning_rate": 2.2761369736331603e-05, "loss": 0.5514, "step": 45500 }, { "epoch": 0.733056046915587, "grad_norm": 46988.7890625, "learning_rate": 2.2681647382715132e-05, "loss": 0.5511, "step": 46000 }, { "epoch": 0.7410240474255391, "grad_norm": 34446.23046875, "learning_rate": 2.2601925029098658e-05, "loss": 0.556, "step": 46500 }, { "epoch": 0.7489920479354911, "grad_norm": 30729.033203125, "learning_rate": 2.252220267548219e-05, "loss": 0.5568, "step": 47000 }, { "epoch": 0.7569600484454431, "grad_norm": 35997.25390625, "learning_rate": 2.2442480321865716e-05, "loss": 0.5538, "step": 47500 }, { "epoch": 0.7649280489553951, "grad_norm": 36570.2578125, "learning_rate": 2.2362757968249245e-05, "loss": 0.5524, "step": 48000 }, { "epoch": 0.7728960494653472, "grad_norm": 38073.46484375, "learning_rate": 2.2283035614632774e-05, "loss": 0.5564, "step": 48500 }, { "epoch": 0.7808640499752992, "grad_norm": 32213.791015625, "learning_rate": 2.22033132610163e-05, "loss": 0.5531, "step": 49000 }, { "epoch": 0.7888320504852512, "grad_norm": 51021.55078125, "learning_rate": 2.212359090739983e-05, "loss": 0.5551, "step": 49500 }, { "epoch": 0.7968000509952032, "grad_norm": 34028.9609375, "learning_rate": 2.2043868553783358e-05, "loss": 0.5522, "step": 50000 }, { "epoch": 0.8047680515051553, "grad_norm": 35102.0390625, "learning_rate": 2.1964146200166887e-05, "loss": 0.551, "step": 50500 }, { "epoch": 0.8127360520151073, "grad_norm": 38054.03125, "learning_rate": 2.1884423846550416e-05, "loss": 0.5564, "step": 51000 }, { "epoch": 0.8207040525250594, "grad_norm": 34621.86328125, "learning_rate": 2.180470149293394e-05, "loss": 0.5505, "step": 51500 }, { "epoch": 0.8286720530350113, "grad_norm": 34099.5859375, "learning_rate": 2.172497913931747e-05, "loss": 0.5504, "step": 52000 }, { "epoch": 0.8366400535449634, "grad_norm": 36866.79296875, "learning_rate": 2.1645256785701e-05, "loss": 0.5473, "step": 52500 }, { "epoch": 0.8446080540549155, "grad_norm": 38420.94140625, "learning_rate": 2.156553443208453e-05, "loss": 0.547, "step": 53000 }, { "epoch": 0.8525760545648675, "grad_norm": 35430.8984375, "learning_rate": 2.1485812078468058e-05, "loss": 0.5504, "step": 53500 }, { "epoch": 0.8605440550748196, "grad_norm": 35433.71875, "learning_rate": 2.1406089724851584e-05, "loss": 0.5547, "step": 54000 }, { "epoch": 0.8685120555847715, "grad_norm": 41701.05859375, "learning_rate": 2.1326367371235113e-05, "loss": 0.5517, "step": 54500 }, { "epoch": 0.8764800560947236, "grad_norm": 34275.7265625, "learning_rate": 2.124664501761864e-05, "loss": 0.5496, "step": 55000 } ], "logging_steps": 500, "max_steps": 188253, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1496849408e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }