{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4536366770079283, "eval_steps": 500, "global_step": 2109, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006894174422612892, "grad_norm": 6.479256629943848, "learning_rate": 0.0, "loss": 1.6632, "step": 1 }, { "epoch": 0.03447087211306446, "grad_norm": 3.2223708629608154, "learning_rate": 1.9999434046461045e-05, "loss": 1.4496, "step": 50 }, { "epoch": 0.06894174422612892, "grad_norm": 3.344071388244629, "learning_rate": 1.996249692618611e-05, "loss": 1.2491, "step": 100 }, { "epoch": 0.10341261633919338, "grad_norm": 3.0401906967163086, "learning_rate": 1.9868053167196865e-05, "loss": 1.1876, "step": 150 }, { "epoch": 0.13788348845225784, "grad_norm": 3.164588689804077, "learning_rate": 1.971664792831919e-05, "loss": 1.1607, "step": 200 }, { "epoch": 0.1723543605653223, "grad_norm": 2.982455253601074, "learning_rate": 1.9509155167802316e-05, "loss": 1.1297, "step": 250 }, { "epoch": 0.20682523267838676, "grad_norm": 3.021336793899536, "learning_rate": 1.9246772598559302e-05, "loss": 1.1096, "step": 300 }, { "epoch": 0.24129610479145122, "grad_norm": 2.949462652206421, "learning_rate": 1.8931014774594656e-05, "loss": 1.1004, "step": 350 }, { "epoch": 0.2757669769045157, "grad_norm": 2.738522529602051, "learning_rate": 1.8563704348526337e-05, "loss": 1.0608, "step": 400 }, { "epoch": 0.31023784901758017, "grad_norm": 3.002688407897949, "learning_rate": 1.8146961550666525e-05, "loss": 1.0632, "step": 450 }, { "epoch": 0.3447087211306446, "grad_norm": 2.8380253314971924, "learning_rate": 1.7683191950391142e-05, "loss": 1.0549, "step": 500 }, { "epoch": 0.3791795932437091, "grad_norm": 2.875920295715332, "learning_rate": 1.717507257044331e-05, "loss": 1.0378, "step": 550 }, { "epoch": 0.4136504653567735, "grad_norm": 2.8477611541748047, "learning_rate": 1.6625536434323358e-05, "loss": 1.0359, "step": 600 }, { "epoch": 0.448121337469838, "grad_norm": 2.6955602169036865, "learning_rate": 1.6037755635962587e-05, "loss": 1.017, "step": 650 }, { "epoch": 0.48259220958290244, "grad_norm": 2.665055751800537, "learning_rate": 1.5415123029408046e-05, "loss": 1.0066, "step": 700 }, { "epoch": 0.5170630816959669, "grad_norm": 2.7886996269226074, "learning_rate": 1.4761232644210963e-05, "loss": 1.0003, "step": 750 }, { "epoch": 0.5515339538090314, "grad_norm": 2.561713218688965, "learning_rate": 1.4079858939567557e-05, "loss": 0.9959, "step": 800 }, { "epoch": 0.5860048259220958, "grad_norm": 2.623467206954956, "learning_rate": 1.3374935016963595e-05, "loss": 0.9915, "step": 850 }, { "epoch": 0.6204756980351603, "grad_norm": 2.6682498455047607, "learning_rate": 1.2650529917086232e-05, "loss": 0.9738, "step": 900 }, { "epoch": 0.6549465701482248, "grad_norm": 2.6951420307159424, "learning_rate": 1.1910825132052356e-05, "loss": 0.9785, "step": 950 }, { "epoch": 0.6894174422612892, "grad_norm": 2.6426846981048584, "learning_rate": 1.1160090468532266e-05, "loss": 0.9806, "step": 1000 }, { "epoch": 0.7238883143743536, "grad_norm": 2.6905856132507324, "learning_rate": 1.0402659401094154e-05, "loss": 0.9568, "step": 1050 }, { "epoch": 0.7583591864874182, "grad_norm": 2.618277072906494, "learning_rate": 9.642904058037667e-06, "loss": 0.9653, "step": 1100 }, { "epoch": 0.7928300586004826, "grad_norm": 2.5803515911102295, "learning_rate": 8.885209984106072e-06, "loss": 0.9566, "step": 1150 }, { "epoch": 0.827300930713547, "grad_norm": 2.7561099529266357, "learning_rate": 8.133950825754511e-06, "loss": 0.9571, "step": 1200 }, { "epoch": 0.8617718028266115, "grad_norm": 2.651911735534668, "learning_rate": 7.393463085098886e-06, "loss": 0.9485, "step": 1250 }, { "epoch": 0.896242674939676, "grad_norm": 2.649693250656128, "learning_rate": 6.6680210882734805e-06, "loss": 0.9471, "step": 1300 }, { "epoch": 0.9307135470527405, "grad_norm": 2.6041650772094727, "learning_rate": 5.961812312687689e-06, "loss": 0.9349, "step": 1350 }, { "epoch": 0.9651844191658049, "grad_norm": 2.6525392532348633, "learning_rate": 5.278913215600714e-06, "loss": 0.9344, "step": 1400 }, { "epoch": 0.9996552912788693, "grad_norm": 2.6537036895751953, "learning_rate": 4.623265703539146e-06, "loss": 0.9174, "step": 1450 }, { "epoch": 1.033781454670803, "grad_norm": 2.5499179363250732, "learning_rate": 3.998654378383361e-06, "loss": 0.7982, "step": 1500 }, { "epoch": 1.0682523267838677, "grad_norm": 2.596992254257202, "learning_rate": 3.408684691465355e-06, "loss": 0.786, "step": 1550 }, { "epoch": 1.1027231988969322, "grad_norm": 2.550497531890869, "learning_rate": 2.85676213177945e-06, "loss": 0.7988, "step": 1600 }, { "epoch": 1.1371940710099966, "grad_norm": 2.5404860973358154, "learning_rate": 2.3460725684379002e-06, "loss": 0.7874, "step": 1650 }, { "epoch": 1.171664943123061, "grad_norm": 2.6311583518981934, "learning_rate": 1.8795638608410016e-06, "loss": 0.7965, "step": 1700 }, { "epoch": 1.2061358152361255, "grad_norm": 2.623417615890503, "learning_rate": 1.4599288427134283e-06, "loss": 0.7922, "step": 1750 }, { "epoch": 1.2406066873491899, "grad_norm": 2.5811400413513184, "learning_rate": 1.0895897782283305e-06, "loss": 0.7915, "step": 1800 }, { "epoch": 1.2750775594622543, "grad_norm": 2.5747768878936768, "learning_rate": 7.706843799431985e-07, "loss": 0.7884, "step": 1850 }, { "epoch": 1.309548431575319, "grad_norm": 2.675109624862671, "learning_rate": 5.050534692564358e-07, "loss": 0.786, "step": 1900 }, { "epoch": 1.3440193036883832, "grad_norm": 2.594230890274048, "learning_rate": 2.94230350612239e-07, "loss": 0.7751, "step": 1950 }, { "epoch": 1.3784901758014478, "grad_norm": 2.5236330032348633, "learning_rate": 1.3943196078924247e-07, "loss": 0.7861, "step": 2000 }, { "epoch": 1.4129610479145123, "grad_norm": 2.671135902404785, "learning_rate": 4.155184436196669e-08, "loss": 0.7872, "step": 2050 }, { "epoch": 1.4474319200275767, "grad_norm": 2.5242528915405273, "learning_rate": 1.154995882924892e-09, "loss": 0.7975, "step": 2100 }, { "epoch": 1.4536366770079283, "step": 2109, "total_flos": 7.41829675403182e+16, "train_loss": 0.9605129250082827, "train_runtime": 2747.171, "train_samples_per_second": 6.142, "train_steps_per_second": 0.768 } ], "logging_steps": 50, "max_steps": 2109, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.41829675403182e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }