{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 25, "global_step": 25, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21052631578947367, "grad_norm": 3.703707456588745, "learning_rate": 0.0, "loss": 0.2265, "num_input_tokens_seen": 90632, "step": 1, "train_runtime": 21.9544, "train_tokens_per_second": 4128.187 }, { "epoch": 0.42105263157894735, "grad_norm": 3.695063352584839, "learning_rate": 1.6666666666666667e-05, "loss": 0.2547, "num_input_tokens_seen": 183712, "step": 2, "train_runtime": 31.6354, "train_tokens_per_second": 5807.165 }, { "epoch": 0.631578947368421, "grad_norm": 3.6188080310821533, "learning_rate": 3.3333333333333335e-05, "loss": 0.2468, "num_input_tokens_seen": 277384, "step": 3, "train_runtime": 41.3805, "train_tokens_per_second": 6703.247 }, { "epoch": 0.8421052631578947, "grad_norm": 1.5909664630889893, "learning_rate": 5e-05, "loss": 0.238, "num_input_tokens_seen": 376672, "step": 4, "train_runtime": 51.4829, "train_tokens_per_second": 7316.455 }, { "epoch": 1.0, "grad_norm": 1.5909664630889893, "learning_rate": 4.9745536047023324e-05, "loss": 0.2939, "num_input_tokens_seen": 448584, "step": 5, "train_runtime": 56.2763, "train_tokens_per_second": 7971.101 }, { "epoch": 1.2105263157894737, "grad_norm": 1.5388646125793457, "learning_rate": 4.898732434036244e-05, "loss": 0.3134, "num_input_tokens_seen": 541576, "step": 6, "train_runtime": 66.044, "train_tokens_per_second": 8200.224 }, { "epoch": 1.4210526315789473, "grad_norm": 1.0950462818145752, "learning_rate": 4.774079988386296e-05, "loss": 0.2642, "num_input_tokens_seen": 637744, "step": 7, "train_runtime": 75.8587, "train_tokens_per_second": 8407.001 }, { "epoch": 1.631578947368421, "grad_norm": 0.9882143139839172, "learning_rate": 4.6031338320779534e-05, "loss": 0.2666, "num_input_tokens_seen": 732120, "step": 8, "train_runtime": 86.0089, "train_tokens_per_second": 8512.144 }, { "epoch": 1.8421052631578947, "grad_norm": 0.7304351925849915, "learning_rate": 4.389373935885646e-05, "loss": 0.2346, "num_input_tokens_seen": 833528, "step": 9, "train_runtime": 96.2093, "train_tokens_per_second": 8663.694 }, { "epoch": 2.0, "grad_norm": 0.7826125025749207, "learning_rate": 4.137151834863213e-05, "loss": 0.251, "num_input_tokens_seen": 897168, "step": 10, "train_runtime": 104.3221, "train_tokens_per_second": 8599.982 }, { "epoch": 2.2105263157894735, "grad_norm": 0.8914588093757629, "learning_rate": 3.851602043638994e-05, "loss": 0.2168, "num_input_tokens_seen": 993464, "step": 11, "train_runtime": 113.7023, "train_tokens_per_second": 8737.415 }, { "epoch": 2.4210526315789473, "grad_norm": 0.7048306465148926, "learning_rate": 3.5385375325047166e-05, "loss": 0.1636, "num_input_tokens_seen": 1092872, "step": 12, "train_runtime": 123.4543, "train_tokens_per_second": 8852.44 }, { "epoch": 2.6315789473684212, "grad_norm": 0.5587737560272217, "learning_rate": 3.2043313921035743e-05, "loss": 0.1882, "num_input_tokens_seen": 1188144, "step": 13, "train_runtime": 133.3972, "train_tokens_per_second": 8906.815 }, { "epoch": 2.8421052631578947, "grad_norm": 0.43069741129875183, "learning_rate": 2.8557870956832132e-05, "loss": 0.161, "num_input_tokens_seen": 1287152, "step": 14, "train_runtime": 143.1833, "train_tokens_per_second": 8989.542 }, { "epoch": 3.0, "grad_norm": 0.9480004906654358, "learning_rate": 2.5e-05, "loss": 0.2328, "num_input_tokens_seen": 1345752, "step": 15, "train_runtime": 150.9211, "train_tokens_per_second": 8916.923 }, { "epoch": 3.2105263157894735, "grad_norm": 0.7457718253135681, "learning_rate": 2.1442129043167874e-05, "loss": 0.1513, "num_input_tokens_seen": 1438888, "step": 16, "train_runtime": 160.8789, "train_tokens_per_second": 8943.92 }, { "epoch": 3.4210526315789473, "grad_norm": 0.40165457129478455, "learning_rate": 1.795668607896426e-05, "loss": 0.1159, "num_input_tokens_seen": 1535160, "step": 17, "train_runtime": 170.5415, "train_tokens_per_second": 9001.68 }, { "epoch": 3.6315789473684212, "grad_norm": 0.3553405702114105, "learning_rate": 1.4614624674952842e-05, "loss": 0.1408, "num_input_tokens_seen": 1629952, "step": 18, "train_runtime": 180.3174, "train_tokens_per_second": 9039.349 }, { "epoch": 3.8421052631578947, "grad_norm": 0.4195708632469177, "learning_rate": 1.148397956361007e-05, "loss": 0.1207, "num_input_tokens_seen": 1729264, "step": 19, "train_runtime": 190.1508, "train_tokens_per_second": 9094.17 }, { "epoch": 4.0, "grad_norm": 0.6537638306617737, "learning_rate": 8.628481651367876e-06, "loss": 0.1767, "num_input_tokens_seen": 1794336, "step": 20, "train_runtime": 197.9448, "train_tokens_per_second": 9064.829 }, { "epoch": 4.2105263157894735, "grad_norm": 0.2990877628326416, "learning_rate": 6.106260641143546e-06, "loss": 0.0917, "num_input_tokens_seen": 1893760, "step": 21, "train_runtime": 207.6689, "train_tokens_per_second": 9119.132 }, { "epoch": 4.421052631578947, "grad_norm": 0.40062811970710754, "learning_rate": 3.968661679220468e-06, "loss": 0.1139, "num_input_tokens_seen": 1981256, "step": 22, "train_runtime": 216.876, "train_tokens_per_second": 9135.434 }, { "epoch": 4.631578947368421, "grad_norm": 0.31818389892578125, "learning_rate": 2.2592001161370392e-06, "loss": 0.1306, "num_input_tokens_seen": 2077800, "step": 23, "train_runtime": 226.6964, "train_tokens_per_second": 9165.564 }, { "epoch": 4.842105263157895, "grad_norm": 0.39665085077285767, "learning_rate": 1.0126756596375686e-06, "loss": 0.133, "num_input_tokens_seen": 2170328, "step": 24, "train_runtime": 236.3103, "train_tokens_per_second": 9184.228 }, { "epoch": 5.0, "grad_norm": 0.39665085077285767, "learning_rate": 2.544639529766829e-07, "loss": 0.1053, "num_input_tokens_seen": 2242920, "step": 25, "train_runtime": 240.7089, "train_tokens_per_second": 9317.977 }, { "epoch": 5.0, "eval_accuracy": 0.9471856009625939, "eval_loss": 0.16564106941223145, "eval_runtime": 0.5896, "eval_samples_per_second": 8.48, "eval_steps_per_second": 3.392, "num_input_tokens_seen": 2242920, "step": 25 }, { "epoch": 5.0, "num_input_tokens_seen": 2242920, "step": 25, "total_flos": 1.0185196765918003e+17, "train_loss": 0.19328440070152283, "train_runtime": 354.8577, "train_samples_per_second": 1.071, "train_steps_per_second": 0.07 } ], "logging_steps": 1, "max_steps": 25, "num_input_tokens_seen": 2242920, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0185196765918003e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }