{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5231746031746032, "eval_steps": 300, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.5461102724075317, "epoch": 0.0025396825396825397, "grad_norm": 3.896618366241455, "learning_rate": 0.0, "loss": 2.3711, "mean_token_accuracy": 0.6472751796245575, "num_tokens": 9621.0, "step": 1 }, { "entropy": 1.581420679233576, "epoch": 0.050793650793650794, "grad_norm": 2.02801251411438, "learning_rate": 9.5e-06, "loss": 1.9565, "mean_token_accuracy": 0.6899475496458379, "num_tokens": 194969.0, "step": 20 }, { "entropy": 1.8750008303672074, "epoch": 0.10158730158730159, "grad_norm": 0.83933025598526, "learning_rate": 1.95e-05, "loss": 1.4223, "mean_token_accuracy": 0.7304104179143905, "num_tokens": 390209.0, "step": 40 }, { "entropy": 1.868852799385786, "epoch": 0.1523809523809524, "grad_norm": 0.7984667420387268, "learning_rate": 1.9968176841806687e-05, "loss": 0.9479, "mean_token_accuracy": 0.7837442796677351, "num_tokens": 585927.0, "step": 60 }, { "entropy": 1.635760549083352, "epoch": 0.20317460317460317, "grad_norm": 0.7746185660362244, "learning_rate": 1.9866148103359362e-05, "loss": 0.6905, "mean_token_accuracy": 0.8289982877671719, "num_tokens": 781164.0, "step": 80 }, { "entropy": 1.4979693002998828, "epoch": 0.25396825396825395, "grad_norm": 0.7985681891441345, "learning_rate": 1.9694545073405348e-05, "loss": 0.5682, "mean_token_accuracy": 0.8550767470151186, "num_tokens": 976497.0, "step": 100 }, { "entropy": 1.3932939764112233, "epoch": 0.3047619047619048, "grad_norm": 0.8087366223335266, "learning_rate": 1.94545778654666e-05, "loss": 0.5126, "mean_token_accuracy": 0.8678769677877426, "num_tokens": 1171938.0, "step": 120 }, { "entropy": 1.3521239839494228, "epoch": 0.35555555555555557, "grad_norm": 1.0716849565505981, "learning_rate": 1.9147938684880213e-05, "loss": 0.4679, "mean_token_accuracy": 0.8757014229893685, "num_tokens": 1367065.0, "step": 140 }, { "entropy": 1.276645314320922, "epoch": 0.40634920634920635, "grad_norm": 1.04561448097229, "learning_rate": 1.8776789895672557e-05, "loss": 0.4319, "mean_token_accuracy": 0.8847867721691728, "num_tokens": 1562066.0, "step": 160 }, { "entropy": 1.237958626449108, "epoch": 0.45714285714285713, "grad_norm": 0.78230220079422, "learning_rate": 1.8343748771959346e-05, "loss": 0.4182, "mean_token_accuracy": 0.8867125200107694, "num_tokens": 1756828.0, "step": 180 }, { "entropy": 1.2167125567793846, "epoch": 0.5079365079365079, "grad_norm": 1.3130111694335938, "learning_rate": 1.785186904140207e-05, "loss": 0.384, "mean_token_accuracy": 0.8928723320364952, "num_tokens": 1952018.0, "step": 200 }, { "entropy": 1.1336687998846173, "epoch": 0.5587301587301587, "grad_norm": 1.0740300416946411, "learning_rate": 1.7304619350872992e-05, "loss": 0.355, "mean_token_accuracy": 0.8990522997453809, "num_tokens": 2146747.0, "step": 220 }, { "entropy": 1.0902384620159864, "epoch": 0.6095238095238096, "grad_norm": 1.2321722507476807, "learning_rate": 1.6705858806184933e-05, "loss": 0.3421, "mean_token_accuracy": 0.8982272742316126, "num_tokens": 2342526.0, "step": 240 }, { "entropy": 1.072350986301899, "epoch": 0.6603174603174603, "grad_norm": 1.2107694149017334, "learning_rate": 1.605980975837524e-05, "loss": 0.3293, "mean_token_accuracy": 0.9014129877090454, "num_tokens": 2537429.0, "step": 260 }, { "entropy": 1.0297158515080809, "epoch": 0.7111111111111111, "grad_norm": 1.3480980396270752, "learning_rate": 1.5371028028450152e-05, "loss": 0.3115, "mean_token_accuracy": 0.9040320562198758, "num_tokens": 2732125.0, "step": 280 }, { "entropy": 1.0070750426501036, "epoch": 0.7619047619047619, "grad_norm": 1.2340924739837646, "learning_rate": 1.4644370780559265e-05, "loss": 0.297, "mean_token_accuracy": 0.906495463848114, "num_tokens": 2926977.0, "step": 300 }, { "epoch": 0.7619047619047619, "eval_entropy": 0.9912602304560798, "eval_loss": 0.28733107447624207, "eval_mean_token_accuracy": 0.9105286079645157, "eval_num_tokens": 2926977.0, "eval_runtime": 254.0073, "eval_samples_per_second": 2.756, "eval_steps_per_second": 2.756, "step": 300 }, { "entropy": 0.9841975728049874, "epoch": 0.8126984126984127, "grad_norm": 1.2695492506027222, "learning_rate": 1.3884962270152693e-05, "loss": 0.2871, "mean_token_accuracy": 0.9104899806901813, "num_tokens": 3122184.0, "step": 320 }, { "entropy": 0.9748819842934608, "epoch": 0.8634920634920635, "grad_norm": 1.4477434158325195, "learning_rate": 1.3098157708658657e-05, "loss": 0.2584, "mean_token_accuracy": 0.9179832600057125, "num_tokens": 3317637.0, "step": 340 }, { "entropy": 0.9418716534972191, "epoch": 0.9142857142857143, "grad_norm": 1.3718384504318237, "learning_rate": 1.2289505499501341e-05, "loss": 0.2509, "mean_token_accuracy": 0.9180495567619801, "num_tokens": 3513113.0, "step": 360 }, { "entropy": 0.946486484631896, "epoch": 0.9650793650793651, "grad_norm": 1.3464258909225464, "learning_rate": 1.1464708111763723e-05, "loss": 0.2477, "mean_token_accuracy": 0.9207174494862557, "num_tokens": 3707967.0, "step": 380 }, { "entropy": 0.9170130017814757, "epoch": 1.0152380952380953, "grad_norm": 1.5472830533981323, "learning_rate": 1.0629581867407241e-05, "loss": 0.2329, "mean_token_accuracy": 0.9222704160817062, "num_tokens": 3900703.0, "step": 400 }, { "entropy": 0.927550189383328, "epoch": 1.066031746031746, "grad_norm": 1.8993698358535767, "learning_rate": 9.790015925621588e-06, "loss": 0.2196, "mean_token_accuracy": 0.9251768393442035, "num_tokens": 4096441.0, "step": 420 }, { "entropy": 0.9159683456644416, "epoch": 1.116825396825397, "grad_norm": 1.489464521408081, "learning_rate": 8.951930753539521e-06, "loss": 0.2162, "mean_token_accuracy": 0.9265725754201413, "num_tokens": 4290838.0, "step": 440 }, { "entropy": 0.8982374468818307, "epoch": 1.1676190476190476, "grad_norm": 1.4005104303359985, "learning_rate": 8.121236376173745e-06, "loss": 0.205, "mean_token_accuracy": 0.9285883469507098, "num_tokens": 4485804.0, "step": 460 }, { "entropy": 0.8816795371472835, "epoch": 1.2184126984126984, "grad_norm": 1.45193612575531, "learning_rate": 7.303790699989714e-06, "loss": 0.2085, "mean_token_accuracy": 0.928891065903008, "num_tokens": 4681086.0, "step": 480 }, { "entropy": 0.8763484323397279, "epoch": 1.2692063492063492, "grad_norm": 1.3778793811798096, "learning_rate": 6.505358204009018e-06, "loss": 0.1982, "mean_token_accuracy": 0.9306870764121413, "num_tokens": 4876429.0, "step": 500 }, { "entropy": 0.8736646875739098, "epoch": 1.32, "grad_norm": 1.3831270933151245, "learning_rate": 5.731569289746193e-06, "loss": 0.1897, "mean_token_accuracy": 0.9305133303627372, "num_tokens": 5071394.0, "step": 520 }, { "entropy": 0.868809700757265, "epoch": 1.370793650793651, "grad_norm": 1.8496570587158203, "learning_rate": 4.98788057663585e-06, "loss": 0.1931, "mean_token_accuracy": 0.9316842250525952, "num_tokens": 5265948.0, "step": 540 }, { "entropy": 0.8576494121924043, "epoch": 1.4215873015873015, "grad_norm": 1.5876587629318237, "learning_rate": 4.279536422939606e-06, "loss": 0.1873, "mean_token_accuracy": 0.9321241827681661, "num_tokens": 5461394.0, "step": 560 }, { "entropy": 0.854162979312241, "epoch": 1.4723809523809523, "grad_norm": 1.3804458379745483, "learning_rate": 3.6115319434803897e-06, "loss": 0.1901, "mean_token_accuracy": 0.9312955033034086, "num_tokens": 5656093.0, "step": 580 }, { "entropy": 0.8487729975953698, "epoch": 1.5231746031746032, "grad_norm": 2.6718533039093018, "learning_rate": 2.9885777849964016e-06, "loss": 0.1862, "mean_token_accuracy": 0.9315127771347761, "num_tokens": 5851549.0, "step": 600 }, { "epoch": 1.5231746031746032, "eval_entropy": 0.844009120634624, "eval_loss": 0.19104033708572388, "eval_mean_token_accuracy": 0.9312942716905049, "eval_num_tokens": 5851549.0, "eval_runtime": 253.7619, "eval_samples_per_second": 2.758, "eval_steps_per_second": 2.758, "step": 600 } ], "logging_steps": 20, "max_steps": 788, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.847129416270643e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }