{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 39, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.074157863855362, "epoch": 0.08, "grad_norm": 0.9623633623123169, "learning_rate": 0.0, "loss": 2.8972, "mean_token_accuracy": 0.4513181820511818, "num_tokens": 2908.0, "step": 1 }, { "entropy": 2.1756343841552734, "epoch": 0.16, "grad_norm": 1.0783641338348389, "learning_rate": 5e-05, "loss": 2.989, "mean_token_accuracy": 0.4384376108646393, "num_tokens": 5086.0, "step": 2 }, { "entropy": 2.1015622317790985, "epoch": 0.24, "grad_norm": 1.0787147283554077, "learning_rate": 0.0001, "loss": 2.9231, "mean_token_accuracy": 0.4472825303673744, "num_tokens": 6703.0, "step": 3 }, { "entropy": 2.318566679954529, "epoch": 0.32, "grad_norm": 1.0077248811721802, "learning_rate": 9.993784606094612e-05, "loss": 2.9968, "mean_token_accuracy": 0.4545922800898552, "num_tokens": 9075.0, "step": 4 }, { "entropy": 2.267362117767334, "epoch": 0.4, "grad_norm": 0.8960147500038147, "learning_rate": 9.975153876827008e-05, "loss": 2.7756, "mean_token_accuracy": 0.46887098997831345, "num_tokens": 11207.0, "step": 5 }, { "entropy": 2.5053298473358154, "epoch": 0.48, "grad_norm": 0.9175477027893066, "learning_rate": 9.944154131125642e-05, "loss": 2.8334, "mean_token_accuracy": 0.4478325620293617, "num_tokens": 13013.0, "step": 6 }, { "entropy": 2.5062224864959717, "epoch": 0.56, "grad_norm": 0.739920973777771, "learning_rate": 9.900862439242719e-05, "loss": 2.674, "mean_token_accuracy": 0.48493383079767227, "num_tokens": 15798.0, "step": 7 }, { "entropy": 2.543445646762848, "epoch": 0.64, "grad_norm": 0.8454245924949646, "learning_rate": 9.84538643114539e-05, "loss": 2.6321, "mean_token_accuracy": 0.4908815994858742, "num_tokens": 17899.0, "step": 8 }, { "entropy": 2.6626943945884705, "epoch": 0.72, "grad_norm": 0.9240928888320923, "learning_rate": 9.777864028930705e-05, "loss": 2.7066, "mean_token_accuracy": 0.45207490772008896, "num_tokens": 19457.0, "step": 9 }, { "entropy": 2.5745842456817627, "epoch": 0.8, "grad_norm": 0.7333309054374695, "learning_rate": 9.698463103929542e-05, "loss": 2.3685, "mean_token_accuracy": 0.5086618214845657, "num_tokens": 22198.0, "step": 10 }, { "entropy": 2.744494140148163, "epoch": 0.88, "grad_norm": 0.9435456395149231, "learning_rate": 9.607381059352038e-05, "loss": 2.4654, "mean_token_accuracy": 0.5047935917973518, "num_tokens": 24508.0, "step": 11 }, { "entropy": 2.7561975717544556, "epoch": 0.96, "grad_norm": 0.9667397737503052, "learning_rate": 9.504844339512095e-05, "loss": 2.4149, "mean_token_accuracy": 0.5050082951784134, "num_tokens": 26458.0, "step": 12 }, { "entropy": 2.7281463146209717, "epoch": 1.0, "grad_norm": 1.060321569442749, "learning_rate": 9.391107866851143e-05, "loss": 2.5978, "mean_token_accuracy": 0.47450077533721924, "num_tokens": 27555.0, "step": 13 }, { "entropy": 2.523155093193054, "epoch": 1.08, "grad_norm": 0.7708236575126648, "learning_rate": 9.266454408160779e-05, "loss": 2.2798, "mean_token_accuracy": 0.5204823762178421, "num_tokens": 30557.0, "step": 14 }, { "entropy": 2.4740833044052124, "epoch": 1.16, "grad_norm": 0.8472949266433716, "learning_rate": 9.131193871579975e-05, "loss": 2.145, "mean_token_accuracy": 0.5561789870262146, "num_tokens": 32709.0, "step": 15 }, { "entropy": 2.564497411251068, "epoch": 1.24, "grad_norm": 0.9041805267333984, "learning_rate": 8.985662536114613e-05, "loss": 2.5184, "mean_token_accuracy": 0.4873481020331383, "num_tokens": 34383.0, "step": 16 }, { "entropy": 2.283787727355957, "epoch": 1.32, "grad_norm": 0.880549967288971, "learning_rate": 8.83022221559489e-05, "loss": 2.0957, "mean_token_accuracy": 0.5535659641027451, "num_tokens": 36823.0, "step": 17 }, { "entropy": 2.2330291867256165, "epoch": 1.4, "grad_norm": 0.9609613418579102, "learning_rate": 8.665259359149132e-05, "loss": 2.0666, "mean_token_accuracy": 0.5471859276294708, "num_tokens": 38982.0, "step": 18 }, { "entropy": 2.1904984414577484, "epoch": 1.48, "grad_norm": 1.1095411777496338, "learning_rate": 8.491184090430364e-05, "loss": 2.3007, "mean_token_accuracy": 0.5376310795545578, "num_tokens": 40570.0, "step": 19 }, { "entropy": 2.1939757764339447, "epoch": 1.56, "grad_norm": 0.9360744953155518, "learning_rate": 8.308429187984297e-05, "loss": 2.2015, "mean_token_accuracy": 0.5384656116366386, "num_tokens": 43213.0, "step": 20 }, { "entropy": 2.0095443725585938, "epoch": 1.6400000000000001, "grad_norm": 0.9957161545753479, "learning_rate": 8.117449009293668e-05, "loss": 2.01, "mean_token_accuracy": 0.5757630318403244, "num_tokens": 45424.0, "step": 21 }, { "entropy": 2.056138187646866, "epoch": 1.72, "grad_norm": 1.2742964029312134, "learning_rate": 7.91871836117395e-05, "loss": 2.1542, "mean_token_accuracy": 0.5374337062239647, "num_tokens": 47023.0, "step": 22 }, { "entropy": 1.9765421450138092, "epoch": 1.8, "grad_norm": 0.9705051183700562, "learning_rate": 7.712731319328798e-05, "loss": 2.0624, "mean_token_accuracy": 0.5524236708879471, "num_tokens": 49784.0, "step": 23 }, { "entropy": 1.8856642246246338, "epoch": 1.88, "grad_norm": 1.0250470638275146, "learning_rate": 7.500000000000001e-05, "loss": 1.9065, "mean_token_accuracy": 0.5826538950204849, "num_tokens": 52011.0, "step": 24 }, { "entropy": 2.021214246749878, "epoch": 1.96, "grad_norm": 1.2443675994873047, "learning_rate": 7.281053286765815e-05, "loss": 2.1271, "mean_token_accuracy": 0.5555168986320496, "num_tokens": 53916.0, "step": 25 }, { "entropy": 2.0159093737602234, "epoch": 2.0, "grad_norm": 1.4712964296340942, "learning_rate": 7.056435515653059e-05, "loss": 2.0417, "mean_token_accuracy": 0.5624207556247711, "num_tokens": 55110.0, "step": 26 }, { "entropy": 1.917511224746704, "epoch": 2.08, "grad_norm": 0.9676669836044312, "learning_rate": 6.826705121831976e-05, "loss": 1.7429, "mean_token_accuracy": 0.6025293320417404, "num_tokens": 57944.0, "step": 27 }, { "entropy": 1.9889066219329834, "epoch": 2.16, "grad_norm": 1.1242130994796753, "learning_rate": 6.592433251258423e-05, "loss": 1.7523, "mean_token_accuracy": 0.59645976126194, "num_tokens": 59993.0, "step": 28 }, { "entropy": 2.039870023727417, "epoch": 2.24, "grad_norm": 1.4665213823318481, "learning_rate": 6.354202340715026e-05, "loss": 2.1926, "mean_token_accuracy": 0.545217253267765, "num_tokens": 61625.0, "step": 29 }, { "entropy": 1.9573059976100922, "epoch": 2.32, "grad_norm": 1.017709732055664, "learning_rate": 6.112604669781572e-05, "loss": 1.8044, "mean_token_accuracy": 0.5996322631835938, "num_tokens": 64409.0, "step": 30 }, { "entropy": 1.8920992612838745, "epoch": 2.4, "grad_norm": 1.0838905572891235, "learning_rate": 5.868240888334653e-05, "loss": 1.7022, "mean_token_accuracy": 0.6239200979471207, "num_tokens": 66753.0, "step": 31 }, { "entropy": 2.0309515595436096, "epoch": 2.48, "grad_norm": 1.3521836996078491, "learning_rate": 5.621718523237427e-05, "loss": 1.9173, "mean_token_accuracy": 0.5637946575880051, "num_tokens": 68494.0, "step": 32 }, { "entropy": 1.9445721805095673, "epoch": 2.56, "grad_norm": 1.1347510814666748, "learning_rate": 5.373650467932122e-05, "loss": 1.8014, "mean_token_accuracy": 0.5944488346576691, "num_tokens": 71033.0, "step": 33 }, { "entropy": 1.884565383195877, "epoch": 2.64, "grad_norm": 1.1540933847427368, "learning_rate": 5.124653458690365e-05, "loss": 1.7365, "mean_token_accuracy": 0.6244822144508362, "num_tokens": 73249.0, "step": 34 }, { "entropy": 1.9037934839725494, "epoch": 2.7199999999999998, "grad_norm": 1.317600965499878, "learning_rate": 4.875346541309637e-05, "loss": 1.7379, "mean_token_accuracy": 0.6134182661771774, "num_tokens": 74994.0, "step": 35 }, { "entropy": 1.864166796207428, "epoch": 2.8, "grad_norm": 1.159854531288147, "learning_rate": 4.626349532067879e-05, "loss": 1.8124, "mean_token_accuracy": 0.6027743518352509, "num_tokens": 77504.0, "step": 36 }, { "entropy": 1.730625718832016, "epoch": 2.88, "grad_norm": 1.2109590768814087, "learning_rate": 4.378281476762576e-05, "loss": 1.6053, "mean_token_accuracy": 0.6425295174121857, "num_tokens": 79750.0, "step": 37 }, { "entropy": 1.8132167756557465, "epoch": 2.96, "grad_norm": 1.3908966779708862, "learning_rate": 4.131759111665349e-05, "loss": 1.8062, "mean_token_accuracy": 0.5964024662971497, "num_tokens": 81441.0, "step": 38 }, { "entropy": 1.6718182563781738, "epoch": 3.0, "grad_norm": 1.6227182149887085, "learning_rate": 3.887395330218429e-05, "loss": 1.5381, "mean_token_accuracy": 0.6420005559921265, "num_tokens": 82665.0, "step": 39 } ], "logging_steps": 1, "max_steps": 65, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3744731166105600.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }