278 lines
7.4 KiB
JSON
278 lines
7.4 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 18.75,
|
|
"eval_steps": 500,
|
|
"global_step": 1350,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.6944444444444444,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 9.94575799721836e-06,
|
|
"loss": 1.9929,
|
|
"mean_token_accuracy": 0.5180467286705971,
|
|
"num_tokens": 565403.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 1.3888888888888888,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 9.876216968011127e-06,
|
|
"loss": 1.7844,
|
|
"mean_token_accuracy": 0.5439706787467002,
|
|
"num_tokens": 1137207.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 2.0833333333333335,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 9.806675938803894e-06,
|
|
"loss": 1.7471,
|
|
"mean_token_accuracy": 0.5503649765253067,
|
|
"num_tokens": 1691787.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 2.7777777777777777,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 9.737134909596663e-06,
|
|
"loss": 1.6946,
|
|
"mean_token_accuracy": 0.558834604024887,
|
|
"num_tokens": 2259114.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 3.4722222222222223,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 9.667593880389431e-06,
|
|
"loss": 1.6698,
|
|
"mean_token_accuracy": 0.5630534660816192,
|
|
"num_tokens": 2824656.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 4.166666666666667,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 9.598052851182198e-06,
|
|
"loss": 1.6426,
|
|
"mean_token_accuracy": 0.5657697267830372,
|
|
"num_tokens": 3383725.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 4.861111111111111,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 9.528511821974965e-06,
|
|
"loss": 1.6163,
|
|
"mean_token_accuracy": 0.572148412913084,
|
|
"num_tokens": 3951935.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 5.555555555555555,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 9.458970792767734e-06,
|
|
"loss": 1.5929,
|
|
"mean_token_accuracy": 0.5758887875080109,
|
|
"num_tokens": 4517240.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 6.25,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 9.389429763560501e-06,
|
|
"loss": 1.5578,
|
|
"mean_token_accuracy": 0.5819848603010178,
|
|
"num_tokens": 5081608.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 6.944444444444445,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 9.31988873435327e-06,
|
|
"loss": 1.5426,
|
|
"mean_token_accuracy": 0.5854928362369537,
|
|
"num_tokens": 5642941.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 7.638888888888889,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 9.250347705146037e-06,
|
|
"loss": 1.5061,
|
|
"mean_token_accuracy": 0.5927784067392349,
|
|
"num_tokens": 6209257.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 8.333333333333334,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 9.180806675938806e-06,
|
|
"loss": 1.5025,
|
|
"mean_token_accuracy": 0.5923233330249786,
|
|
"num_tokens": 6774284.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 9.027777777777779,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 9.111265646731573e-06,
|
|
"loss": 1.477,
|
|
"mean_token_accuracy": 0.597908786535263,
|
|
"num_tokens": 7336223.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 9.722222222222221,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 9.04172461752434e-06,
|
|
"loss": 1.4452,
|
|
"mean_token_accuracy": 0.6044489535689354,
|
|
"num_tokens": 7894148.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 10.416666666666666,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 8.972183588317108e-06,
|
|
"loss": 1.4268,
|
|
"mean_token_accuracy": 0.6078107115626336,
|
|
"num_tokens": 8463546.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 11.11111111111111,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 8.902642559109875e-06,
|
|
"loss": 1.4126,
|
|
"mean_token_accuracy": 0.6102430355548859,
|
|
"num_tokens": 9025580.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 11.805555555555555,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 8.833101529902644e-06,
|
|
"loss": 1.3873,
|
|
"mean_token_accuracy": 0.6162240096926689,
|
|
"num_tokens": 9591962.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 12.5,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 8.763560500695411e-06,
|
|
"loss": 1.3538,
|
|
"mean_token_accuracy": 0.6232619461417198,
|
|
"num_tokens": 10156358.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 13.194444444444445,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 8.694019471488178e-06,
|
|
"loss": 1.3373,
|
|
"mean_token_accuracy": 0.6272177976369858,
|
|
"num_tokens": 10723149.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 13.88888888888889,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 8.624478442280947e-06,
|
|
"loss": 1.3169,
|
|
"mean_token_accuracy": 0.6308149287104606,
|
|
"num_tokens": 11281835.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 14.583333333333334,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 8.554937413073714e-06,
|
|
"loss": 1.2865,
|
|
"mean_token_accuracy": 0.6381412792205811,
|
|
"num_tokens": 11846572.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 15.277777777777779,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 8.485396383866483e-06,
|
|
"loss": 1.261,
|
|
"mean_token_accuracy": 0.6445417484641075,
|
|
"num_tokens": 12410336.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 15.972222222222221,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 8.41585535465925e-06,
|
|
"loss": 1.245,
|
|
"mean_token_accuracy": 0.6479020461440086,
|
|
"num_tokens": 12973627.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 16.666666666666668,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 8.346314325452017e-06,
|
|
"loss": 1.2052,
|
|
"mean_token_accuracy": 0.6577865305542946,
|
|
"num_tokens": 13538332.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 17.36111111111111,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 8.276773296244786e-06,
|
|
"loss": 1.1764,
|
|
"mean_token_accuracy": 0.6647191798686981,
|
|
"num_tokens": 14102319.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 18.055555555555557,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 8.207232267037553e-06,
|
|
"loss": 1.1584,
|
|
"mean_token_accuracy": 0.6698976960778237,
|
|
"num_tokens": 14671757.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 18.75,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 8.137691237830321e-06,
|
|
"loss": 1.1171,
|
|
"mean_token_accuracy": 0.6794905725121498,
|
|
"num_tokens": 15232777.0,
|
|
"step": 1350
|
|
}
|
|
],
|
|
"logging_steps": 50,
|
|
"max_steps": 7200,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 100,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.561784415232942e+17,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|