212 lines
4.7 KiB
JSON
212 lines
4.7 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 9.24,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 120,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.4,
|
||
|
|
"grad_norm": 3366.383771014986,
|
||
|
|
"learning_rate": 3.3333333333333333e-06,
|
||
|
|
"loss": 50.181,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8,
|
||
|
|
"grad_norm": 138.84039031345438,
|
||
|
|
"learning_rate": 7.500000000000001e-06,
|
||
|
|
"loss": 9.0003,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.16,
|
||
|
|
"grad_norm": 31.069992807592296,
|
||
|
|
"learning_rate": 9.814814814814815e-06,
|
||
|
|
"loss": 2.0675,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.56,
|
||
|
|
"grad_norm": 38.6699521302523,
|
||
|
|
"learning_rate": 9.351851851851854e-06,
|
||
|
|
"loss": 2.348,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.96,
|
||
|
|
"grad_norm": 31.534923460458895,
|
||
|
|
"learning_rate": 8.888888888888888e-06,
|
||
|
|
"loss": 2.7579,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.32,
|
||
|
|
"grad_norm": 35.62793738569212,
|
||
|
|
"learning_rate": 8.425925925925926e-06,
|
||
|
|
"loss": 2.6538,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7199999999999998,
|
||
|
|
"grad_norm": 43.09111169943847,
|
||
|
|
"learning_rate": 7.962962962962963e-06,
|
||
|
|
"loss": 3.0774,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.08,
|
||
|
|
"grad_norm": 31.55815735622089,
|
||
|
|
"learning_rate": 7.500000000000001e-06,
|
||
|
|
"loss": 2.5569,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.48,
|
||
|
|
"grad_norm": 32.987381074140224,
|
||
|
|
"learning_rate": 7.0370370370370375e-06,
|
||
|
|
"loss": 2.691,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.88,
|
||
|
|
"grad_norm": 30.799908902389816,
|
||
|
|
"learning_rate": 6.574074074074075e-06,
|
||
|
|
"loss": 2.725,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.24,
|
||
|
|
"grad_norm": 30.428695432735317,
|
||
|
|
"learning_rate": 6.111111111111112e-06,
|
||
|
|
"loss": 2.1202,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.64,
|
||
|
|
"grad_norm": 29.827204683187425,
|
||
|
|
"learning_rate": 5.6481481481481485e-06,
|
||
|
|
"loss": 2.3278,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 33.33785124405715,
|
||
|
|
"learning_rate": 5.185185185185185e-06,
|
||
|
|
"loss": 2.1507,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.4,
|
||
|
|
"grad_norm": 25.114796048971975,
|
||
|
|
"learning_rate": 4.722222222222222e-06,
|
||
|
|
"loss": 2.0327,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.8,
|
||
|
|
"grad_norm": 22.363021156501855,
|
||
|
|
"learning_rate": 4.2592592592592596e-06,
|
||
|
|
"loss": 2.0363,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.16,
|
||
|
|
"grad_norm": 27.92963131520657,
|
||
|
|
"learning_rate": 3.796296296296297e-06,
|
||
|
|
"loss": 1.7444,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.5600000000000005,
|
||
|
|
"grad_norm": 24.85153444942929,
|
||
|
|
"learning_rate": 3.3333333333333333e-06,
|
||
|
|
"loss": 1.9417,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.96,
|
||
|
|
"grad_norm": 24.5998271042858,
|
||
|
|
"learning_rate": 2.8703703703703706e-06,
|
||
|
|
"loss": 1.9907,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 7.32,
|
||
|
|
"grad_norm": 19.476935749731375,
|
||
|
|
"learning_rate": 2.4074074074074075e-06,
|
||
|
|
"loss": 1.6776,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 7.72,
|
||
|
|
"grad_norm": 32.61686883093416,
|
||
|
|
"learning_rate": 1.944444444444445e-06,
|
||
|
|
"loss": 2.0266,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 8.08,
|
||
|
|
"grad_norm": 22.893625227037738,
|
||
|
|
"learning_rate": 1.4814814814814815e-06,
|
||
|
|
"loss": 1.7539,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 8.48,
|
||
|
|
"grad_norm": 29.10691686568226,
|
||
|
|
"learning_rate": 1.0185185185185185e-06,
|
||
|
|
"loss": 2.2201,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 8.88,
|
||
|
|
"grad_norm": 25.9922847192214,
|
||
|
|
"learning_rate": 5.555555555555555e-07,
|
||
|
|
"loss": 1.9925,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 9.24,
|
||
|
|
"grad_norm": 25.22158299607113,
|
||
|
|
"learning_rate": 9.259259259259259e-08,
|
||
|
|
"loss": 1.8082,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 9.24,
|
||
|
|
"step": 120,
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_loss": 4.495095912615458,
|
||
|
|
"train_runtime": 3832.3964,
|
||
|
|
"train_samples_per_second": 1.044,
|
||
|
|
"train_steps_per_second": 0.031
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 120,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": false,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_batch_size": 4,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|