243 lines
6.1 KiB
JSON
243 lines
6.1 KiB
JSON
|
|
{
|
||
|
|
"best_metric": 2.2968251705169678,
|
||
|
|
"best_model_checkpoint": "./output/training_results/C018_Meta-Llama-3-8B_pretrain_20240726_033210/checkpoint-4230",
|
||
|
|
"epoch": 4.0,
|
||
|
|
"eval_steps": 470,
|
||
|
|
"global_step": 4696,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0008517887563884157,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 2.4637,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20017035775127767,
|
||
|
|
"grad_norm": 2.3409501850123138,
|
||
|
|
"learning_rate": 1.9546742209631728e-06,
|
||
|
|
"loss": 2.4082,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40034071550255534,
|
||
|
|
"grad_norm": 2.1722293582224492,
|
||
|
|
"learning_rate": 2.2631312554186003e-06,
|
||
|
|
"loss": 2.3529,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40034071550255534,
|
||
|
|
"eval_loss": 2.341965675354004,
|
||
|
|
"eval_runtime": 41.4873,
|
||
|
|
"eval_samples_per_second": 201.218,
|
||
|
|
"eval_steps_per_second": 1.591,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.600511073253833,
|
||
|
|
"grad_norm": 2.185798119323745,
|
||
|
|
"learning_rate": 1.2303591421466819e-06,
|
||
|
|
"loss": 2.3196,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8006814310051107,
|
||
|
|
"grad_norm": 1.9581500824387317,
|
||
|
|
"learning_rate": 6.559971206312988e-07,
|
||
|
|
"loss": 2.3053,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8006814310051107,
|
||
|
|
"eval_loss": 2.307070732116699,
|
||
|
|
"eval_runtime": 41.2244,
|
||
|
|
"eval_samples_per_second": 202.501,
|
||
|
|
"eval_steps_per_second": 1.601,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0008517887563884,
|
||
|
|
"grad_norm": 2.0030962313242693,
|
||
|
|
"learning_rate": 3.4801579366796346e-07,
|
||
|
|
"loss": 2.2905,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.201022146507666,
|
||
|
|
"grad_norm": 2.1511768973070087,
|
||
|
|
"learning_rate": 1.8955345667471282e-07,
|
||
|
|
"loss": 2.2195,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.201022146507666,
|
||
|
|
"eval_loss": 2.3012468814849854,
|
||
|
|
"eval_runtime": 41.1845,
|
||
|
|
"eval_samples_per_second": 202.698,
|
||
|
|
"eval_steps_per_second": 1.603,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4011925042589437,
|
||
|
|
"grad_norm": 2.038172613706977,
|
||
|
|
"learning_rate": 1.1177613622113936e-07,
|
||
|
|
"loss": 2.217,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6013628620102214,
|
||
|
|
"grad_norm": 2.6056027737544087,
|
||
|
|
"learning_rate": 7.561933429867634e-08,
|
||
|
|
"loss": 2.2134,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6013628620102214,
|
||
|
|
"eval_loss": 2.2990095615386963,
|
||
|
|
"eval_runtime": 41.2014,
|
||
|
|
"eval_samples_per_second": 202.615,
|
||
|
|
"eval_steps_per_second": 1.602,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8015332197614993,
|
||
|
|
"grad_norm": 2.3428592690655385,
|
||
|
|
"learning_rate": 5.984119005303602e-08,
|
||
|
|
"loss": 2.2142,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0017035775127767,
|
||
|
|
"grad_norm": 1.9738830418944124,
|
||
|
|
"learning_rate": 5.345074457083591e-08,
|
||
|
|
"loss": 2.2183,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0017035775127767,
|
||
|
|
"eval_loss": 2.2979490756988525,
|
||
|
|
"eval_runtime": 41.2381,
|
||
|
|
"eval_samples_per_second": 202.434,
|
||
|
|
"eval_steps_per_second": 1.6,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2018739352640546,
|
||
|
|
"grad_norm": 1.9766056487629942,
|
||
|
|
"learning_rate": 5.108344330433012e-08,
|
||
|
|
"loss": 2.1996,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.402044293015332,
|
||
|
|
"grad_norm": 1.9886986639549535,
|
||
|
|
"learning_rate": 5.0296763609045817e-08,
|
||
|
|
"loss": 2.2069,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.402044293015332,
|
||
|
|
"eval_loss": 2.2981810569763184,
|
||
|
|
"eval_runtime": 41.2271,
|
||
|
|
"eval_samples_per_second": 202.488,
|
||
|
|
"eval_steps_per_second": 1.601,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.60221465076661,
|
||
|
|
"grad_norm": 2.0358858583570556,
|
||
|
|
"learning_rate": 5.006836944156395e-08,
|
||
|
|
"loss": 2.2071,
|
||
|
|
"step": 3055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8023850085178874,
|
||
|
|
"grad_norm": 2.0667439556087315,
|
||
|
|
"learning_rate": 5.001265655634458e-08,
|
||
|
|
"loss": 2.205,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8023850085178874,
|
||
|
|
"eval_loss": 2.2976646423339844,
|
||
|
|
"eval_runtime": 41.1148,
|
||
|
|
"eval_samples_per_second": 203.041,
|
||
|
|
"eval_steps_per_second": 1.605,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0025553662691653,
|
||
|
|
"grad_norm": 2.048334597597571,
|
||
|
|
"learning_rate": 5.000170873605877e-08,
|
||
|
|
"loss": 2.2038,
|
||
|
|
"step": 3525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2027257240204428,
|
||
|
|
"grad_norm": 2.0830071658637626,
|
||
|
|
"learning_rate": 5.000014746665313e-08,
|
||
|
|
"loss": 2.1934,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2027257240204428,
|
||
|
|
"eval_loss": 2.2974419593811035,
|
||
|
|
"eval_runtime": 41.1847,
|
||
|
|
"eval_samples_per_second": 202.697,
|
||
|
|
"eval_steps_per_second": 1.603,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4028960817717206,
|
||
|
|
"grad_norm": 2.067015754816994,
|
||
|
|
"learning_rate": 5.000000637528681e-08,
|
||
|
|
"loss": 2.193,
|
||
|
|
"step": 3995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6030664395229985,
|
||
|
|
"grad_norm": 2.0937714004243673,
|
||
|
|
"learning_rate": 5.000000007544082e-08,
|
||
|
|
"loss": 2.2047,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6030664395229985,
|
||
|
|
"eval_loss": 2.2968251705169678,
|
||
|
|
"eval_runtime": 41.1318,
|
||
|
|
"eval_samples_per_second": 202.957,
|
||
|
|
"eval_steps_per_second": 1.605,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.803236797274276,
|
||
|
|
"grad_norm": 2.076604473202926,
|
||
|
|
"learning_rate": 5.000000000003948e-08,
|
||
|
|
"loss": 2.2036,
|
||
|
|
"step": 4465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0,
|
||
|
|
"step": 4696,
|
||
|
|
"total_flos": 490890602741760.0,
|
||
|
|
"train_loss": 2.2402087043862937,
|
||
|
|
"train_runtime": 7138.091,
|
||
|
|
"train_samples_per_second": 42.101,
|
||
|
|
"train_steps_per_second": 0.658
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 235,
|
||
|
|
"max_steps": 4696,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 4,
|
||
|
|
"save_steps": 470,
|
||
|
|
"total_flos": 490890602741760.0,
|
||
|
|
"train_batch_size": 8,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|