310 lines
7.9 KiB
JSON
310 lines
7.9 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 387,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.07759456838021339,
|
|
"grad_norm": 1.9683642394428182,
|
|
"learning_rate": 2.307692307692308e-06,
|
|
"loss": 0.7343237400054932,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.15518913676042678,
|
|
"grad_norm": 1.4175428237350762,
|
|
"learning_rate": 4.871794871794872e-06,
|
|
"loss": 0.5461452007293701,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.23278370514064015,
|
|
"grad_norm": 0.5442834252561063,
|
|
"learning_rate": 7.435897435897437e-06,
|
|
"loss": 0.3490773677825928,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.31037827352085356,
|
|
"grad_norm": 0.32322946422972365,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2592954635620117,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.3879728419010669,
|
|
"grad_norm": 0.24901563193155196,
|
|
"learning_rate": 9.979639600327522e-06,
|
|
"loss": 0.2136533737182617,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.4655674102812803,
|
|
"grad_norm": 0.2047675084448879,
|
|
"learning_rate": 9.918724219660013e-06,
|
|
"loss": 0.18301695585250854,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.5431619786614937,
|
|
"grad_norm": 0.1694310997257767,
|
|
"learning_rate": 9.817749962596115e-06,
|
|
"loss": 0.16246029138565063,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.6207565470417071,
|
|
"grad_norm": 0.22587456656054467,
|
|
"learning_rate": 9.677539179628005e-06,
|
|
"loss": 0.14934264421463012,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.6983511154219205,
|
|
"grad_norm": 0.22154973989105028,
|
|
"learning_rate": 9.499233769787534e-06,
|
|
"loss": 0.134801185131073,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.7759456838021338,
|
|
"grad_norm": 0.2099862635469814,
|
|
"learning_rate": 9.284285880837947e-06,
|
|
"loss": 0.13017673492431642,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.8535402521823472,
|
|
"grad_norm": 0.32230657820182124,
|
|
"learning_rate": 9.034446082750352e-06,
|
|
"loss": 0.12214579582214355,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.9311348205625606,
|
|
"grad_norm": 0.324253054340729,
|
|
"learning_rate": 8.751749110782013e-06,
|
|
"loss": 0.12026152610778809,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.0077594568380213,
|
|
"grad_norm": 0.20488241588612174,
|
|
"learning_rate": 8.438497294267117e-06,
|
|
"loss": 0.11126101016998291,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 1.0853540252182348,
|
|
"grad_norm": 0.20661218086124847,
|
|
"learning_rate": 8.097241806078616e-06,
|
|
"loss": 0.10776399374008179,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 1.162948593598448,
|
|
"grad_norm": 0.25468202960165104,
|
|
"learning_rate": 7.730761885468486e-06,
|
|
"loss": 0.10431833267211914,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.2405431619786615,
|
|
"grad_norm": 0.17930064486716413,
|
|
"learning_rate": 7.342042203498952e-06,
|
|
"loss": 0.10304663181304932,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.3181377303588748,
|
|
"grad_norm": 0.20225538073749422,
|
|
"learning_rate": 6.934248555404197e-06,
|
|
"loss": 0.09784629344940185,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 1.3957322987390883,
|
|
"grad_norm": 0.2256721972453044,
|
|
"learning_rate": 6.510702077847864e-06,
|
|
"loss": 0.09537227749824524,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.4733268671193016,
|
|
"grad_norm": 0.21487787771920072,
|
|
"learning_rate": 6.074852201055121e-06,
|
|
"loss": 0.09520423412322998,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 1.5509214354995149,
|
|
"grad_norm": 0.17540761321861204,
|
|
"learning_rate": 5.630248556101448e-06,
|
|
"loss": 0.09088362455368042,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.6285160038797284,
|
|
"grad_norm": 0.21743503130668765,
|
|
"learning_rate": 5.180512066149682e-06,
|
|
"loss": 0.0899280071258545,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 1.706110572259942,
|
|
"grad_norm": 0.20331687416060285,
|
|
"learning_rate": 4.729305457072913e-06,
|
|
"loss": 0.0881616234779358,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.7837051406401552,
|
|
"grad_norm": 0.15781467110120098,
|
|
"learning_rate": 4.280303427629404e-06,
|
|
"loss": 0.08638249635696411,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 1.8612997090203685,
|
|
"grad_norm": 0.1623620489054104,
|
|
"learning_rate": 3.8371627221284495e-06,
|
|
"loss": 0.08716154098510742,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.938894277400582,
|
|
"grad_norm": 0.15611783173066054,
|
|
"learning_rate": 3.403492349320101e-06,
|
|
"loss": 0.08580605983734131,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 2.0155189136760425,
|
|
"grad_norm": 0.15287072067575233,
|
|
"learning_rate": 2.982824190050958e-06,
|
|
"loss": 0.08316840529441834,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 2.093113482056256,
|
|
"grad_norm": 0.1853136112632167,
|
|
"learning_rate": 2.5785842330619038e-06,
|
|
"loss": 0.08091338872909545,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 2.1707080504364695,
|
|
"grad_norm": 0.14114872525549504,
|
|
"learning_rate": 2.1940646731880887e-06,
|
|
"loss": 0.08085420131683349,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 2.248302618816683,
|
|
"grad_norm": 0.13643528182686213,
|
|
"learning_rate": 1.8323970991978823e-06,
|
|
"loss": 0.08156624436378479,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 2.325897187196896,
|
|
"grad_norm": 0.14573681730374075,
|
|
"learning_rate": 1.4965269896332884e-06,
|
|
"loss": 0.0808843195438385,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 2.4034917555771096,
|
|
"grad_norm": 0.1466398992341211,
|
|
"learning_rate": 1.1891897243618184e-06,
|
|
"loss": 0.07979943156242371,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 2.481086323957323,
|
|
"grad_norm": 0.12798260710398743,
|
|
"learning_rate": 9.128883072055411e-07,
|
|
"loss": 0.08049517869949341,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.558680892337536,
|
|
"grad_norm": 0.13826353734235647,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.08011389374732972,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 2.6362754607177497,
|
|
"grad_norm": 0.1305401343538733,
|
|
"learning_rate": 4.6212290164521554e-07,
|
|
"loss": 0.08163015246391296,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.713870029097963,
|
|
"grad_norm": 0.12804004522045906,
|
|
"learning_rate": 2.9133001876746004e-07,
|
|
"loss": 0.08051948547363282,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 2.7914645974781767,
|
|
"grad_norm": 0.12808224007612634,
|
|
"learning_rate": 1.5888529698718347e-07,
|
|
"loss": 0.07719261646270752,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.86905916585839,
|
|
"grad_norm": 0.12117673381149041,
|
|
"learning_rate": 6.58673872923693e-08,
|
|
"loss": 0.08128957152366638,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 2.946653734238603,
|
|
"grad_norm": 0.124324493318766,
|
|
"learning_rate": 1.3033842410251074e-08,
|
|
"loss": 0.07743191719055176,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 387,
|
|
"total_flos": 3081875480379392.0,
|
|
"train_loss": 0.06056562058377327,
|
|
"train_runtime": 29609.547,
|
|
"train_samples_per_second": 6.685,
|
|
"train_steps_per_second": 0.013
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 387,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 40,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3081875480379392.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|