443 lines
11 KiB
JSON
443 lines
11 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 287,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.017452006980802792,
|
|
"grad_norm": 18.758426666259766,
|
|
"learning_rate": 2.7586206896551725e-06,
|
|
"loss": 1.2108,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.034904013961605584,
|
|
"grad_norm": 0.6289834976196289,
|
|
"learning_rate": 6.206896551724138e-06,
|
|
"loss": 0.2604,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.05235602094240838,
|
|
"grad_norm": 0.7779368758201599,
|
|
"learning_rate": 9.655172413793105e-06,
|
|
"loss": 0.0645,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.06980802792321117,
|
|
"grad_norm": 0.7711329460144043,
|
|
"learning_rate": 1.310344827586207e-05,
|
|
"loss": 0.0643,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.08726003490401396,
|
|
"grad_norm": 1.1758290529251099,
|
|
"learning_rate": 1.6551724137931037e-05,
|
|
"loss": 0.0682,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.10471204188481675,
|
|
"grad_norm": 0.20803742110729218,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0648,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.12216404886561955,
|
|
"grad_norm": 0.3231872320175171,
|
|
"learning_rate": 1.998147167378645e-05,
|
|
"loss": 0.0638,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.13961605584642234,
|
|
"grad_norm": 0.20903366804122925,
|
|
"learning_rate": 1.9925955354920265e-05,
|
|
"loss": 0.0631,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.15706806282722513,
|
|
"grad_norm": 0.1551412045955658,
|
|
"learning_rate": 1.983365676829466e-05,
|
|
"loss": 0.0637,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.17452006980802792,
|
|
"grad_norm": 0.10300405323505402,
|
|
"learning_rate": 1.9704917941574053e-05,
|
|
"loss": 0.0633,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.19197207678883071,
|
|
"grad_norm": 0.05463937669992447,
|
|
"learning_rate": 1.954021593775401e-05,
|
|
"loss": 0.0634,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.2094240837696335,
|
|
"grad_norm": 0.05097668617963791,
|
|
"learning_rate": 1.9340161087325483e-05,
|
|
"loss": 0.0637,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.2268760907504363,
|
|
"grad_norm": 0.025731965899467468,
|
|
"learning_rate": 1.9105494726594344e-05,
|
|
"loss": 0.0634,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.2443280977312391,
|
|
"grad_norm": 0.5567801594734192,
|
|
"learning_rate": 1.8837086450537195e-05,
|
|
"loss": 0.0643,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.2617801047120419,
|
|
"grad_norm": 0.07695559412240982,
|
|
"learning_rate": 1.8535930890373467e-05,
|
|
"loss": 0.0633,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.2792321116928447,
|
|
"grad_norm": 0.13339029252529144,
|
|
"learning_rate": 1.820314402779511e-05,
|
|
"loss": 0.0633,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.29668411867364747,
|
|
"grad_norm": 0.10536781698465347,
|
|
"learning_rate": 1.7839959059512016e-05,
|
|
"loss": 0.0638,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.31413612565445026,
|
|
"grad_norm": 0.12401806563138962,
|
|
"learning_rate": 1.744772182743782e-05,
|
|
"loss": 0.0633,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.33158813263525305,
|
|
"grad_norm": 0.1011064425110817,
|
|
"learning_rate": 1.7027885831450318e-05,
|
|
"loss": 0.0629,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.34904013961605584,
|
|
"grad_norm": 0.13563387095928192,
|
|
"learning_rate": 1.658200684320748e-05,
|
|
"loss": 0.0632,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.36649214659685864,
|
|
"grad_norm": 0.26744481921195984,
|
|
"learning_rate": 1.6111737140978495e-05,
|
|
"loss": 0.0633,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.38394415357766143,
|
|
"grad_norm": 0.6496581435203552,
|
|
"learning_rate": 1.5618819386853607e-05,
|
|
"loss": 0.0638,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.4013961605584642,
|
|
"grad_norm": 0.2886026203632355,
|
|
"learning_rate": 1.5105080169021792e-05,
|
|
"loss": 0.0637,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.418848167539267,
|
|
"grad_norm": 0.07766488194465637,
|
|
"learning_rate": 1.4572423233046386e-05,
|
|
"loss": 0.064,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.4363001745200698,
|
|
"grad_norm": 0.152951180934906,
|
|
"learning_rate": 1.4022822427221325e-05,
|
|
"loss": 0.0637,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.4537521815008726,
|
|
"grad_norm": 0.4545815587043762,
|
|
"learning_rate": 1.3458314388150115e-05,
|
|
"loss": 0.0631,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.4712041884816754,
|
|
"grad_norm": 0.13478335738182068,
|
|
"learning_rate": 1.2880990993652379e-05,
|
|
"loss": 0.0627,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.4886561954624782,
|
|
"grad_norm": 0.45286211371421814,
|
|
"learning_rate": 1.2292991610964902e-05,
|
|
"loss": 0.0637,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.506108202443281,
|
|
"grad_norm": 0.44334903359413147,
|
|
"learning_rate": 1.1696495168962848e-05,
|
|
"loss": 0.0639,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.5235602094240838,
|
|
"grad_norm": 0.6045412421226501,
|
|
"learning_rate": 1.1093712083778748e-05,
|
|
"loss": 0.0644,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.5410122164048866,
|
|
"grad_norm": 0.5224294066429138,
|
|
"learning_rate": 1.0486876067740253e-05,
|
|
"loss": 0.0655,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.5584642233856894,
|
|
"grad_norm": 0.37020203471183777,
|
|
"learning_rate": 9.878235851980027e-06,
|
|
"loss": 0.0639,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.5759162303664922,
|
|
"grad_norm": 0.01445784978568554,
|
|
"learning_rate": 9.270046853390924e-06,
|
|
"loss": 0.0636,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.5933682373472949,
|
|
"grad_norm": 0.5739990472793579,
|
|
"learning_rate": 8.664562816806022e-06,
|
|
"loss": 0.0644,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.6108202443280978,
|
|
"grad_norm": 0.21191075444221497,
|
|
"learning_rate": 8.064027463374702e-06,
|
|
"loss": 0.0629,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.6282722513089005,
|
|
"grad_norm": 0.3500339686870575,
|
|
"learning_rate": 7.470666176083193e-06,
|
|
"loss": 0.0645,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.6457242582897034,
|
|
"grad_norm": 0.31313106417655945,
|
|
"learning_rate": 6.886677753230184e-06,
|
|
"loss": 0.0623,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.6631762652705061,
|
|
"grad_norm": 0.3150012791156769,
|
|
"learning_rate": 6.314226260416383e-06,
|
|
"loss": 0.0624,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.680628272251309,
|
|
"grad_norm": 0.19164550304412842,
|
|
"learning_rate": 5.755433011241851e-06,
|
|
"loss": 0.0621,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.6980802792321117,
|
|
"grad_norm": 0.448416143655777,
|
|
"learning_rate": 5.212368706427913e-06,
|
|
"loss": 0.0638,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.7155322862129145,
|
|
"grad_norm": 0.0443989560008049,
|
|
"learning_rate": 4.687045760493468e-06,
|
|
"loss": 0.0614,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.7329842931937173,
|
|
"grad_norm": 0.32341665029525757,
|
|
"learning_rate": 4.181410844420473e-06,
|
|
"loss": 0.0623,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.7504363001745201,
|
|
"grad_norm": 0.2636391222476959,
|
|
"learning_rate": 3.6973376719429134e-06,
|
|
"loss": 0.0604,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.7678883071553229,
|
|
"grad_norm": 0.27186042070388794,
|
|
"learning_rate": 3.236620056190972e-06,
|
|
"loss": 0.0601,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.7853403141361257,
|
|
"grad_norm": 0.5704047679901123,
|
|
"learning_rate": 2.8009652624200436e-06,
|
|
"loss": 0.0613,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.8027923211169284,
|
|
"grad_norm": 0.4298834204673767,
|
|
"learning_rate": 2.3919876814572197e-06,
|
|
"loss": 0.0592,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.8202443280977313,
|
|
"grad_norm": 0.08873734623193741,
|
|
"learning_rate": 2.0112028473093294e-06,
|
|
"loss": 0.0595,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.837696335078534,
|
|
"grad_norm": 0.39123955368995667,
|
|
"learning_rate": 1.660021821101222e-06,
|
|
"loss": 0.0567,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.8551483420593369,
|
|
"grad_norm": 0.16050003468990326,
|
|
"learning_rate": 1.339745962155613e-06,
|
|
"loss": 0.0571,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.8726003490401396,
|
|
"grad_norm": 0.12748093903064728,
|
|
"learning_rate": 1.051562105591082e-06,
|
|
"loss": 0.0607,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.8900523560209425,
|
|
"grad_norm": 0.1128767654299736,
|
|
"learning_rate": 7.965381643084069e-07,
|
|
"loss": 0.0582,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.9075043630017452,
|
|
"grad_norm": 0.5375702381134033,
|
|
"learning_rate": 5.756191716628556e-07,
|
|
"loss": 0.0621,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.924956369982548,
|
|
"grad_norm": 0.272128164768219,
|
|
"learning_rate": 3.8962377948693395e-07,
|
|
"loss": 0.0579,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.9424083769633508,
|
|
"grad_norm": 0.12358862906694412,
|
|
"learning_rate": 2.392412244407294e-07,
|
|
"loss": 0.058,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.9598603839441536,
|
|
"grad_norm": 0.13405446708202362,
|
|
"learning_rate": 1.2502877393158587e-07,
|
|
"loss": 0.0592,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.9773123909249564,
|
|
"grad_norm": 0.12268463522195816,
|
|
"learning_rate": 4.740966106764222e-08,
|
|
"loss": 0.0565,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.9947643979057592,
|
|
"grad_norm": 0.6271886825561523,
|
|
"learning_rate": 6.671516297606095e-09,
|
|
"loss": 0.0593,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 287,
|
|
"total_flos": 3.259472961077248e+17,
|
|
"train_loss": 0.08579332347738618,
|
|
"train_runtime": 1625.0751,
|
|
"train_samples_per_second": 11.273,
|
|
"train_steps_per_second": 0.177
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 287,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.259472961077248e+17,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|