367 lines
10 KiB
JSON
367 lines
10 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.5231746031746032,
|
|
"eval_steps": 300,
|
|
"global_step": 600,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 1.5461102724075317,
|
|
"epoch": 0.0025396825396825397,
|
|
"grad_norm": 3.896618366241455,
|
|
"learning_rate": 0.0,
|
|
"loss": 2.3711,
|
|
"mean_token_accuracy": 0.6472751796245575,
|
|
"num_tokens": 9621.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 1.581420679233576,
|
|
"epoch": 0.050793650793650794,
|
|
"grad_norm": 2.02801251411438,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 1.9565,
|
|
"mean_token_accuracy": 0.6899475496458379,
|
|
"num_tokens": 194969.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 1.8750008303672074,
|
|
"epoch": 0.10158730158730159,
|
|
"grad_norm": 0.83933025598526,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 1.4223,
|
|
"mean_token_accuracy": 0.7304104179143905,
|
|
"num_tokens": 390209.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 1.868852799385786,
|
|
"epoch": 0.1523809523809524,
|
|
"grad_norm": 0.7984667420387268,
|
|
"learning_rate": 1.9968176841806687e-05,
|
|
"loss": 0.9479,
|
|
"mean_token_accuracy": 0.7837442796677351,
|
|
"num_tokens": 585927.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 1.635760549083352,
|
|
"epoch": 0.20317460317460317,
|
|
"grad_norm": 0.7746185660362244,
|
|
"learning_rate": 1.9866148103359362e-05,
|
|
"loss": 0.6905,
|
|
"mean_token_accuracy": 0.8289982877671719,
|
|
"num_tokens": 781164.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 1.4979693002998828,
|
|
"epoch": 0.25396825396825395,
|
|
"grad_norm": 0.7985681891441345,
|
|
"learning_rate": 1.9694545073405348e-05,
|
|
"loss": 0.5682,
|
|
"mean_token_accuracy": 0.8550767470151186,
|
|
"num_tokens": 976497.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 1.3932939764112233,
|
|
"epoch": 0.3047619047619048,
|
|
"grad_norm": 0.8087366223335266,
|
|
"learning_rate": 1.94545778654666e-05,
|
|
"loss": 0.5126,
|
|
"mean_token_accuracy": 0.8678769677877426,
|
|
"num_tokens": 1171938.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 1.3521239839494228,
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 1.0716849565505981,
|
|
"learning_rate": 1.9147938684880213e-05,
|
|
"loss": 0.4679,
|
|
"mean_token_accuracy": 0.8757014229893685,
|
|
"num_tokens": 1367065.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 1.276645314320922,
|
|
"epoch": 0.40634920634920635,
|
|
"grad_norm": 1.04561448097229,
|
|
"learning_rate": 1.8776789895672557e-05,
|
|
"loss": 0.4319,
|
|
"mean_token_accuracy": 0.8847867721691728,
|
|
"num_tokens": 1562066.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 1.237958626449108,
|
|
"epoch": 0.45714285714285713,
|
|
"grad_norm": 0.78230220079422,
|
|
"learning_rate": 1.8343748771959346e-05,
|
|
"loss": 0.4182,
|
|
"mean_token_accuracy": 0.8867125200107694,
|
|
"num_tokens": 1756828.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 1.2167125567793846,
|
|
"epoch": 0.5079365079365079,
|
|
"grad_norm": 1.3130111694335938,
|
|
"learning_rate": 1.785186904140207e-05,
|
|
"loss": 0.384,
|
|
"mean_token_accuracy": 0.8928723320364952,
|
|
"num_tokens": 1952018.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 1.1336687998846173,
|
|
"epoch": 0.5587301587301587,
|
|
"grad_norm": 1.0740300416946411,
|
|
"learning_rate": 1.7304619350872992e-05,
|
|
"loss": 0.355,
|
|
"mean_token_accuracy": 0.8990522997453809,
|
|
"num_tokens": 2146747.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 1.0902384620159864,
|
|
"epoch": 0.6095238095238096,
|
|
"grad_norm": 1.2321722507476807,
|
|
"learning_rate": 1.6705858806184933e-05,
|
|
"loss": 0.3421,
|
|
"mean_token_accuracy": 0.8982272742316126,
|
|
"num_tokens": 2342526.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 1.072350986301899,
|
|
"epoch": 0.6603174603174603,
|
|
"grad_norm": 1.2107694149017334,
|
|
"learning_rate": 1.605980975837524e-05,
|
|
"loss": 0.3293,
|
|
"mean_token_accuracy": 0.9014129877090454,
|
|
"num_tokens": 2537429.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 1.0297158515080809,
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 1.3480980396270752,
|
|
"learning_rate": 1.5371028028450152e-05,
|
|
"loss": 0.3115,
|
|
"mean_token_accuracy": 0.9040320562198758,
|
|
"num_tokens": 2732125.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 1.0070750426501036,
|
|
"epoch": 0.7619047619047619,
|
|
"grad_norm": 1.2340924739837646,
|
|
"learning_rate": 1.4644370780559265e-05,
|
|
"loss": 0.297,
|
|
"mean_token_accuracy": 0.906495463848114,
|
|
"num_tokens": 2926977.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.7619047619047619,
|
|
"eval_entropy": 0.9912602304560798,
|
|
"eval_loss": 0.28733107447624207,
|
|
"eval_mean_token_accuracy": 0.9105286079645157,
|
|
"eval_num_tokens": 2926977.0,
|
|
"eval_runtime": 254.0073,
|
|
"eval_samples_per_second": 2.756,
|
|
"eval_steps_per_second": 2.756,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.9841975728049874,
|
|
"epoch": 0.8126984126984127,
|
|
"grad_norm": 1.2695492506027222,
|
|
"learning_rate": 1.3884962270152693e-05,
|
|
"loss": 0.2871,
|
|
"mean_token_accuracy": 0.9104899806901813,
|
|
"num_tokens": 3122184.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.9748819842934608,
|
|
"epoch": 0.8634920634920635,
|
|
"grad_norm": 1.4477434158325195,
|
|
"learning_rate": 1.3098157708658657e-05,
|
|
"loss": 0.2584,
|
|
"mean_token_accuracy": 0.9179832600057125,
|
|
"num_tokens": 3317637.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.9418716534972191,
|
|
"epoch": 0.9142857142857143,
|
|
"grad_norm": 1.3718384504318237,
|
|
"learning_rate": 1.2289505499501341e-05,
|
|
"loss": 0.2509,
|
|
"mean_token_accuracy": 0.9180495567619801,
|
|
"num_tokens": 3513113.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.946486484631896,
|
|
"epoch": 0.9650793650793651,
|
|
"grad_norm": 1.3464258909225464,
|
|
"learning_rate": 1.1464708111763723e-05,
|
|
"loss": 0.2477,
|
|
"mean_token_accuracy": 0.9207174494862557,
|
|
"num_tokens": 3707967.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.9170130017814757,
|
|
"epoch": 1.0152380952380953,
|
|
"grad_norm": 1.5472830533981323,
|
|
"learning_rate": 1.0629581867407241e-05,
|
|
"loss": 0.2329,
|
|
"mean_token_accuracy": 0.9222704160817062,
|
|
"num_tokens": 3900703.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.927550189383328,
|
|
"epoch": 1.066031746031746,
|
|
"grad_norm": 1.8993698358535767,
|
|
"learning_rate": 9.790015925621588e-06,
|
|
"loss": 0.2196,
|
|
"mean_token_accuracy": 0.9251768393442035,
|
|
"num_tokens": 4096441.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.9159683456644416,
|
|
"epoch": 1.116825396825397,
|
|
"grad_norm": 1.489464521408081,
|
|
"learning_rate": 8.951930753539521e-06,
|
|
"loss": 0.2162,
|
|
"mean_token_accuracy": 0.9265725754201413,
|
|
"num_tokens": 4290838.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.8982374468818307,
|
|
"epoch": 1.1676190476190476,
|
|
"grad_norm": 1.4005104303359985,
|
|
"learning_rate": 8.121236376173745e-06,
|
|
"loss": 0.205,
|
|
"mean_token_accuracy": 0.9285883469507098,
|
|
"num_tokens": 4485804.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.8816795371472835,
|
|
"epoch": 1.2184126984126984,
|
|
"grad_norm": 1.45193612575531,
|
|
"learning_rate": 7.303790699989714e-06,
|
|
"loss": 0.2085,
|
|
"mean_token_accuracy": 0.928891065903008,
|
|
"num_tokens": 4681086.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.8763484323397279,
|
|
"epoch": 1.2692063492063492,
|
|
"grad_norm": 1.3778793811798096,
|
|
"learning_rate": 6.505358204009018e-06,
|
|
"loss": 0.1982,
|
|
"mean_token_accuracy": 0.9306870764121413,
|
|
"num_tokens": 4876429.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.8736646875739098,
|
|
"epoch": 1.32,
|
|
"grad_norm": 1.3831270933151245,
|
|
"learning_rate": 5.731569289746193e-06,
|
|
"loss": 0.1897,
|
|
"mean_token_accuracy": 0.9305133303627372,
|
|
"num_tokens": 5071394.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.868809700757265,
|
|
"epoch": 1.370793650793651,
|
|
"grad_norm": 1.8496570587158203,
|
|
"learning_rate": 4.98788057663585e-06,
|
|
"loss": 0.1931,
|
|
"mean_token_accuracy": 0.9316842250525952,
|
|
"num_tokens": 5265948.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 0.8576494121924043,
|
|
"epoch": 1.4215873015873015,
|
|
"grad_norm": 1.5876587629318237,
|
|
"learning_rate": 4.279536422939606e-06,
|
|
"loss": 0.1873,
|
|
"mean_token_accuracy": 0.9321241827681661,
|
|
"num_tokens": 5461394.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 0.854162979312241,
|
|
"epoch": 1.4723809523809523,
|
|
"grad_norm": 1.3804458379745483,
|
|
"learning_rate": 3.6115319434803897e-06,
|
|
"loss": 0.1901,
|
|
"mean_token_accuracy": 0.9312955033034086,
|
|
"num_tokens": 5656093.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 0.8487729975953698,
|
|
"epoch": 1.5231746031746032,
|
|
"grad_norm": 2.6718533039093018,
|
|
"learning_rate": 2.9885777849964016e-06,
|
|
"loss": 0.1862,
|
|
"mean_token_accuracy": 0.9315127771347761,
|
|
"num_tokens": 5851549.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.5231746031746032,
|
|
"eval_entropy": 0.844009120634624,
|
|
"eval_loss": 0.19104033708572388,
|
|
"eval_mean_token_accuracy": 0.9312942716905049,
|
|
"eval_num_tokens": 5851549.0,
|
|
"eval_runtime": 253.7619,
|
|
"eval_samples_per_second": 2.758,
|
|
"eval_steps_per_second": 2.758,
|
|
"step": 600
|
|
}
|
|
],
|
|
"logging_steps": 20,
|
|
"max_steps": 788,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 300,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.847129416270643e+16,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|