Files
mistral-7b-base-sft-hh-harm…/trainer_state.json

417 lines
10 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998003992015968,
"eval_steps": 100,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003992015968063872,
"grad_norm": 71.5754165649414,
"learning_rate": 0.0,
"loss": 2.0095,
"step": 1
},
{
"epoch": 0.01996007984031936,
"grad_norm": 25.808263778686523,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.8226,
"step": 5
},
{
"epoch": 0.03992015968063872,
"grad_norm": 10.55435562133789,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.5338,
"step": 10
},
{
"epoch": 0.059880239520958084,
"grad_norm": 13.148735046386719,
"learning_rate": 1.1200000000000001e-05,
"loss": 1.4367,
"step": 15
},
{
"epoch": 0.07984031936127745,
"grad_norm": 6.636435031890869,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.444,
"step": 20
},
{
"epoch": 0.0998003992015968,
"grad_norm": 4.353740215301514,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.4407,
"step": 25
},
{
"epoch": 0.11976047904191617,
"grad_norm": 3.963563919067383,
"learning_rate": 1.9984407641819812e-05,
"loss": 1.4644,
"step": 30
},
{
"epoch": 0.13972055888223553,
"grad_norm": 4.042232036590576,
"learning_rate": 1.9921147013144782e-05,
"loss": 1.4582,
"step": 35
},
{
"epoch": 0.1596806387225549,
"grad_norm": 3.6070656776428223,
"learning_rate": 1.9809551553491918e-05,
"loss": 1.461,
"step": 40
},
{
"epoch": 0.17964071856287425,
"grad_norm": 3.843057632446289,
"learning_rate": 1.9650164944723116e-05,
"loss": 1.4496,
"step": 45
},
{
"epoch": 0.1996007984031936,
"grad_norm": 3.784003734588623,
"learning_rate": 1.944376370237481e-05,
"loss": 1.4632,
"step": 50
},
{
"epoch": 0.21956087824351297,
"grad_norm": 3.471970319747925,
"learning_rate": 1.9191353392552346e-05,
"loss": 1.4363,
"step": 55
},
{
"epoch": 0.23952095808383234,
"grad_norm": 3.609161615371704,
"learning_rate": 1.889416373291298e-05,
"loss": 1.4209,
"step": 60
},
{
"epoch": 0.25948103792415167,
"grad_norm": 3.706693649291992,
"learning_rate": 1.855364260160507e-05,
"loss": 1.3991,
"step": 65
},
{
"epoch": 0.27944111776447106,
"grad_norm": 3.828991174697876,
"learning_rate": 1.8171448983351284e-05,
"loss": 1.4168,
"step": 70
},
{
"epoch": 0.2994011976047904,
"grad_norm": 3.53777813911438,
"learning_rate": 1.7749444887041797e-05,
"loss": 1.4197,
"step": 75
},
{
"epoch": 0.3193612774451098,
"grad_norm": 3.46360182762146,
"learning_rate": 1.7289686274214116e-05,
"loss": 1.4041,
"step": 80
},
{
"epoch": 0.3393213572854291,
"grad_norm": 3.3420891761779785,
"learning_rate": 1.6794413042615168e-05,
"loss": 1.361,
"step": 85
},
{
"epoch": 0.3592814371257485,
"grad_norm": 3.3036203384399414,
"learning_rate": 1.6266038113644605e-05,
"loss": 1.3671,
"step": 90
},
{
"epoch": 0.37924151696606784,
"grad_norm": 3.4878897666931152,
"learning_rate": 1.570713567684432e-05,
"loss": 1.346,
"step": 95
},
{
"epoch": 0.3992015968063872,
"grad_norm": 4.090396404266357,
"learning_rate": 1.5120428648705716e-05,
"loss": 1.3645,
"step": 100
},
{
"epoch": 0.3992015968063872,
"eval_loss": 1.3725436925888062,
"eval_runtime": 4.6422,
"eval_samples_per_second": 194.519,
"eval_steps_per_second": 6.247,
"step": 100
},
{
"epoch": 0.41916167664670656,
"grad_norm": 3.2958004474639893,
"learning_rate": 1.4508775406894308e-05,
"loss": 1.3203,
"step": 105
},
{
"epoch": 0.43912175648702595,
"grad_norm": 3.205641746520996,
"learning_rate": 1.3875155864521031e-05,
"loss": 1.3251,
"step": 110
},
{
"epoch": 0.4590818363273453,
"grad_norm": 3.419351100921631,
"learning_rate": 1.3222656952305113e-05,
"loss": 1.3093,
"step": 115
},
{
"epoch": 0.47904191616766467,
"grad_norm": 3.5063862800598145,
"learning_rate": 1.2554457579357906e-05,
"loss": 1.297,
"step": 120
},
{
"epoch": 0.499001996007984,
"grad_norm": 3.2938807010650635,
"learning_rate": 1.187381314585725e-05,
"loss": 1.2889,
"step": 125
},
{
"epoch": 0.5189620758483033,
"grad_norm": 3.2896780967712402,
"learning_rate": 1.1184039683065014e-05,
"loss": 1.2707,
"step": 130
},
{
"epoch": 0.5389221556886228,
"grad_norm": 3.1759278774261475,
"learning_rate": 1.0488497697956134e-05,
"loss": 1.2518,
"step": 135
},
{
"epoch": 0.5588822355289421,
"grad_norm": 3.616849422454834,
"learning_rate": 9.790575801166432e-06,
"loss": 1.2737,
"step": 140
},
{
"epoch": 0.5788423153692615,
"grad_norm": 3.459834098815918,
"learning_rate": 9.093674198022201e-06,
"loss": 1.2496,
"step": 145
},
{
"epoch": 0.5988023952095808,
"grad_norm": 3.072103261947632,
"learning_rate": 8.401188123081653e-06,
"loss": 1.2129,
"step": 150
},
{
"epoch": 0.6187624750499002,
"grad_norm": 3.2528676986694336,
"learning_rate": 7.716491298893443e-06,
"loss": 1.2096,
"step": 155
},
{
"epoch": 0.6387225548902196,
"grad_norm": 3.041900157928467,
"learning_rate": 7.042919499559538e-06,
"loss": 1.2171,
"step": 160
},
{
"epoch": 0.6586826347305389,
"grad_norm": 3.830709457397461,
"learning_rate": 6.383754299179079e-06,
"loss": 1.2038,
"step": 165
},
{
"epoch": 0.6786427145708582,
"grad_norm": 3.1818060874938965,
"learning_rate": 5.742207084349274e-06,
"loss": 1.1999,
"step": 170
},
{
"epoch": 0.6986027944111777,
"grad_norm": 3.237358331680298,
"learning_rate": 5.121403408612672e-06,
"loss": 1.1821,
"step": 175
},
{
"epoch": 0.718562874251497,
"grad_norm": 3.207139015197754,
"learning_rate": 4.524367765074499e-06,
"loss": 1.1617,
"step": 180
},
{
"epoch": 0.7385229540918163,
"grad_norm": 3.0992743968963623,
"learning_rate": 3.954008851376252e-06,
"loss": 1.1629,
"step": 185
},
{
"epoch": 0.7584830339321357,
"grad_norm": 3.1126255989074707,
"learning_rate": 3.4131053988131947e-06,
"loss": 1.1688,
"step": 190
},
{
"epoch": 0.7784431137724551,
"grad_norm": 3.3172667026519775,
"learning_rate": 2.9042926346347932e-06,
"loss": 1.1507,
"step": 195
},
{
"epoch": 0.7984031936127745,
"grad_norm": 3.125807762145996,
"learning_rate": 2.4300494434824373e-06,
"loss": 1.1459,
"step": 200
},
{
"epoch": 0.7984031936127745,
"eval_loss": 1.1677805185317993,
"eval_runtime": 4.6292,
"eval_samples_per_second": 195.067,
"eval_steps_per_second": 6.265,
"step": 200
},
{
"epoch": 0.8183632734530938,
"grad_norm": 3.1806719303131104,
"learning_rate": 1.9926862905126663e-06,
"loss": 1.1508,
"step": 205
},
{
"epoch": 0.8383233532934131,
"grad_norm": 3.2433359622955322,
"learning_rate": 1.5943339650431578e-06,
"loss": 1.1156,
"step": 210
},
{
"epoch": 0.8582834331337326,
"grad_norm": 3.1037845611572266,
"learning_rate": 1.2369331995613664e-06,
"loss": 1.1278,
"step": 215
},
{
"epoch": 0.8782435129740519,
"grad_norm": 3.121793270111084,
"learning_rate": 9.222252146709143e-07,
"loss": 1.1291,
"step": 220
},
{
"epoch": 0.8982035928143712,
"grad_norm": 3.311478614807129,
"learning_rate": 6.517432360398556e-07,
"loss": 1.1606,
"step": 225
},
{
"epoch": 0.9181636726546906,
"grad_norm": 3.1572906970977783,
"learning_rate": 4.268050246793276e-07,
"loss": 1.1376,
"step": 230
},
{
"epoch": 0.93812375249501,
"grad_norm": 3.125819683074951,
"learning_rate": 2.4850645694436736e-07,
"loss": 1.1042,
"step": 235
},
{
"epoch": 0.9580838323353293,
"grad_norm": 3.240495443344116,
"learning_rate": 1.1771618553447217e-07,
"loss": 1.1349,
"step": 240
},
{
"epoch": 0.9780439121756487,
"grad_norm": 3.0710411071777344,
"learning_rate": 3.50714075049563e-08,
"loss": 1.1139,
"step": 245
},
{
"epoch": 0.998003992015968,
"grad_norm": 3.2199409008026123,
"learning_rate": 9.74759906957612e-10,
"loss": 1.1324,
"step": 250
},
{
"epoch": 0.998003992015968,
"step": 250,
"total_flos": 8.741444925364634e+16,
"train_loss": 1.2971151485443115,
"train_runtime": 891.5874,
"train_samples_per_second": 17.966,
"train_steps_per_second": 0.28
}
],
"logging_steps": 5,
"max_steps": 250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.741444925364634e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}