458 lines
10 KiB
JSON
458 lines
10 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 10,
|
|
"global_step": 54,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.018823529411764704,
|
|
"grad_norm": 18.0,
|
|
"learning_rate": 0.0,
|
|
"loss": 2.6849,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.03764705882352941,
|
|
"grad_norm": 11.4375,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 1.7743,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.05647058823529412,
|
|
"grad_norm": 12.75,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 1.8629,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.07529411764705882,
|
|
"grad_norm": 17.25,
|
|
"learning_rate": 6e-06,
|
|
"loss": 2.6392,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.09411764705882353,
|
|
"grad_norm": 14.375,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 2.224,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.11294117647058824,
|
|
"grad_norm": 14.3125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 2.3093,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.13176470588235295,
|
|
"grad_norm": 14.0625,
|
|
"learning_rate": 9.795918367346939e-06,
|
|
"loss": 2.2673,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.15058823529411763,
|
|
"grad_norm": 9.875,
|
|
"learning_rate": 9.591836734693878e-06,
|
|
"loss": 1.6974,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.16941176470588235,
|
|
"grad_norm": 11.25,
|
|
"learning_rate": 9.387755102040818e-06,
|
|
"loss": 1.7583,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.18823529411764706,
|
|
"grad_norm": 6.75,
|
|
"learning_rate": 9.183673469387756e-06,
|
|
"loss": 1.6678,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.18823529411764706,
|
|
"eval_loss": 1.6175613403320312,
|
|
"eval_model_preparation_time": 0.0245,
|
|
"eval_runtime": 7.2192,
|
|
"eval_samples_per_second": 29.505,
|
|
"eval_steps_per_second": 14.822,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.20705882352941177,
|
|
"grad_norm": 6.875,
|
|
"learning_rate": 8.979591836734695e-06,
|
|
"loss": 1.3797,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.22588235294117648,
|
|
"grad_norm": 6.875,
|
|
"learning_rate": 8.775510204081633e-06,
|
|
"loss": 1.291,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.2447058823529412,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 8.571428571428571e-06,
|
|
"loss": 1.4359,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.2635294117647059,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 8.36734693877551e-06,
|
|
"loss": 1.5594,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.2823529411764706,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 8.16326530612245e-06,
|
|
"loss": 1.2485,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.30117647058823527,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 7.959183673469388e-06,
|
|
"loss": 0.9712,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 7.755102040816327e-06,
|
|
"loss": 1.316,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.3388235294117647,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 7.551020408163265e-06,
|
|
"loss": 0.8954,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.35764705882352943,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 7.346938775510205e-06,
|
|
"loss": 1.1975,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.3764705882352941,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 7.1428571428571436e-06,
|
|
"loss": 1.1925,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.3764705882352941,
|
|
"eval_loss": 1.2663090229034424,
|
|
"eval_model_preparation_time": 0.0245,
|
|
"eval_runtime": 7.791,
|
|
"eval_samples_per_second": 27.339,
|
|
"eval_steps_per_second": 13.734,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.3952941176470588,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 6.938775510204082e-06,
|
|
"loss": 1.3386,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.41411764705882353,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 6.734693877551021e-06,
|
|
"loss": 1.1289,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.4329411764705882,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 6.530612244897959e-06,
|
|
"loss": 0.8379,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.45176470588235296,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 6.326530612244899e-06,
|
|
"loss": 1.0583,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.47058823529411764,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 6.122448979591837e-06,
|
|
"loss": 1.1262,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.4894117647058824,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 5.918367346938776e-06,
|
|
"loss": 1.052,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.508235294117647,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 5.7142857142857145e-06,
|
|
"loss": 1.2485,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.5270588235294118,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 5.510204081632653e-06,
|
|
"loss": 1.1587,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.5458823529411765,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 5.306122448979593e-06,
|
|
"loss": 1.015,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.5647058823529412,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 5.1020408163265315e-06,
|
|
"loss": 1.3413,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.5647058823529412,
|
|
"eval_loss": 1.1769490242004395,
|
|
"eval_model_preparation_time": 0.0245,
|
|
"eval_runtime": 7.0483,
|
|
"eval_samples_per_second": 30.22,
|
|
"eval_steps_per_second": 15.181,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.5835294117647059,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 4.897959183673469e-06,
|
|
"loss": 0.9845,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.6023529411764705,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 4.693877551020409e-06,
|
|
"loss": 1.2283,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.6211764705882353,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 4.489795918367348e-06,
|
|
"loss": 1.2924,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 4.2857142857142855e-06,
|
|
"loss": 0.8717,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.6588235294117647,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 4.081632653061225e-06,
|
|
"loss": 1.0587,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.6776470588235294,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 3.877551020408164e-06,
|
|
"loss": 1.1701,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.6964705882352941,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 3.6734693877551024e-06,
|
|
"loss": 1.0312,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.7152941176470589,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 3.469387755102041e-06,
|
|
"loss": 1.2946,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.7341176470588235,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 3.2653061224489794e-06,
|
|
"loss": 1.1724,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.7529411764705882,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 3.0612244897959185e-06,
|
|
"loss": 0.889,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.7529411764705882,
|
|
"eval_loss": 1.1430858373641968,
|
|
"eval_model_preparation_time": 0.0245,
|
|
"eval_runtime": 7.0198,
|
|
"eval_samples_per_second": 30.343,
|
|
"eval_steps_per_second": 15.243,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.7717647058823529,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 2.8571428571428573e-06,
|
|
"loss": 0.9574,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.7905882352941176,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 2.6530612244897964e-06,
|
|
"loss": 1.0509,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.8094117647058824,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 2.4489795918367347e-06,
|
|
"loss": 1.002,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.8282352941176471,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 2.244897959183674e-06,
|
|
"loss": 1.1657,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.8470588235294118,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 2.0408163265306125e-06,
|
|
"loss": 0.9907,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.8658823529411764,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 1.8367346938775512e-06,
|
|
"loss": 0.9649,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.8847058823529412,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 1.6326530612244897e-06,
|
|
"loss": 1.0064,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.9035294117647059,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 1.4285714285714286e-06,
|
|
"loss": 0.9563,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.9223529411764706,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 1.2244897959183673e-06,
|
|
"loss": 1.359,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.9411764705882353,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 1.0204081632653063e-06,
|
|
"loss": 0.8387,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.9411764705882353,
|
|
"eval_loss": 1.1291884183883667,
|
|
"eval_model_preparation_time": 0.0245,
|
|
"eval_runtime": 7.0742,
|
|
"eval_samples_per_second": 30.11,
|
|
"eval_steps_per_second": 15.125,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 8.163265306122449e-07,
|
|
"loss": 1.0581,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.9788235294117648,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 6.122448979591837e-07,
|
|
"loss": 0.8892,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.9976470588235294,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 4.0816326530612243e-07,
|
|
"loss": 1.3932,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 11.875,
|
|
"learning_rate": 2.0408163265306121e-07,
|
|
"loss": 0.9602,
|
|
"step": 54
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 54,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 5000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9865654906564608.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|