1450 lines
32 KiB
JSON
1450 lines
32 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 1.9880715705765408,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 1000,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.009940357852882704,
|
||
|
|
"grad_norm": 7.53125,
|
||
|
|
"learning_rate": 1e-05,
|
||
|
|
"loss": 10.9235,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.019880715705765408,
|
||
|
|
"grad_norm": 7.125,
|
||
|
|
"learning_rate": 2e-05,
|
||
|
|
"loss": 10.8542,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02982107355864811,
|
||
|
|
"grad_norm": 5.28125,
|
||
|
|
"learning_rate": 3e-05,
|
||
|
|
"loss": 10.6296,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.039761431411530816,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 4e-05,
|
||
|
|
"loss": 10.4421,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04970178926441352,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 5e-05,
|
||
|
|
"loss": 10.3131,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05964214711729622,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 6e-05,
|
||
|
|
"loss": 10.2138,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06958250497017893,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 7.000000000000001e-05,
|
||
|
|
"loss": 10.061,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07952286282306163,
|
||
|
|
"grad_norm": 2.671875,
|
||
|
|
"learning_rate": 8e-05,
|
||
|
|
"loss": 9.9397,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08946322067594434,
|
||
|
|
"grad_norm": 2.625,
|
||
|
|
"learning_rate": 8.999999999999999e-05,
|
||
|
|
"loss": 9.7387,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09940357852882704,
|
||
|
|
"grad_norm": 2.59375,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 9.5704,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10934393638170974,
|
||
|
|
"grad_norm": 2.375,
|
||
|
|
"learning_rate": 0.00011,
|
||
|
|
"loss": 9.3733,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11928429423459244,
|
||
|
|
"grad_norm": 2.34375,
|
||
|
|
"learning_rate": 0.00012,
|
||
|
|
"loss": 9.1876,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12922465208747516,
|
||
|
|
"grad_norm": 1.84375,
|
||
|
|
"learning_rate": 0.00013000000000000002,
|
||
|
|
"loss": 9.041,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13916500994035785,
|
||
|
|
"grad_norm": 1.671875,
|
||
|
|
"learning_rate": 0.00014000000000000001,
|
||
|
|
"loss": 8.8545,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14910536779324055,
|
||
|
|
"grad_norm": 1.53125,
|
||
|
|
"learning_rate": 0.00015,
|
||
|
|
"loss": 8.6955,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15904572564612326,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00016,
|
||
|
|
"loss": 8.5583,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16898608349900596,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00017,
|
||
|
|
"loss": 8.4812,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17892644135188868,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.00017999999999999998,
|
||
|
|
"loss": 8.4265,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18886679920477137,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00019,
|
||
|
|
"loss": 8.4638,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1988071570576541,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0002,
|
||
|
|
"loss": 8.3998,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20874751491053678,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.00021,
|
||
|
|
"loss": 8.3605,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21868787276341947,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.00022,
|
||
|
|
"loss": 8.3752,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2286282306163022,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.00023,
|
||
|
|
"loss": 8.3556,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23856858846918488,
|
||
|
|
"grad_norm": 1.3671875,
|
||
|
|
"learning_rate": 0.00024,
|
||
|
|
"loss": 8.3474,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2485089463220676,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.00025,
|
||
|
|
"loss": 8.2942,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2584493041749503,
|
||
|
|
"grad_norm": 1.65625,
|
||
|
|
"learning_rate": 0.00026000000000000003,
|
||
|
|
"loss": 8.316,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.268389662027833,
|
||
|
|
"grad_norm": 1.5234375,
|
||
|
|
"learning_rate": 0.00027,
|
||
|
|
"loss": 8.2375,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2783300198807157,
|
||
|
|
"grad_norm": 1.734375,
|
||
|
|
"learning_rate": 0.00028000000000000003,
|
||
|
|
"loss": 8.2676,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2882703777335984,
|
||
|
|
"grad_norm": 1.9375,
|
||
|
|
"learning_rate": 0.00029,
|
||
|
|
"loss": 8.1987,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2982107355864811,
|
||
|
|
"grad_norm": 1.96875,
|
||
|
|
"learning_rate": 0.0003,
|
||
|
|
"loss": 8.1847,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3081510934393638,
|
||
|
|
"grad_norm": 1.703125,
|
||
|
|
"learning_rate": 0.00031,
|
||
|
|
"loss": 8.2578,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31809145129224653,
|
||
|
|
"grad_norm": 1.578125,
|
||
|
|
"learning_rate": 0.00032,
|
||
|
|
"loss": 8.1946,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32803180914512925,
|
||
|
|
"grad_norm": 1.5859375,
|
||
|
|
"learning_rate": 0.00033,
|
||
|
|
"loss": 8.1504,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3379721669980119,
|
||
|
|
"grad_norm": 1.640625,
|
||
|
|
"learning_rate": 0.00034,
|
||
|
|
"loss": 8.126,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34791252485089463,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.00035,
|
||
|
|
"loss": 8.0893,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35785288270377735,
|
||
|
|
"grad_norm": 1.75,
|
||
|
|
"learning_rate": 0.00035999999999999997,
|
||
|
|
"loss": 8.0522,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36779324055666,
|
||
|
|
"grad_norm": 1.640625,
|
||
|
|
"learning_rate": 0.00037,
|
||
|
|
"loss": 8.0653,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37773359840954274,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.00038,
|
||
|
|
"loss": 8.0899,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38767395626242546,
|
||
|
|
"grad_norm": 1.5546875,
|
||
|
|
"learning_rate": 0.00039000000000000005,
|
||
|
|
"loss": 8.0308,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3976143141153082,
|
||
|
|
"grad_norm": 1.703125,
|
||
|
|
"learning_rate": 0.0004,
|
||
|
|
"loss": 7.9695,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40755467196819084,
|
||
|
|
"grad_norm": 1.5546875,
|
||
|
|
"learning_rate": 0.00041,
|
||
|
|
"loss": 7.9639,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41749502982107356,
|
||
|
|
"grad_norm": 2.203125,
|
||
|
|
"learning_rate": 0.00042,
|
||
|
|
"loss": 7.9662,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4274353876739563,
|
||
|
|
"grad_norm": 1.65625,
|
||
|
|
"learning_rate": 0.00043,
|
||
|
|
"loss": 7.9049,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43737574552683894,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.00044,
|
||
|
|
"loss": 7.9815,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44731610337972166,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.00045000000000000004,
|
||
|
|
"loss": 7.9026,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4572564612326044,
|
||
|
|
"grad_norm": 1.6484375,
|
||
|
|
"learning_rate": 0.00046,
|
||
|
|
"loss": 7.8753,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4671968190854871,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.00047,
|
||
|
|
"loss": 7.8852,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47713717693836977,
|
||
|
|
"grad_norm": 1.625,
|
||
|
|
"learning_rate": 0.00048,
|
||
|
|
"loss": 7.9331,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4870775347912525,
|
||
|
|
"grad_norm": 1.78125,
|
||
|
|
"learning_rate": 0.00049,
|
||
|
|
"loss": 7.7972,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4970178926441352,
|
||
|
|
"grad_norm": 1.5234375,
|
||
|
|
"learning_rate": 0.0005,
|
||
|
|
"loss": 7.8323,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5069582504970179,
|
||
|
|
"grad_norm": 2.375,
|
||
|
|
"learning_rate": 0.00051,
|
||
|
|
"loss": 7.8246,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5168986083499006,
|
||
|
|
"grad_norm": 1.5546875,
|
||
|
|
"learning_rate": 0.0005200000000000001,
|
||
|
|
"loss": 7.864,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5268389662027833,
|
||
|
|
"grad_norm": 1.46875,
|
||
|
|
"learning_rate": 0.0005300000000000001,
|
||
|
|
"loss": 7.9518,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.536779324055666,
|
||
|
|
"grad_norm": 1.703125,
|
||
|
|
"learning_rate": 0.00054,
|
||
|
|
"loss": 7.7417,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5467196819085487,
|
||
|
|
"grad_norm": 1.5078125,
|
||
|
|
"learning_rate": 0.00055,
|
||
|
|
"loss": 7.7927,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5566600397614314,
|
||
|
|
"grad_norm": 1.546875,
|
||
|
|
"learning_rate": 0.0005600000000000001,
|
||
|
|
"loss": 7.7389,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5666003976143141,
|
||
|
|
"grad_norm": 1.5625,
|
||
|
|
"learning_rate": 0.00057,
|
||
|
|
"loss": 7.7131,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5765407554671969,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.00058,
|
||
|
|
"loss": 7.6988,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5864811133200796,
|
||
|
|
"grad_norm": 1.7421875,
|
||
|
|
"learning_rate": 0.00059,
|
||
|
|
"loss": 7.7104,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5964214711729622,
|
||
|
|
"grad_norm": 1.859375,
|
||
|
|
"learning_rate": 0.0006,
|
||
|
|
"loss": 7.6605,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6063618290258449,
|
||
|
|
"grad_norm": 1.59375,
|
||
|
|
"learning_rate": 0.00061,
|
||
|
|
"loss": 7.7188,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6163021868787276,
|
||
|
|
"grad_norm": 1.734375,
|
||
|
|
"learning_rate": 0.00062,
|
||
|
|
"loss": 7.6618,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6262425447316103,
|
||
|
|
"grad_norm": 1.46875,
|
||
|
|
"learning_rate": 0.00063,
|
||
|
|
"loss": 7.7635,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6361829025844931,
|
||
|
|
"grad_norm": 2.265625,
|
||
|
|
"learning_rate": 0.00064,
|
||
|
|
"loss": 7.6721,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6461232604373758,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0006500000000000001,
|
||
|
|
"loss": 7.6399,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6560636182902585,
|
||
|
|
"grad_norm": 1.578125,
|
||
|
|
"learning_rate": 0.00066,
|
||
|
|
"loss": 7.5828,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6660039761431411,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.00067,
|
||
|
|
"loss": 7.6427,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6759443339960238,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.00068,
|
||
|
|
"loss": 7.4545,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6858846918489065,
|
||
|
|
"grad_norm": 1.59375,
|
||
|
|
"learning_rate": 0.00069,
|
||
|
|
"loss": 7.5338,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6958250497017893,
|
||
|
|
"grad_norm": 1.7265625,
|
||
|
|
"learning_rate": 0.0007,
|
||
|
|
"loss": 7.5311,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.705765407554672,
|
||
|
|
"grad_norm": 2.171875,
|
||
|
|
"learning_rate": 0.00071,
|
||
|
|
"loss": 7.5899,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7157057654075547,
|
||
|
|
"grad_norm": 1.4375,
|
||
|
|
"learning_rate": 0.0007199999999999999,
|
||
|
|
"loss": 7.5128,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7256461232604374,
|
||
|
|
"grad_norm": 1.671875,
|
||
|
|
"learning_rate": 0.00073,
|
||
|
|
"loss": 7.4893,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.73558648111332,
|
||
|
|
"grad_norm": 1.59375,
|
||
|
|
"learning_rate": 0.00074,
|
||
|
|
"loss": 7.5447,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7455268389662028,
|
||
|
|
"grad_norm": 1.609375,
|
||
|
|
"learning_rate": 0.00075,
|
||
|
|
"loss": 7.4271,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7554671968190855,
|
||
|
|
"grad_norm": 1.46875,
|
||
|
|
"learning_rate": 0.00076,
|
||
|
|
"loss": 7.5216,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7654075546719682,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.0007700000000000001,
|
||
|
|
"loss": 7.4923,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7753479125248509,
|
||
|
|
"grad_norm": 1.4453125,
|
||
|
|
"learning_rate": 0.0007800000000000001,
|
||
|
|
"loss": 7.4305,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7852882703777336,
|
||
|
|
"grad_norm": 1.484375,
|
||
|
|
"learning_rate": 0.00079,
|
||
|
|
"loss": 7.4223,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7952286282306164,
|
||
|
|
"grad_norm": 1.5703125,
|
||
|
|
"learning_rate": 0.0008,
|
||
|
|
"loss": 7.4875,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.805168986083499,
|
||
|
|
"grad_norm": 1.4921875,
|
||
|
|
"learning_rate": 0.0008100000000000001,
|
||
|
|
"loss": 7.4465,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8151093439363817,
|
||
|
|
"grad_norm": 1.5625,
|
||
|
|
"learning_rate": 0.00082,
|
||
|
|
"loss": 7.4046,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8250497017892644,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.00083,
|
||
|
|
"loss": 7.384,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8349900596421471,
|
||
|
|
"grad_norm": 1.5546875,
|
||
|
|
"learning_rate": 0.00084,
|
||
|
|
"loss": 7.3318,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8449304174950298,
|
||
|
|
"grad_norm": 1.421875,
|
||
|
|
"learning_rate": 0.00085,
|
||
|
|
"loss": 7.4465,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8548707753479126,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 0.00086,
|
||
|
|
"loss": 7.3554,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8648111332007953,
|
||
|
|
"grad_norm": 1.5078125,
|
||
|
|
"learning_rate": 0.00087,
|
||
|
|
"loss": 7.348,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8747514910536779,
|
||
|
|
"grad_norm": 1.4921875,
|
||
|
|
"learning_rate": 0.00088,
|
||
|
|
"loss": 7.3536,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8846918489065606,
|
||
|
|
"grad_norm": 1.59375,
|
||
|
|
"learning_rate": 0.0008900000000000001,
|
||
|
|
"loss": 7.3074,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8946322067594433,
|
||
|
|
"grad_norm": 1.546875,
|
||
|
|
"learning_rate": 0.0009000000000000001,
|
||
|
|
"loss": 7.4301,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.904572564612326,
|
||
|
|
"grad_norm": 1.5625,
|
||
|
|
"learning_rate": 0.00091,
|
||
|
|
"loss": 7.2948,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9145129224652088,
|
||
|
|
"grad_norm": 1.6953125,
|
||
|
|
"learning_rate": 0.00092,
|
||
|
|
"loss": 7.4022,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9244532803180915,
|
||
|
|
"grad_norm": 1.78125,
|
||
|
|
"learning_rate": 0.00093,
|
||
|
|
"loss": 7.3491,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9343936381709742,
|
||
|
|
"grad_norm": 1.4140625,
|
||
|
|
"learning_rate": 0.00094,
|
||
|
|
"loss": 7.3304,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9443339960238568,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 0.00095,
|
||
|
|
"loss": 7.3213,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9542743538767395,
|
||
|
|
"grad_norm": 1.53125,
|
||
|
|
"learning_rate": 0.00096,
|
||
|
|
"loss": 7.3184,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9642147117296223,
|
||
|
|
"grad_norm": 1.4609375,
|
||
|
|
"learning_rate": 0.0009699999999999999,
|
||
|
|
"loss": 7.2904,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.974155069582505,
|
||
|
|
"grad_norm": 1.3671875,
|
||
|
|
"learning_rate": 0.00098,
|
||
|
|
"loss": 7.2904,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9840954274353877,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.00099,
|
||
|
|
"loss": 7.2536,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9940357852882704,
|
||
|
|
"grad_norm": 1.859375,
|
||
|
|
"learning_rate": 0.001,
|
||
|
|
"loss": 7.2545,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9940357852882704,
|
||
|
|
"eval_loss": 7.392611503601074,
|
||
|
|
"eval_runtime": 0.9938,
|
||
|
|
"eval_samples_per_second": 3486.498,
|
||
|
|
"eval_steps_per_second": 436.693,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0039761431411531,
|
||
|
|
"grad_norm": 1.546875,
|
||
|
|
"learning_rate": 0.0009999972946377045,
|
||
|
|
"loss": 7.1713,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0139165009940359,
|
||
|
|
"grad_norm": 1.5390625,
|
||
|
|
"learning_rate": 0.0009999891785833469,
|
||
|
|
"loss": 7.0401,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0238568588469186,
|
||
|
|
"grad_norm": 1.40625,
|
||
|
|
"learning_rate": 0.0009999756519345133,
|
||
|
|
"loss": 7.0191,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0337972166998013,
|
||
|
|
"grad_norm": 1.453125,
|
||
|
|
"learning_rate": 0.0009999567148538456,
|
||
|
|
"loss": 7.0774,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0437375745526838,
|
||
|
|
"grad_norm": 1.5703125,
|
||
|
|
"learning_rate": 0.0009999323675690406,
|
||
|
|
"loss": 7.1122,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0536779324055665,
|
||
|
|
"grad_norm": 1.65625,
|
||
|
|
"learning_rate": 0.0009999026103728454,
|
||
|
|
"loss": 7.0297,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0636182902584492,
|
||
|
|
"grad_norm": 1.6875,
|
||
|
|
"learning_rate": 0.0009998674436230558,
|
||
|
|
"loss": 7.0478,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.073558648111332,
|
||
|
|
"grad_norm": 1.5234375,
|
||
|
|
"learning_rate": 0.000999826867742511,
|
||
|
|
"loss": 7.0749,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0834990059642147,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.0009997808832190884,
|
||
|
|
"loss": 6.9982,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0934393638170974,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.0009997294906056982,
|
||
|
|
"loss": 7.0269,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.10337972166998,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.000999672690520277,
|
||
|
|
"loss": 7.0031,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1133200795228628,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.000999610483645779,
|
||
|
|
"loss": 6.9229,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1232604373757455,
|
||
|
|
"grad_norm": 1.421875,
|
||
|
|
"learning_rate": 0.0009995428707301694,
|
||
|
|
"loss": 6.989,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1332007952286283,
|
||
|
|
"grad_norm": 1.421875,
|
||
|
|
"learning_rate": 0.0009994698525864147,
|
||
|
|
"loss": 7.0723,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.143141153081511,
|
||
|
|
"grad_norm": 1.453125,
|
||
|
|
"learning_rate": 0.0009993914300924726,
|
||
|
|
"loss": 7.0914,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1530815109343937,
|
||
|
|
"grad_norm": 1.4609375,
|
||
|
|
"learning_rate": 0.000999307604191282,
|
||
|
|
"loss": 6.9886,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1630218687872764,
|
||
|
|
"grad_norm": 1.4609375,
|
||
|
|
"learning_rate": 0.0009992183758907518,
|
||
|
|
"loss": 6.993,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1729622266401591,
|
||
|
|
"grad_norm": 1.4453125,
|
||
|
|
"learning_rate": 0.0009991237462637478,
|
||
|
|
"loss": 6.9879,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1829025844930419,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.000999023716448081,
|
||
|
|
"loss": 7.034,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1928429423459244,
|
||
|
|
"grad_norm": 1.5390625,
|
||
|
|
"learning_rate": 0.0009989182876464931,
|
||
|
|
"loss": 6.9752,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.202783300198807,
|
||
|
|
"grad_norm": 1.546875,
|
||
|
|
"learning_rate": 0.0009988074611266423,
|
||
|
|
"loss": 6.8754,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2127236580516898,
|
||
|
|
"grad_norm": 1.4453125,
|
||
|
|
"learning_rate": 0.000998691238221088,
|
||
|
|
"loss": 6.9923,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2226640159045725,
|
||
|
|
"grad_norm": 1.375,
|
||
|
|
"learning_rate": 0.0009985696203272752,
|
||
|
|
"loss": 6.885,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2326043737574552,
|
||
|
|
"grad_norm": 1.53125,
|
||
|
|
"learning_rate": 0.0009984426089075168,
|
||
|
|
"loss": 6.9113,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.242544731610338,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.000998310205488977,
|
||
|
|
"loss": 6.9467,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2524850894632207,
|
||
|
|
"grad_norm": 1.421875,
|
||
|
|
"learning_rate": 0.0009981724116636525,
|
||
|
|
"loss": 6.91,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2624254473161034,
|
||
|
|
"grad_norm": 1.390625,
|
||
|
|
"learning_rate": 0.0009980292290883526,
|
||
|
|
"loss": 6.9814,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2723658051689861,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.000997880659484681,
|
||
|
|
"loss": 6.9393,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2823061630218688,
|
||
|
|
"grad_norm": 1.5234375,
|
||
|
|
"learning_rate": 0.0009977267046390138,
|
||
|
|
"loss": 6.9344,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2922465208747516,
|
||
|
|
"grad_norm": 1.484375,
|
||
|
|
"learning_rate": 0.000997567366402478,
|
||
|
|
"loss": 6.8575,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.302186878727634,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.0009974026466909299,
|
||
|
|
"loss": 6.85,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3121272365805168,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.000997232547484932,
|
||
|
|
"loss": 6.9196,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3220675944333995,
|
||
|
|
"grad_norm": 1.6875,
|
||
|
|
"learning_rate": 0.0009970570708297281,
|
||
|
|
"loss": 6.8259,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3320079522862822,
|
||
|
|
"grad_norm": 1.5390625,
|
||
|
|
"learning_rate": 0.0009968762188352208,
|
||
|
|
"loss": 6.8472,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.341948310139165,
|
||
|
|
"grad_norm": 1.3828125,
|
||
|
|
"learning_rate": 0.0009966899936759436,
|
||
|
|
"loss": 6.8573,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3518886679920477,
|
||
|
|
"grad_norm": 2.78125,
|
||
|
|
"learning_rate": 0.0009964983975910369,
|
||
|
|
"loss": 6.9833,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3618290258449304,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.0009963014328842196,
|
||
|
|
"loss": 6.9976,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.371769383697813,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 0.0009960991019237627,
|
||
|
|
"loss": 6.8598,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3817097415506958,
|
||
|
|
"grad_norm": 1.359375,
|
||
|
|
"learning_rate": 0.0009958914071424596,
|
||
|
|
"loss": 6.8171,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3916500994035785,
|
||
|
|
"grad_norm": 1.578125,
|
||
|
|
"learning_rate": 0.0009956783510375975,
|
||
|
|
"loss": 6.8734,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4015904572564613,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.0009954599361709276,
|
||
|
|
"loss": 6.8877,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.411530815109344,
|
||
|
|
"grad_norm": 1.34375,
|
||
|
|
"learning_rate": 0.0009952361651686331,
|
||
|
|
"loss": 6.7897,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4214711729622267,
|
||
|
|
"grad_norm": 1.5546875,
|
||
|
|
"learning_rate": 0.0009950070407212996,
|
||
|
|
"loss": 6.9605,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4314115308151094,
|
||
|
|
"grad_norm": 1.4140625,
|
||
|
|
"learning_rate": 0.0009947725655838806,
|
||
|
|
"loss": 6.8834,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4413518886679921,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.0009945327425756661,
|
||
|
|
"loss": 6.8195,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4512922465208749,
|
||
|
|
"grad_norm": 1.3828125,
|
||
|
|
"learning_rate": 0.000994287574580248,
|
||
|
|
"loss": 6.8148,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4612326043737576,
|
||
|
|
"grad_norm": 1.421875,
|
||
|
|
"learning_rate": 0.0009940370645454848,
|
||
|
|
"loss": 6.8626,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4711729622266403,
|
||
|
|
"grad_norm": 1.453125,
|
||
|
|
"learning_rate": 0.000993781215483467,
|
||
|
|
"loss": 6.8765,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4811133200795228,
|
||
|
|
"grad_norm": 1.46875,
|
||
|
|
"learning_rate": 0.0009935200304704815,
|
||
|
|
"loss": 6.7831,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4910536779324055,
|
||
|
|
"grad_norm": 1.4765625,
|
||
|
|
"learning_rate": 0.0009932535126469725,
|
||
|
|
"loss": 6.8274,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5009940357852882,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.0009929816652175063,
|
||
|
|
"loss": 6.8189,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.510934393638171,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.00099270449145073,
|
||
|
|
"loss": 6.7934,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5208747514910537,
|
||
|
|
"grad_norm": 1.4140625,
|
||
|
|
"learning_rate": 0.0009924219946793353,
|
||
|
|
"loss": 6.6405,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5308151093439364,
|
||
|
|
"grad_norm": 1.71875,
|
||
|
|
"learning_rate": 0.0009921341783000158,
|
||
|
|
"loss": 6.6862,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.540755467196819,
|
||
|
|
"grad_norm": 1.3671875,
|
||
|
|
"learning_rate": 0.000991841045773427,
|
||
|
|
"loss": 6.7518,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5506958250497018,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 0.000991542600624146,
|
||
|
|
"loss": 6.7292,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5606361829025845,
|
||
|
|
"grad_norm": 1.546875,
|
||
|
|
"learning_rate": 0.0009912388464406265,
|
||
|
|
"loss": 6.7062,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.570576540755467,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.0009909297868751585,
|
||
|
|
"loss": 6.6082,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5805168986083498,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.0009906154256438223,
|
||
|
|
"loss": 6.7426,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5904572564612325,
|
||
|
|
"grad_norm": 1.484375,
|
||
|
|
"learning_rate": 0.0009902957665264443,
|
||
|
|
"loss": 6.8086,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6003976143141152,
|
||
|
|
"grad_norm": 1.78125,
|
||
|
|
"learning_rate": 0.0009899708133665529,
|
||
|
|
"loss": 6.736,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.610337972166998,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.0009896405700713295,
|
||
|
|
"loss": 6.7488,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6202783300198806,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.000989305040611565,
|
||
|
|
"loss": 6.7246,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6302186878727634,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.0009889642290216085,
|
||
|
|
"loss": 6.7968,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.640159045725646,
|
||
|
|
"grad_norm": 1.4453125,
|
||
|
|
"learning_rate": 0.0009886181393993223,
|
||
|
|
"loss": 6.6922,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6500994035785288,
|
||
|
|
"grad_norm": 1.4140625,
|
||
|
|
"learning_rate": 0.0009882667759060298,
|
||
|
|
"loss": 6.6635,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6600397614314115,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.0009879101427664662,
|
||
|
|
"loss": 6.6233,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6699801192842942,
|
||
|
|
"grad_norm": 1.375,
|
||
|
|
"learning_rate": 0.0009875482442687294,
|
||
|
|
"loss": 6.7173,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.679920477137177,
|
||
|
|
"grad_norm": 2.03125,
|
||
|
|
"learning_rate": 0.0009871810847642258,
|
||
|
|
"loss": 6.7099,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6898608349900597,
|
||
|
|
"grad_norm": 1.453125,
|
||
|
|
"learning_rate": 0.00098680866866762,
|
||
|
|
"loss": 6.6863,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6998011928429424,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.0009864310004567807,
|
||
|
|
"loss": 6.728,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7097415506958251,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 0.000986048084672727,
|
||
|
|
"loss": 6.6503,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7196819085487078,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.0009856599259195741,
|
||
|
|
"loss": 6.6758,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7296222664015906,
|
||
|
|
"grad_norm": 1.4609375,
|
||
|
|
"learning_rate": 0.0009852665288644783,
|
||
|
|
"loss": 6.6894,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7395626242544733,
|
||
|
|
"grad_norm": 1.5078125,
|
||
|
|
"learning_rate": 0.000984867898237579,
|
||
|
|
"loss": 6.6299,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.749502982107356,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.000984464038831945,
|
||
|
|
"loss": 6.6652,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7594433399602387,
|
||
|
|
"grad_norm": 1.53125,
|
||
|
|
"learning_rate": 0.0009840549555035136,
|
||
|
|
"loss": 6.6375,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7693836978131214,
|
||
|
|
"grad_norm": 1.34375,
|
||
|
|
"learning_rate": 0.0009836406531710342,
|
||
|
|
"loss": 6.6245,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.779324055666004,
|
||
|
|
"grad_norm": 1.359375,
|
||
|
|
"learning_rate": 0.0009832211368160087,
|
||
|
|
"loss": 6.6434,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7892644135188867,
|
||
|
|
"grad_norm": 1.4375,
|
||
|
|
"learning_rate": 0.0009827964114826314,
|
||
|
|
"loss": 6.5907,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7992047713717694,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.0009823664822777285,
|
||
|
|
"loss": 6.6743,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.809145129224652,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.000981931354370697,
|
||
|
|
"loss": 6.6238,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8190854870775348,
|
||
|
|
"grad_norm": 1.3828125,
|
||
|
|
"learning_rate": 0.0009814910329934414,
|
||
|
|
"loss": 6.5983,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8290258449304175,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.0009810455234403126,
|
||
|
|
"loss": 6.6457,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8389662027833003,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.000980594831068043,
|
||
|
|
"loss": 6.4873,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8489065606361827,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0009801389612956815,
|
||
|
|
"loss": 6.5629,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8588469184890655,
|
||
|
|
"grad_norm": 1.4921875,
|
||
|
|
"learning_rate": 0.0009796779196045303,
|
||
|
|
"loss": 6.6765,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8687872763419482,
|
||
|
|
"grad_norm": 1.5234375,
|
||
|
|
"learning_rate": 0.0009792117115380774,
|
||
|
|
"loss": 6.5999,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.878727634194831,
|
||
|
|
"grad_norm": 1.390625,
|
||
|
|
"learning_rate": 0.0009787403427019303,
|
||
|
|
"loss": 6.6639,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8886679920477136,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.000978263818763749,
|
||
|
|
"loss": 6.6352,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8986083499005963,
|
||
|
|
"grad_norm": 1.5859375,
|
||
|
|
"learning_rate": 0.0009777821454531775,
|
||
|
|
"loss": 6.6011,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.908548707753479,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.0009772953285617748,
|
||
|
|
"loss": 6.5817,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9184890656063618,
|
||
|
|
"grad_norm": 1.40625,
|
||
|
|
"learning_rate": 0.0009768033739429459,
|
||
|
|
"loss": 6.6113,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9284294234592445,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.0009763062875118706,
|
||
|
|
"loss": 6.5931,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9383697813121272,
|
||
|
|
"grad_norm": 1.46875,
|
||
|
|
"learning_rate": 0.0009758040752454326,
|
||
|
|
"loss": 6.6421,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.94831013916501,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0009752967431821485,
|
||
|
|
"loss": 6.6209,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9582504970178927,
|
||
|
|
"grad_norm": 1.515625,
|
||
|
|
"learning_rate": 0.0009747842974220936,
|
||
|
|
"loss": 6.5526,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9681908548707754,
|
||
|
|
"grad_norm": 1.4609375,
|
||
|
|
"learning_rate": 0.00097426674412683,
|
||
|
|
"loss": 6.6085,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.978131212723658,
|
||
|
|
"grad_norm": 1.40625,
|
||
|
|
"learning_rate": 0.0009737440895193317,
|
||
|
|
"loss": 6.548,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9880715705765408,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.0009732163398839106,
|
||
|
|
"loss": 6.5648,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9880715705765408,
|
||
|
|
"eval_loss": 6.81672477722168,
|
||
|
|
"eval_runtime": 0.9933,
|
||
|
|
"eval_samples_per_second": 3488.505,
|
||
|
|
"eval_steps_per_second": 436.944,
|
||
|
|
"step": 1000
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 5030,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 1000,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 1380849812951040.0,
|
||
|
|
"train_batch_size": 32,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|