7114 lines
161 KiB
JSON
7114 lines
161 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 9.940357852882704,
|
|
"eval_steps": 500,
|
|
"global_step": 5000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.009940357852882704,
|
|
"grad_norm": 8.1875,
|
|
"learning_rate": 1e-05,
|
|
"loss": 10.9381,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.019880715705765408,
|
|
"grad_norm": 7.4375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 10.8784,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.02982107355864811,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 3e-05,
|
|
"loss": 10.6618,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.039761431411530816,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 4e-05,
|
|
"loss": 10.4488,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.04970178926441352,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 10.3286,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.05964214711729622,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 6e-05,
|
|
"loss": 10.2218,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.06958250497017893,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 7.000000000000001e-05,
|
|
"loss": 10.0546,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.07952286282306163,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 8e-05,
|
|
"loss": 9.9396,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.08946322067594434,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 8.999999999999999e-05,
|
|
"loss": 9.7561,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.09940357852882704,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 0.0001,
|
|
"loss": 9.5594,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.10934393638170974,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00011,
|
|
"loss": 9.4019,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.11928429423459244,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.00012,
|
|
"loss": 9.2211,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.12922465208747516,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00013000000000000002,
|
|
"loss": 9.0177,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.13916500994035785,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00014000000000000001,
|
|
"loss": 8.9211,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.14910536779324055,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00015,
|
|
"loss": 8.6806,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.15904572564612326,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00016,
|
|
"loss": 8.5648,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.16898608349900596,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00017,
|
|
"loss": 8.5182,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.17892644135188868,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00017999999999999998,
|
|
"loss": 8.4601,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.18886679920477137,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00019,
|
|
"loss": 8.4077,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.1988071570576541,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0002,
|
|
"loss": 8.3882,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.20874751491053678,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00021,
|
|
"loss": 8.3702,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.21868787276341947,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00022,
|
|
"loss": 8.3917,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.2286282306163022,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00023,
|
|
"loss": 8.3738,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.23856858846918488,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00024,
|
|
"loss": 8.453,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.2485089463220676,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00025,
|
|
"loss": 8.2373,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.2584493041749503,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00026000000000000003,
|
|
"loss": 8.2437,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.268389662027833,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00027,
|
|
"loss": 8.2662,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.2783300198807157,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00028000000000000003,
|
|
"loss": 8.2467,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2882703777335984,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00029,
|
|
"loss": 8.1932,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.2982107355864811,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0003,
|
|
"loss": 8.1642,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.3081510934393638,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00031,
|
|
"loss": 8.1923,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.31809145129224653,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00032,
|
|
"loss": 8.1708,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.32803180914512925,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00033,
|
|
"loss": 8.2046,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.3379721669980119,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00034,
|
|
"loss": 8.1551,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.34791252485089463,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00035,
|
|
"loss": 8.1342,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.35785288270377735,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00035999999999999997,
|
|
"loss": 8.0606,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.36779324055666,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00037,
|
|
"loss": 8.1021,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.37773359840954274,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00038,
|
|
"loss": 8.0288,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.38767395626242546,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00039000000000000005,
|
|
"loss": 7.9732,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.3976143141153082,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 0.0004,
|
|
"loss": 7.9827,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.40755467196819084,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.00041,
|
|
"loss": 8.052,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.41749502982107356,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00042,
|
|
"loss": 8.0189,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.4274353876739563,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 0.00043,
|
|
"loss": 8.0269,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.43737574552683894,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00044,
|
|
"loss": 8.0191,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.44731610337972166,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00045000000000000004,
|
|
"loss": 7.9266,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.4572564612326044,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00046,
|
|
"loss": 7.8376,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.4671968190854871,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00047,
|
|
"loss": 7.9162,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.47713717693836977,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 0.00048,
|
|
"loss": 7.9084,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.4870775347912525,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00049,
|
|
"loss": 7.886,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.4970178926441352,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0005,
|
|
"loss": 7.8392,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.5069582504970179,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00051,
|
|
"loss": 7.8265,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.5168986083499006,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0005200000000000001,
|
|
"loss": 7.8021,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.5268389662027833,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0005300000000000001,
|
|
"loss": 7.7635,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.536779324055666,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00054,
|
|
"loss": 7.7854,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.5467196819085487,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00055,
|
|
"loss": 7.7803,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.5566600397614314,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0005600000000000001,
|
|
"loss": 7.745,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.5666003976143141,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00057,
|
|
"loss": 7.7378,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.5765407554671969,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00058,
|
|
"loss": 7.7688,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.5864811133200796,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00059,
|
|
"loss": 7.6103,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.5964214711729622,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0006,
|
|
"loss": 7.6106,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.6063618290258449,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.00061,
|
|
"loss": 7.7767,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.6163021868787276,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00062,
|
|
"loss": 7.705,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.6262425447316103,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00063,
|
|
"loss": 7.681,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.6361829025844931,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00064,
|
|
"loss": 7.6629,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.6461232604373758,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0006500000000000001,
|
|
"loss": 7.6692,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.6560636182902585,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00066,
|
|
"loss": 7.5971,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.6660039761431411,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00067,
|
|
"loss": 7.6274,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.6759443339960238,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00068,
|
|
"loss": 7.518,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.6858846918489065,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00069,
|
|
"loss": 7.6237,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.6958250497017893,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0007,
|
|
"loss": 7.6114,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.705765407554672,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00071,
|
|
"loss": 7.6391,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.7157057654075547,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0007199999999999999,
|
|
"loss": 7.5411,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.7256461232604374,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00073,
|
|
"loss": 7.6202,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.73558648111332,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00074,
|
|
"loss": 7.5403,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.7455268389662028,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00075,
|
|
"loss": 7.5464,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.7554671968190855,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00076,
|
|
"loss": 7.5237,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.7654075546719682,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0007700000000000001,
|
|
"loss": 7.4257,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.7753479125248509,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0007800000000000001,
|
|
"loss": 7.458,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.7852882703777336,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00079,
|
|
"loss": 7.3689,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.7952286282306164,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0008,
|
|
"loss": 7.4212,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.805168986083499,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0008100000000000001,
|
|
"loss": 7.4605,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.8151093439363817,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00082,
|
|
"loss": 7.4011,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.8250497017892644,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00083,
|
|
"loss": 7.4494,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.8349900596421471,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00084,
|
|
"loss": 7.3044,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.8449304174950298,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00085,
|
|
"loss": 7.4261,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.8548707753479126,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00086,
|
|
"loss": 7.3506,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.8648111332007953,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00087,
|
|
"loss": 7.3579,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.8747514910536779,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00088,
|
|
"loss": 7.3574,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.8846918489065606,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0008900000000000001,
|
|
"loss": 7.3697,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.8946322067594433,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0009000000000000001,
|
|
"loss": 7.359,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.904572564612326,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00091,
|
|
"loss": 7.3429,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.9145129224652088,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00092,
|
|
"loss": 7.3306,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.9244532803180915,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00093,
|
|
"loss": 7.3062,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.9343936381709742,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00094,
|
|
"loss": 7.3062,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.9443339960238568,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00095,
|
|
"loss": 7.3399,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.9542743538767395,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00096,
|
|
"loss": 7.3344,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.9642147117296223,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0009699999999999999,
|
|
"loss": 7.3416,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.974155069582505,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00098,
|
|
"loss": 7.2912,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.9840954274353877,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00099,
|
|
"loss": 7.1231,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.9940357852882704,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.001,
|
|
"loss": 7.2644,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.9940357852882704,
|
|
"eval_loss": 7.379269123077393,
|
|
"eval_runtime": 1.0043,
|
|
"eval_samples_per_second": 3450.268,
|
|
"eval_steps_per_second": 432.155,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.0039761431411531,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009999972946377045,
|
|
"loss": 7.1732,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 1.0139165009940359,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0009999891785833469,
|
|
"loss": 7.1168,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 1.0238568588469186,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0009999756519345133,
|
|
"loss": 7.078,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 1.0337972166998013,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009999567148538456,
|
|
"loss": 7.0055,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.0437375745526838,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0009999323675690406,
|
|
"loss": 7.0395,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 1.0536779324055665,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0009999026103728454,
|
|
"loss": 7.0092,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 1.0636182902584492,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0009998674436230558,
|
|
"loss": 7.098,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 1.073558648111332,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000999826867742511,
|
|
"loss": 7.0573,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.0834990059642147,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0009997808832190884,
|
|
"loss": 7.0814,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 1.0934393638170974,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009997294906056982,
|
|
"loss": 7.0954,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.10337972166998,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000999672690520277,
|
|
"loss": 7.0448,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 1.1133200795228628,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000999610483645779,
|
|
"loss": 7.011,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.1232604373757455,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0009995428707301694,
|
|
"loss": 7.0098,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 1.1332007952286283,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0009994698525864147,
|
|
"loss": 7.0345,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.143141153081511,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0009993914300924726,
|
|
"loss": 6.9947,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 1.1530815109343937,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000999307604191282,
|
|
"loss": 6.9929,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.1630218687872764,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0009992183758907518,
|
|
"loss": 7.0264,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 1.1729622266401591,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009991237462637478,
|
|
"loss": 7.0081,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.1829025844930419,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000999023716448081,
|
|
"loss": 6.9429,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 1.1928429423459244,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009989182876464931,
|
|
"loss": 6.9688,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.202783300198807,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009988074611266423,
|
|
"loss": 6.9687,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 1.2127236580516898,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000998691238221088,
|
|
"loss": 7.0143,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 1.2226640159045725,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0009985696203272752,
|
|
"loss": 6.9389,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 1.2326043737574552,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0009984426089075168,
|
|
"loss": 6.9601,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.242544731610338,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000998310205488977,
|
|
"loss": 7.0213,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 1.2524850894632207,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0009981724116636525,
|
|
"loss": 6.9925,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 1.2624254473161034,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0009980292290883526,
|
|
"loss": 6.9383,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 1.2723658051689861,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000997880659484681,
|
|
"loss": 6.9755,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.2823061630218688,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009977267046390138,
|
|
"loss": 6.9515,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 1.2922465208747516,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000997567366402478,
|
|
"loss": 6.9614,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 1.302186878727634,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009974026466909299,
|
|
"loss": 6.8975,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 1.3121272365805168,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000997232547484932,
|
|
"loss": 6.9208,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.3220675944333995,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009970570708297281,
|
|
"loss": 6.9917,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 1.3320079522862822,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009968762188352208,
|
|
"loss": 6.8237,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 1.341948310139165,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009966899936759436,
|
|
"loss": 6.9117,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 1.3518886679920477,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009964983975910369,
|
|
"loss": 6.8536,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.3618290258449304,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009963014328842196,
|
|
"loss": 6.9004,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 1.371769383697813,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0009960991019237627,
|
|
"loss": 6.8176,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 1.3817097415506958,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009958914071424596,
|
|
"loss": 6.8095,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 1.3916500994035785,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009956783510375975,
|
|
"loss": 6.8506,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.4015904572564613,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009954599361709276,
|
|
"loss": 6.8532,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 1.411530815109344,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009952361651686331,
|
|
"loss": 6.8656,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.4214711729622267,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009950070407212996,
|
|
"loss": 6.8219,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 1.4314115308151094,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009947725655838806,
|
|
"loss": 6.8193,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.4413518886679921,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009945327425756661,
|
|
"loss": 6.7235,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 1.4512922465208749,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000994287574580248,
|
|
"loss": 6.822,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.4612326043737576,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0009940370645454848,
|
|
"loss": 6.7462,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 1.4711729622266403,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000993781215483467,
|
|
"loss": 6.7522,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.4811133200795228,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009935200304704815,
|
|
"loss": 6.7767,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 1.4910536779324055,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009932535126469725,
|
|
"loss": 6.7556,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.5009940357852882,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009929816652175063,
|
|
"loss": 6.7793,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 1.510934393638171,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00099270449145073,
|
|
"loss": 6.7808,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.5208747514910537,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009924219946793353,
|
|
"loss": 6.7323,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 1.5308151093439364,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009921341783000158,
|
|
"loss": 6.7711,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.540755467196819,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000991841045773427,
|
|
"loss": 6.8143,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 1.5506958250497018,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000991542600624146,
|
|
"loss": 6.7932,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.5606361829025845,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0009912388464406265,
|
|
"loss": 6.715,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 1.570576540755467,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009909297868751585,
|
|
"loss": 6.6676,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.5805168986083498,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009906154256438223,
|
|
"loss": 6.7629,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 1.5904572564612325,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009902957665264443,
|
|
"loss": 6.6631,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.6003976143141152,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009899708133665529,
|
|
"loss": 6.7854,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 1.610337972166998,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009896405700713295,
|
|
"loss": 6.683,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.6202783300198806,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000989305040611565,
|
|
"loss": 6.7869,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 1.6302186878727634,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0009889642290216085,
|
|
"loss": 6.6871,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.640159045725646,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009886181393993223,
|
|
"loss": 6.6766,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 1.6500994035785288,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0009882667759060298,
|
|
"loss": 6.7339,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 1.6600397614314115,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0009879101427664662,
|
|
"loss": 6.7283,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 1.6699801192842942,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009875482442687294,
|
|
"loss": 6.7421,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.679920477137177,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0009871810847642258,
|
|
"loss": 6.6533,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 1.6898608349900597,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00098680866866762,
|
|
"loss": 6.6482,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 1.6998011928429424,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009864310004567807,
|
|
"loss": 6.5192,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 1.7097415506958251,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000986048084672727,
|
|
"loss": 6.5793,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.7196819085487078,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0009856599259195741,
|
|
"loss": 6.7184,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 1.7296222664015906,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009852665288644783,
|
|
"loss": 6.5879,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 1.7395626242544733,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000984867898237579,
|
|
"loss": 6.6798,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 1.749502982107356,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000984464038831945,
|
|
"loss": 6.7027,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.7594433399602387,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0009840549555035136,
|
|
"loss": 6.7056,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 1.7693836978131214,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0009836406531710342,
|
|
"loss": 6.6514,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.779324055666004,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0009832211368160087,
|
|
"loss": 6.5464,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 1.7892644135188867,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009827964114826314,
|
|
"loss": 6.5945,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.7992047713717694,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009823664822777285,
|
|
"loss": 6.7718,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 1.809145129224652,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000981931354370697,
|
|
"loss": 6.6851,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.8190854870775348,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009814910329934414,
|
|
"loss": 6.5936,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 1.8290258449304175,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009810455234403126,
|
|
"loss": 6.6248,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.8389662027833003,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000980594831068043,
|
|
"loss": 6.5634,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 1.8489065606361827,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009801389612956815,
|
|
"loss": 6.5672,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.8588469184890655,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009796779196045303,
|
|
"loss": 6.5675,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 1.8687872763419482,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009792117115380774,
|
|
"loss": 6.5819,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.878727634194831,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0009787403427019303,
|
|
"loss": 6.6632,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 1.8886679920477136,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000978263818763749,
|
|
"loss": 6.6098,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.8986083499005963,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0009777821454531775,
|
|
"loss": 6.506,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 1.908548707753479,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009772953285617748,
|
|
"loss": 6.623,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.9184890656063618,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0009768033739429459,
|
|
"loss": 6.5092,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 1.9284294234592445,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0009763062875118706,
|
|
"loss": 6.4015,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.9383697813121272,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0009758040752454326,
|
|
"loss": 6.5968,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 1.94831013916501,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009752967431821485,
|
|
"loss": 6.624,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.9582504970178927,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009747842974220936,
|
|
"loss": 6.6067,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.9681908548707754,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00097426674412683,
|
|
"loss": 6.5329,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.978131212723658,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009737440895193317,
|
|
"loss": 6.5692,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.9880715705765408,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009732163398839106,
|
|
"loss": 6.5427,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.9880715705765408,
|
|
"eval_loss": 6.80261754989624,
|
|
"eval_runtime": 0.9948,
|
|
"eval_samples_per_second": 3483.087,
|
|
"eval_steps_per_second": 436.265,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.9980119284294235,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 0.0009726835015661391,
|
|
"loss": 6.5276,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 2.0079522862823063,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009721455809727765,
|
|
"loss": 6.3445,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 2.017892644135189,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009716025845716894,
|
|
"loss": 6.2872,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 2.0278330019880717,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0009710545188917757,
|
|
"loss": 6.0278,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 2.0377733598409544,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0009705013905228854,
|
|
"loss": 6.155,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 2.047713717693837,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0009699432061157414,
|
|
"loss": 6.0783,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 2.05765407554672,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009693799723818591,
|
|
"loss": 6.0622,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 2.0675944333996026,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0009688116960934669,
|
|
"loss": 6.1243,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 2.0775347912524853,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009682383840834234,
|
|
"loss": 6.2396,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 2.0874751491053676,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009676600432451364,
|
|
"loss": 6.1982,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 2.0974155069582503,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009670766805324789,
|
|
"loss": 6.1659,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 2.107355864811133,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0009664883029597066,
|
|
"loss": 6.2021,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 2.1172962226640157,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0009658949176013729,
|
|
"loss": 6.2161,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 2.1272365805168985,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009652965315922438,
|
|
"loss": 6.124,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 2.137176938369781,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009646931521272123,
|
|
"loss": 6.2051,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 2.147117296222664,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009640847864612124,
|
|
"loss": 6.244,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 2.1570576540755466,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0009634714419091302,
|
|
"loss": 6.1595,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 2.1669980119284293,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009628531258457185,
|
|
"loss": 6.2621,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 2.176938369781312,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009622298457055056,
|
|
"loss": 6.1667,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 2.1868787276341948,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009616016089827078,
|
|
"loss": 6.1677,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 2.1968190854870775,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009609684232311378,
|
|
"loss": 6.0832,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 2.20675944333996,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009603302960641154,
|
|
"loss": 6.1629,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 2.216699801192843,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009596872351543742,
|
|
"loss": 6.1327,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 2.2266401590457257,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0009590392482339713,
|
|
"loss": 5.9253,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 2.2365805168986084,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009583863430941926,
|
|
"loss": 6.1576,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 2.246520874751491,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0009577285275854602,
|
|
"loss": 6.2573,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 2.256461232604374,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009570658096172374,
|
|
"loss": 6.1597,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 2.2664015904572565,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009563981971579342,
|
|
"loss": 6.1474,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 2.2763419483101393,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009557256982348107,
|
|
"loss": 6.1435,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 2.286282306163022,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009550483209338814,
|
|
"loss": 6.2277,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 2.2962226640159047,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0009543660733998174,
|
|
"loss": 6.1146,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 2.3061630218687874,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009536789638358488,
|
|
"loss": 5.9877,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 2.31610337972167,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000952987000503666,
|
|
"loss": 6.0654,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 2.326043737574553,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009522901917233196,
|
|
"loss": 6.2146,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 2.3359840954274356,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000951588545873122,
|
|
"loss": 6.3132,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 2.3459244532803183,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009508820713895454,
|
|
"loss": 6.1984,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 2.355864811133201,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009501707767671204,
|
|
"loss": 6.2137,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 2.3658051689860837,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009494546705583344,
|
|
"loss": 6.1788,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 2.3757455268389664,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009487337613735288,
|
|
"loss": 6.1691,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 2.3856858846918487,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009480080578807941,
|
|
"loss": 6.065,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 2.3956262425447314,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009472775688058681,
|
|
"loss": 6.1897,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 2.405566600397614,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009465423029320288,
|
|
"loss": 6.0957,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 2.415506958250497,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0009458022690999899,
|
|
"loss": 6.1618,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 2.4254473161033796,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000945057476207794,
|
|
"loss": 6.221,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 2.4353876739562623,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0009443079332107064,
|
|
"loss": 6.1258,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 2.445328031809145,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009435536491211062,
|
|
"loss": 6.1954,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 2.4552683896620278,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0009427946330083791,
|
|
"loss": 6.1142,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 2.4652087475149105,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0009420308939988073,
|
|
"loss": 6.1708,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 2.475149105367793,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000941262441275461,
|
|
"loss": 6.1363,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 2.485089463220676,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009404892840780868,
|
|
"loss": 6.1852,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 2.4950298210735586,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0009397114317029974,
|
|
"loss": 5.9939,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 2.5049701789264414,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0009389288935029595,
|
|
"loss": 6.155,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 2.514910536779324,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009381416788870807,
|
|
"loss": 6.1827,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 2.524850894632207,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009373497973206984,
|
|
"loss": 6.1103,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 2.5347912524850895,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009365532583252634,
|
|
"loss": 6.1874,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 2.5447316103379722,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009357520714782273,
|
|
"loss": 6.1845,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 2.554671968190855,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009349462464129264,
|
|
"loss": 6.1129,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 2.5646123260437377,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000934135792818466,
|
|
"loss": 6.1862,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 2.5745526838966204,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0009333207204396049,
|
|
"loss": 6.1098,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 2.584493041749503,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009325010390766362,
|
|
"loss": 6.1314,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 2.594433399602386,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0009316767585852716,
|
|
"loss": 6.0922,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 2.604373757455268,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0009308478888765214,
|
|
"loss": 6.0555,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 2.614314115308151,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009300144399165763,
|
|
"loss": 6.202,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 2.6242544731610336,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0009291764217266869,
|
|
"loss": 6.2056,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 2.6341948310139163,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009283338443830432,
|
|
"loss": 6.1862,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 2.644135188866799,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009274867180166542,
|
|
"loss": 6.1514,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 2.6540755467196817,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009266350528132253,
|
|
"loss": 6.1258,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 2.6640159045725644,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009257788590130365,
|
|
"loss": 6.1579,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 2.673956262425447,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0009249181469108181,
|
|
"loss": 6.0662,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 2.68389662027833,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009240529268556283,
|
|
"loss": 6.1693,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 2.6938369781312126,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009231832092507283,
|
|
"loss": 6.1294,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 2.7037773359840953,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009223090045534567,
|
|
"loss": 6.1247,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 2.713717693836978,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0009214303232751044,
|
|
"loss": 6.0484,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 2.7236580516898607,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009205471759807874,
|
|
"loss": 6.0757,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 2.7335984095427435,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0009196595732893213,
|
|
"loss": 6.1256,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 2.743538767395626,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009187675258730918,
|
|
"loss": 6.1258,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 2.753479125248509,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009178710444579277,
|
|
"loss": 6.1197,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 2.7634194831013916,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0009169701398229713,
|
|
"loss": 5.9847,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 2.7733598409542743,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000916064822800549,
|
|
"loss": 6.1352,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 2.783300198807157,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009151551042760408,
|
|
"loss": 6.0972,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 2.79324055666004,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009142409951877497,
|
|
"loss": 6.2098,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 2.8031809145129225,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009133225065267707,
|
|
"loss": 6.0333,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 2.8131212723658052,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000912399649336857,
|
|
"loss": 6.0754,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 2.823061630218688,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0009114724347142892,
|
|
"loss": 6.1589,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 2.8330019880715707,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009105408738077402,
|
|
"loss": 6.1881,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 2.8429423459244534,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009096049778181426,
|
|
"loss": 6.1274,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 2.852882703777336,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0009086647579985526,
|
|
"loss": 6.1397,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 2.862823061630219,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0009077202256540159,
|
|
"loss": 6.1388,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 2.8727634194831015,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009067713921414313,
|
|
"loss": 5.9705,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 2.8827037773359843,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0009058182688694137,
|
|
"loss": 6.076,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 2.892644135188867,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009048608672981576,
|
|
"loss": 6.0662,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 2.9025844930417497,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0009038991989392992,
|
|
"loss": 6.058,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 2.9125248508946324,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009029332753557776,
|
|
"loss": 6.0491,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 2.922465208747515,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009019631081616963,
|
|
"loss": 6.1085,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 2.932405566600398,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0009009887090221828,
|
|
"loss": 6.1568,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 2.9423459244532806,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0009000100896532492,
|
|
"loss": 6.02,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 2.952286282306163,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0008990272618216508,
|
|
"loss": 6.1198,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 2.9622266401590456,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0008980402373447446,
|
|
"loss": 6.1041,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 2.9721669980119283,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0008970490280903477,
|
|
"loss": 6.0385,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 2.982107355864811,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000896053645976594,
|
|
"loss": 6.1723,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 2.982107355864811,
|
|
"eval_loss": 6.617855072021484,
|
|
"eval_runtime": 0.9901,
|
|
"eval_samples_per_second": 3499.671,
|
|
"eval_steps_per_second": 438.343,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 2.9920477137176937,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008950541029717912,
|
|
"loss": 6.1183,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 3.0019880715705765,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0008940504110942771,
|
|
"loss": 5.9013,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 3.011928429423459,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0008930425824122744,
|
|
"loss": 5.6088,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 3.021868787276342,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0008920306290437462,
|
|
"loss": 5.6395,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 3.0318091451292246,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008910145631562507,
|
|
"loss": 5.707,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 3.0417495029821073,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0008899943969667932,
|
|
"loss": 5.56,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 3.05168986083499,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0008889701427416815,
|
|
"loss": 5.6505,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 3.0616302186878728,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0008879418127963767,
|
|
"loss": 5.5553,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 3.0715705765407555,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008869094194953455,
|
|
"loss": 5.4907,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 3.081510934393638,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008858729752519121,
|
|
"loss": 5.7229,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 3.091451292246521,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0008848324925281085,
|
|
"loss": 5.5297,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 3.1013916500994037,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0008837879838345245,
|
|
"loss": 5.6875,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 3.1113320079522864,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0008827394617301576,
|
|
"loss": 5.6974,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 3.121272365805169,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008816869388222618,
|
|
"loss": 5.6248,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 3.131212723658052,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0008806304277661964,
|
|
"loss": 5.7482,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 3.1411530815109345,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0008795699412652732,
|
|
"loss": 5.7131,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 3.1510934393638173,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0008785054920706039,
|
|
"loss": 5.6582,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 3.1610337972167,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0008774370929809475,
|
|
"loss": 5.5989,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 3.1709741550695827,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008763647568425557,
|
|
"loss": 5.7373,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 3.1809145129224654,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0008752884965490185,
|
|
"loss": 5.708,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 3.1908548707753477,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0008742083250411091,
|
|
"loss": 5.5961,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 3.2007952286282304,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0008731242553066287,
|
|
"loss": 5.7179,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 3.210735586481113,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008720363003802503,
|
|
"loss": 5.6874,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 3.220675944333996,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0008709444733433617,
|
|
"loss": 5.6388,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 3.2306163021868786,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0008698487873239079,
|
|
"loss": 5.5839,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 3.2405566600397613,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0008687492554962345,
|
|
"loss": 5.6572,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 3.250497017892644,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0008676458910809273,
|
|
"loss": 5.7369,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 3.2604373757455267,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008665387073446556,
|
|
"loss": 5.7047,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 3.2703777335984094,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000865427717600011,
|
|
"loss": 5.6605,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 3.280318091451292,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008643129352053478,
|
|
"loss": 5.7413,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 3.290258449304175,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0008631943735646231,
|
|
"loss": 5.5826,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 3.3001988071570576,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0008620720461272344,
|
|
"loss": 5.7436,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 3.3101391650099403,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0008609459663878586,
|
|
"loss": 5.767,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 3.320079522862823,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00085981614788629,
|
|
"loss": 5.6447,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 3.3300198807157058,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0008586826042072768,
|
|
"loss": 5.7648,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 3.3399602385685885,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008575453489803583,
|
|
"loss": 5.6635,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 3.349900596421471,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008564043958797008,
|
|
"loss": 5.7262,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 3.359840954274354,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0008552597586239333,
|
|
"loss": 5.597,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 3.3697813121272366,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0008541114509759821,
|
|
"loss": 5.6742,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 3.3797216699801194,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0008529594867429059,
|
|
"loss": 5.7195,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 3.389662027833002,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0008518038797757299,
|
|
"loss": 5.6343,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 3.399602385685885,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008506446439692784,
|
|
"loss": 5.5716,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 3.4095427435387675,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0008494817932620086,
|
|
"loss": 5.7378,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 3.4194831013916502,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0008483153416358423,
|
|
"loss": 5.7251,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 3.429423459244533,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008471453031159987,
|
|
"loss": 5.7797,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 3.4393638170974157,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008459716917708248,
|
|
"loss": 5.6589,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 3.4493041749502984,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0008447945217116265,
|
|
"loss": 5.7069,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 3.459244532803181,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008436138070924997,
|
|
"loss": 5.6907,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 3.469184890656064,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000842429562110159,
|
|
"loss": 5.6668,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 3.4791252485089466,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0008412418010037673,
|
|
"loss": 5.8247,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 3.4890656063618293,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0008400505380547655,
|
|
"loss": 5.6747,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 3.4990059642147116,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0008388557875866995,
|
|
"loss": 5.8498,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 3.5089463220675943,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008376575639650489,
|
|
"loss": 5.7285,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 3.518886679920477,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008364558815970536,
|
|
"loss": 5.6371,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 3.5288270377733597,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0008352507549315407,
|
|
"loss": 5.7659,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 3.5387673956262424,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0008340421984587517,
|
|
"loss": 5.7699,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 3.548707753479125,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000832830226710167,
|
|
"loss": 5.6951,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 3.558648111332008,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008316148542583319,
|
|
"loss": 5.6054,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 3.5685884691848906,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000830396095716681,
|
|
"loss": 5.735,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 3.5785288270377733,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0008291739657393626,
|
|
"loss": 5.7304,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 3.588469184890656,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0008279484790210632,
|
|
"loss": 5.8595,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 3.5984095427435387,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000826719650296829,
|
|
"loss": 5.8058,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 3.6083499005964215,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0008254874943418914,
|
|
"loss": 5.6195,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 3.618290258449304,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0008242520259714868,
|
|
"loss": 5.776,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 3.628230616302187,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00082301326004068,
|
|
"loss": 5.7275,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 3.6381709741550696,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0008217712114441846,
|
|
"loss": 5.7364,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 3.6481113320079523,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0008205258951161852,
|
|
"loss": 5.7252,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 3.658051689860835,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0008192773260301564,
|
|
"loss": 5.662,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 3.667992047713718,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008180255191986837,
|
|
"loss": 5.797,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 3.6779324055666005,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0008167704896732828,
|
|
"loss": 5.7634,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 3.6878727634194832,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0008155122525442182,
|
|
"loss": 5.5614,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 3.697813121272366,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0008142508229403225,
|
|
"loss": 5.7841,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 3.7077534791252487,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0008129862160288137,
|
|
"loss": 5.6563,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 3.717693836978131,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008117184470151134,
|
|
"loss": 5.7122,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 3.7276341948310137,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000810447531142664,
|
|
"loss": 5.7254,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 3.7375745526838964,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0008091734836927447,
|
|
"loss": 5.7376,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 3.747514910536779,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0008078963199842886,
|
|
"loss": 5.7528,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 3.757455268389662,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000806616055373698,
|
|
"loss": 5.7318,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 3.7673956262425445,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0008053327052546605,
|
|
"loss": 5.6969,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 3.7773359840954273,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0008040462850579625,
|
|
"loss": 5.7362,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 3.78727634194831,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000802756810251305,
|
|
"loss": 5.7475,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 3.7972166998011927,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0008014642963391168,
|
|
"loss": 5.7347,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 3.8071570576540754,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0008001687588623686,
|
|
"loss": 5.8059,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 3.817097415506958,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0007988702133983861,
|
|
"loss": 5.6374,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 3.827037773359841,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0007975686755606623,
|
|
"loss": 5.6819,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 3.8369781312127236,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0007962641609986703,
|
|
"loss": 5.5957,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 3.8469184890656063,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0007949566853976738,
|
|
"loss": 5.7282,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 3.856858846918489,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0007936462644785413,
|
|
"loss": 5.7273,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 3.8667992047713717,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0007923329139975537,
|
|
"loss": 5.6673,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 3.8767395626242545,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0007910166497462173,
|
|
"loss": 5.7586,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 3.886679920477137,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0007896974875510731,
|
|
"loss": 5.6486,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 3.89662027833002,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0007883754432735058,
|
|
"loss": 5.7101,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 3.9065606361829026,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0007870505328095545,
|
|
"loss": 5.5657,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 3.9165009940357853,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0007857227720897207,
|
|
"loss": 5.7389,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 3.926441351888668,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0007843921770787765,
|
|
"loss": 5.5861,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 3.9363817097415508,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0007830587637755736,
|
|
"loss": 5.745,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 3.9463220675944335,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00078172254821285,
|
|
"loss": 5.6622,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 3.956262425447316,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0007803835464570379,
|
|
"loss": 5.7632,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 3.966202783300199,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0007790417746080698,
|
|
"loss": 5.7247,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 3.9761431411530817,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0007776972487991857,
|
|
"loss": 5.7307,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 3.9761431411530817,
|
|
"eval_loss": 6.569971561431885,
|
|
"eval_runtime": 0.9949,
|
|
"eval_samples_per_second": 3482.812,
|
|
"eval_steps_per_second": 436.231,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 3.9860834990059644,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0007763499851967385,
|
|
"loss": 5.7296,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 3.996023856858847,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0007750000000000001,
|
|
"loss": 5.7099,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 4.00596421471173,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000773647309440966,
|
|
"loss": 5.409,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 4.0159045725646125,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0007722919297841613,
|
|
"loss": 5.2489,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 4.025844930417495,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0007709338773264435,
|
|
"loss": 5.3349,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 4.035785288270378,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0007695731683968077,
|
|
"loss": 5.2368,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 4.045725646123261,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0007682098193561904,
|
|
"loss": 5.4091,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 4.055666003976143,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0007668438465972717,
|
|
"loss": 5.2548,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 4.065606361829026,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0007654752665442794,
|
|
"loss": 5.2552,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 4.075546719681909,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0007641040956527904,
|
|
"loss": 5.3086,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 4.085487077534792,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0007627303504095341,
|
|
"loss": 5.1799,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 4.095427435387674,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0007613540473321927,
|
|
"loss": 5.3074,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 4.105367793240557,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0007599752029692041,
|
|
"loss": 5.1494,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 4.11530815109344,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0007585938338995616,
|
|
"loss": 5.3291,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 4.1252485089463224,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0007572099567326158,
|
|
"loss": 5.2711,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 4.135188866799205,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0007558235881078734,
|
|
"loss": 5.2345,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 4.145129224652088,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0007544347446947986,
|
|
"loss": 5.3347,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 4.155069582504971,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0007530434431926118,
|
|
"loss": 5.4939,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 4.165009940357853,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 0.0007516497003300892,
|
|
"loss": 5.0571,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 4.174950298210735,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0007502535328653615,
|
|
"loss": 5.343,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 4.184890656063618,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0007488549575857124,
|
|
"loss": 5.385,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 4.194831013916501,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0007474539913073764,
|
|
"loss": 5.3096,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 4.204771371769383,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0007460506508753373,
|
|
"loss": 5.3324,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 4.214711729622266,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0007446449531631255,
|
|
"loss": 5.4102,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 4.224652087475149,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0007432369150726146,
|
|
"loss": 5.2792,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 4.2345924453280315,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0007418265535338187,
|
|
"loss": 5.3,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 4.244532803180914,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0007404138855046884,
|
|
"loss": 5.296,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 4.254473161033797,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0007389989279709077,
|
|
"loss": 5.3927,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 4.26441351888668,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0007375816979456887,
|
|
"loss": 5.2995,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 4.274353876739562,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0007361622124695677,
|
|
"loss": 5.3132,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 4.284294234592445,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0007347404886102002,
|
|
"loss": 5.1792,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 4.294234592445328,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0007333165434621556,
|
|
"loss": 5.2816,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 4.3041749502982105,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0007318903941467119,
|
|
"loss": 5.2905,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 4.314115308151093,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0007304620578116493,
|
|
"loss": 5.4091,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 4.324055666003976,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0007290315516310445,
|
|
"loss": 5.3938,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 4.333996023856859,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0007275988928050645,
|
|
"loss": 5.2725,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 4.343936381709741,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0007261640985597584,
|
|
"loss": 5.2745,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 4.353876739562624,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0007247271861468522,
|
|
"loss": 5.2106,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 4.363817097415507,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0007232881728435397,
|
|
"loss": 5.347,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 4.3737574552683895,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0007218470759522759,
|
|
"loss": 5.3879,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 4.383697813121272,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0007204039128005682,
|
|
"loss": 5.4032,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 4.393638170974155,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0007189587007407686,
|
|
"loss": 5.27,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 4.403578528827038,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0007175114571498644,
|
|
"loss": 5.3247,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 4.41351888667992,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0007160621994292706,
|
|
"loss": 5.3636,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 4.423459244532803,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0007146109450046187,
|
|
"loss": 5.3791,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 4.433399602385686,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0007131577113255489,
|
|
"loss": 5.3862,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 4.443339960238569,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0007117025158654991,
|
|
"loss": 5.3419,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 4.453280318091451,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0007102453761214961,
|
|
"loss": 5.3771,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 4.463220675944334,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0007087863096139438,
|
|
"loss": 5.4289,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 4.473161033797217,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0007073253338864137,
|
|
"loss": 5.352,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 4.4831013916500995,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0007058624665054326,
|
|
"loss": 5.3599,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 4.493041749502982,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0007043977250602732,
|
|
"loss": 5.3989,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 4.502982107355865,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0007029311271627408,
|
|
"loss": 5.191,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 4.512922465208748,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0007014626904469629,
|
|
"loss": 5.3663,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 4.52286282306163,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0006999924325691765,
|
|
"loss": 5.4312,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 4.532803180914513,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0006985203712075161,
|
|
"loss": 5.3819,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 4.542743538767396,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0006970465240618006,
|
|
"loss": 5.3549,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 4.5526838966202785,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0006955709088533212,
|
|
"loss": 5.407,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 4.562624254473161,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0006940935433246279,
|
|
"loss": 5.364,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 4.572564612326044,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0006926144452393163,
|
|
"loss": 5.4723,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 4.582504970178927,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0006911336323818137,
|
|
"loss": 5.4365,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 4.592445328031809,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000689651122557166,
|
|
"loss": 5.312,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 4.602385685884692,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0006881669335908229,
|
|
"loss": 5.3521,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 4.612326043737575,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0006866810833284234,
|
|
"loss": 5.3956,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 4.6222664015904575,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0006851935896355827,
|
|
"loss": 5.3256,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 4.63220675944334,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0006837044703976754,
|
|
"loss": 5.374,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 4.642147117296223,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0006822137435196214,
|
|
"loss": 5.3469,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 4.652087475149106,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0006807214269256713,
|
|
"loss": 5.3048,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 4.662027833001988,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0006792275385591895,
|
|
"loss": 5.3451,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 4.671968190854871,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0006777320963824396,
|
|
"loss": 5.2193,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 4.681908548707754,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0006762351183763674,
|
|
"loss": 5.465,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 4.691848906560637,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0006747366225403858,
|
|
"loss": 5.24,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 4.701789264413518,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0006732366268921576,
|
|
"loss": 5.4872,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 4.711729622266402,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0006717351494673791,
|
|
"loss": 5.3933,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 4.721669980119284,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0006702322083195633,
|
|
"loss": 5.3808,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 4.7316103379721675,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0006687278215198226,
|
|
"loss": 5.4217,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 4.741550695825049,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000667222007156652,
|
|
"loss": 5.4018,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 4.751491053677933,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0006657147833357107,
|
|
"loss": 5.3794,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 4.761431411530815,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0006642061681796056,
|
|
"loss": 5.4367,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 4.7713717693836974,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0006626961798276726,
|
|
"loss": 5.4065,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 4.78131212723658,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0006611848364357584,
|
|
"loss": 5.3891,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 4.791252485089463,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0006596721561760028,
|
|
"loss": 5.4017,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 4.801192842942346,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0006581581572366196,
|
|
"loss": 5.3121,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 4.811133200795228,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0006566428578216785,
|
|
"loss": 5.3899,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 4.821073558648111,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0006551262761508857,
|
|
"loss": 5.3972,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 4.831013916500994,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0006536084304593652,
|
|
"loss": 5.326,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 4.8409542743538765,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000652089338997439,
|
|
"loss": 5.2832,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 4.850894632206759,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0006505690200304083,
|
|
"loss": 5.4461,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 4.860834990059642,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0006490474918383339,
|
|
"loss": 5.4626,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 4.870775347912525,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0006475247727158154,
|
|
"loss": 5.409,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 4.880715705765407,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0006460008809717727,
|
|
"loss": 5.3163,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 4.89065606361829,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0006444758349292244,
|
|
"loss": 5.4431,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 4.900596421471173,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0006429496529250689,
|
|
"loss": 5.4066,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 4.9105367793240555,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0006414223533098627,
|
|
"loss": 5.4518,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 4.920477137176938,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0006398939544476005,
|
|
"loss": 5.3899,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 4.930417495029821,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000638364474715494,
|
|
"loss": 5.5096,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 4.940357852882704,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0006368339325037513,
|
|
"loss": 5.3423,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 4.950298210735586,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0006353023462153552,
|
|
"loss": 5.4424,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 4.960238568588469,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0006337697342658431,
|
|
"loss": 5.4042,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 4.970178926441352,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0006322361150830839,
|
|
"loss": 5.3938,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 4.970178926441352,
|
|
"eval_loss": 6.55920934677124,
|
|
"eval_runtime": 0.9897,
|
|
"eval_samples_per_second": 3501.041,
|
|
"eval_steps_per_second": 438.514,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 4.980119284294235,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0006307015071070575,
|
|
"loss": 5.3588,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 4.990059642147117,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0006291659287896334,
|
|
"loss": 5.2532,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0006276293985943478,
|
|
"loss": 5.3366,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 5.009940357852883,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0006260919349961824,
|
|
"loss": 4.9571,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 5.019880715705765,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0006245535564813417,
|
|
"loss": 4.8936,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 5.029821073558648,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0006230142815470312,
|
|
"loss": 5.0264,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 5.039761431411531,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0006214741287012348,
|
|
"loss": 5.0319,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 5.049701789264414,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0006199331164624922,
|
|
"loss": 5.0509,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 5.059642147117296,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0006183912633596763,
|
|
"loss": 4.921,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 5.069582504970179,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0006168485879317707,
|
|
"loss": 5.0029,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 5.079522862823062,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0006153051087276458,
|
|
"loss": 5.0056,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 5.0894632206759445,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0006137608443058371,
|
|
"loss": 5.0813,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 5.099403578528827,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0006122158132343213,
|
|
"loss": 4.9781,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 5.10934393638171,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000610670034090293,
|
|
"loss": 4.9813,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 5.119284294234593,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0006091235254599417,
|
|
"loss": 4.9986,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 5.129224652087475,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0006075763059382278,
|
|
"loss": 4.9165,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 5.139165009940358,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0006060283941286597,
|
|
"loss": 4.995,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 5.149105367793241,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0006044798086430697,
|
|
"loss": 5.1471,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 5.1590457256461235,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00060293056810139,
|
|
"loss": 5.0747,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 5.168986083499006,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0006013806911314293,
|
|
"loss": 5.0625,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 5.178926441351889,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0005998301963686485,
|
|
"loss": 4.9767,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 5.188866799204772,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0005982791024559371,
|
|
"loss": 5.0446,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 5.198807157057654,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0005967274280433881,
|
|
"loss": 5.109,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 5.208747514910537,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0005951751917880747,
|
|
"loss": 5.0925,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 5.21868787276342,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0005936224123538254,
|
|
"loss": 5.0823,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 5.2286282306163026,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000592069108411,
|
|
"loss": 5.0158,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 5.238568588469185,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0005905152986362649,
|
|
"loss": 5.0814,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 5.248508946322068,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0005889610017123685,
|
|
"loss": 5.0124,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 5.258449304174951,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0005874062363279164,
|
|
"loss": 4.9618,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 5.2683896620278325,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0005858510211771469,
|
|
"loss": 5.0623,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 5.278330019880716,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0005842953749597065,
|
|
"loss": 5.0798,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 5.288270377733598,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0005827393163804249,
|
|
"loss": 5.1155,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 5.298210735586481,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0005811828641490892,
|
|
"loss": 5.1405,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 5.308151093439363,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0005796260369802205,
|
|
"loss": 5.1275,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 5.318091451292246,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0005780688535928478,
|
|
"loss": 5.1139,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 5.328031809145129,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0005765113327102831,
|
|
"loss": 5.0838,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 5.337972166998012,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0005749534930598966,
|
|
"loss": 5.1417,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"epoch": 5.347912524850894,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0005733953533728912,
|
|
"loss": 5.1345,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 5.357852882703777,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0005718369323840773,
|
|
"loss": 5.0213,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"epoch": 5.36779324055666,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0005702782488316478,
|
|
"loss": 5.1443,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 5.3777335984095425,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0005687193214569524,
|
|
"loss": 5.1212,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"epoch": 5.387673956262425,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0005671601690042727,
|
|
"loss": 5.0729,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 5.397614314115308,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0005656008102205966,
|
|
"loss": 5.0507,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"epoch": 5.407554671968191,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0005640412638553927,
|
|
"loss": 5.063,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 5.417495029821073,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000562481548660385,
|
|
"loss": 4.8997,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 5.427435387673956,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000560921683389328,
|
|
"loss": 5.1052,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 5.437375745526839,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0005593616867977801,
|
|
"loss": 5.0741,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"epoch": 5.4473161033797215,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000557801577642879,
|
|
"loss": 5.1363,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 5.457256461232604,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0005562413746831156,
|
|
"loss": 5.085,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"epoch": 5.467196819085487,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000554681096678109,
|
|
"loss": 5.0483,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 5.47713717693837,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0005531207623883801,
|
|
"loss": 5.131,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"epoch": 5.487077534791252,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0005515603905751276,
|
|
"loss": 5.1046,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 5.497017892644135,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00055,
|
|
"loss": 5.0007,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"epoch": 5.506958250497018,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0005484396094248726,
|
|
"loss": 5.1351,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 5.5168986083499005,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0005468792376116198,
|
|
"loss": 5.1641,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 5.526838966202783,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0005453189033218912,
|
|
"loss": 5.0468,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 5.536779324055666,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0005437586253168845,
|
|
"loss": 4.9815,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"epoch": 5.546719681908549,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0005421984223571211,
|
|
"loss": 5.1722,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 5.556660039761431,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0005406383132022199,
|
|
"loss": 5.1806,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"epoch": 5.566600397614314,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000539078316610672,
|
|
"loss": 5.0021,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 5.576540755467197,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000537518451339615,
|
|
"loss": 5.1392,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"epoch": 5.58648111332008,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0005359587361446073,
|
|
"loss": 4.9547,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 5.596421471172962,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0005343991897794036,
|
|
"loss": 5.0659,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"epoch": 5.606361829025845,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0005328398309957274,
|
|
"loss": 5.2004,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 5.616302186878728,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0005312806785430478,
|
|
"loss": 5.0584,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 5.6262425447316105,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0005297217511683524,
|
|
"loss": 5.0177,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 5.636182902584493,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0005281630676159228,
|
|
"loss": 5.0376,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"epoch": 5.646123260437376,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0005266046466271089,
|
|
"loss": 4.961,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 5.656063618290259,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0005250465069401034,
|
|
"loss": 5.1326,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"epoch": 5.666003976143141,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000523488667289717,
|
|
"loss": 5.0847,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 5.675944333996024,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0005219311464071524,
|
|
"loss": 5.1852,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"epoch": 5.685884691848907,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0005203739630197796,
|
|
"loss": 4.9042,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 5.6958250497017895,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0005188171358509109,
|
|
"loss": 5.059,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"epoch": 5.705765407554672,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0005172606836195753,
|
|
"loss": 5.1375,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 5.715705765407555,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0005157046250402936,
|
|
"loss": 5.1251,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 5.725646123260438,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0005141489788228533,
|
|
"loss": 5.0766,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 5.73558648111332,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0005125937636720838,
|
|
"loss": 5.1601,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"epoch": 5.745526838966203,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0005110389982876316,
|
|
"loss": 5.0683,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 5.755467196819086,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000509484701363735,
|
|
"loss": 4.9532,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"epoch": 5.7654075546719685,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000507930891589,
|
|
"loss": 5.1401,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 5.775347912524851,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0005063775876461746,
|
|
"loss": 5.1512,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"epoch": 5.785288270377734,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0005048248082119253,
|
|
"loss": 5.0868,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 5.795228628230617,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000503272571956612,
|
|
"loss": 5.2094,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"epoch": 5.805168986083499,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000501720897544063,
|
|
"loss": 5.0475,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 5.815109343936381,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0005001698036313514,
|
|
"loss": 5.1375,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 5.825049701789265,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004986193088685708,
|
|
"loss": 5.0319,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 5.834990059642147,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004970694318986101,
|
|
"loss": 5.0563,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"epoch": 5.84493041749503,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004955201913569304,
|
|
"loss": 5.0266,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 5.854870775347912,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004939716058713404,
|
|
"loss": 5.1218,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"epoch": 5.864811133200796,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004924236940617722,
|
|
"loss": 5.0916,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 5.8747514910536776,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004908764745400584,
|
|
"loss": 4.9822,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"epoch": 5.88469184890656,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000489329965909707,
|
|
"loss": 5.1834,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 5.894632206759443,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004877841867656788,
|
|
"loss": 5.1634,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"epoch": 5.904572564612326,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000486239155694163,
|
|
"loss": 5.2446,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 5.914512922465208,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048469489127235424,
|
|
"loss": 5.074,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"epoch": 5.924453280318091,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048315141206822944,
|
|
"loss": 5.121,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 5.934393638170974,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004816087366403237,
|
|
"loss": 4.9793,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"epoch": 5.944333996023857,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004800668835375078,
|
|
"loss": 5.1341,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 5.954274353876739,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004785258712987651,
|
|
"loss": 4.9571,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"epoch": 5.964214711729622,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004769857184529688,
|
|
"loss": 5.1579,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 5.964214711729622,
|
|
"eval_loss": 6.571132183074951,
|
|
"eval_runtime": 0.989,
|
|
"eval_samples_per_second": 3503.588,
|
|
"eval_steps_per_second": 438.833,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 5.974155069582505,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004754464435186583,
|
|
"loss": 5.0747,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"epoch": 5.9840954274353875,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047390806500381753,
|
|
"loss": 5.0098,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 5.99403578528827,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004723706014056522,
|
|
"loss": 5.1607,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"epoch": 6.003976143141153,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004708340712103667,
|
|
"loss": 5.0051,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 6.013916500994036,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004692984928929426,
|
|
"loss": 4.8317,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"epoch": 6.023856858846918,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00046776388491691633,
|
|
"loss": 4.8178,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 6.033797216699801,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00046623026573415716,
|
|
"loss": 4.7927,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"epoch": 6.043737574552684,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004646976537846449,
|
|
"loss": 4.8697,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 6.0536779324055665,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004631660674962489,
|
|
"loss": 4.6803,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"epoch": 6.063618290258449,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00046163552528450617,
|
|
"loss": 4.7957,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 6.073558648111332,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046010604555239964,
|
|
"loss": 4.8543,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"epoch": 6.083499005964215,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00045857764669013736,
|
|
"loss": 4.8197,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 6.093439363817097,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004570503470749312,
|
|
"loss": 4.8259,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"epoch": 6.10337972166998,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00045552416507077564,
|
|
"loss": 4.7964,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 6.113320079522863,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00045399911902822745,
|
|
"loss": 4.842,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"epoch": 6.1232604373757455,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00045247522728418467,
|
|
"loss": 4.7331,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 6.133200795228628,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00045095250816166624,
|
|
"loss": 4.8636,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"epoch": 6.143141153081511,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004494309799695916,
|
|
"loss": 4.8616,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 6.153081510934394,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00044791066100256105,
|
|
"loss": 4.8586,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"epoch": 6.163021868787276,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00044639156954063484,
|
|
"loss": 4.7848,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 6.172962226640159,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004448737238491143,
|
|
"loss": 4.8573,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"epoch": 6.182902584493042,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004433571421783216,
|
|
"loss": 4.8224,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 6.192842942345925,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00044184184276338046,
|
|
"loss": 4.8686,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"epoch": 6.202783300198807,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004403278438239975,
|
|
"loss": 4.7262,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 6.21272365805169,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004388151635642418,
|
|
"loss": 4.8178,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"epoch": 6.222664015904573,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004373038201723276,
|
|
"loss": 4.724,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 6.2326043737574555,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00043579383182039443,
|
|
"loss": 4.9154,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"epoch": 6.242544731610338,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00043428521666428945,
|
|
"loss": 4.8501,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 6.252485089463221,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004327779928433482,
|
|
"loss": 4.8747,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"epoch": 6.262425447316104,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00043127217848017743,
|
|
"loss": 4.8295,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 6.272365805168986,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00042976779168043676,
|
|
"loss": 4.844,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"epoch": 6.282306163021869,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000428264850532621,
|
|
"loss": 4.892,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 6.292246520874752,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004267633731078425,
|
|
"loss": 4.8446,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"epoch": 6.3021868787276345,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004252633774596143,
|
|
"loss": 4.7874,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 6.312127236580517,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004237648816236328,
|
|
"loss": 4.8716,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"epoch": 6.3220675944334,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004222679036175605,
|
|
"loss": 4.7429,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 6.332007952286283,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004207724614408105,
|
|
"loss": 4.9711,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"epoch": 6.341948310139165,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004192785730743287,
|
|
"loss": 4.8918,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 6.351888667992048,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004177862564803785,
|
|
"loss": 4.7297,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"epoch": 6.361829025844931,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004162955296023246,
|
|
"loss": 4.9112,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 6.3717693836978135,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00041480641036441724,
|
|
"loss": 4.8235,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"epoch": 6.381709741550695,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004133189166715766,
|
|
"loss": 4.8517,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 6.391650099403579,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00041183306640917727,
|
|
"loss": 4.7988,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"epoch": 6.401590457256461,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004103488774428341,
|
|
"loss": 4.9309,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 6.4115308151093435,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004088663676181864,
|
|
"loss": 4.826,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"epoch": 6.421471172962226,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00040738555476068386,
|
|
"loss": 4.9,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 6.431411530815109,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004059064566753722,
|
|
"loss": 4.8835,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"epoch": 6.441351888667992,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004044290911466789,
|
|
"loss": 4.8503,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 6.451292246520874,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00040295347593819955,
|
|
"loss": 4.8019,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"epoch": 6.461232604373757,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000401479628792484,
|
|
"loss": 4.9857,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 6.47117296222664,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00040000756743082354,
|
|
"loss": 4.8754,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"epoch": 6.481113320079523,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00039853730955303725,
|
|
"loss": 4.9805,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 6.491053677932405,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00039706887283725943,
|
|
"loss": 4.9007,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"epoch": 6.500994035785288,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000395602274939727,
|
|
"loss": 4.6004,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 6.510934393638171,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0003941375334945675,
|
|
"loss": 4.8384,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"epoch": 6.5208747514910534,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00039267466611358636,
|
|
"loss": 4.8327,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 6.530815109343936,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000391213690386056,
|
|
"loss": 4.8514,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"epoch": 6.540755467196819,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0003897546238785039,
|
|
"loss": 4.965,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 6.550695825049702,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00038829748413450095,
|
|
"loss": 4.8576,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"epoch": 6.560636182902584,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00038684228867445135,
|
|
"loss": 4.9319,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 6.570576540755467,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00038538905499538144,
|
|
"loss": 4.8882,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"epoch": 6.58051689860835,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0003839378005707297,
|
|
"loss": 4.8057,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 6.5904572564612325,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00038248854285013567,
|
|
"loss": 4.8606,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"epoch": 6.600397614314115,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0003810412992592317,
|
|
"loss": 4.8841,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 6.610337972166998,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000379596087199432,
|
|
"loss": 4.8573,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"epoch": 6.620278330019881,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0003781529240477243,
|
|
"loss": 4.9168,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 6.630218687872763,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00037671182715646036,
|
|
"loss": 4.8127,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"epoch": 6.640159045725646,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0003752728138531479,
|
|
"loss": 4.8465,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 6.650099403578529,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0003738359014402417,
|
|
"loss": 4.7892,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"epoch": 6.6600397614314115,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0003724011071949357,
|
|
"loss": 4.9296,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 6.669980119284294,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00037096844836895546,
|
|
"loss": 4.7325,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"epoch": 6.679920477137177,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0003695379421883509,
|
|
"loss": 4.914,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 6.68986083499006,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00036810960585328836,
|
|
"loss": 4.8846,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"epoch": 6.699801192842942,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0003666834565378444,
|
|
"loss": 4.9634,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 6.709741550695825,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00036525951138979986,
|
|
"loss": 4.7747,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"epoch": 6.719681908548708,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0003638377875304324,
|
|
"loss": 4.8854,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 6.729622266401591,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00036241830205431134,
|
|
"loss": 4.8437,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"epoch": 6.739562624254473,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0003610010720290923,
|
|
"loss": 4.9233,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 6.749502982107356,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0003595861144953115,
|
|
"loss": 4.9041,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"epoch": 6.759443339960239,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00035817344646618134,
|
|
"loss": 4.8625,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 6.769383697813121,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0003567630849273854,
|
|
"loss": 4.8721,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"epoch": 6.779324055666004,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00035535504683687467,
|
|
"loss": 4.9343,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 6.789264413518887,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0003539493491246628,
|
|
"loss": 4.9468,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"epoch": 6.79920477137177,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0003525460086926239,
|
|
"loss": 4.8996,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 6.809145129224652,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0003511450424142878,
|
|
"loss": 4.851,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"epoch": 6.819085487077535,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00034974646713463854,
|
|
"loss": 4.8986,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 6.829025844930418,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00034835029966991083,
|
|
"loss": 4.8128,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"epoch": 6.8389662027833005,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0003469565568073884,
|
|
"loss": 4.9424,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 6.848906560636183,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00034556525530520166,
|
|
"loss": 4.9103,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"epoch": 6.858846918489066,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0003441764118921268,
|
|
"loss": 4.9301,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 6.868787276341949,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00034279004326738446,
|
|
"loss": 4.7545,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"epoch": 6.878727634194831,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0003414061661004383,
|
|
"loss": 4.8724,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 6.888667992047714,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00034002479703079593,
|
|
"loss": 4.9124,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"epoch": 6.898608349900597,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00033864595266780727,
|
|
"loss": 4.7911,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 6.9085487077534795,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00033726964959046596,
|
|
"loss": 4.9172,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"epoch": 6.918489065606362,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0003358959043472096,
|
|
"loss": 4.7924,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 6.928429423459244,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00033452473345572064,
|
|
"loss": 4.8789,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"epoch": 6.938369781312128,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00033315615340272827,
|
|
"loss": 4.8461,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 6.9483101391650095,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00033179018064380964,
|
|
"loss": 4.8758,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"epoch": 6.958250497017893,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0003304268316031922,
|
|
"loss": 4.907,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 6.958250497017893,
|
|
"eval_loss": 6.587252140045166,
|
|
"eval_runtime": 0.9889,
|
|
"eval_samples_per_second": 3503.757,
|
|
"eval_steps_per_second": 438.854,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 6.968190854870775,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00032906612267355673,
|
|
"loss": 4.9143,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"epoch": 6.9781312127236585,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0003277080702158389,
|
|
"loss": 4.9012,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 6.98807157057654,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000326352690559034,
|
|
"loss": 4.9948,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"epoch": 6.998011928429423,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00032500000000000015,
|
|
"loss": 4.9512,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 7.007952286282306,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0003236500148032616,
|
|
"loss": 4.8295,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"epoch": 7.0178926441351885,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00032230275120081445,
|
|
"loss": 4.7205,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 7.027833001988071,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0003209582253919302,
|
|
"loss": 4.6125,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"epoch": 7.037773359840954,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00031961645354296214,
|
|
"loss": 4.6474,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 7.047713717693837,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00031827745178714996,
|
|
"loss": 4.705,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"epoch": 7.057654075546719,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00031694123622442647,
|
|
"loss": 4.7174,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 7.067594433399602,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0003156078229212236,
|
|
"loss": 4.6565,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"epoch": 7.077534791252485,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00031427722791027953,
|
|
"loss": 4.6824,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 7.087475149105368,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0003129494671904457,
|
|
"loss": 4.794,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"epoch": 7.09741550695825,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0003116245567264944,
|
|
"loss": 4.6447,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 7.107355864811133,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00031030251244892714,
|
|
"loss": 4.6191,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"epoch": 7.117296222664016,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00030898335025378277,
|
|
"loss": 4.7078,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 7.1272365805168985,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0003076670860024464,
|
|
"loss": 4.6622,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"epoch": 7.137176938369781,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0003063537355214588,
|
|
"loss": 4.7675,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 7.147117296222664,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0003050433146023259,
|
|
"loss": 4.7234,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"epoch": 7.157057654075547,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00030373583900132975,
|
|
"loss": 4.8054,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 7.166998011928429,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0003024313244393377,
|
|
"loss": 4.6191,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"epoch": 7.176938369781312,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00030112978660161395,
|
|
"loss": 4.6637,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 7.186878727634195,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0002998312411376315,
|
|
"loss": 4.7553,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"epoch": 7.1968190854870775,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00029853570366088336,
|
|
"loss": 4.6949,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 7.20675944333996,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0002972431897486952,
|
|
"loss": 4.7082,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"epoch": 7.216699801192843,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00029595371494203754,
|
|
"loss": 4.7411,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 7.226640159045726,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0002946672947453395,
|
|
"loss": 4.7645,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"epoch": 7.236580516898608,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0002933839446263019,
|
|
"loss": 4.7813,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 7.246520874751491,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0002921036800157115,
|
|
"loss": 4.6833,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"epoch": 7.256461232604374,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002908265163072554,
|
|
"loss": 4.7884,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 7.2664015904572565,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0002895524688573361,
|
|
"loss": 4.6495,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"epoch": 7.276341948310139,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00028828155298488655,
|
|
"loss": 4.7162,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 7.286282306163022,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0002870137839711864,
|
|
"loss": 4.7134,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"epoch": 7.296222664015905,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00028574917705967765,
|
|
"loss": 4.7246,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 7.306163021868787,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0002844877474557819,
|
|
"loss": 4.761,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"epoch": 7.31610337972167,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00028322951032671727,
|
|
"loss": 4.6278,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 7.326043737574553,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00028197448080131634,
|
|
"loss": 4.7252,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"epoch": 7.335984095427436,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0002807226739698437,
|
|
"loss": 4.7248,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 7.345924453280318,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0002794741048838149,
|
|
"loss": 4.6949,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"epoch": 7.355864811133201,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0002782287885558155,
|
|
"loss": 4.7877,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 7.365805168986084,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0002769867399593201,
|
|
"loss": 4.6872,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"epoch": 7.3757455268389664,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00027574797402851313,
|
|
"loss": 4.7275,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 7.385685884691849,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00027451250565810857,
|
|
"loss": 4.6347,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"epoch": 7.395626242544732,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000273280349703171,
|
|
"loss": 4.7225,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 7.405566600397615,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00027205152097893695,
|
|
"loss": 4.5634,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"epoch": 7.415506958250497,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00027082603426063735,
|
|
"loss": 4.7793,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 7.42544731610338,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00026960390428331906,
|
|
"loss": 4.7748,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"epoch": 7.435387673956263,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00026838514574166814,
|
|
"loss": 4.5358,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 7.4453280318091455,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0002671697732898329,
|
|
"loss": 4.636,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"epoch": 7.455268389662028,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0002659578015412483,
|
|
"loss": 4.7551,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 7.465208747514911,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00026474924506845934,
|
|
"loss": 4.8286,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"epoch": 7.475149105367794,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0002635441184029466,
|
|
"loss": 4.7009,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 7.485089463220676,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00026234243603495125,
|
|
"loss": 4.6918,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"epoch": 7.495029821073558,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0002611442124133005,
|
|
"loss": 4.8079,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 7.504970178926442,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0002599494619452345,
|
|
"loss": 4.4822,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"epoch": 7.514910536779324,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0002587581989962328,
|
|
"loss": 4.7089,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 7.524850894632207,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00025757043788984113,
|
|
"loss": 4.7166,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"epoch": 7.534791252485089,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0002563861929075003,
|
|
"loss": 4.7756,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 7.544731610337972,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00025520547828837347,
|
|
"loss": 4.772,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"epoch": 7.5546719681908545,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0002540283082291754,
|
|
"loss": 4.665,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 7.564612326043737,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0002528546968840014,
|
|
"loss": 4.7564,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"epoch": 7.57455268389662,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00025168465836415785,
|
|
"loss": 4.6575,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 7.584493041749503,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00025051820673799166,
|
|
"loss": 4.786,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"epoch": 7.594433399602385,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00024935535603072176,
|
|
"loss": 4.7156,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 7.604373757455268,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00024819612022427027,
|
|
"loss": 4.7455,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"epoch": 7.614314115308151,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00024704051325709407,
|
|
"loss": 4.6002,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 7.6242544731610336,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00024588854902401797,
|
|
"loss": 4.6842,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"epoch": 7.634194831013916,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002447402413760668,
|
|
"loss": 4.8286,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 7.644135188866799,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00024359560412029913,
|
|
"loss": 4.7759,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"epoch": 7.654075546719682,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00024245465101964164,
|
|
"loss": 4.6149,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 7.664015904572564,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00024131739579272317,
|
|
"loss": 4.8054,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"epoch": 7.673956262425447,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00024018385211371,
|
|
"loss": 4.6753,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 7.68389662027833,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00023905403361214144,
|
|
"loss": 4.6202,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"epoch": 7.693836978131213,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0002379279538727657,
|
|
"loss": 4.6213,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 7.703777335984095,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00023680562643537689,
|
|
"loss": 4.7484,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"epoch": 7.713717693836978,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00023568706479465214,
|
|
"loss": 4.7135,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 7.723658051689861,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00023457228239998906,
|
|
"loss": 4.7813,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"epoch": 7.7335984095427435,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00023346129265534442,
|
|
"loss": 4.7033,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 7.743538767395626,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0002323541089190727,
|
|
"loss": 4.634,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"epoch": 7.753479125248509,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0002312507445037658,
|
|
"loss": 4.8234,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 7.763419483101392,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00023015121267609218,
|
|
"loss": 4.6978,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"epoch": 7.773359840954274,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0002290555266566385,
|
|
"loss": 4.7563,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 7.783300198807157,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00022796369961974977,
|
|
"loss": 4.7945,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"epoch": 7.79324055666004,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00022687574469337145,
|
|
"loss": 4.648,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 7.8031809145129225,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00022579167495889114,
|
|
"loss": 4.7676,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"epoch": 7.813121272365805,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00022471150345098175,
|
|
"loss": 4.7234,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 7.823061630218688,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00022363524315744437,
|
|
"loss": 4.7666,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"epoch": 7.833001988071571,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00022256290701905254,
|
|
"loss": 4.6092,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 7.842942345924453,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002214945079293962,
|
|
"loss": 4.6325,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"epoch": 7.852882703777336,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000220430058734727,
|
|
"loss": 4.7461,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 7.862823061630219,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00021936957223380368,
|
|
"loss": 4.6685,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"epoch": 7.8727634194831015,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0002183130611777382,
|
|
"loss": 4.7269,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 7.882703777335984,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00021726053826984248,
|
|
"loss": 4.7855,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"epoch": 7.892644135188867,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00021621201616547548,
|
|
"loss": 4.7606,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 7.90258449304175,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00021516750747189146,
|
|
"loss": 4.6604,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"epoch": 7.912524850894632,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00021412702474808782,
|
|
"loss": 4.5741,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 7.922465208747515,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00021309058050465447,
|
|
"loss": 4.7476,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"epoch": 7.932405566600398,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0002120581872036233,
|
|
"loss": 4.7695,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 7.942345924453281,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00021102985725831848,
|
|
"loss": 4.5975,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"epoch": 7.952286282306163,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00021000560303320687,
|
|
"loss": 4.7038,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 7.952286282306163,
|
|
"eval_loss": 6.598598480224609,
|
|
"eval_runtime": 0.9871,
|
|
"eval_samples_per_second": 3510.425,
|
|
"eval_steps_per_second": 439.69,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 7.962226640159046,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00020898543684374953,
|
|
"loss": 4.7847,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"epoch": 7.972166998011929,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00020796937095625382,
|
|
"loss": 4.7834,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 7.9821073558648115,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00020695741758772576,
|
|
"loss": 4.6518,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"epoch": 7.992047713717694,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00020594958890572302,
|
|
"loss": 4.7082,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 8.001988071570576,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0002049458970282088,
|
|
"loss": 4.7762,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"epoch": 8.01192842942346,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00020394635402340607,
|
|
"loss": 4.672,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 8.021868787276341,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0002029509719096524,
|
|
"loss": 4.6403,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"epoch": 8.031809145129225,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0002019597626552555,
|
|
"loss": 4.612,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 8.041749502982107,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0002009727381783494,
|
|
"loss": 4.5758,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"epoch": 8.05168986083499,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00019998991034675092,
|
|
"loss": 4.6523,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 8.061630218687872,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0001990112909778173,
|
|
"loss": 4.5683,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"epoch": 8.071570576540756,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00019803689183830385,
|
|
"loss": 4.6635,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 8.081510934393638,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0001970667246442224,
|
|
"loss": 4.5682,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"epoch": 8.091451292246521,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00019610080106070087,
|
|
"loss": 4.6741,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 8.101391650099403,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00019513913270184248,
|
|
"loss": 4.6582,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"epoch": 8.111332007952287,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00019418173113058644,
|
|
"loss": 4.4836,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 8.121272365805169,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00019322860785856878,
|
|
"loss": 4.4907,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"epoch": 8.131212723658052,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00019227977434598412,
|
|
"loss": 4.6087,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 8.141153081510934,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00019133524200144742,
|
|
"loss": 4.6471,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"epoch": 8.151093439363818,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0001903950221818575,
|
|
"loss": 4.6317,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 8.1610337972167,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0001894591261922598,
|
|
"loss": 4.6445,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"epoch": 8.170974155069583,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00018852756528571087,
|
|
"loss": 4.4262,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 8.180914512922465,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00018760035066314295,
|
|
"loss": 4.7058,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"epoch": 8.190854870775349,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00018667749347322938,
|
|
"loss": 4.5877,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 8.20079522862823,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00018575900481225028,
|
|
"loss": 4.6046,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"epoch": 8.210735586481114,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00018484489572395934,
|
|
"loss": 4.6345,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 8.220675944333996,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00018393517719945114,
|
|
"loss": 4.5759,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"epoch": 8.23061630218688,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00018302986017702875,
|
|
"loss": 4.5343,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 8.240556660039761,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00018212895554207232,
|
|
"loss": 4.7279,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"epoch": 8.250497017892645,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00018123247412690826,
|
|
"loss": 4.6596,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 8.260437375745527,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0001803404267106788,
|
|
"loss": 4.6755,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"epoch": 8.27037773359841,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0001794528240192126,
|
|
"loss": 4.52,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 8.280318091451292,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00017856967672489577,
|
|
"loss": 4.6399,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"epoch": 8.290258449304176,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00017769099544654332,
|
|
"loss": 4.6558,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 8.300198807157058,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00017681679074927164,
|
|
"loss": 4.6274,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"epoch": 8.310139165009941,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00017594707314437166,
|
|
"loss": 4.5461,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 8.320079522862823,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00017508185308918195,
|
|
"loss": 4.6264,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"epoch": 8.330019880715707,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0001742211409869636,
|
|
"loss": 4.6886,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 8.339960238568588,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00017336494718677472,
|
|
"loss": 4.6175,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"epoch": 8.34990059642147,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00017251328198334591,
|
|
"loss": 4.5912,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 8.359840954274354,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00017166615561695685,
|
|
"loss": 4.6233,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"epoch": 8.369781312127236,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00017082357827331323,
|
|
"loss": 4.7118,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 8.37972166998012,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0001699855600834237,
|
|
"loss": 4.4303,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"epoch": 8.389662027833001,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00016915211112347855,
|
|
"loss": 4.7001,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 8.399602385685885,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00016832324141472844,
|
|
"loss": 4.6361,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"epoch": 8.409542743538767,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00016749896092336383,
|
|
"loss": 4.6182,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 8.41948310139165,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0001666792795603952,
|
|
"loss": 4.5153,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"epoch": 8.429423459244532,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00016586420718153394,
|
|
"loss": 4.575,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 8.439363817097416,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00016505375358707373,
|
|
"loss": 4.6754,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"epoch": 8.449304174950298,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00016424792852177278,
|
|
"loss": 4.6452,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 8.459244532803181,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00016344674167473655,
|
|
"loss": 4.6821,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"epoch": 8.469184890656063,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00016265020267930161,
|
|
"loss": 4.6335,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 8.479125248508947,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00016185832111291924,
|
|
"loss": 4.6286,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"epoch": 8.489065606361828,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00016107110649704064,
|
|
"loss": 4.6015,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 8.499005964214712,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00016028856829700258,
|
|
"loss": 4.6084,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"epoch": 8.508946322067594,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0001595107159219132,
|
|
"loss": 4.6903,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 8.518886679920477,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00015873755872453904,
|
|
"loss": 4.7057,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"epoch": 8.52882703777336,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0001579691060011927,
|
|
"loss": 4.6179,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 8.538767395626243,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00015720536699162112,
|
|
"loss": 4.5584,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"epoch": 8.548707753479125,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0001564463508788939,
|
|
"loss": 4.694,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 8.558648111332008,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0001556920667892937,
|
|
"loss": 4.6101,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"epoch": 8.56858846918489,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.000154942523792206,
|
|
"loss": 4.6478,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 8.578528827037774,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0001541977309000102,
|
|
"loss": 4.5603,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"epoch": 8.588469184890656,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00015345769706797132,
|
|
"loss": 4.6485,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 8.59840954274354,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000152722431194132,
|
|
"loss": 4.5807,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"epoch": 8.608349900596421,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00015199194211920595,
|
|
"loss": 4.6023,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 8.618290258449305,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00015126623862647142,
|
|
"loss": 4.5958,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"epoch": 8.628230616302186,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0001505453294416655,
|
|
"loss": 4.607,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 8.63817097415507,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0001498292232328796,
|
|
"loss": 4.6336,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"epoch": 8.648111332007952,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0001491179286104546,
|
|
"loss": 4.6872,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 8.658051689860836,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00014841145412687796,
|
|
"loss": 4.653,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"epoch": 8.667992047713717,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00014770980827668047,
|
|
"loss": 4.6053,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 8.677932405566601,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0001470129994963342,
|
|
"loss": 4.6167,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"epoch": 8.687872763419483,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0001463210361641512,
|
|
"loss": 4.7008,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 8.697813121272366,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00014563392660018253,
|
|
"loss": 4.6053,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"epoch": 8.707753479125248,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00014495167906611856,
|
|
"loss": 4.7543,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 8.717693836978132,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00014427430176518926,
|
|
"loss": 4.6379,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"epoch": 8.727634194831014,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00014360180284206592,
|
|
"loss": 4.7382,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 8.737574552683897,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0001429341903827626,
|
|
"loss": 4.7364,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"epoch": 8.747514910536779,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00014227147241453985,
|
|
"loss": 4.7337,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 8.757455268389663,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0001416136569058074,
|
|
"loss": 4.6204,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"epoch": 8.767395626242545,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00014096075176602875,
|
|
"loss": 4.5669,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 8.777335984095428,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0001403127648456258,
|
|
"loss": 4.6668,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"epoch": 8.78727634194831,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00013966970393588474,
|
|
"loss": 4.6719,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 8.797216699801194,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00013903157676886223,
|
|
"loss": 4.5721,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"epoch": 8.807157057654075,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00013839839101729233,
|
|
"loss": 4.6441,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 8.817097415506959,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00013777015429449445,
|
|
"loss": 4.7086,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"epoch": 8.82703777335984,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0001371468741542816,
|
|
"loss": 4.6569,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 8.836978131212724,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00013652855809086984,
|
|
"loss": 4.6813,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"epoch": 8.846918489065606,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00013591521353878782,
|
|
"loss": 4.7261,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 8.856858846918488,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0001353068478727877,
|
|
"loss": 4.5968,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"epoch": 8.866799204771372,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00013470346840775634,
|
|
"loss": 4.6258,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 8.876739562624255,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00013410508239862718,
|
|
"loss": 4.6303,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"epoch": 8.886679920477137,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00013351169704029343,
|
|
"loss": 4.6891,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 8.896620278330019,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0001329233194675211,
|
|
"loss": 4.5401,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"epoch": 8.906560636182903,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00013233995675486368,
|
|
"loss": 4.6145,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 8.916500994035786,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00013176161591657654,
|
|
"loss": 4.5558,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"epoch": 8.926441351888668,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0001311883039065332,
|
|
"loss": 4.6623,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 8.93638170974155,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0001306200276181409,
|
|
"loss": 4.631,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"epoch": 8.946322067594433,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0001300567938842587,
|
|
"loss": 4.6715,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 8.946322067594433,
|
|
"eval_loss": 6.609087944030762,
|
|
"eval_runtime": 0.9934,
|
|
"eval_samples_per_second": 3487.851,
|
|
"eval_steps_per_second": 436.862,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 8.956262425447315,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00012949860947711458,
|
|
"loss": 4.5913,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"epoch": 8.966202783300199,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0001289454811082243,
|
|
"loss": 4.6589,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 8.97614314115308,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00012839741542831069,
|
|
"loss": 4.5529,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"epoch": 8.986083499005964,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00012785441902722364,
|
|
"loss": 4.7133,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 8.996023856858846,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0001273164984338609,
|
|
"loss": 4.595,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"epoch": 9.00596421471173,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00012678366011608955,
|
|
"loss": 4.5176,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 9.015904572564612,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00012625591048066816,
|
|
"loss": 4.634,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"epoch": 9.025844930417495,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00012573325587317003,
|
|
"loss": 4.6876,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 9.035785288270377,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00012521570257790644,
|
|
"loss": 4.649,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"epoch": 9.04572564612326,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00012470325681785161,
|
|
"loss": 4.4619,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 9.055666003976143,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00012419592475456737,
|
|
"loss": 4.6049,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"epoch": 9.065606361829026,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00012369371248812955,
|
|
"loss": 4.4383,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 9.075546719681908,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00012319662605705418,
|
|
"loss": 4.6178,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"epoch": 9.085487077534792,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00012270467143822523,
|
|
"loss": 4.5343,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 9.095427435387673,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00012221785454682267,
|
|
"loss": 4.4714,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"epoch": 9.105367793240557,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00012173618123625113,
|
|
"loss": 4.5174,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 9.115308151093439,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00012125965729806981,
|
|
"loss": 4.6209,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"epoch": 9.125248508946322,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00012078828846192275,
|
|
"loss": 4.6087,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 9.135188866799204,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00012032208039546979,
|
|
"loss": 4.4441,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"epoch": 9.145129224652088,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00011986103870431853,
|
|
"loss": 4.6316,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 9.15506958250497,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00011940516893195715,
|
|
"loss": 4.4901,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"epoch": 9.165009940357853,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00011895447655968729,
|
|
"loss": 4.6695,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 9.174950298210735,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00011850896700655852,
|
|
"loss": 4.6445,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"epoch": 9.184890656063619,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00011806864562930313,
|
|
"loss": 4.5255,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 9.1948310139165,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00011763351772227151,
|
|
"loss": 4.6115,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"epoch": 9.204771371769384,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00011720358851736869,
|
|
"loss": 4.6291,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 9.214711729622266,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00011677886318399139,
|
|
"loss": 4.5363,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"epoch": 9.22465208747515,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00011635934682896586,
|
|
"loss": 4.5902,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 9.234592445328031,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00011594504449648645,
|
|
"loss": 4.509,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"epoch": 9.244532803180915,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00011553596116805495,
|
|
"loss": 4.6383,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 9.254473161033797,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0001151321017624208,
|
|
"loss": 4.6914,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"epoch": 9.26441351888668,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00011473347113552179,
|
|
"loss": 4.6307,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 9.274353876739562,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00011434007408042576,
|
|
"loss": 4.6423,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"epoch": 9.284294234592446,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00011395191532727306,
|
|
"loss": 4.6707,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 9.294234592445328,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00011356899954321936,
|
|
"loss": 4.5052,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"epoch": 9.304174950298211,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00011319133133237998,
|
|
"loss": 4.6264,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 9.314115308151093,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00011281891523577412,
|
|
"loss": 4.6456,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"epoch": 9.324055666003977,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00011245175573127065,
|
|
"loss": 4.6149,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 9.333996023856859,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00011208985723353378,
|
|
"loss": 4.525,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"epoch": 9.343936381709742,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00011173322409397041,
|
|
"loss": 4.5987,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 9.353876739562624,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00011138186060067774,
|
|
"loss": 4.5696,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"epoch": 9.363817097415508,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00011103577097839149,
|
|
"loss": 4.6074,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 9.37375745526839,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00011069495938843527,
|
|
"loss": 4.4032,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"epoch": 9.383697813121273,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00011035942992867054,
|
|
"loss": 4.4702,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 9.393638170974155,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00011002918663344732,
|
|
"loss": 4.5273,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"epoch": 9.403578528827039,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00010970423347355563,
|
|
"loss": 4.611,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 9.41351888667992,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00010938457435617775,
|
|
"loss": 4.6382,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"epoch": 9.423459244532804,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00010907021312484145,
|
|
"loss": 4.4596,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 9.433399602385686,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00010876115355937341,
|
|
"loss": 4.6563,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"epoch": 9.443339960238568,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00010845739937585413,
|
|
"loss": 4.6698,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 9.453280318091451,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00010815895422657296,
|
|
"loss": 4.6652,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"epoch": 9.463220675944333,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00010786582169998433,
|
|
"loss": 4.63,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 9.473161033797217,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00010757800532066469,
|
|
"loss": 4.6623,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"epoch": 9.483101391650099,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00010729550854926994,
|
|
"loss": 4.6685,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 9.493041749502982,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00010701833478249384,
|
|
"loss": 4.6494,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"epoch": 9.502982107355864,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00010674648735302741,
|
|
"loss": 4.6705,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 9.512922465208748,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00010647996952951861,
|
|
"loss": 4.6408,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"epoch": 9.52286282306163,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00010621878451653301,
|
|
"loss": 4.6534,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 9.532803180914513,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00010596293545451544,
|
|
"loss": 4.5444,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"epoch": 9.542743538767395,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00010571242541975214,
|
|
"loss": 4.5749,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 9.552683896620279,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00010546725742433386,
|
|
"loss": 4.4601,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"epoch": 9.56262425447316,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0001052274344161194,
|
|
"loss": 4.5162,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 9.572564612326044,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0001049929592787005,
|
|
"loss": 4.6273,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"epoch": 9.582504970178926,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00010476383483136686,
|
|
"loss": 4.5508,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 9.59244532803181,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00010454006382907255,
|
|
"loss": 4.6645,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"epoch": 9.602385685884691,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00010432164896240246,
|
|
"loss": 4.6665,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 9.612326043737575,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0001041085928575404,
|
|
"loss": 4.5439,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"epoch": 9.622266401590457,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0001039008980762373,
|
|
"loss": 4.6502,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 9.63220675944334,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0001036985671157804,
|
|
"loss": 4.5608,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"epoch": 9.642147117296222,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00010350160240896322,
|
|
"loss": 4.6016,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 9.652087475149106,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00010331000632405643,
|
|
"loss": 4.5933,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"epoch": 9.662027833001988,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00010312378116477925,
|
|
"loss": 4.6101,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 9.671968190854871,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00010294292917027178,
|
|
"loss": 4.6727,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"epoch": 9.681908548707753,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00010276745251506802,
|
|
"loss": 4.6492,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 9.691848906560637,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00010259735330906998,
|
|
"loss": 4.5769,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"epoch": 9.701789264413518,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00010243263359752201,
|
|
"loss": 4.5999,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 9.711729622266402,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00010227329536098623,
|
|
"loss": 4.6217,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"epoch": 9.721669980119284,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00010211934051531895,
|
|
"loss": 4.5847,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 9.731610337972167,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00010197077091164744,
|
|
"loss": 4.6056,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"epoch": 9.74155069582505,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00010182758833634767,
|
|
"loss": 4.5723,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 9.751491053677933,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00010168979451102295,
|
|
"loss": 4.6376,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"epoch": 9.761431411530815,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00010155739109248322,
|
|
"loss": 4.6147,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 9.771371769383698,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0001014303796727249,
|
|
"loss": 4.6909,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"epoch": 9.78131212723658,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00010130876177891203,
|
|
"loss": 4.6076,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 9.791252485089464,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0001011925388733578,
|
|
"loss": 4.6118,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"epoch": 9.801192842942346,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00010108171235350698,
|
|
"loss": 4.616,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 9.81113320079523,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00010097628355191905,
|
|
"loss": 4.6284,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"epoch": 9.821073558648111,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00010087625373625225,
|
|
"loss": 4.5675,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 9.831013916500995,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00010078162410924831,
|
|
"loss": 4.5976,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"epoch": 9.840954274353876,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00010069239580871804,
|
|
"loss": 4.4535,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 9.85089463220676,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00010060856990752757,
|
|
"loss": 4.4907,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"epoch": 9.860834990059642,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00010053014741358548,
|
|
"loss": 4.6067,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 9.870775347912526,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00010045712926983064,
|
|
"loss": 4.5656,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"epoch": 9.880715705765407,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00010038951635422104,
|
|
"loss": 4.6429,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 9.890656063618291,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00010032730947972305,
|
|
"loss": 4.6575,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"epoch": 9.900596421471173,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00010027050939430167,
|
|
"loss": 4.6605,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 9.910536779324056,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00010021911678091158,
|
|
"loss": 4.5164,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"epoch": 9.920477137176938,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00010017313225748906,
|
|
"loss": 4.536,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 9.930417495029822,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00010013255637694424,
|
|
"loss": 4.5897,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"epoch": 9.940357852882704,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00010009738962715469,
|
|
"loss": 4.5739,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 9.940357852882704,
|
|
"eval_loss": 6.611749172210693,
|
|
"eval_runtime": 1.0018,
|
|
"eval_samples_per_second": 3458.669,
|
|
"eval_steps_per_second": 433.207,
|
|
"step": 5000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 5030,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 1000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6944724800870400.0,
|
|
"train_batch_size": 32,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|