2866 lines
64 KiB
JSON
2866 lines
64 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.9761431411530817,
|
|
"eval_steps": 500,
|
|
"global_step": 2000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.009940357852882704,
|
|
"grad_norm": 7.9375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 10.9268,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.019880715705765408,
|
|
"grad_norm": 7.125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 10.8658,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.02982107355864811,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 3e-05,
|
|
"loss": 10.6834,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.039761431411530816,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 4e-05,
|
|
"loss": 10.4582,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.04970178926441352,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 5e-05,
|
|
"loss": 10.2988,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.05964214711729622,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 6e-05,
|
|
"loss": 10.2331,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.06958250497017893,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 7.000000000000001e-05,
|
|
"loss": 10.0708,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.07952286282306163,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 8e-05,
|
|
"loss": 9.92,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.08946322067594434,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 8.999999999999999e-05,
|
|
"loss": 9.7669,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.09940357852882704,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0001,
|
|
"loss": 9.5628,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.10934393638170974,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.00011,
|
|
"loss": 9.4269,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.11928429423459244,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00012,
|
|
"loss": 9.1929,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.12922465208747516,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00013000000000000002,
|
|
"loss": 8.9863,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.13916500994035785,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00014000000000000001,
|
|
"loss": 8.8215,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.14910536779324055,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00015,
|
|
"loss": 8.6842,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.15904572564612326,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00016,
|
|
"loss": 8.5892,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.16898608349900596,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00017,
|
|
"loss": 8.4767,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.17892644135188868,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00017999999999999998,
|
|
"loss": 8.4466,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.18886679920477137,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00019,
|
|
"loss": 8.4066,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.1988071570576541,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0002,
|
|
"loss": 8.4231,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.20874751491053678,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00021,
|
|
"loss": 8.3899,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.21868787276341947,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00022,
|
|
"loss": 8.4064,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.2286282306163022,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00023,
|
|
"loss": 8.3748,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.23856858846918488,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00024,
|
|
"loss": 8.3608,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.2485089463220676,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00025,
|
|
"loss": 8.297,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.2584493041749503,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00026000000000000003,
|
|
"loss": 8.3535,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.268389662027833,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00027,
|
|
"loss": 8.2688,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.2783300198807157,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 0.00028000000000000003,
|
|
"loss": 8.2854,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2882703777335984,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.00029,
|
|
"loss": 8.1947,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.2982107355864811,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0003,
|
|
"loss": 8.1805,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.3081510934393638,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00031,
|
|
"loss": 8.2251,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.31809145129224653,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00032,
|
|
"loss": 8.1777,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.32803180914512925,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00033,
|
|
"loss": 8.1526,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.3379721669980119,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.00034,
|
|
"loss": 8.1298,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.34791252485089463,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00035,
|
|
"loss": 8.0809,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.35785288270377735,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00035999999999999997,
|
|
"loss": 8.0967,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.36779324055666,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00037,
|
|
"loss": 8.0563,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.37773359840954274,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00038,
|
|
"loss": 8.0444,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.38767395626242546,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00039000000000000005,
|
|
"loss": 8.08,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.3976143141153082,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004,
|
|
"loss": 8.0104,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.40755467196819084,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00041,
|
|
"loss": 7.9939,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.41749502982107356,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00042,
|
|
"loss": 7.9853,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.4274353876739563,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00043,
|
|
"loss": 7.9695,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.43737574552683894,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00044,
|
|
"loss": 7.9762,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.44731610337972166,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00045000000000000004,
|
|
"loss": 7.9428,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.4572564612326044,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046,
|
|
"loss": 7.8805,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.4671968190854871,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00047,
|
|
"loss": 7.8542,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.47713717693836977,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00048,
|
|
"loss": 7.9372,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.4870775347912525,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00049,
|
|
"loss": 7.8429,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.4970178926441352,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0005,
|
|
"loss": 7.797,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.5069582504970179,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00051,
|
|
"loss": 7.8132,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.5168986083499006,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0005200000000000001,
|
|
"loss": 7.8295,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.5268389662027833,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0005300000000000001,
|
|
"loss": 7.777,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.536779324055666,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00054,
|
|
"loss": 7.7526,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.5467196819085487,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00055,
|
|
"loss": 7.7913,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.5566600397614314,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0005600000000000001,
|
|
"loss": 7.7696,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.5666003976143141,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00057,
|
|
"loss": 7.7029,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.5765407554671969,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00058,
|
|
"loss": 7.7941,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.5864811133200796,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00059,
|
|
"loss": 7.7566,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.5964214711729622,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0006,
|
|
"loss": 7.6959,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.6063618290258449,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00061,
|
|
"loss": 7.6751,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.6163021868787276,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00062,
|
|
"loss": 7.6124,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.6262425447316103,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00063,
|
|
"loss": 7.6487,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.6361829025844931,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00064,
|
|
"loss": 7.5655,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.6461232604373758,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0006500000000000001,
|
|
"loss": 7.673,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.6560636182902585,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00066,
|
|
"loss": 7.6439,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.6660039761431411,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00067,
|
|
"loss": 7.5623,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.6759443339960238,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00068,
|
|
"loss": 7.611,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.6858846918489065,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00069,
|
|
"loss": 7.5456,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.6958250497017893,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0007,
|
|
"loss": 7.6076,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.705765407554672,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00071,
|
|
"loss": 7.5633,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.7157057654075547,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0007199999999999999,
|
|
"loss": 7.5467,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.7256461232604374,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00073,
|
|
"loss": 7.524,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.73558648111332,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00074,
|
|
"loss": 7.5077,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.7455268389662028,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00075,
|
|
"loss": 7.418,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.7554671968190855,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00076,
|
|
"loss": 7.4668,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.7654075546719682,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0007700000000000001,
|
|
"loss": 7.5067,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.7753479125248509,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0007800000000000001,
|
|
"loss": 7.4381,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.7852882703777336,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00079,
|
|
"loss": 7.4506,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.7952286282306164,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0008,
|
|
"loss": 7.414,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.805168986083499,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0008100000000000001,
|
|
"loss": 7.4373,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.8151093439363817,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00082,
|
|
"loss": 7.4198,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.8250497017892644,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00083,
|
|
"loss": 7.432,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.8349900596421471,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00084,
|
|
"loss": 7.3658,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.8449304174950298,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00085,
|
|
"loss": 7.4176,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.8548707753479126,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00086,
|
|
"loss": 7.3672,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.8648111332007953,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00087,
|
|
"loss": 7.192,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.8747514910536779,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00088,
|
|
"loss": 7.3876,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.8846918489065606,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0008900000000000001,
|
|
"loss": 7.3364,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.8946322067594433,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009000000000000001,
|
|
"loss": 7.4146,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.904572564612326,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00091,
|
|
"loss": 7.3658,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.9145129224652088,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00092,
|
|
"loss": 7.4027,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.9244532803180915,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00093,
|
|
"loss": 7.389,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.9343936381709742,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00094,
|
|
"loss": 7.2114,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.9443339960238568,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00095,
|
|
"loss": 7.2808,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.9542743538767395,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00096,
|
|
"loss": 7.2878,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.9642147117296223,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0009699999999999999,
|
|
"loss": 7.2404,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.974155069582505,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00098,
|
|
"loss": 7.1975,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.9840954274353877,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00099,
|
|
"loss": 7.3406,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.9940357852882704,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.001,
|
|
"loss": 7.2716,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.9940357852882704,
|
|
"eval_loss": 7.370969295501709,
|
|
"eval_runtime": 1.0019,
|
|
"eval_samples_per_second": 3458.582,
|
|
"eval_steps_per_second": 433.196,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.0039761431411531,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0009999972946377045,
|
|
"loss": 7.1343,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 1.0139165009940359,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009999891785833469,
|
|
"loss": 6.9134,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 1.0238568588469186,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0009999756519345133,
|
|
"loss": 7.1282,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 1.0337972166998013,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009999567148538456,
|
|
"loss": 7.0434,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.0437375745526838,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0009999323675690406,
|
|
"loss": 6.9941,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 1.0536779324055665,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0009999026103728454,
|
|
"loss": 7.0054,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 1.0636182902584492,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0009998674436230558,
|
|
"loss": 7.066,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 1.073558648111332,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.000999826867742511,
|
|
"loss": 7.0517,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.0834990059642147,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0009997808832190884,
|
|
"loss": 7.0508,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 1.0934393638170974,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0009997294906056982,
|
|
"loss": 7.0144,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.10337972166998,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.000999672690520277,
|
|
"loss": 7.0313,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 1.1133200795228628,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000999610483645779,
|
|
"loss": 6.9706,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.1232604373757455,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009995428707301694,
|
|
"loss": 6.8887,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 1.1332007952286283,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009994698525864147,
|
|
"loss": 7.0384,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.143141153081511,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0009993914300924726,
|
|
"loss": 7.0223,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 1.1530815109343937,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000999307604191282,
|
|
"loss": 6.9796,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.1630218687872764,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009992183758907518,
|
|
"loss": 6.9398,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 1.1729622266401591,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0009991237462637478,
|
|
"loss": 7.0023,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.1829025844930419,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000999023716448081,
|
|
"loss": 6.9849,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 1.1928429423459244,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009989182876464931,
|
|
"loss": 6.7996,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.202783300198807,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0009988074611266423,
|
|
"loss": 6.9645,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 1.2127236580516898,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000998691238221088,
|
|
"loss": 7.0538,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 1.2226640159045725,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009985696203272752,
|
|
"loss": 6.9212,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 1.2326043737574552,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009984426089075168,
|
|
"loss": 6.9601,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.242544731610338,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000998310205488977,
|
|
"loss": 7.0175,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 1.2524850894632207,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009981724116636525,
|
|
"loss": 6.9285,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 1.2624254473161034,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0009980292290883526,
|
|
"loss": 6.9276,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 1.2723658051689861,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000997880659484681,
|
|
"loss": 6.898,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.2823061630218688,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009977267046390138,
|
|
"loss": 6.9623,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 1.2922465208747516,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000997567366402478,
|
|
"loss": 6.9184,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 1.302186878727634,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0009974026466909299,
|
|
"loss": 6.8785,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 1.3121272365805168,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000997232547484932,
|
|
"loss": 6.9804,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.3220675944333995,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0009970570708297281,
|
|
"loss": 6.9272,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 1.3320079522862822,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0009968762188352208,
|
|
"loss": 6.8377,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 1.341948310139165,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0009966899936759436,
|
|
"loss": 6.8753,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 1.3518886679920477,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009964983975910369,
|
|
"loss": 6.8766,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.3618290258449304,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0009963014328842196,
|
|
"loss": 6.7019,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 1.371769383697813,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009960991019237627,
|
|
"loss": 6.8576,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 1.3817097415506958,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009958914071424596,
|
|
"loss": 6.8327,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 1.3916500994035785,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009956783510375975,
|
|
"loss": 6.8854,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.4015904572564613,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009954599361709276,
|
|
"loss": 6.8141,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 1.411530815109344,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0009952361651686331,
|
|
"loss": 6.8819,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.4214711729622267,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009950070407212996,
|
|
"loss": 6.7342,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 1.4314115308151094,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009947725655838806,
|
|
"loss": 6.9538,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.4413518886679921,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009945327425756661,
|
|
"loss": 6.7637,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 1.4512922465208749,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000994287574580248,
|
|
"loss": 6.8231,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.4612326043737576,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009940370645454848,
|
|
"loss": 6.7927,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 1.4711729622266403,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000993781215483467,
|
|
"loss": 6.7573,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.4811133200795228,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0009935200304704815,
|
|
"loss": 6.8108,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 1.4910536779324055,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009932535126469725,
|
|
"loss": 6.7694,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.5009940357852882,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0009929816652175063,
|
|
"loss": 6.8232,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 1.510934393638171,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00099270449145073,
|
|
"loss": 6.7734,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.5208747514910537,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0009924219946793353,
|
|
"loss": 6.7136,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 1.5308151093439364,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009921341783000158,
|
|
"loss": 6.7783,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.540755467196819,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000991841045773427,
|
|
"loss": 6.7885,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 1.5506958250497018,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000991542600624146,
|
|
"loss": 6.8011,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.5606361829025845,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009912388464406265,
|
|
"loss": 6.7833,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 1.570576540755467,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009909297868751585,
|
|
"loss": 6.8117,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.5805168986083498,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009906154256438223,
|
|
"loss": 6.7266,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 1.5904572564612325,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009902957665264443,
|
|
"loss": 6.7726,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.6003976143141152,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009899708133665529,
|
|
"loss": 6.6905,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 1.610337972166998,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009896405700713295,
|
|
"loss": 6.7339,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.6202783300198806,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000989305040611565,
|
|
"loss": 6.7015,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 1.6302186878727634,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009889642290216085,
|
|
"loss": 6.8003,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.640159045725646,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009886181393993223,
|
|
"loss": 6.5606,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 1.6500994035785288,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009882667759060298,
|
|
"loss": 6.6673,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 1.6600397614314115,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009879101427664662,
|
|
"loss": 6.756,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 1.6699801192842942,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009875482442687294,
|
|
"loss": 6.7176,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.679920477137177,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009871810847642258,
|
|
"loss": 6.7124,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 1.6898608349900597,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00098680866866762,
|
|
"loss": 6.6817,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 1.6998011928429424,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009864310004567807,
|
|
"loss": 6.6662,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 1.7097415506958251,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000986048084672727,
|
|
"loss": 6.7013,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.7196819085487078,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009856599259195741,
|
|
"loss": 6.7312,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 1.7296222664015906,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009852665288644783,
|
|
"loss": 6.733,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 1.7395626242544733,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000984867898237579,
|
|
"loss": 6.7149,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 1.749502982107356,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000984464038831945,
|
|
"loss": 6.6369,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.7594433399602387,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0009840549555035136,
|
|
"loss": 6.6816,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 1.7693836978131214,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009836406531710342,
|
|
"loss": 6.6142,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.779324055666004,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009832211368160087,
|
|
"loss": 6.661,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 1.7892644135188867,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009827964114826314,
|
|
"loss": 6.7227,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.7992047713717694,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009823664822777285,
|
|
"loss": 6.6658,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 1.809145129224652,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000981931354370697,
|
|
"loss": 6.6343,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.8190854870775348,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009814910329934414,
|
|
"loss": 6.6246,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 1.8290258449304175,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0009810455234403126,
|
|
"loss": 6.6103,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.8389662027833003,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000980594831068043,
|
|
"loss": 6.6444,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 1.8489065606361827,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009801389612956815,
|
|
"loss": 6.58,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.8588469184890655,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0009796779196045303,
|
|
"loss": 6.4323,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 1.8687872763419482,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009792117115380774,
|
|
"loss": 6.5837,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.878727634194831,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009787403427019303,
|
|
"loss": 6.5917,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 1.8886679920477136,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000978263818763749,
|
|
"loss": 6.5789,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.8986083499005963,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009777821454531775,
|
|
"loss": 6.6152,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 1.908548707753479,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0009772953285617748,
|
|
"loss": 6.5594,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.9184890656063618,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009768033739429459,
|
|
"loss": 6.5918,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 1.9284294234592445,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009763062875118706,
|
|
"loss": 6.5099,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.9383697813121272,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0009758040752454326,
|
|
"loss": 6.5962,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 1.94831013916501,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0009752967431821485,
|
|
"loss": 6.6482,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.9582504970178927,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009747842974220936,
|
|
"loss": 6.5015,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.9681908548707754,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00097426674412683,
|
|
"loss": 6.5312,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.978131212723658,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009737440895193317,
|
|
"loss": 6.7128,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.9880715705765408,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009732163398839106,
|
|
"loss": 6.4696,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.9880715705765408,
|
|
"eval_loss": 6.7939839363098145,
|
|
"eval_runtime": 0.998,
|
|
"eval_samples_per_second": 3471.994,
|
|
"eval_steps_per_second": 434.876,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.9980119284294235,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0009726835015661391,
|
|
"loss": 6.5883,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 2.0079522862823063,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0009721455809727765,
|
|
"loss": 6.1925,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 2.017892644135189,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009716025845716894,
|
|
"loss": 5.9461,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 2.0278330019880717,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009710545188917757,
|
|
"loss": 6.1817,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 2.0377733598409544,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0009705013905228854,
|
|
"loss": 6.1649,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 2.047713717693837,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009699432061157414,
|
|
"loss": 6.1353,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 2.05765407554672,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0009693799723818591,
|
|
"loss": 6.259,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 2.0675944333996026,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009688116960934669,
|
|
"loss": 6.2553,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 2.0775347912524853,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0009682383840834234,
|
|
"loss": 6.1891,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 2.0874751491053676,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0009676600432451364,
|
|
"loss": 6.1401,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 2.0974155069582503,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009670766805324789,
|
|
"loss": 6.2547,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 2.107355864811133,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009664883029597066,
|
|
"loss": 6.1731,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 2.1172962226640157,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009658949176013729,
|
|
"loss": 6.1449,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 2.1272365805168985,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009652965315922438,
|
|
"loss": 6.1843,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 2.137176938369781,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009646931521272123,
|
|
"loss": 6.1649,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 2.147117296222664,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009640847864612124,
|
|
"loss": 6.0978,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 2.1570576540755466,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009634714419091302,
|
|
"loss": 6.1817,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 2.1669980119284293,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009628531258457185,
|
|
"loss": 6.2633,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 2.176938369781312,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009622298457055056,
|
|
"loss": 6.0898,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 2.1868787276341948,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0009616016089827078,
|
|
"loss": 6.1527,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 2.1968190854870775,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0009609684232311378,
|
|
"loss": 6.0889,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 2.20675944333996,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009603302960641154,
|
|
"loss": 6.0709,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 2.216699801192843,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009596872351543742,
|
|
"loss": 6.1364,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 2.2266401590457257,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009590392482339713,
|
|
"loss": 6.0234,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 2.2365805168986084,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009583863430941926,
|
|
"loss": 6.0574,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 2.246520874751491,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009577285275854602,
|
|
"loss": 6.0683,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 2.256461232604374,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009570658096172374,
|
|
"loss": 6.1882,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 2.2664015904572565,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009563981971579342,
|
|
"loss": 6.209,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 2.2763419483101393,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009557256982348107,
|
|
"loss": 6.1383,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 2.286282306163022,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0009550483209338814,
|
|
"loss": 6.2263,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 2.2962226640159047,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0009543660733998174,
|
|
"loss": 6.1201,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 2.3061630218687874,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009536789638358488,
|
|
"loss": 6.1434,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 2.31610337972167,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000952987000503666,
|
|
"loss": 6.1207,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 2.326043737574553,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009522901917233196,
|
|
"loss": 5.9989,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 2.3359840954274356,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000951588545873122,
|
|
"loss": 6.0769,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 2.3459244532803183,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0009508820713895454,
|
|
"loss": 6.0889,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 2.355864811133201,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009501707767671204,
|
|
"loss": 6.207,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 2.3658051689860837,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009494546705583344,
|
|
"loss": 6.2066,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 2.3757455268389664,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009487337613735288,
|
|
"loss": 6.1389,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 2.3856858846918487,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009480080578807941,
|
|
"loss": 6.0482,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 2.3956262425447314,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009472775688058681,
|
|
"loss": 6.157,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 2.405566600397614,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009465423029320288,
|
|
"loss": 6.1028,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 2.415506958250497,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009458022690999899,
|
|
"loss": 6.091,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 2.4254473161033796,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000945057476207794,
|
|
"loss": 6.2019,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 2.4353876739562623,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0009443079332107064,
|
|
"loss": 6.0989,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 2.445328031809145,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0009435536491211062,
|
|
"loss": 6.127,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 2.4552683896620278,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0009427946330083791,
|
|
"loss": 6.2099,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 2.4652087475149105,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009420308939988073,
|
|
"loss": 6.1458,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 2.475149105367793,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000941262441275461,
|
|
"loss": 6.2149,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 2.485089463220676,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0009404892840780868,
|
|
"loss": 6.1122,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 2.4950298210735586,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009397114317029974,
|
|
"loss": 6.0771,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 2.5049701789264414,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0009389288935029595,
|
|
"loss": 6.1618,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 2.514910536779324,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0009381416788870807,
|
|
"loss": 6.2248,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 2.524850894632207,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0009373497973206984,
|
|
"loss": 6.1093,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 2.5347912524850895,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009365532583252634,
|
|
"loss": 6.148,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 2.5447316103379722,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009357520714782273,
|
|
"loss": 6.1547,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 2.554671968190855,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009349462464129264,
|
|
"loss": 6.1477,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 2.5646123260437377,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000934135792818466,
|
|
"loss": 6.0481,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 2.5745526838966204,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0009333207204396049,
|
|
"loss": 6.1616,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 2.584493041749503,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009325010390766362,
|
|
"loss": 6.0867,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 2.594433399602386,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0009316767585852716,
|
|
"loss": 6.1963,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 2.604373757455268,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009308478888765214,
|
|
"loss": 6.0798,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 2.614314115308151,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009300144399165763,
|
|
"loss": 6.0295,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 2.6242544731610336,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0009291764217266869,
|
|
"loss": 6.1378,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 2.6341948310139163,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0009283338443830432,
|
|
"loss": 6.2136,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 2.644135188866799,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009274867180166542,
|
|
"loss": 6.0917,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 2.6540755467196817,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0009266350528132253,
|
|
"loss": 6.1464,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 2.6640159045725644,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009257788590130365,
|
|
"loss": 6.1729,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 2.673956262425447,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0009249181469108181,
|
|
"loss": 6.1581,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 2.68389662027833,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009240529268556283,
|
|
"loss": 6.1723,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 2.6938369781312126,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009231832092507283,
|
|
"loss": 6.0568,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 2.7037773359840953,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0009223090045534567,
|
|
"loss": 6.1985,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 2.713717693836978,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0009214303232751044,
|
|
"loss": 6.0739,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 2.7236580516898607,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0009205471759807874,
|
|
"loss": 5.9764,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 2.7335984095427435,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009196595732893213,
|
|
"loss": 6.1165,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 2.743538767395626,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009187675258730918,
|
|
"loss": 6.1814,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 2.753479125248509,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0009178710444579277,
|
|
"loss": 6.1311,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 2.7634194831013916,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0009169701398229713,
|
|
"loss": 6.1689,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 2.7733598409542743,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000916064822800549,
|
|
"loss": 6.1812,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 2.783300198807157,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0009151551042760408,
|
|
"loss": 6.0078,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 2.79324055666004,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0009142409951877497,
|
|
"loss": 6.1,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 2.8031809145129225,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0009133225065267707,
|
|
"loss": 6.1481,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 2.8131212723658052,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000912399649336857,
|
|
"loss": 6.1574,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 2.823061630218688,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009114724347142892,
|
|
"loss": 6.0991,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 2.8330019880715707,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0009105408738077402,
|
|
"loss": 6.1241,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 2.8429423459244534,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0009096049778181426,
|
|
"loss": 6.0701,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 2.852882703777336,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0009086647579985526,
|
|
"loss": 5.9656,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 2.862823061630219,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 0.0009077202256540159,
|
|
"loss": 5.9564,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 2.8727634194831015,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0009067713921414313,
|
|
"loss": 6.1301,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 2.8827037773359843,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0009058182688694137,
|
|
"loss": 6.1673,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 2.892644135188867,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0009048608672981576,
|
|
"loss": 6.0005,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 2.9025844930417497,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0009038991989392992,
|
|
"loss": 6.1207,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 2.9125248508946324,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0009029332753557776,
|
|
"loss": 6.0328,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 2.922465208747515,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0009019631081616963,
|
|
"loss": 6.1406,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 2.932405566600398,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0009009887090221828,
|
|
"loss": 6.117,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 2.9423459244532806,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0009000100896532492,
|
|
"loss": 6.164,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 2.952286282306163,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008990272618216508,
|
|
"loss": 6.1009,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 2.9622266401590456,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0008980402373447446,
|
|
"loss": 5.9912,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 2.9721669980119283,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0008970490280903477,
|
|
"loss": 6.1235,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 2.982107355864811,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000896053645976594,
|
|
"loss": 6.06,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 2.982107355864811,
|
|
"eval_loss": 6.611093044281006,
|
|
"eval_runtime": 0.9899,
|
|
"eval_samples_per_second": 3500.511,
|
|
"eval_steps_per_second": 438.448,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 2.9920477137176937,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0008950541029717912,
|
|
"loss": 6.1042,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 3.0019880715705765,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0008940504110942771,
|
|
"loss": 6.0011,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 3.011928429423459,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0008930425824122744,
|
|
"loss": 5.6549,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 3.021868787276342,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0008920306290437462,
|
|
"loss": 5.6925,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 3.0318091451292246,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008910145631562507,
|
|
"loss": 5.671,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 3.0417495029821073,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008899943969667932,
|
|
"loss": 5.6636,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 3.05168986083499,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008889701427416815,
|
|
"loss": 5.6678,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 3.0616302186878728,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0008879418127963767,
|
|
"loss": 5.6669,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 3.0715705765407555,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008869094194953455,
|
|
"loss": 5.6231,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 3.081510934393638,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008858729752519121,
|
|
"loss": 5.6009,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 3.091451292246521,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0008848324925281085,
|
|
"loss": 5.6402,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 3.1013916500994037,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008837879838345245,
|
|
"loss": 5.53,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 3.1113320079522864,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0008827394617301576,
|
|
"loss": 5.6391,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 3.121272365805169,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0008816869388222618,
|
|
"loss": 5.6437,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 3.131212723658052,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0008806304277661964,
|
|
"loss": 5.6641,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 3.1411530815109345,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0008795699412652732,
|
|
"loss": 5.7165,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 3.1510934393638173,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0008785054920706039,
|
|
"loss": 5.5469,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 3.1610337972167,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0008774370929809475,
|
|
"loss": 5.6662,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 3.1709741550695827,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0008763647568425557,
|
|
"loss": 5.6708,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 3.1809145129224654,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0008752884965490185,
|
|
"loss": 5.7795,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 3.1908548707753477,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0008742083250411091,
|
|
"loss": 5.7075,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 3.2007952286282304,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008731242553066287,
|
|
"loss": 5.7067,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 3.210735586481113,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008720363003802503,
|
|
"loss": 5.6134,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 3.220675944333996,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008709444733433617,
|
|
"loss": 5.7087,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 3.2306163021868786,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0008698487873239079,
|
|
"loss": 5.6931,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 3.2405566600397613,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0008687492554962345,
|
|
"loss": 5.6945,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 3.250497017892644,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0008676458910809273,
|
|
"loss": 5.551,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 3.2604373757455267,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0008665387073446556,
|
|
"loss": 5.6124,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 3.2703777335984094,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000865427717600011,
|
|
"loss": 5.7552,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 3.280318091451292,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008643129352053478,
|
|
"loss": 5.5644,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 3.290258449304175,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0008631943735646231,
|
|
"loss": 5.5357,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 3.3001988071570576,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0008620720461272344,
|
|
"loss": 5.64,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 3.3101391650099403,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0008609459663878586,
|
|
"loss": 5.6495,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 3.320079522862823,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00085981614788629,
|
|
"loss": 5.6324,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 3.3300198807157058,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0008586826042072768,
|
|
"loss": 5.7362,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 3.3399602385685885,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0008575453489803583,
|
|
"loss": 5.6626,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 3.349900596421471,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0008564043958797008,
|
|
"loss": 5.6344,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 3.359840954274354,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0008552597586239333,
|
|
"loss": 5.7596,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 3.3697813121272366,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0008541114509759821,
|
|
"loss": 5.7374,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 3.3797216699801194,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0008529594867429059,
|
|
"loss": 5.7173,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 3.389662027833002,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0008518038797757299,
|
|
"loss": 5.7058,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 3.399602385685885,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0008506446439692784,
|
|
"loss": 5.6639,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 3.4095427435387675,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0008494817932620086,
|
|
"loss": 5.524,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 3.4194831013916502,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0008483153416358423,
|
|
"loss": 5.7222,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 3.429423459244533,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0008471453031159987,
|
|
"loss": 5.7485,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 3.4393638170974157,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0008459716917708248,
|
|
"loss": 5.6887,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 3.4493041749502984,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008447945217116265,
|
|
"loss": 5.674,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 3.459244532803181,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0008436138070924997,
|
|
"loss": 5.7669,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 3.469184890656064,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000842429562110159,
|
|
"loss": 5.7269,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 3.4791252485089466,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0008412418010037673,
|
|
"loss": 5.6269,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 3.4890656063618293,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0008400505380547655,
|
|
"loss": 5.6623,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 3.4990059642147116,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0008388557875866995,
|
|
"loss": 5.6936,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 3.5089463220675943,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0008376575639650489,
|
|
"loss": 5.6208,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 3.518886679920477,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008364558815970536,
|
|
"loss": 5.7239,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 3.5288270377733597,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0008352507549315407,
|
|
"loss": 5.5989,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 3.5387673956262424,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0008340421984587517,
|
|
"loss": 5.7711,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 3.548707753479125,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000832830226710167,
|
|
"loss": 5.6808,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 3.558648111332008,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0008316148542583319,
|
|
"loss": 5.7268,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 3.5685884691848906,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.000830396095716681,
|
|
"loss": 5.4095,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 3.5785288270377733,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0008291739657393626,
|
|
"loss": 5.4744,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 3.588469184890656,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0008279484790210632,
|
|
"loss": 5.7428,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 3.5984095427435387,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000826719650296829,
|
|
"loss": 5.6654,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 3.6083499005964215,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0008254874943418914,
|
|
"loss": 5.4806,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 3.618290258449304,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0008242520259714868,
|
|
"loss": 5.6981,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 3.628230616302187,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00082301326004068,
|
|
"loss": 5.7074,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 3.6381709741550696,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0008217712114441846,
|
|
"loss": 5.6836,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 3.6481113320079523,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0008205258951161852,
|
|
"loss": 5.6227,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 3.658051689860835,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0008192773260301564,
|
|
"loss": 5.7191,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 3.667992047713718,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0008180255191986837,
|
|
"loss": 5.718,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 3.6779324055666005,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0008167704896732828,
|
|
"loss": 5.7583,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 3.6878727634194832,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0008155122525442182,
|
|
"loss": 5.6963,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 3.697813121272366,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0008142508229403225,
|
|
"loss": 5.7905,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 3.7077534791252487,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0008129862160288137,
|
|
"loss": 5.7075,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 3.717693836978131,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0008117184470151134,
|
|
"loss": 5.7883,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 3.7276341948310137,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000810447531142664,
|
|
"loss": 5.7276,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 3.7375745526838964,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0008091734836927447,
|
|
"loss": 5.6329,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 3.747514910536779,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0008078963199842886,
|
|
"loss": 5.7745,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 3.757455268389662,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000806616055373698,
|
|
"loss": 5.7704,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 3.7673956262425445,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0008053327052546605,
|
|
"loss": 5.7274,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 3.7773359840954273,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008040462850579625,
|
|
"loss": 5.7801,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 3.78727634194831,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000802756810251305,
|
|
"loss": 5.6725,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 3.7972166998011927,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0008014642963391168,
|
|
"loss": 5.6716,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 3.8071570576540754,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0008001687588623686,
|
|
"loss": 5.6628,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 3.817097415506958,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0007988702133983861,
|
|
"loss": 5.7119,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 3.827037773359841,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0007975686755606623,
|
|
"loss": 5.6313,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 3.8369781312127236,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0007962641609986703,
|
|
"loss": 5.6761,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 3.8469184890656063,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0007949566853976738,
|
|
"loss": 5.625,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 3.856858846918489,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0007936462644785413,
|
|
"loss": 5.7207,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 3.8667992047713717,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0007923329139975537,
|
|
"loss": 5.7018,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 3.8767395626242545,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0007910166497462173,
|
|
"loss": 5.7197,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 3.886679920477137,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0007896974875510731,
|
|
"loss": 5.7637,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 3.89662027833002,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0007883754432735058,
|
|
"loss": 5.7297,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 3.9065606361829026,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0007870505328095545,
|
|
"loss": 5.6832,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 3.9165009940357853,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0007857227720897207,
|
|
"loss": 5.7105,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 3.926441351888668,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0007843921770787765,
|
|
"loss": 5.7642,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 3.9363817097415508,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0007830587637755736,
|
|
"loss": 5.7092,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 3.9463220675944335,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00078172254821285,
|
|
"loss": 5.6186,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 3.956262425447316,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0007803835464570379,
|
|
"loss": 5.8184,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 3.966202783300199,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0007790417746080698,
|
|
"loss": 5.7464,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 3.9761431411530817,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0007776972487991857,
|
|
"loss": 5.7122,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 3.9761431411530817,
|
|
"eval_loss": 6.55804967880249,
|
|
"eval_runtime": 0.997,
|
|
"eval_samples_per_second": 3475.347,
|
|
"eval_steps_per_second": 435.296,
|
|
"step": 2000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 5030,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 1000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2772559966279680.0,
|
|
"train_batch_size": 32,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|