Files

1450 lines
32 KiB
JSON
Raw Permalink Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9880715705765408,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009940357852882704,
"grad_norm": 8.1875,
"learning_rate": 1e-05,
"loss": 10.9381,
"step": 5
},
{
"epoch": 0.019880715705765408,
"grad_norm": 7.4375,
"learning_rate": 2e-05,
"loss": 10.8784,
"step": 10
},
{
"epoch": 0.02982107355864811,
"grad_norm": 5.34375,
"learning_rate": 3e-05,
"loss": 10.6618,
"step": 15
},
{
"epoch": 0.039761431411530816,
"grad_norm": 3.828125,
"learning_rate": 4e-05,
"loss": 10.4488,
"step": 20
},
{
"epoch": 0.04970178926441352,
"grad_norm": 3.046875,
"learning_rate": 5e-05,
"loss": 10.3286,
"step": 25
},
{
"epoch": 0.05964214711729622,
"grad_norm": 3.09375,
"learning_rate": 6e-05,
"loss": 10.2218,
"step": 30
},
{
"epoch": 0.06958250497017893,
"grad_norm": 2.90625,
"learning_rate": 7.000000000000001e-05,
"loss": 10.0546,
"step": 35
},
{
"epoch": 0.07952286282306163,
"grad_norm": 2.578125,
"learning_rate": 8e-05,
"loss": 9.9396,
"step": 40
},
{
"epoch": 0.08946322067594434,
"grad_norm": 2.703125,
"learning_rate": 8.999999999999999e-05,
"loss": 9.7561,
"step": 45
},
{
"epoch": 0.09940357852882704,
"grad_norm": 2.71875,
"learning_rate": 0.0001,
"loss": 9.5594,
"step": 50
},
{
"epoch": 0.10934393638170974,
"grad_norm": 2.203125,
"learning_rate": 0.00011,
"loss": 9.4019,
"step": 55
},
{
"epoch": 0.11928429423459244,
"grad_norm": 2.234375,
"learning_rate": 0.00012,
"loss": 9.2211,
"step": 60
},
{
"epoch": 0.12922465208747516,
"grad_norm": 1.7109375,
"learning_rate": 0.00013000000000000002,
"loss": 9.0177,
"step": 65
},
{
"epoch": 0.13916500994035785,
"grad_norm": 1.671875,
"learning_rate": 0.00014000000000000001,
"loss": 8.9211,
"step": 70
},
{
"epoch": 0.14910536779324055,
"grad_norm": 1.4453125,
"learning_rate": 0.00015,
"loss": 8.6806,
"step": 75
},
{
"epoch": 0.15904572564612326,
"grad_norm": 1.3203125,
"learning_rate": 0.00016,
"loss": 8.5648,
"step": 80
},
{
"epoch": 0.16898608349900596,
"grad_norm": 1.0625,
"learning_rate": 0.00017,
"loss": 8.5182,
"step": 85
},
{
"epoch": 0.17892644135188868,
"grad_norm": 1.1484375,
"learning_rate": 0.00017999999999999998,
"loss": 8.4601,
"step": 90
},
{
"epoch": 0.18886679920477137,
"grad_norm": 1.0390625,
"learning_rate": 0.00019,
"loss": 8.4077,
"step": 95
},
{
"epoch": 0.1988071570576541,
"grad_norm": 1.390625,
"learning_rate": 0.0002,
"loss": 8.3882,
"step": 100
},
{
"epoch": 0.20874751491053678,
"grad_norm": 1.5390625,
"learning_rate": 0.00021,
"loss": 8.3702,
"step": 105
},
{
"epoch": 0.21868787276341947,
"grad_norm": 1.1875,
"learning_rate": 0.00022,
"loss": 8.3917,
"step": 110
},
{
"epoch": 0.2286282306163022,
"grad_norm": 1.484375,
"learning_rate": 0.00023,
"loss": 8.3738,
"step": 115
},
{
"epoch": 0.23856858846918488,
"grad_norm": 1.625,
"learning_rate": 0.00024,
"loss": 8.453,
"step": 120
},
{
"epoch": 0.2485089463220676,
"grad_norm": 1.828125,
"learning_rate": 0.00025,
"loss": 8.2373,
"step": 125
},
{
"epoch": 0.2584493041749503,
"grad_norm": 1.7734375,
"learning_rate": 0.00026000000000000003,
"loss": 8.2437,
"step": 130
},
{
"epoch": 0.268389662027833,
"grad_norm": 1.21875,
"learning_rate": 0.00027,
"loss": 8.2662,
"step": 135
},
{
"epoch": 0.2783300198807157,
"grad_norm": 1.515625,
"learning_rate": 0.00028000000000000003,
"loss": 8.2467,
"step": 140
},
{
"epoch": 0.2882703777335984,
"grad_norm": 1.3359375,
"learning_rate": 0.00029,
"loss": 8.1932,
"step": 145
},
{
"epoch": 0.2982107355864811,
"grad_norm": 1.3828125,
"learning_rate": 0.0003,
"loss": 8.1642,
"step": 150
},
{
"epoch": 0.3081510934393638,
"grad_norm": 1.453125,
"learning_rate": 0.00031,
"loss": 8.1923,
"step": 155
},
{
"epoch": 0.31809145129224653,
"grad_norm": 1.5546875,
"learning_rate": 0.00032,
"loss": 8.1708,
"step": 160
},
{
"epoch": 0.32803180914512925,
"grad_norm": 1.5078125,
"learning_rate": 0.00033,
"loss": 8.2046,
"step": 165
},
{
"epoch": 0.3379721669980119,
"grad_norm": 1.4140625,
"learning_rate": 0.00034,
"loss": 8.1551,
"step": 170
},
{
"epoch": 0.34791252485089463,
"grad_norm": 1.359375,
"learning_rate": 0.00035,
"loss": 8.1342,
"step": 175
},
{
"epoch": 0.35785288270377735,
"grad_norm": 1.2734375,
"learning_rate": 0.00035999999999999997,
"loss": 8.0606,
"step": 180
},
{
"epoch": 0.36779324055666,
"grad_norm": 1.5234375,
"learning_rate": 0.00037,
"loss": 8.1021,
"step": 185
},
{
"epoch": 0.37773359840954274,
"grad_norm": 1.5234375,
"learning_rate": 0.00038,
"loss": 8.0288,
"step": 190
},
{
"epoch": 0.38767395626242546,
"grad_norm": 1.515625,
"learning_rate": 0.00039000000000000005,
"loss": 7.9732,
"step": 195
},
{
"epoch": 0.3976143141153082,
"grad_norm": 4.0625,
"learning_rate": 0.0004,
"loss": 7.9827,
"step": 200
},
{
"epoch": 0.40755467196819084,
"grad_norm": 2.421875,
"learning_rate": 0.00041,
"loss": 8.052,
"step": 205
},
{
"epoch": 0.41749502982107356,
"grad_norm": 1.40625,
"learning_rate": 0.00042,
"loss": 8.0189,
"step": 210
},
{
"epoch": 0.4274353876739563,
"grad_norm": 4.15625,
"learning_rate": 0.00043,
"loss": 8.0269,
"step": 215
},
{
"epoch": 0.43737574552683894,
"grad_norm": 1.6640625,
"learning_rate": 0.00044,
"loss": 8.0191,
"step": 220
},
{
"epoch": 0.44731610337972166,
"grad_norm": 1.71875,
"learning_rate": 0.00045000000000000004,
"loss": 7.9266,
"step": 225
},
{
"epoch": 0.4572564612326044,
"grad_norm": 1.515625,
"learning_rate": 0.00046,
"loss": 7.8376,
"step": 230
},
{
"epoch": 0.4671968190854871,
"grad_norm": 1.4609375,
"learning_rate": 0.00047,
"loss": 7.9162,
"step": 235
},
{
"epoch": 0.47713717693836977,
"grad_norm": 2.859375,
"learning_rate": 0.00048,
"loss": 7.9084,
"step": 240
},
{
"epoch": 0.4870775347912525,
"grad_norm": 1.40625,
"learning_rate": 0.00049,
"loss": 7.886,
"step": 245
},
{
"epoch": 0.4970178926441352,
"grad_norm": 1.6171875,
"learning_rate": 0.0005,
"loss": 7.8392,
"step": 250
},
{
"epoch": 0.5069582504970179,
"grad_norm": 1.5390625,
"learning_rate": 0.00051,
"loss": 7.8265,
"step": 255
},
{
"epoch": 0.5168986083499006,
"grad_norm": 1.5625,
"learning_rate": 0.0005200000000000001,
"loss": 7.8021,
"step": 260
},
{
"epoch": 0.5268389662027833,
"grad_norm": 1.59375,
"learning_rate": 0.0005300000000000001,
"loss": 7.7635,
"step": 265
},
{
"epoch": 0.536779324055666,
"grad_norm": 1.640625,
"learning_rate": 0.00054,
"loss": 7.7854,
"step": 270
},
{
"epoch": 0.5467196819085487,
"grad_norm": 1.484375,
"learning_rate": 0.00055,
"loss": 7.7803,
"step": 275
},
{
"epoch": 0.5566600397614314,
"grad_norm": 1.546875,
"learning_rate": 0.0005600000000000001,
"loss": 7.745,
"step": 280
},
{
"epoch": 0.5666003976143141,
"grad_norm": 1.53125,
"learning_rate": 0.00057,
"loss": 7.7378,
"step": 285
},
{
"epoch": 0.5765407554671969,
"grad_norm": 1.734375,
"learning_rate": 0.00058,
"loss": 7.7688,
"step": 290
},
{
"epoch": 0.5864811133200796,
"grad_norm": 1.6953125,
"learning_rate": 0.00059,
"loss": 7.6103,
"step": 295
},
{
"epoch": 0.5964214711729622,
"grad_norm": 1.6875,
"learning_rate": 0.0006,
"loss": 7.6106,
"step": 300
},
{
"epoch": 0.6063618290258449,
"grad_norm": 1.8671875,
"learning_rate": 0.00061,
"loss": 7.7767,
"step": 305
},
{
"epoch": 0.6163021868787276,
"grad_norm": 1.671875,
"learning_rate": 0.00062,
"loss": 7.705,
"step": 310
},
{
"epoch": 0.6262425447316103,
"grad_norm": 1.3828125,
"learning_rate": 0.00063,
"loss": 7.681,
"step": 315
},
{
"epoch": 0.6361829025844931,
"grad_norm": 1.453125,
"learning_rate": 0.00064,
"loss": 7.6629,
"step": 320
},
{
"epoch": 0.6461232604373758,
"grad_norm": 1.8203125,
"learning_rate": 0.0006500000000000001,
"loss": 7.6692,
"step": 325
},
{
"epoch": 0.6560636182902585,
"grad_norm": 1.4140625,
"learning_rate": 0.00066,
"loss": 7.5971,
"step": 330
},
{
"epoch": 0.6660039761431411,
"grad_norm": 1.6015625,
"learning_rate": 0.00067,
"loss": 7.6274,
"step": 335
},
{
"epoch": 0.6759443339960238,
"grad_norm": 1.59375,
"learning_rate": 0.00068,
"loss": 7.518,
"step": 340
},
{
"epoch": 0.6858846918489065,
"grad_norm": 1.5625,
"learning_rate": 0.00069,
"loss": 7.6237,
"step": 345
},
{
"epoch": 0.6958250497017893,
"grad_norm": 1.78125,
"learning_rate": 0.0007,
"loss": 7.6114,
"step": 350
},
{
"epoch": 0.705765407554672,
"grad_norm": 1.484375,
"learning_rate": 0.00071,
"loss": 7.6391,
"step": 355
},
{
"epoch": 0.7157057654075547,
"grad_norm": 1.6875,
"learning_rate": 0.0007199999999999999,
"loss": 7.5411,
"step": 360
},
{
"epoch": 0.7256461232604374,
"grad_norm": 1.6875,
"learning_rate": 0.00073,
"loss": 7.6202,
"step": 365
},
{
"epoch": 0.73558648111332,
"grad_norm": 1.46875,
"learning_rate": 0.00074,
"loss": 7.5403,
"step": 370
},
{
"epoch": 0.7455268389662028,
"grad_norm": 1.515625,
"learning_rate": 0.00075,
"loss": 7.5464,
"step": 375
},
{
"epoch": 0.7554671968190855,
"grad_norm": 1.5,
"learning_rate": 0.00076,
"loss": 7.5237,
"step": 380
},
{
"epoch": 0.7654075546719682,
"grad_norm": 1.5390625,
"learning_rate": 0.0007700000000000001,
"loss": 7.4257,
"step": 385
},
{
"epoch": 0.7753479125248509,
"grad_norm": 1.578125,
"learning_rate": 0.0007800000000000001,
"loss": 7.458,
"step": 390
},
{
"epoch": 0.7852882703777336,
"grad_norm": 1.578125,
"learning_rate": 0.00079,
"loss": 7.3689,
"step": 395
},
{
"epoch": 0.7952286282306164,
"grad_norm": 1.515625,
"learning_rate": 0.0008,
"loss": 7.4212,
"step": 400
},
{
"epoch": 0.805168986083499,
"grad_norm": 1.828125,
"learning_rate": 0.0008100000000000001,
"loss": 7.4605,
"step": 405
},
{
"epoch": 0.8151093439363817,
"grad_norm": 1.7109375,
"learning_rate": 0.00082,
"loss": 7.4011,
"step": 410
},
{
"epoch": 0.8250497017892644,
"grad_norm": 1.5390625,
"learning_rate": 0.00083,
"loss": 7.4494,
"step": 415
},
{
"epoch": 0.8349900596421471,
"grad_norm": 1.8203125,
"learning_rate": 0.00084,
"loss": 7.3044,
"step": 420
},
{
"epoch": 0.8449304174950298,
"grad_norm": 1.515625,
"learning_rate": 0.00085,
"loss": 7.4261,
"step": 425
},
{
"epoch": 0.8548707753479126,
"grad_norm": 1.6171875,
"learning_rate": 0.00086,
"loss": 7.3506,
"step": 430
},
{
"epoch": 0.8648111332007953,
"grad_norm": 1.46875,
"learning_rate": 0.00087,
"loss": 7.3579,
"step": 435
},
{
"epoch": 0.8747514910536779,
"grad_norm": 1.5546875,
"learning_rate": 0.00088,
"loss": 7.3574,
"step": 440
},
{
"epoch": 0.8846918489065606,
"grad_norm": 1.7265625,
"learning_rate": 0.0008900000000000001,
"loss": 7.3697,
"step": 445
},
{
"epoch": 0.8946322067594433,
"grad_norm": 1.546875,
"learning_rate": 0.0009000000000000001,
"loss": 7.359,
"step": 450
},
{
"epoch": 0.904572564612326,
"grad_norm": 1.390625,
"learning_rate": 0.00091,
"loss": 7.3429,
"step": 455
},
{
"epoch": 0.9145129224652088,
"grad_norm": 1.4609375,
"learning_rate": 0.00092,
"loss": 7.3306,
"step": 460
},
{
"epoch": 0.9244532803180915,
"grad_norm": 1.5078125,
"learning_rate": 0.00093,
"loss": 7.3062,
"step": 465
},
{
"epoch": 0.9343936381709742,
"grad_norm": 2.28125,
"learning_rate": 0.00094,
"loss": 7.3062,
"step": 470
},
{
"epoch": 0.9443339960238568,
"grad_norm": 1.5234375,
"learning_rate": 0.00095,
"loss": 7.3399,
"step": 475
},
{
"epoch": 0.9542743538767395,
"grad_norm": 1.359375,
"learning_rate": 0.00096,
"loss": 7.3344,
"step": 480
},
{
"epoch": 0.9642147117296223,
"grad_norm": 1.4765625,
"learning_rate": 0.0009699999999999999,
"loss": 7.3416,
"step": 485
},
{
"epoch": 0.974155069582505,
"grad_norm": 1.46875,
"learning_rate": 0.00098,
"loss": 7.2912,
"step": 490
},
{
"epoch": 0.9840954274353877,
"grad_norm": 1.6015625,
"learning_rate": 0.00099,
"loss": 7.1231,
"step": 495
},
{
"epoch": 0.9940357852882704,
"grad_norm": 1.296875,
"learning_rate": 0.001,
"loss": 7.2644,
"step": 500
},
{
"epoch": 0.9940357852882704,
"eval_loss": 7.379269123077393,
"eval_runtime": 1.0043,
"eval_samples_per_second": 3450.268,
"eval_steps_per_second": 432.155,
"step": 500
},
{
"epoch": 1.0039761431411531,
"grad_norm": 1.3125,
"learning_rate": 0.0009999972946377045,
"loss": 7.1732,
"step": 505
},
{
"epoch": 1.0139165009940359,
"grad_norm": 1.984375,
"learning_rate": 0.0009999891785833469,
"loss": 7.1168,
"step": 510
},
{
"epoch": 1.0238568588469186,
"grad_norm": 1.6640625,
"learning_rate": 0.0009999756519345133,
"loss": 7.078,
"step": 515
},
{
"epoch": 1.0337972166998013,
"grad_norm": 1.421875,
"learning_rate": 0.0009999567148538456,
"loss": 7.0055,
"step": 520
},
{
"epoch": 1.0437375745526838,
"grad_norm": 1.546875,
"learning_rate": 0.0009999323675690406,
"loss": 7.0395,
"step": 525
},
{
"epoch": 1.0536779324055665,
"grad_norm": 1.625,
"learning_rate": 0.0009999026103728454,
"loss": 7.0092,
"step": 530
},
{
"epoch": 1.0636182902584492,
"grad_norm": 1.53125,
"learning_rate": 0.0009998674436230558,
"loss": 7.098,
"step": 535
},
{
"epoch": 1.073558648111332,
"grad_norm": 1.4296875,
"learning_rate": 0.000999826867742511,
"loss": 7.0573,
"step": 540
},
{
"epoch": 1.0834990059642147,
"grad_norm": 1.515625,
"learning_rate": 0.0009997808832190884,
"loss": 7.0814,
"step": 545
},
{
"epoch": 1.0934393638170974,
"grad_norm": 1.3046875,
"learning_rate": 0.0009997294906056982,
"loss": 7.0954,
"step": 550
},
{
"epoch": 1.10337972166998,
"grad_norm": 1.3828125,
"learning_rate": 0.000999672690520277,
"loss": 7.0448,
"step": 555
},
{
"epoch": 1.1133200795228628,
"grad_norm": 1.390625,
"learning_rate": 0.000999610483645779,
"loss": 7.011,
"step": 560
},
{
"epoch": 1.1232604373757455,
"grad_norm": 1.5,
"learning_rate": 0.0009995428707301694,
"loss": 7.0098,
"step": 565
},
{
"epoch": 1.1332007952286283,
"grad_norm": 1.5078125,
"learning_rate": 0.0009994698525864147,
"loss": 7.0345,
"step": 570
},
{
"epoch": 1.143141153081511,
"grad_norm": 1.5234375,
"learning_rate": 0.0009993914300924726,
"loss": 6.9947,
"step": 575
},
{
"epoch": 1.1530815109343937,
"grad_norm": 1.5625,
"learning_rate": 0.000999307604191282,
"loss": 6.9929,
"step": 580
},
{
"epoch": 1.1630218687872764,
"grad_norm": 1.5078125,
"learning_rate": 0.0009992183758907518,
"loss": 7.0264,
"step": 585
},
{
"epoch": 1.1729622266401591,
"grad_norm": 1.359375,
"learning_rate": 0.0009991237462637478,
"loss": 7.0081,
"step": 590
},
{
"epoch": 1.1829025844930419,
"grad_norm": 1.4765625,
"learning_rate": 0.000999023716448081,
"loss": 6.9429,
"step": 595
},
{
"epoch": 1.1928429423459244,
"grad_norm": 1.3984375,
"learning_rate": 0.0009989182876464931,
"loss": 6.9688,
"step": 600
},
{
"epoch": 1.202783300198807,
"grad_norm": 1.359375,
"learning_rate": 0.0009988074611266423,
"loss": 6.9687,
"step": 605
},
{
"epoch": 1.2127236580516898,
"grad_norm": 1.34375,
"learning_rate": 0.000998691238221088,
"loss": 7.0143,
"step": 610
},
{
"epoch": 1.2226640159045725,
"grad_norm": 1.5703125,
"learning_rate": 0.0009985696203272752,
"loss": 6.9389,
"step": 615
},
{
"epoch": 1.2326043737574552,
"grad_norm": 1.515625,
"learning_rate": 0.0009984426089075168,
"loss": 6.9601,
"step": 620
},
{
"epoch": 1.242544731610338,
"grad_norm": 1.4765625,
"learning_rate": 0.000998310205488977,
"loss": 7.0213,
"step": 625
},
{
"epoch": 1.2524850894632207,
"grad_norm": 1.5078125,
"learning_rate": 0.0009981724116636525,
"loss": 6.9925,
"step": 630
},
{
"epoch": 1.2624254473161034,
"grad_norm": 2.125,
"learning_rate": 0.0009980292290883526,
"loss": 6.9383,
"step": 635
},
{
"epoch": 1.2723658051689861,
"grad_norm": 1.484375,
"learning_rate": 0.000997880659484681,
"loss": 6.9755,
"step": 640
},
{
"epoch": 1.2823061630218688,
"grad_norm": 1.390625,
"learning_rate": 0.0009977267046390138,
"loss": 6.9515,
"step": 645
},
{
"epoch": 1.2922465208747516,
"grad_norm": 1.4375,
"learning_rate": 0.000997567366402478,
"loss": 6.9614,
"step": 650
},
{
"epoch": 1.302186878727634,
"grad_norm": 1.4296875,
"learning_rate": 0.0009974026466909299,
"loss": 6.8975,
"step": 655
},
{
"epoch": 1.3121272365805168,
"grad_norm": 1.3671875,
"learning_rate": 0.000997232547484932,
"loss": 6.9208,
"step": 660
},
{
"epoch": 1.3220675944333995,
"grad_norm": 1.3203125,
"learning_rate": 0.0009970570708297281,
"loss": 6.9917,
"step": 665
},
{
"epoch": 1.3320079522862822,
"grad_norm": 1.3984375,
"learning_rate": 0.0009968762188352208,
"loss": 6.8237,
"step": 670
},
{
"epoch": 1.341948310139165,
"grad_norm": 1.3125,
"learning_rate": 0.0009966899936759436,
"loss": 6.9117,
"step": 675
},
{
"epoch": 1.3518886679920477,
"grad_norm": 1.4296875,
"learning_rate": 0.0009964983975910369,
"loss": 6.8536,
"step": 680
},
{
"epoch": 1.3618290258449304,
"grad_norm": 1.3046875,
"learning_rate": 0.0009963014328842196,
"loss": 6.9004,
"step": 685
},
{
"epoch": 1.371769383697813,
"grad_norm": 1.53125,
"learning_rate": 0.0009960991019237627,
"loss": 6.8176,
"step": 690
},
{
"epoch": 1.3817097415506958,
"grad_norm": 1.328125,
"learning_rate": 0.0009958914071424596,
"loss": 6.8095,
"step": 695
},
{
"epoch": 1.3916500994035785,
"grad_norm": 1.4140625,
"learning_rate": 0.0009956783510375975,
"loss": 6.8506,
"step": 700
},
{
"epoch": 1.4015904572564613,
"grad_norm": 1.34375,
"learning_rate": 0.0009954599361709276,
"loss": 6.8532,
"step": 705
},
{
"epoch": 1.411530815109344,
"grad_norm": 1.3671875,
"learning_rate": 0.0009952361651686331,
"loss": 6.8656,
"step": 710
},
{
"epoch": 1.4214711729622267,
"grad_norm": 1.3515625,
"learning_rate": 0.0009950070407212996,
"loss": 6.8219,
"step": 715
},
{
"epoch": 1.4314115308151094,
"grad_norm": 1.34375,
"learning_rate": 0.0009947725655838806,
"loss": 6.8193,
"step": 720
},
{
"epoch": 1.4413518886679921,
"grad_norm": 1.421875,
"learning_rate": 0.0009945327425756661,
"loss": 6.7235,
"step": 725
},
{
"epoch": 1.4512922465208749,
"grad_norm": 1.3046875,
"learning_rate": 0.000994287574580248,
"loss": 6.822,
"step": 730
},
{
"epoch": 1.4612326043737576,
"grad_norm": 1.40625,
"learning_rate": 0.0009940370645454848,
"loss": 6.7462,
"step": 735
},
{
"epoch": 1.4711729622266403,
"grad_norm": 1.3046875,
"learning_rate": 0.000993781215483467,
"loss": 6.7522,
"step": 740
},
{
"epoch": 1.4811133200795228,
"grad_norm": 1.4453125,
"learning_rate": 0.0009935200304704815,
"loss": 6.7767,
"step": 745
},
{
"epoch": 1.4910536779324055,
"grad_norm": 1.3203125,
"learning_rate": 0.0009932535126469725,
"loss": 6.7556,
"step": 750
},
{
"epoch": 1.5009940357852882,
"grad_norm": 1.4140625,
"learning_rate": 0.0009929816652175063,
"loss": 6.7793,
"step": 755
},
{
"epoch": 1.510934393638171,
"grad_norm": 1.375,
"learning_rate": 0.00099270449145073,
"loss": 6.7808,
"step": 760
},
{
"epoch": 1.5208747514910537,
"grad_norm": 1.421875,
"learning_rate": 0.0009924219946793353,
"loss": 6.7323,
"step": 765
},
{
"epoch": 1.5308151093439364,
"grad_norm": 1.34375,
"learning_rate": 0.0009921341783000158,
"loss": 6.7711,
"step": 770
},
{
"epoch": 1.540755467196819,
"grad_norm": 1.4296875,
"learning_rate": 0.000991841045773427,
"loss": 6.8143,
"step": 775
},
{
"epoch": 1.5506958250497018,
"grad_norm": 1.3984375,
"learning_rate": 0.000991542600624146,
"loss": 6.7932,
"step": 780
},
{
"epoch": 1.5606361829025845,
"grad_norm": 1.4609375,
"learning_rate": 0.0009912388464406265,
"loss": 6.715,
"step": 785
},
{
"epoch": 1.570576540755467,
"grad_norm": 1.3046875,
"learning_rate": 0.0009909297868751585,
"loss": 6.6676,
"step": 790
},
{
"epoch": 1.5805168986083498,
"grad_norm": 1.328125,
"learning_rate": 0.0009906154256438223,
"loss": 6.7629,
"step": 795
},
{
"epoch": 1.5904572564612325,
"grad_norm": 1.3671875,
"learning_rate": 0.0009902957665264443,
"loss": 6.6631,
"step": 800
},
{
"epoch": 1.6003976143141152,
"grad_norm": 1.453125,
"learning_rate": 0.0009899708133665529,
"loss": 6.7854,
"step": 805
},
{
"epoch": 1.610337972166998,
"grad_norm": 1.3203125,
"learning_rate": 0.0009896405700713295,
"loss": 6.683,
"step": 810
},
{
"epoch": 1.6202783300198806,
"grad_norm": 1.4375,
"learning_rate": 0.000989305040611565,
"loss": 6.7869,
"step": 815
},
{
"epoch": 1.6302186878727634,
"grad_norm": 1.484375,
"learning_rate": 0.0009889642290216085,
"loss": 6.6871,
"step": 820
},
{
"epoch": 1.640159045725646,
"grad_norm": 1.34375,
"learning_rate": 0.0009886181393993223,
"loss": 6.6766,
"step": 825
},
{
"epoch": 1.6500994035785288,
"grad_norm": 1.7734375,
"learning_rate": 0.0009882667759060298,
"loss": 6.7339,
"step": 830
},
{
"epoch": 1.6600397614314115,
"grad_norm": 1.5625,
"learning_rate": 0.0009879101427664662,
"loss": 6.7283,
"step": 835
},
{
"epoch": 1.6699801192842942,
"grad_norm": 1.3671875,
"learning_rate": 0.0009875482442687294,
"loss": 6.7421,
"step": 840
},
{
"epoch": 1.679920477137177,
"grad_norm": 1.5078125,
"learning_rate": 0.0009871810847642258,
"loss": 6.6533,
"step": 845
},
{
"epoch": 1.6898608349900597,
"grad_norm": 1.3359375,
"learning_rate": 0.00098680866866762,
"loss": 6.6482,
"step": 850
},
{
"epoch": 1.6998011928429424,
"grad_norm": 1.3515625,
"learning_rate": 0.0009864310004567807,
"loss": 6.5192,
"step": 855
},
{
"epoch": 1.7097415506958251,
"grad_norm": 1.328125,
"learning_rate": 0.000986048084672727,
"loss": 6.5793,
"step": 860
},
{
"epoch": 1.7196819085487078,
"grad_norm": 1.2578125,
"learning_rate": 0.0009856599259195741,
"loss": 6.7184,
"step": 865
},
{
"epoch": 1.7296222664015906,
"grad_norm": 1.375,
"learning_rate": 0.0009852665288644783,
"loss": 6.5879,
"step": 870
},
{
"epoch": 1.7395626242544733,
"grad_norm": 1.5625,
"learning_rate": 0.000984867898237579,
"loss": 6.6798,
"step": 875
},
{
"epoch": 1.749502982107356,
"grad_norm": 1.28125,
"learning_rate": 0.000984464038831945,
"loss": 6.7027,
"step": 880
},
{
"epoch": 1.7594433399602387,
"grad_norm": 2.140625,
"learning_rate": 0.0009840549555035136,
"loss": 6.7056,
"step": 885
},
{
"epoch": 1.7693836978131214,
"grad_norm": 1.296875,
"learning_rate": 0.0009836406531710342,
"loss": 6.6514,
"step": 890
},
{
"epoch": 1.779324055666004,
"grad_norm": 1.2578125,
"learning_rate": 0.0009832211368160087,
"loss": 6.5464,
"step": 895
},
{
"epoch": 1.7892644135188867,
"grad_norm": 1.328125,
"learning_rate": 0.0009827964114826314,
"loss": 6.5945,
"step": 900
},
{
"epoch": 1.7992047713717694,
"grad_norm": 1.4453125,
"learning_rate": 0.0009823664822777285,
"loss": 6.7718,
"step": 905
},
{
"epoch": 1.809145129224652,
"grad_norm": 1.34375,
"learning_rate": 0.000981931354370697,
"loss": 6.6851,
"step": 910
},
{
"epoch": 1.8190854870775348,
"grad_norm": 1.4375,
"learning_rate": 0.0009814910329934414,
"loss": 6.5936,
"step": 915
},
{
"epoch": 1.8290258449304175,
"grad_norm": 1.3359375,
"learning_rate": 0.0009810455234403126,
"loss": 6.6248,
"step": 920
},
{
"epoch": 1.8389662027833003,
"grad_norm": 1.296875,
"learning_rate": 0.000980594831068043,
"loss": 6.5634,
"step": 925
},
{
"epoch": 1.8489065606361827,
"grad_norm": 1.421875,
"learning_rate": 0.0009801389612956815,
"loss": 6.5672,
"step": 930
},
{
"epoch": 1.8588469184890655,
"grad_norm": 1.28125,
"learning_rate": 0.0009796779196045303,
"loss": 6.5675,
"step": 935
},
{
"epoch": 1.8687872763419482,
"grad_norm": 1.3828125,
"learning_rate": 0.0009792117115380774,
"loss": 6.5819,
"step": 940
},
{
"epoch": 1.878727634194831,
"grad_norm": 1.25,
"learning_rate": 0.0009787403427019303,
"loss": 6.6632,
"step": 945
},
{
"epoch": 1.8886679920477136,
"grad_norm": 1.28125,
"learning_rate": 0.000978263818763749,
"loss": 6.6098,
"step": 950
},
{
"epoch": 1.8986083499005963,
"grad_norm": 1.609375,
"learning_rate": 0.0009777821454531775,
"loss": 6.506,
"step": 955
},
{
"epoch": 1.908548707753479,
"grad_norm": 1.3203125,
"learning_rate": 0.0009772953285617748,
"loss": 6.623,
"step": 960
},
{
"epoch": 1.9184890656063618,
"grad_norm": 1.296875,
"learning_rate": 0.0009768033739429459,
"loss": 6.5092,
"step": 965
},
{
"epoch": 1.9284294234592445,
"grad_norm": 1.484375,
"learning_rate": 0.0009763062875118706,
"loss": 6.4015,
"step": 970
},
{
"epoch": 1.9383697813121272,
"grad_norm": 1.265625,
"learning_rate": 0.0009758040752454326,
"loss": 6.5968,
"step": 975
},
{
"epoch": 1.94831013916501,
"grad_norm": 1.3515625,
"learning_rate": 0.0009752967431821485,
"loss": 6.624,
"step": 980
},
{
"epoch": 1.9582504970178927,
"grad_norm": 1.3359375,
"learning_rate": 0.0009747842974220936,
"loss": 6.6067,
"step": 985
},
{
"epoch": 1.9681908548707754,
"grad_norm": 1.21875,
"learning_rate": 0.00097426674412683,
"loss": 6.5329,
"step": 990
},
{
"epoch": 1.978131212723658,
"grad_norm": 1.375,
"learning_rate": 0.0009737440895193317,
"loss": 6.5692,
"step": 995
},
{
"epoch": 1.9880715705765408,
"grad_norm": 1.421875,
"learning_rate": 0.0009732163398839106,
"loss": 6.5427,
"step": 1000
},
{
"epoch": 1.9880715705765408,
"eval_loss": 6.80261754989624,
"eval_runtime": 0.9948,
"eval_samples_per_second": 3483.087,
"eval_steps_per_second": 436.265,
"step": 1000
}
],
"logging_steps": 5,
"max_steps": 5030,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1386643507752960.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}