Files
tur_10mb_baseline_seed21/checkpoint-3000/trainer_state.json
ModelHub XC 23c1d4a760 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/tur_10mb_baseline_seed21
Source: Original Platform
2026-05-23 00:49:24 +08:00

4282 lines
96 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.964214711729622,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009940357852882704,
"grad_norm": 7.53125,
"learning_rate": 1e-05,
"loss": 10.9235,
"step": 5
},
{
"epoch": 0.019880715705765408,
"grad_norm": 7.125,
"learning_rate": 2e-05,
"loss": 10.8542,
"step": 10
},
{
"epoch": 0.02982107355864811,
"grad_norm": 5.28125,
"learning_rate": 3e-05,
"loss": 10.6296,
"step": 15
},
{
"epoch": 0.039761431411530816,
"grad_norm": 3.734375,
"learning_rate": 4e-05,
"loss": 10.4421,
"step": 20
},
{
"epoch": 0.04970178926441352,
"grad_norm": 3.109375,
"learning_rate": 5e-05,
"loss": 10.3131,
"step": 25
},
{
"epoch": 0.05964214711729622,
"grad_norm": 2.921875,
"learning_rate": 6e-05,
"loss": 10.2138,
"step": 30
},
{
"epoch": 0.06958250497017893,
"grad_norm": 2.921875,
"learning_rate": 7.000000000000001e-05,
"loss": 10.061,
"step": 35
},
{
"epoch": 0.07952286282306163,
"grad_norm": 2.671875,
"learning_rate": 8e-05,
"loss": 9.9397,
"step": 40
},
{
"epoch": 0.08946322067594434,
"grad_norm": 2.625,
"learning_rate": 8.999999999999999e-05,
"loss": 9.7387,
"step": 45
},
{
"epoch": 0.09940357852882704,
"grad_norm": 2.59375,
"learning_rate": 0.0001,
"loss": 9.5704,
"step": 50
},
{
"epoch": 0.10934393638170974,
"grad_norm": 2.375,
"learning_rate": 0.00011,
"loss": 9.3733,
"step": 55
},
{
"epoch": 0.11928429423459244,
"grad_norm": 2.34375,
"learning_rate": 0.00012,
"loss": 9.1876,
"step": 60
},
{
"epoch": 0.12922465208747516,
"grad_norm": 1.84375,
"learning_rate": 0.00013000000000000002,
"loss": 9.041,
"step": 65
},
{
"epoch": 0.13916500994035785,
"grad_norm": 1.671875,
"learning_rate": 0.00014000000000000001,
"loss": 8.8545,
"step": 70
},
{
"epoch": 0.14910536779324055,
"grad_norm": 1.53125,
"learning_rate": 0.00015,
"loss": 8.6955,
"step": 75
},
{
"epoch": 0.15904572564612326,
"grad_norm": 1.2421875,
"learning_rate": 0.00016,
"loss": 8.5583,
"step": 80
},
{
"epoch": 0.16898608349900596,
"grad_norm": 1.046875,
"learning_rate": 0.00017,
"loss": 8.4812,
"step": 85
},
{
"epoch": 0.17892644135188868,
"grad_norm": 1.28125,
"learning_rate": 0.00017999999999999998,
"loss": 8.4265,
"step": 90
},
{
"epoch": 0.18886679920477137,
"grad_norm": 1.2109375,
"learning_rate": 0.00019,
"loss": 8.4638,
"step": 95
},
{
"epoch": 0.1988071570576541,
"grad_norm": 1.2734375,
"learning_rate": 0.0002,
"loss": 8.3998,
"step": 100
},
{
"epoch": 0.20874751491053678,
"grad_norm": 1.265625,
"learning_rate": 0.00021,
"loss": 8.3605,
"step": 105
},
{
"epoch": 0.21868787276341947,
"grad_norm": 1.3203125,
"learning_rate": 0.00022,
"loss": 8.3752,
"step": 110
},
{
"epoch": 0.2286282306163022,
"grad_norm": 1.5,
"learning_rate": 0.00023,
"loss": 8.3556,
"step": 115
},
{
"epoch": 0.23856858846918488,
"grad_norm": 1.3671875,
"learning_rate": 0.00024,
"loss": 8.3474,
"step": 120
},
{
"epoch": 0.2485089463220676,
"grad_norm": 1.3203125,
"learning_rate": 0.00025,
"loss": 8.2942,
"step": 125
},
{
"epoch": 0.2584493041749503,
"grad_norm": 1.65625,
"learning_rate": 0.00026000000000000003,
"loss": 8.316,
"step": 130
},
{
"epoch": 0.268389662027833,
"grad_norm": 1.5234375,
"learning_rate": 0.00027,
"loss": 8.2375,
"step": 135
},
{
"epoch": 0.2783300198807157,
"grad_norm": 1.734375,
"learning_rate": 0.00028000000000000003,
"loss": 8.2676,
"step": 140
},
{
"epoch": 0.2882703777335984,
"grad_norm": 1.9375,
"learning_rate": 0.00029,
"loss": 8.1987,
"step": 145
},
{
"epoch": 0.2982107355864811,
"grad_norm": 1.96875,
"learning_rate": 0.0003,
"loss": 8.1847,
"step": 150
},
{
"epoch": 0.3081510934393638,
"grad_norm": 1.703125,
"learning_rate": 0.00031,
"loss": 8.2578,
"step": 155
},
{
"epoch": 0.31809145129224653,
"grad_norm": 1.578125,
"learning_rate": 0.00032,
"loss": 8.1946,
"step": 160
},
{
"epoch": 0.32803180914512925,
"grad_norm": 1.5859375,
"learning_rate": 0.00033,
"loss": 8.1504,
"step": 165
},
{
"epoch": 0.3379721669980119,
"grad_norm": 1.640625,
"learning_rate": 0.00034,
"loss": 8.126,
"step": 170
},
{
"epoch": 0.34791252485089463,
"grad_norm": 1.3359375,
"learning_rate": 0.00035,
"loss": 8.0893,
"step": 175
},
{
"epoch": 0.35785288270377735,
"grad_norm": 1.75,
"learning_rate": 0.00035999999999999997,
"loss": 8.0522,
"step": 180
},
{
"epoch": 0.36779324055666,
"grad_norm": 1.640625,
"learning_rate": 0.00037,
"loss": 8.0653,
"step": 185
},
{
"epoch": 0.37773359840954274,
"grad_norm": 1.515625,
"learning_rate": 0.00038,
"loss": 8.0899,
"step": 190
},
{
"epoch": 0.38767395626242546,
"grad_norm": 1.5546875,
"learning_rate": 0.00039000000000000005,
"loss": 8.0308,
"step": 195
},
{
"epoch": 0.3976143141153082,
"grad_norm": 1.703125,
"learning_rate": 0.0004,
"loss": 7.9695,
"step": 200
},
{
"epoch": 0.40755467196819084,
"grad_norm": 1.5546875,
"learning_rate": 0.00041,
"loss": 7.9639,
"step": 205
},
{
"epoch": 0.41749502982107356,
"grad_norm": 2.203125,
"learning_rate": 0.00042,
"loss": 7.9662,
"step": 210
},
{
"epoch": 0.4274353876739563,
"grad_norm": 1.65625,
"learning_rate": 0.00043,
"loss": 7.9049,
"step": 215
},
{
"epoch": 0.43737574552683894,
"grad_norm": 1.515625,
"learning_rate": 0.00044,
"loss": 7.9815,
"step": 220
},
{
"epoch": 0.44731610337972166,
"grad_norm": 1.5,
"learning_rate": 0.00045000000000000004,
"loss": 7.9026,
"step": 225
},
{
"epoch": 0.4572564612326044,
"grad_norm": 1.6484375,
"learning_rate": 0.00046,
"loss": 7.8753,
"step": 230
},
{
"epoch": 0.4671968190854871,
"grad_norm": 1.515625,
"learning_rate": 0.00047,
"loss": 7.8852,
"step": 235
},
{
"epoch": 0.47713717693836977,
"grad_norm": 1.625,
"learning_rate": 0.00048,
"loss": 7.9331,
"step": 240
},
{
"epoch": 0.4870775347912525,
"grad_norm": 1.78125,
"learning_rate": 0.00049,
"loss": 7.7972,
"step": 245
},
{
"epoch": 0.4970178926441352,
"grad_norm": 1.5234375,
"learning_rate": 0.0005,
"loss": 7.8323,
"step": 250
},
{
"epoch": 0.5069582504970179,
"grad_norm": 2.375,
"learning_rate": 0.00051,
"loss": 7.8246,
"step": 255
},
{
"epoch": 0.5168986083499006,
"grad_norm": 1.5546875,
"learning_rate": 0.0005200000000000001,
"loss": 7.864,
"step": 260
},
{
"epoch": 0.5268389662027833,
"grad_norm": 1.46875,
"learning_rate": 0.0005300000000000001,
"loss": 7.9518,
"step": 265
},
{
"epoch": 0.536779324055666,
"grad_norm": 1.703125,
"learning_rate": 0.00054,
"loss": 7.7417,
"step": 270
},
{
"epoch": 0.5467196819085487,
"grad_norm": 1.5078125,
"learning_rate": 0.00055,
"loss": 7.7927,
"step": 275
},
{
"epoch": 0.5566600397614314,
"grad_norm": 1.546875,
"learning_rate": 0.0005600000000000001,
"loss": 7.7389,
"step": 280
},
{
"epoch": 0.5666003976143141,
"grad_norm": 1.5625,
"learning_rate": 0.00057,
"loss": 7.7131,
"step": 285
},
{
"epoch": 0.5765407554671969,
"grad_norm": 1.3125,
"learning_rate": 0.00058,
"loss": 7.6988,
"step": 290
},
{
"epoch": 0.5864811133200796,
"grad_norm": 1.7421875,
"learning_rate": 0.00059,
"loss": 7.7104,
"step": 295
},
{
"epoch": 0.5964214711729622,
"grad_norm": 1.859375,
"learning_rate": 0.0006,
"loss": 7.6605,
"step": 300
},
{
"epoch": 0.6063618290258449,
"grad_norm": 1.59375,
"learning_rate": 0.00061,
"loss": 7.7188,
"step": 305
},
{
"epoch": 0.6163021868787276,
"grad_norm": 1.734375,
"learning_rate": 0.00062,
"loss": 7.6618,
"step": 310
},
{
"epoch": 0.6262425447316103,
"grad_norm": 1.46875,
"learning_rate": 0.00063,
"loss": 7.7635,
"step": 315
},
{
"epoch": 0.6361829025844931,
"grad_norm": 2.265625,
"learning_rate": 0.00064,
"loss": 7.6721,
"step": 320
},
{
"epoch": 0.6461232604373758,
"grad_norm": 2.953125,
"learning_rate": 0.0006500000000000001,
"loss": 7.6399,
"step": 325
},
{
"epoch": 0.6560636182902585,
"grad_norm": 1.578125,
"learning_rate": 0.00066,
"loss": 7.5828,
"step": 330
},
{
"epoch": 0.6660039761431411,
"grad_norm": 1.515625,
"learning_rate": 0.00067,
"loss": 7.6427,
"step": 335
},
{
"epoch": 0.6759443339960238,
"grad_norm": 1.515625,
"learning_rate": 0.00068,
"loss": 7.4545,
"step": 340
},
{
"epoch": 0.6858846918489065,
"grad_norm": 1.59375,
"learning_rate": 0.00069,
"loss": 7.5338,
"step": 345
},
{
"epoch": 0.6958250497017893,
"grad_norm": 1.7265625,
"learning_rate": 0.0007,
"loss": 7.5311,
"step": 350
},
{
"epoch": 0.705765407554672,
"grad_norm": 2.171875,
"learning_rate": 0.00071,
"loss": 7.5899,
"step": 355
},
{
"epoch": 0.7157057654075547,
"grad_norm": 1.4375,
"learning_rate": 0.0007199999999999999,
"loss": 7.5128,
"step": 360
},
{
"epoch": 0.7256461232604374,
"grad_norm": 1.671875,
"learning_rate": 0.00073,
"loss": 7.4893,
"step": 365
},
{
"epoch": 0.73558648111332,
"grad_norm": 1.59375,
"learning_rate": 0.00074,
"loss": 7.5447,
"step": 370
},
{
"epoch": 0.7455268389662028,
"grad_norm": 1.609375,
"learning_rate": 0.00075,
"loss": 7.4271,
"step": 375
},
{
"epoch": 0.7554671968190855,
"grad_norm": 1.46875,
"learning_rate": 0.00076,
"loss": 7.5216,
"step": 380
},
{
"epoch": 0.7654075546719682,
"grad_norm": 1.515625,
"learning_rate": 0.0007700000000000001,
"loss": 7.4923,
"step": 385
},
{
"epoch": 0.7753479125248509,
"grad_norm": 1.4453125,
"learning_rate": 0.0007800000000000001,
"loss": 7.4305,
"step": 390
},
{
"epoch": 0.7852882703777336,
"grad_norm": 1.484375,
"learning_rate": 0.00079,
"loss": 7.4223,
"step": 395
},
{
"epoch": 0.7952286282306164,
"grad_norm": 1.5703125,
"learning_rate": 0.0008,
"loss": 7.4875,
"step": 400
},
{
"epoch": 0.805168986083499,
"grad_norm": 1.4921875,
"learning_rate": 0.0008100000000000001,
"loss": 7.4465,
"step": 405
},
{
"epoch": 0.8151093439363817,
"grad_norm": 1.5625,
"learning_rate": 0.00082,
"loss": 7.4046,
"step": 410
},
{
"epoch": 0.8250497017892644,
"grad_norm": 1.5,
"learning_rate": 0.00083,
"loss": 7.384,
"step": 415
},
{
"epoch": 0.8349900596421471,
"grad_norm": 1.5546875,
"learning_rate": 0.00084,
"loss": 7.3318,
"step": 420
},
{
"epoch": 0.8449304174950298,
"grad_norm": 1.421875,
"learning_rate": 0.00085,
"loss": 7.4465,
"step": 425
},
{
"epoch": 0.8548707753479126,
"grad_norm": 1.4296875,
"learning_rate": 0.00086,
"loss": 7.3554,
"step": 430
},
{
"epoch": 0.8648111332007953,
"grad_norm": 1.5078125,
"learning_rate": 0.00087,
"loss": 7.348,
"step": 435
},
{
"epoch": 0.8747514910536779,
"grad_norm": 1.4921875,
"learning_rate": 0.00088,
"loss": 7.3536,
"step": 440
},
{
"epoch": 0.8846918489065606,
"grad_norm": 1.59375,
"learning_rate": 0.0008900000000000001,
"loss": 7.3074,
"step": 445
},
{
"epoch": 0.8946322067594433,
"grad_norm": 1.546875,
"learning_rate": 0.0009000000000000001,
"loss": 7.4301,
"step": 450
},
{
"epoch": 0.904572564612326,
"grad_norm": 1.5625,
"learning_rate": 0.00091,
"loss": 7.2948,
"step": 455
},
{
"epoch": 0.9145129224652088,
"grad_norm": 1.6953125,
"learning_rate": 0.00092,
"loss": 7.4022,
"step": 460
},
{
"epoch": 0.9244532803180915,
"grad_norm": 1.78125,
"learning_rate": 0.00093,
"loss": 7.3491,
"step": 465
},
{
"epoch": 0.9343936381709742,
"grad_norm": 1.4140625,
"learning_rate": 0.00094,
"loss": 7.3304,
"step": 470
},
{
"epoch": 0.9443339960238568,
"grad_norm": 1.4296875,
"learning_rate": 0.00095,
"loss": 7.3213,
"step": 475
},
{
"epoch": 0.9542743538767395,
"grad_norm": 1.53125,
"learning_rate": 0.00096,
"loss": 7.3184,
"step": 480
},
{
"epoch": 0.9642147117296223,
"grad_norm": 1.4609375,
"learning_rate": 0.0009699999999999999,
"loss": 7.2904,
"step": 485
},
{
"epoch": 0.974155069582505,
"grad_norm": 1.3671875,
"learning_rate": 0.00098,
"loss": 7.2904,
"step": 490
},
{
"epoch": 0.9840954274353877,
"grad_norm": 1.3359375,
"learning_rate": 0.00099,
"loss": 7.2536,
"step": 495
},
{
"epoch": 0.9940357852882704,
"grad_norm": 1.859375,
"learning_rate": 0.001,
"loss": 7.2545,
"step": 500
},
{
"epoch": 0.9940357852882704,
"eval_loss": 7.392611503601074,
"eval_runtime": 0.9938,
"eval_samples_per_second": 3486.498,
"eval_steps_per_second": 436.693,
"step": 500
},
{
"epoch": 1.0039761431411531,
"grad_norm": 1.546875,
"learning_rate": 0.0009999972946377045,
"loss": 7.1713,
"step": 505
},
{
"epoch": 1.0139165009940359,
"grad_norm": 1.5390625,
"learning_rate": 0.0009999891785833469,
"loss": 7.0401,
"step": 510
},
{
"epoch": 1.0238568588469186,
"grad_norm": 1.40625,
"learning_rate": 0.0009999756519345133,
"loss": 7.0191,
"step": 515
},
{
"epoch": 1.0337972166998013,
"grad_norm": 1.453125,
"learning_rate": 0.0009999567148538456,
"loss": 7.0774,
"step": 520
},
{
"epoch": 1.0437375745526838,
"grad_norm": 1.5703125,
"learning_rate": 0.0009999323675690406,
"loss": 7.1122,
"step": 525
},
{
"epoch": 1.0536779324055665,
"grad_norm": 1.65625,
"learning_rate": 0.0009999026103728454,
"loss": 7.0297,
"step": 530
},
{
"epoch": 1.0636182902584492,
"grad_norm": 1.6875,
"learning_rate": 0.0009998674436230558,
"loss": 7.0478,
"step": 535
},
{
"epoch": 1.073558648111332,
"grad_norm": 1.5234375,
"learning_rate": 0.000999826867742511,
"loss": 7.0749,
"step": 540
},
{
"epoch": 1.0834990059642147,
"grad_norm": 1.3984375,
"learning_rate": 0.0009997808832190884,
"loss": 6.9982,
"step": 545
},
{
"epoch": 1.0934393638170974,
"grad_norm": 1.5,
"learning_rate": 0.0009997294906056982,
"loss": 7.0269,
"step": 550
},
{
"epoch": 1.10337972166998,
"grad_norm": 1.328125,
"learning_rate": 0.000999672690520277,
"loss": 7.0031,
"step": 555
},
{
"epoch": 1.1133200795228628,
"grad_norm": 1.28125,
"learning_rate": 0.000999610483645779,
"loss": 6.9229,
"step": 560
},
{
"epoch": 1.1232604373757455,
"grad_norm": 1.421875,
"learning_rate": 0.0009995428707301694,
"loss": 6.989,
"step": 565
},
{
"epoch": 1.1332007952286283,
"grad_norm": 1.421875,
"learning_rate": 0.0009994698525864147,
"loss": 7.0723,
"step": 570
},
{
"epoch": 1.143141153081511,
"grad_norm": 1.453125,
"learning_rate": 0.0009993914300924726,
"loss": 7.0914,
"step": 575
},
{
"epoch": 1.1530815109343937,
"grad_norm": 1.4609375,
"learning_rate": 0.000999307604191282,
"loss": 6.9886,
"step": 580
},
{
"epoch": 1.1630218687872764,
"grad_norm": 1.4609375,
"learning_rate": 0.0009992183758907518,
"loss": 6.993,
"step": 585
},
{
"epoch": 1.1729622266401591,
"grad_norm": 1.4453125,
"learning_rate": 0.0009991237462637478,
"loss": 6.9879,
"step": 590
},
{
"epoch": 1.1829025844930419,
"grad_norm": 1.328125,
"learning_rate": 0.000999023716448081,
"loss": 7.034,
"step": 595
},
{
"epoch": 1.1928429423459244,
"grad_norm": 1.5390625,
"learning_rate": 0.0009989182876464931,
"loss": 6.9752,
"step": 600
},
{
"epoch": 1.202783300198807,
"grad_norm": 1.546875,
"learning_rate": 0.0009988074611266423,
"loss": 6.8754,
"step": 605
},
{
"epoch": 1.2127236580516898,
"grad_norm": 1.4453125,
"learning_rate": 0.000998691238221088,
"loss": 6.9923,
"step": 610
},
{
"epoch": 1.2226640159045725,
"grad_norm": 1.375,
"learning_rate": 0.0009985696203272752,
"loss": 6.885,
"step": 615
},
{
"epoch": 1.2326043737574552,
"grad_norm": 1.53125,
"learning_rate": 0.0009984426089075168,
"loss": 6.9113,
"step": 620
},
{
"epoch": 1.242544731610338,
"grad_norm": 1.5,
"learning_rate": 0.000998310205488977,
"loss": 6.9467,
"step": 625
},
{
"epoch": 1.2524850894632207,
"grad_norm": 1.421875,
"learning_rate": 0.0009981724116636525,
"loss": 6.91,
"step": 630
},
{
"epoch": 1.2624254473161034,
"grad_norm": 1.390625,
"learning_rate": 0.0009980292290883526,
"loss": 6.9814,
"step": 635
},
{
"epoch": 1.2723658051689861,
"grad_norm": 1.3515625,
"learning_rate": 0.000997880659484681,
"loss": 6.9393,
"step": 640
},
{
"epoch": 1.2823061630218688,
"grad_norm": 1.5234375,
"learning_rate": 0.0009977267046390138,
"loss": 6.9344,
"step": 645
},
{
"epoch": 1.2922465208747516,
"grad_norm": 1.484375,
"learning_rate": 0.000997567366402478,
"loss": 6.8575,
"step": 650
},
{
"epoch": 1.302186878727634,
"grad_norm": 1.3125,
"learning_rate": 0.0009974026466909299,
"loss": 6.85,
"step": 655
},
{
"epoch": 1.3121272365805168,
"grad_norm": 1.3984375,
"learning_rate": 0.000997232547484932,
"loss": 6.9196,
"step": 660
},
{
"epoch": 1.3220675944333995,
"grad_norm": 1.6875,
"learning_rate": 0.0009970570708297281,
"loss": 6.8259,
"step": 665
},
{
"epoch": 1.3320079522862822,
"grad_norm": 1.5390625,
"learning_rate": 0.0009968762188352208,
"loss": 6.8472,
"step": 670
},
{
"epoch": 1.341948310139165,
"grad_norm": 1.3828125,
"learning_rate": 0.0009966899936759436,
"loss": 6.8573,
"step": 675
},
{
"epoch": 1.3518886679920477,
"grad_norm": 2.78125,
"learning_rate": 0.0009964983975910369,
"loss": 6.9833,
"step": 680
},
{
"epoch": 1.3618290258449304,
"grad_norm": 1.3125,
"learning_rate": 0.0009963014328842196,
"loss": 6.9976,
"step": 685
},
{
"epoch": 1.371769383697813,
"grad_norm": 1.4296875,
"learning_rate": 0.0009960991019237627,
"loss": 6.8598,
"step": 690
},
{
"epoch": 1.3817097415506958,
"grad_norm": 1.359375,
"learning_rate": 0.0009958914071424596,
"loss": 6.8171,
"step": 695
},
{
"epoch": 1.3916500994035785,
"grad_norm": 1.578125,
"learning_rate": 0.0009956783510375975,
"loss": 6.8734,
"step": 700
},
{
"epoch": 1.4015904572564613,
"grad_norm": 1.328125,
"learning_rate": 0.0009954599361709276,
"loss": 6.8877,
"step": 705
},
{
"epoch": 1.411530815109344,
"grad_norm": 1.34375,
"learning_rate": 0.0009952361651686331,
"loss": 6.7897,
"step": 710
},
{
"epoch": 1.4214711729622267,
"grad_norm": 1.5546875,
"learning_rate": 0.0009950070407212996,
"loss": 6.9605,
"step": 715
},
{
"epoch": 1.4314115308151094,
"grad_norm": 1.4140625,
"learning_rate": 0.0009947725655838806,
"loss": 6.8834,
"step": 720
},
{
"epoch": 1.4413518886679921,
"grad_norm": 1.3984375,
"learning_rate": 0.0009945327425756661,
"loss": 6.8195,
"step": 725
},
{
"epoch": 1.4512922465208749,
"grad_norm": 1.3828125,
"learning_rate": 0.000994287574580248,
"loss": 6.8148,
"step": 730
},
{
"epoch": 1.4612326043737576,
"grad_norm": 1.421875,
"learning_rate": 0.0009940370645454848,
"loss": 6.8626,
"step": 735
},
{
"epoch": 1.4711729622266403,
"grad_norm": 1.453125,
"learning_rate": 0.000993781215483467,
"loss": 6.8765,
"step": 740
},
{
"epoch": 1.4811133200795228,
"grad_norm": 1.46875,
"learning_rate": 0.0009935200304704815,
"loss": 6.7831,
"step": 745
},
{
"epoch": 1.4910536779324055,
"grad_norm": 1.4765625,
"learning_rate": 0.0009932535126469725,
"loss": 6.8274,
"step": 750
},
{
"epoch": 1.5009940357852882,
"grad_norm": 1.515625,
"learning_rate": 0.0009929816652175063,
"loss": 6.8189,
"step": 755
},
{
"epoch": 1.510934393638171,
"grad_norm": 1.328125,
"learning_rate": 0.00099270449145073,
"loss": 6.7934,
"step": 760
},
{
"epoch": 1.5208747514910537,
"grad_norm": 1.4140625,
"learning_rate": 0.0009924219946793353,
"loss": 6.6405,
"step": 765
},
{
"epoch": 1.5308151093439364,
"grad_norm": 1.71875,
"learning_rate": 0.0009921341783000158,
"loss": 6.6862,
"step": 770
},
{
"epoch": 1.540755467196819,
"grad_norm": 1.3671875,
"learning_rate": 0.000991841045773427,
"loss": 6.7518,
"step": 775
},
{
"epoch": 1.5506958250497018,
"grad_norm": 1.4296875,
"learning_rate": 0.000991542600624146,
"loss": 6.7292,
"step": 780
},
{
"epoch": 1.5606361829025845,
"grad_norm": 1.546875,
"learning_rate": 0.0009912388464406265,
"loss": 6.7062,
"step": 785
},
{
"epoch": 1.570576540755467,
"grad_norm": 1.5,
"learning_rate": 0.0009909297868751585,
"loss": 6.6082,
"step": 790
},
{
"epoch": 1.5805168986083498,
"grad_norm": 1.3203125,
"learning_rate": 0.0009906154256438223,
"loss": 6.7426,
"step": 795
},
{
"epoch": 1.5904572564612325,
"grad_norm": 1.484375,
"learning_rate": 0.0009902957665264443,
"loss": 6.8086,
"step": 800
},
{
"epoch": 1.6003976143141152,
"grad_norm": 1.78125,
"learning_rate": 0.0009899708133665529,
"loss": 6.736,
"step": 805
},
{
"epoch": 1.610337972166998,
"grad_norm": 1.3203125,
"learning_rate": 0.0009896405700713295,
"loss": 6.7488,
"step": 810
},
{
"epoch": 1.6202783300198806,
"grad_norm": 1.3203125,
"learning_rate": 0.000989305040611565,
"loss": 6.7246,
"step": 815
},
{
"epoch": 1.6302186878727634,
"grad_norm": 1.3984375,
"learning_rate": 0.0009889642290216085,
"loss": 6.7968,
"step": 820
},
{
"epoch": 1.640159045725646,
"grad_norm": 1.4453125,
"learning_rate": 0.0009886181393993223,
"loss": 6.6922,
"step": 825
},
{
"epoch": 1.6500994035785288,
"grad_norm": 1.4140625,
"learning_rate": 0.0009882667759060298,
"loss": 6.6635,
"step": 830
},
{
"epoch": 1.6600397614314115,
"grad_norm": 1.3515625,
"learning_rate": 0.0009879101427664662,
"loss": 6.6233,
"step": 835
},
{
"epoch": 1.6699801192842942,
"grad_norm": 1.375,
"learning_rate": 0.0009875482442687294,
"loss": 6.7173,
"step": 840
},
{
"epoch": 1.679920477137177,
"grad_norm": 2.03125,
"learning_rate": 0.0009871810847642258,
"loss": 6.7099,
"step": 845
},
{
"epoch": 1.6898608349900597,
"grad_norm": 1.453125,
"learning_rate": 0.00098680866866762,
"loss": 6.6863,
"step": 850
},
{
"epoch": 1.6998011928429424,
"grad_norm": 1.3984375,
"learning_rate": 0.0009864310004567807,
"loss": 6.728,
"step": 855
},
{
"epoch": 1.7097415506958251,
"grad_norm": 1.4296875,
"learning_rate": 0.000986048084672727,
"loss": 6.6503,
"step": 860
},
{
"epoch": 1.7196819085487078,
"grad_norm": 1.265625,
"learning_rate": 0.0009856599259195741,
"loss": 6.6758,
"step": 865
},
{
"epoch": 1.7296222664015906,
"grad_norm": 1.4609375,
"learning_rate": 0.0009852665288644783,
"loss": 6.6894,
"step": 870
},
{
"epoch": 1.7395626242544733,
"grad_norm": 1.5078125,
"learning_rate": 0.000984867898237579,
"loss": 6.6299,
"step": 875
},
{
"epoch": 1.749502982107356,
"grad_norm": 1.3203125,
"learning_rate": 0.000984464038831945,
"loss": 6.6652,
"step": 880
},
{
"epoch": 1.7594433399602387,
"grad_norm": 1.53125,
"learning_rate": 0.0009840549555035136,
"loss": 6.6375,
"step": 885
},
{
"epoch": 1.7693836978131214,
"grad_norm": 1.34375,
"learning_rate": 0.0009836406531710342,
"loss": 6.6245,
"step": 890
},
{
"epoch": 1.779324055666004,
"grad_norm": 1.359375,
"learning_rate": 0.0009832211368160087,
"loss": 6.6434,
"step": 895
},
{
"epoch": 1.7892644135188867,
"grad_norm": 1.4375,
"learning_rate": 0.0009827964114826314,
"loss": 6.5907,
"step": 900
},
{
"epoch": 1.7992047713717694,
"grad_norm": 1.3359375,
"learning_rate": 0.0009823664822777285,
"loss": 6.6743,
"step": 905
},
{
"epoch": 1.809145129224652,
"grad_norm": 1.3203125,
"learning_rate": 0.000981931354370697,
"loss": 6.6238,
"step": 910
},
{
"epoch": 1.8190854870775348,
"grad_norm": 1.3828125,
"learning_rate": 0.0009814910329934414,
"loss": 6.5983,
"step": 915
},
{
"epoch": 1.8290258449304175,
"grad_norm": 1.3984375,
"learning_rate": 0.0009810455234403126,
"loss": 6.6457,
"step": 920
},
{
"epoch": 1.8389662027833003,
"grad_norm": 1.3125,
"learning_rate": 0.000980594831068043,
"loss": 6.4873,
"step": 925
},
{
"epoch": 1.8489065606361827,
"grad_norm": 1.234375,
"learning_rate": 0.0009801389612956815,
"loss": 6.5629,
"step": 930
},
{
"epoch": 1.8588469184890655,
"grad_norm": 1.4921875,
"learning_rate": 0.0009796779196045303,
"loss": 6.6765,
"step": 935
},
{
"epoch": 1.8687872763419482,
"grad_norm": 1.5234375,
"learning_rate": 0.0009792117115380774,
"loss": 6.5999,
"step": 940
},
{
"epoch": 1.878727634194831,
"grad_norm": 1.390625,
"learning_rate": 0.0009787403427019303,
"loss": 6.6639,
"step": 945
},
{
"epoch": 1.8886679920477136,
"grad_norm": 1.3359375,
"learning_rate": 0.000978263818763749,
"loss": 6.6352,
"step": 950
},
{
"epoch": 1.8986083499005963,
"grad_norm": 1.5859375,
"learning_rate": 0.0009777821454531775,
"loss": 6.6011,
"step": 955
},
{
"epoch": 1.908548707753479,
"grad_norm": 1.3984375,
"learning_rate": 0.0009772953285617748,
"loss": 6.5817,
"step": 960
},
{
"epoch": 1.9184890656063618,
"grad_norm": 1.40625,
"learning_rate": 0.0009768033739429459,
"loss": 6.6113,
"step": 965
},
{
"epoch": 1.9284294234592445,
"grad_norm": 1.328125,
"learning_rate": 0.0009763062875118706,
"loss": 6.5931,
"step": 970
},
{
"epoch": 1.9383697813121272,
"grad_norm": 1.46875,
"learning_rate": 0.0009758040752454326,
"loss": 6.6421,
"step": 975
},
{
"epoch": 1.94831013916501,
"grad_norm": 1.234375,
"learning_rate": 0.0009752967431821485,
"loss": 6.6209,
"step": 980
},
{
"epoch": 1.9582504970178927,
"grad_norm": 1.515625,
"learning_rate": 0.0009747842974220936,
"loss": 6.5526,
"step": 985
},
{
"epoch": 1.9681908548707754,
"grad_norm": 1.4609375,
"learning_rate": 0.00097426674412683,
"loss": 6.6085,
"step": 990
},
{
"epoch": 1.978131212723658,
"grad_norm": 1.40625,
"learning_rate": 0.0009737440895193317,
"loss": 6.548,
"step": 995
},
{
"epoch": 1.9880715705765408,
"grad_norm": 1.3515625,
"learning_rate": 0.0009732163398839106,
"loss": 6.5648,
"step": 1000
},
{
"epoch": 1.9880715705765408,
"eval_loss": 6.81672477722168,
"eval_runtime": 0.9933,
"eval_samples_per_second": 3488.505,
"eval_steps_per_second": 436.944,
"step": 1000
},
{
"epoch": 1.9980119284294235,
"grad_norm": 1.7265625,
"learning_rate": 0.0009726835015661391,
"loss": 6.6182,
"step": 1005
},
{
"epoch": 2.0079522862823063,
"grad_norm": 1.3203125,
"learning_rate": 0.0009721455809727765,
"loss": 6.231,
"step": 1010
},
{
"epoch": 2.017892644135189,
"grad_norm": 1.28125,
"learning_rate": 0.0009716025845716894,
"loss": 6.1739,
"step": 1015
},
{
"epoch": 2.0278330019880717,
"grad_norm": 1.3515625,
"learning_rate": 0.0009710545188917757,
"loss": 6.1956,
"step": 1020
},
{
"epoch": 2.0377733598409544,
"grad_norm": 1.359375,
"learning_rate": 0.0009705013905228854,
"loss": 6.1494,
"step": 1025
},
{
"epoch": 2.047713717693837,
"grad_norm": 1.890625,
"learning_rate": 0.0009699432061157414,
"loss": 6.0183,
"step": 1030
},
{
"epoch": 2.05765407554672,
"grad_norm": 1.46875,
"learning_rate": 0.0009693799723818591,
"loss": 6.1341,
"step": 1035
},
{
"epoch": 2.0675944333996026,
"grad_norm": 1.3828125,
"learning_rate": 0.0009688116960934669,
"loss": 6.1713,
"step": 1040
},
{
"epoch": 2.0775347912524853,
"grad_norm": 1.1953125,
"learning_rate": 0.0009682383840834234,
"loss": 6.2098,
"step": 1045
},
{
"epoch": 2.0874751491053676,
"grad_norm": 1.4140625,
"learning_rate": 0.0009676600432451364,
"loss": 6.1793,
"step": 1050
},
{
"epoch": 2.0974155069582503,
"grad_norm": 1.421875,
"learning_rate": 0.0009670766805324789,
"loss": 6.2008,
"step": 1055
},
{
"epoch": 2.107355864811133,
"grad_norm": 1.3671875,
"learning_rate": 0.0009664883029597066,
"loss": 5.9955,
"step": 1060
},
{
"epoch": 2.1172962226640157,
"grad_norm": 1.328125,
"learning_rate": 0.0009658949176013729,
"loss": 6.2318,
"step": 1065
},
{
"epoch": 2.1272365805168985,
"grad_norm": 1.296875,
"learning_rate": 0.0009652965315922438,
"loss": 6.1829,
"step": 1070
},
{
"epoch": 2.137176938369781,
"grad_norm": 2.703125,
"learning_rate": 0.0009646931521272123,
"loss": 6.0501,
"step": 1075
},
{
"epoch": 2.147117296222664,
"grad_norm": 1.46875,
"learning_rate": 0.0009640847864612124,
"loss": 6.1206,
"step": 1080
},
{
"epoch": 2.1570576540755466,
"grad_norm": 1.296875,
"learning_rate": 0.0009634714419091302,
"loss": 6.1392,
"step": 1085
},
{
"epoch": 2.1669980119284293,
"grad_norm": 1.4453125,
"learning_rate": 0.0009628531258457185,
"loss": 6.1101,
"step": 1090
},
{
"epoch": 2.176938369781312,
"grad_norm": 3.75,
"learning_rate": 0.0009622298457055056,
"loss": 6.1933,
"step": 1095
},
{
"epoch": 2.1868787276341948,
"grad_norm": 1.25,
"learning_rate": 0.0009616016089827078,
"loss": 6.1325,
"step": 1100
},
{
"epoch": 2.1968190854870775,
"grad_norm": 1.21875,
"learning_rate": 0.0009609684232311378,
"loss": 6.2703,
"step": 1105
},
{
"epoch": 2.20675944333996,
"grad_norm": 1.3125,
"learning_rate": 0.0009603302960641154,
"loss": 6.1771,
"step": 1110
},
{
"epoch": 2.216699801192843,
"grad_norm": 1.3359375,
"learning_rate": 0.0009596872351543742,
"loss": 6.1754,
"step": 1115
},
{
"epoch": 2.2266401590457257,
"grad_norm": 1.5,
"learning_rate": 0.0009590392482339713,
"loss": 6.2085,
"step": 1120
},
{
"epoch": 2.2365805168986084,
"grad_norm": 1.390625,
"learning_rate": 0.0009583863430941926,
"loss": 6.1295,
"step": 1125
},
{
"epoch": 2.246520874751491,
"grad_norm": 1.4140625,
"learning_rate": 0.0009577285275854602,
"loss": 6.151,
"step": 1130
},
{
"epoch": 2.256461232604374,
"grad_norm": 1.4921875,
"learning_rate": 0.0009570658096172374,
"loss": 6.2007,
"step": 1135
},
{
"epoch": 2.2664015904572565,
"grad_norm": 1.34375,
"learning_rate": 0.0009563981971579342,
"loss": 6.1778,
"step": 1140
},
{
"epoch": 2.2763419483101393,
"grad_norm": 1.375,
"learning_rate": 0.0009557256982348107,
"loss": 6.2636,
"step": 1145
},
{
"epoch": 2.286282306163022,
"grad_norm": 1.5234375,
"learning_rate": 0.0009550483209338814,
"loss": 6.2285,
"step": 1150
},
{
"epoch": 2.2962226640159047,
"grad_norm": 1.359375,
"learning_rate": 0.0009543660733998174,
"loss": 6.1812,
"step": 1155
},
{
"epoch": 2.3061630218687874,
"grad_norm": 1.4921875,
"learning_rate": 0.0009536789638358488,
"loss": 6.0793,
"step": 1160
},
{
"epoch": 2.31610337972167,
"grad_norm": 1.3046875,
"learning_rate": 0.000952987000503666,
"loss": 6.1988,
"step": 1165
},
{
"epoch": 2.326043737574553,
"grad_norm": 1.390625,
"learning_rate": 0.0009522901917233196,
"loss": 6.3047,
"step": 1170
},
{
"epoch": 2.3359840954274356,
"grad_norm": 1.2109375,
"learning_rate": 0.000951588545873122,
"loss": 6.2805,
"step": 1175
},
{
"epoch": 2.3459244532803183,
"grad_norm": 1.3359375,
"learning_rate": 0.0009508820713895454,
"loss": 6.1183,
"step": 1180
},
{
"epoch": 2.355864811133201,
"grad_norm": 1.421875,
"learning_rate": 0.0009501707767671204,
"loss": 6.1507,
"step": 1185
},
{
"epoch": 2.3658051689860837,
"grad_norm": 1.3359375,
"learning_rate": 0.0009494546705583344,
"loss": 6.1041,
"step": 1190
},
{
"epoch": 2.3757455268389664,
"grad_norm": 1.234375,
"learning_rate": 0.0009487337613735288,
"loss": 6.2091,
"step": 1195
},
{
"epoch": 2.3856858846918487,
"grad_norm": 1.296875,
"learning_rate": 0.0009480080578807941,
"loss": 6.2558,
"step": 1200
},
{
"epoch": 2.3956262425447314,
"grad_norm": 1.2890625,
"learning_rate": 0.0009472775688058681,
"loss": 6.2074,
"step": 1205
},
{
"epoch": 2.405566600397614,
"grad_norm": 1.4453125,
"learning_rate": 0.0009465423029320288,
"loss": 6.1652,
"step": 1210
},
{
"epoch": 2.415506958250497,
"grad_norm": 1.421875,
"learning_rate": 0.0009458022690999899,
"loss": 6.208,
"step": 1215
},
{
"epoch": 2.4254473161033796,
"grad_norm": 1.390625,
"learning_rate": 0.000945057476207794,
"loss": 6.1602,
"step": 1220
},
{
"epoch": 2.4353876739562623,
"grad_norm": 1.375,
"learning_rate": 0.0009443079332107064,
"loss": 6.1323,
"step": 1225
},
{
"epoch": 2.445328031809145,
"grad_norm": 1.3984375,
"learning_rate": 0.0009435536491211062,
"loss": 6.1912,
"step": 1230
},
{
"epoch": 2.4552683896620278,
"grad_norm": 1.4921875,
"learning_rate": 0.0009427946330083791,
"loss": 6.2095,
"step": 1235
},
{
"epoch": 2.4652087475149105,
"grad_norm": 1.3671875,
"learning_rate": 0.0009420308939988073,
"loss": 6.2642,
"step": 1240
},
{
"epoch": 2.475149105367793,
"grad_norm": 1.421875,
"learning_rate": 0.000941262441275461,
"loss": 6.183,
"step": 1245
},
{
"epoch": 2.485089463220676,
"grad_norm": 1.4453125,
"learning_rate": 0.0009404892840780868,
"loss": 6.2336,
"step": 1250
},
{
"epoch": 2.4950298210735586,
"grad_norm": 1.34375,
"learning_rate": 0.0009397114317029974,
"loss": 6.0793,
"step": 1255
},
{
"epoch": 2.5049701789264414,
"grad_norm": 1.5078125,
"learning_rate": 0.0009389288935029595,
"loss": 6.1941,
"step": 1260
},
{
"epoch": 2.514910536779324,
"grad_norm": 1.4296875,
"learning_rate": 0.0009381416788870807,
"loss": 6.1979,
"step": 1265
},
{
"epoch": 2.524850894632207,
"grad_norm": 1.453125,
"learning_rate": 0.0009373497973206984,
"loss": 6.0777,
"step": 1270
},
{
"epoch": 2.5347912524850895,
"grad_norm": 1.375,
"learning_rate": 0.0009365532583252634,
"loss": 6.245,
"step": 1275
},
{
"epoch": 2.5447316103379722,
"grad_norm": 1.3671875,
"learning_rate": 0.0009357520714782273,
"loss": 6.1484,
"step": 1280
},
{
"epoch": 2.554671968190855,
"grad_norm": 1.34375,
"learning_rate": 0.0009349462464129264,
"loss": 6.1535,
"step": 1285
},
{
"epoch": 2.5646123260437377,
"grad_norm": 1.3046875,
"learning_rate": 0.000934135792818466,
"loss": 6.1407,
"step": 1290
},
{
"epoch": 2.5745526838966204,
"grad_norm": 1.359375,
"learning_rate": 0.0009333207204396049,
"loss": 6.2198,
"step": 1295
},
{
"epoch": 2.584493041749503,
"grad_norm": 1.3125,
"learning_rate": 0.0009325010390766362,
"loss": 6.2007,
"step": 1300
},
{
"epoch": 2.594433399602386,
"grad_norm": 1.8125,
"learning_rate": 0.0009316767585852716,
"loss": 6.1193,
"step": 1305
},
{
"epoch": 2.604373757455268,
"grad_norm": 1.28125,
"learning_rate": 0.0009308478888765214,
"loss": 6.2826,
"step": 1310
},
{
"epoch": 2.614314115308151,
"grad_norm": 1.375,
"learning_rate": 0.0009300144399165763,
"loss": 6.1324,
"step": 1315
},
{
"epoch": 2.6242544731610336,
"grad_norm": 1.625,
"learning_rate": 0.0009291764217266869,
"loss": 6.0737,
"step": 1320
},
{
"epoch": 2.6341948310139163,
"grad_norm": 1.3984375,
"learning_rate": 0.0009283338443830432,
"loss": 6.1777,
"step": 1325
},
{
"epoch": 2.644135188866799,
"grad_norm": 1.484375,
"learning_rate": 0.0009274867180166542,
"loss": 6.1201,
"step": 1330
},
{
"epoch": 2.6540755467196817,
"grad_norm": 1.34375,
"learning_rate": 0.0009266350528132253,
"loss": 6.1517,
"step": 1335
},
{
"epoch": 2.6640159045725644,
"grad_norm": 1.34375,
"learning_rate": 0.0009257788590130365,
"loss": 6.1018,
"step": 1340
},
{
"epoch": 2.673956262425447,
"grad_norm": 1.3984375,
"learning_rate": 0.0009249181469108181,
"loss": 6.2041,
"step": 1345
},
{
"epoch": 2.68389662027833,
"grad_norm": 1.59375,
"learning_rate": 0.0009240529268556283,
"loss": 6.0877,
"step": 1350
},
{
"epoch": 2.6938369781312126,
"grad_norm": 1.546875,
"learning_rate": 0.0009231832092507283,
"loss": 6.2511,
"step": 1355
},
{
"epoch": 2.7037773359840953,
"grad_norm": 1.4609375,
"learning_rate": 0.0009223090045534567,
"loss": 6.1304,
"step": 1360
},
{
"epoch": 2.713717693836978,
"grad_norm": 1.3203125,
"learning_rate": 0.0009214303232751044,
"loss": 6.0645,
"step": 1365
},
{
"epoch": 2.7236580516898607,
"grad_norm": 1.4140625,
"learning_rate": 0.0009205471759807874,
"loss": 6.2178,
"step": 1370
},
{
"epoch": 2.7335984095427435,
"grad_norm": 1.3828125,
"learning_rate": 0.0009196595732893213,
"loss": 6.2112,
"step": 1375
},
{
"epoch": 2.743538767395626,
"grad_norm": 1.3515625,
"learning_rate": 0.0009187675258730918,
"loss": 6.032,
"step": 1380
},
{
"epoch": 2.753479125248509,
"grad_norm": 1.34375,
"learning_rate": 0.0009178710444579277,
"loss": 6.1014,
"step": 1385
},
{
"epoch": 2.7634194831013916,
"grad_norm": 1.3828125,
"learning_rate": 0.0009169701398229713,
"loss": 6.0251,
"step": 1390
},
{
"epoch": 2.7733598409542743,
"grad_norm": 1.3359375,
"learning_rate": 0.000916064822800549,
"loss": 6.0464,
"step": 1395
},
{
"epoch": 2.783300198807157,
"grad_norm": 1.3828125,
"learning_rate": 0.0009151551042760408,
"loss": 6.0727,
"step": 1400
},
{
"epoch": 2.79324055666004,
"grad_norm": 1.3203125,
"learning_rate": 0.0009142409951877497,
"loss": 6.0804,
"step": 1405
},
{
"epoch": 2.8031809145129225,
"grad_norm": 1.3828125,
"learning_rate": 0.0009133225065267707,
"loss": 6.1965,
"step": 1410
},
{
"epoch": 2.8131212723658052,
"grad_norm": 1.46875,
"learning_rate": 0.000912399649336857,
"loss": 6.1233,
"step": 1415
},
{
"epoch": 2.823061630218688,
"grad_norm": 1.2421875,
"learning_rate": 0.0009114724347142892,
"loss": 6.1842,
"step": 1420
},
{
"epoch": 2.8330019880715707,
"grad_norm": 1.3046875,
"learning_rate": 0.0009105408738077402,
"loss": 6.1432,
"step": 1425
},
{
"epoch": 2.8429423459244534,
"grad_norm": 1.3515625,
"learning_rate": 0.0009096049778181426,
"loss": 6.222,
"step": 1430
},
{
"epoch": 2.852882703777336,
"grad_norm": 1.359375,
"learning_rate": 0.0009086647579985526,
"loss": 6.1828,
"step": 1435
},
{
"epoch": 2.862823061630219,
"grad_norm": 1.28125,
"learning_rate": 0.0009077202256540159,
"loss": 6.1993,
"step": 1440
},
{
"epoch": 2.8727634194831015,
"grad_norm": 1.390625,
"learning_rate": 0.0009067713921414313,
"loss": 6.1615,
"step": 1445
},
{
"epoch": 2.8827037773359843,
"grad_norm": 1.3359375,
"learning_rate": 0.0009058182688694137,
"loss": 6.0034,
"step": 1450
},
{
"epoch": 2.892644135188867,
"grad_norm": 1.2109375,
"learning_rate": 0.0009048608672981576,
"loss": 6.1363,
"step": 1455
},
{
"epoch": 2.9025844930417497,
"grad_norm": 1.625,
"learning_rate": 0.0009038991989392992,
"loss": 6.0537,
"step": 1460
},
{
"epoch": 2.9125248508946324,
"grad_norm": 1.3203125,
"learning_rate": 0.0009029332753557776,
"loss": 6.0732,
"step": 1465
},
{
"epoch": 2.922465208747515,
"grad_norm": 1.4140625,
"learning_rate": 0.0009019631081616963,
"loss": 6.1659,
"step": 1470
},
{
"epoch": 2.932405566600398,
"grad_norm": 1.375,
"learning_rate": 0.0009009887090221828,
"loss": 5.9484,
"step": 1475
},
{
"epoch": 2.9423459244532806,
"grad_norm": 1.359375,
"learning_rate": 0.0009000100896532492,
"loss": 6.0528,
"step": 1480
},
{
"epoch": 2.952286282306163,
"grad_norm": 1.921875,
"learning_rate": 0.0008990272618216508,
"loss": 6.0431,
"step": 1485
},
{
"epoch": 2.9622266401590456,
"grad_norm": 2.328125,
"learning_rate": 0.0008980402373447446,
"loss": 6.0559,
"step": 1490
},
{
"epoch": 2.9721669980119283,
"grad_norm": 1.4765625,
"learning_rate": 0.0008970490280903477,
"loss": 6.0582,
"step": 1495
},
{
"epoch": 2.982107355864811,
"grad_norm": 1.4765625,
"learning_rate": 0.000896053645976594,
"loss": 6.0267,
"step": 1500
},
{
"epoch": 2.982107355864811,
"eval_loss": 6.628965854644775,
"eval_runtime": 1.0022,
"eval_samples_per_second": 3457.265,
"eval_steps_per_second": 433.031,
"step": 1500
},
{
"epoch": 2.9920477137176937,
"grad_norm": 1.3515625,
"learning_rate": 0.0008950541029717912,
"loss": 6.1065,
"step": 1505
},
{
"epoch": 3.0019880715705765,
"grad_norm": 1.3046875,
"learning_rate": 0.0008940504110942771,
"loss": 6.0632,
"step": 1510
},
{
"epoch": 3.011928429423459,
"grad_norm": 1.1640625,
"learning_rate": 0.0008930425824122744,
"loss": 5.6407,
"step": 1515
},
{
"epoch": 3.021868787276342,
"grad_norm": 1.3046875,
"learning_rate": 0.0008920306290437462,
"loss": 5.5366,
"step": 1520
},
{
"epoch": 3.0318091451292246,
"grad_norm": 1.34375,
"learning_rate": 0.0008910145631562507,
"loss": 5.6106,
"step": 1525
},
{
"epoch": 3.0417495029821073,
"grad_norm": 1.359375,
"learning_rate": 0.0008899943969667932,
"loss": 5.7496,
"step": 1530
},
{
"epoch": 3.05168986083499,
"grad_norm": 1.34375,
"learning_rate": 0.0008889701427416815,
"loss": 5.7083,
"step": 1535
},
{
"epoch": 3.0616302186878728,
"grad_norm": 1.4765625,
"learning_rate": 0.0008879418127963767,
"loss": 5.7372,
"step": 1540
},
{
"epoch": 3.0715705765407555,
"grad_norm": 1.5,
"learning_rate": 0.0008869094194953455,
"loss": 5.7232,
"step": 1545
},
{
"epoch": 3.081510934393638,
"grad_norm": 1.40625,
"learning_rate": 0.0008858729752519121,
"loss": 5.6709,
"step": 1550
},
{
"epoch": 3.091451292246521,
"grad_norm": 1.34375,
"learning_rate": 0.0008848324925281085,
"loss": 5.6879,
"step": 1555
},
{
"epoch": 3.1013916500994037,
"grad_norm": 1.4296875,
"learning_rate": 0.0008837879838345245,
"loss": 5.5735,
"step": 1560
},
{
"epoch": 3.1113320079522864,
"grad_norm": 1.3359375,
"learning_rate": 0.0008827394617301576,
"loss": 5.6023,
"step": 1565
},
{
"epoch": 3.121272365805169,
"grad_norm": 1.4296875,
"learning_rate": 0.0008816869388222618,
"loss": 5.6529,
"step": 1570
},
{
"epoch": 3.131212723658052,
"grad_norm": 1.390625,
"learning_rate": 0.0008806304277661964,
"loss": 5.7645,
"step": 1575
},
{
"epoch": 3.1411530815109345,
"grad_norm": 1.359375,
"learning_rate": 0.0008795699412652732,
"loss": 5.6259,
"step": 1580
},
{
"epoch": 3.1510934393638173,
"grad_norm": 1.390625,
"learning_rate": 0.0008785054920706039,
"loss": 5.7228,
"step": 1585
},
{
"epoch": 3.1610337972167,
"grad_norm": 1.375,
"learning_rate": 0.0008774370929809475,
"loss": 5.6976,
"step": 1590
},
{
"epoch": 3.1709741550695827,
"grad_norm": 1.5078125,
"learning_rate": 0.0008763647568425557,
"loss": 5.6784,
"step": 1595
},
{
"epoch": 3.1809145129224654,
"grad_norm": 1.421875,
"learning_rate": 0.0008752884965490185,
"loss": 5.607,
"step": 1600
},
{
"epoch": 3.1908548707753477,
"grad_norm": 1.2890625,
"learning_rate": 0.0008742083250411091,
"loss": 5.825,
"step": 1605
},
{
"epoch": 3.2007952286282304,
"grad_norm": 1.3359375,
"learning_rate": 0.0008731242553066287,
"loss": 5.6423,
"step": 1610
},
{
"epoch": 3.210735586481113,
"grad_norm": 1.453125,
"learning_rate": 0.0008720363003802503,
"loss": 5.7355,
"step": 1615
},
{
"epoch": 3.220675944333996,
"grad_norm": 1.484375,
"learning_rate": 0.0008709444733433617,
"loss": 5.7234,
"step": 1620
},
{
"epoch": 3.2306163021868786,
"grad_norm": 1.4765625,
"learning_rate": 0.0008698487873239079,
"loss": 5.6615,
"step": 1625
},
{
"epoch": 3.2405566600397613,
"grad_norm": 1.375,
"learning_rate": 0.0008687492554962345,
"loss": 5.6099,
"step": 1630
},
{
"epoch": 3.250497017892644,
"grad_norm": 1.390625,
"learning_rate": 0.0008676458910809273,
"loss": 5.5962,
"step": 1635
},
{
"epoch": 3.2604373757455267,
"grad_norm": 1.34375,
"learning_rate": 0.0008665387073446556,
"loss": 5.675,
"step": 1640
},
{
"epoch": 3.2703777335984094,
"grad_norm": 1.3671875,
"learning_rate": 0.000865427717600011,
"loss": 5.8307,
"step": 1645
},
{
"epoch": 3.280318091451292,
"grad_norm": 1.3046875,
"learning_rate": 0.0008643129352053478,
"loss": 5.7201,
"step": 1650
},
{
"epoch": 3.290258449304175,
"grad_norm": 1.3984375,
"learning_rate": 0.0008631943735646231,
"loss": 5.6514,
"step": 1655
},
{
"epoch": 3.3001988071570576,
"grad_norm": 1.421875,
"learning_rate": 0.0008620720461272344,
"loss": 5.6428,
"step": 1660
},
{
"epoch": 3.3101391650099403,
"grad_norm": 1.5078125,
"learning_rate": 0.0008609459663878586,
"loss": 5.7221,
"step": 1665
},
{
"epoch": 3.320079522862823,
"grad_norm": 1.4296875,
"learning_rate": 0.00085981614788629,
"loss": 5.6877,
"step": 1670
},
{
"epoch": 3.3300198807157058,
"grad_norm": 1.3125,
"learning_rate": 0.0008586826042072768,
"loss": 5.764,
"step": 1675
},
{
"epoch": 3.3399602385685885,
"grad_norm": 1.3203125,
"learning_rate": 0.0008575453489803583,
"loss": 5.8476,
"step": 1680
},
{
"epoch": 3.349900596421471,
"grad_norm": 1.3984375,
"learning_rate": 0.0008564043958797008,
"loss": 5.6845,
"step": 1685
},
{
"epoch": 3.359840954274354,
"grad_norm": 1.359375,
"learning_rate": 0.0008552597586239333,
"loss": 5.6836,
"step": 1690
},
{
"epoch": 3.3697813121272366,
"grad_norm": 1.28125,
"learning_rate": 0.0008541114509759821,
"loss": 5.7884,
"step": 1695
},
{
"epoch": 3.3797216699801194,
"grad_norm": 1.2734375,
"learning_rate": 0.0008529594867429059,
"loss": 5.6276,
"step": 1700
},
{
"epoch": 3.389662027833002,
"grad_norm": 1.4375,
"learning_rate": 0.0008518038797757299,
"loss": 5.6027,
"step": 1705
},
{
"epoch": 3.399602385685885,
"grad_norm": 1.4609375,
"learning_rate": 0.0008506446439692784,
"loss": 5.7908,
"step": 1710
},
{
"epoch": 3.4095427435387675,
"grad_norm": 1.4453125,
"learning_rate": 0.0008494817932620086,
"loss": 5.7614,
"step": 1715
},
{
"epoch": 3.4194831013916502,
"grad_norm": 1.390625,
"learning_rate": 0.0008483153416358423,
"loss": 5.7827,
"step": 1720
},
{
"epoch": 3.429423459244533,
"grad_norm": 1.4375,
"learning_rate": 0.0008471453031159987,
"loss": 5.6984,
"step": 1725
},
{
"epoch": 3.4393638170974157,
"grad_norm": 1.5,
"learning_rate": 0.0008459716917708248,
"loss": 5.6607,
"step": 1730
},
{
"epoch": 3.4493041749502984,
"grad_norm": 1.40625,
"learning_rate": 0.0008447945217116265,
"loss": 5.7333,
"step": 1735
},
{
"epoch": 3.459244532803181,
"grad_norm": 1.2890625,
"learning_rate": 0.0008436138070924997,
"loss": 5.8342,
"step": 1740
},
{
"epoch": 3.469184890656064,
"grad_norm": 1.484375,
"learning_rate": 0.000842429562110159,
"loss": 5.7099,
"step": 1745
},
{
"epoch": 3.4791252485089466,
"grad_norm": 1.3828125,
"learning_rate": 0.0008412418010037673,
"loss": 5.7363,
"step": 1750
},
{
"epoch": 3.4890656063618293,
"grad_norm": 1.4609375,
"learning_rate": 0.0008400505380547655,
"loss": 5.7489,
"step": 1755
},
{
"epoch": 3.4990059642147116,
"grad_norm": 1.4921875,
"learning_rate": 0.0008388557875866995,
"loss": 5.7809,
"step": 1760
},
{
"epoch": 3.5089463220675943,
"grad_norm": 1.4609375,
"learning_rate": 0.0008376575639650489,
"loss": 5.6746,
"step": 1765
},
{
"epoch": 3.518886679920477,
"grad_norm": 1.3671875,
"learning_rate": 0.0008364558815970536,
"loss": 5.6408,
"step": 1770
},
{
"epoch": 3.5288270377733597,
"grad_norm": 1.3671875,
"learning_rate": 0.0008352507549315407,
"loss": 5.5869,
"step": 1775
},
{
"epoch": 3.5387673956262424,
"grad_norm": 1.4453125,
"learning_rate": 0.0008340421984587517,
"loss": 5.5626,
"step": 1780
},
{
"epoch": 3.548707753479125,
"grad_norm": 1.4296875,
"learning_rate": 0.000832830226710167,
"loss": 5.6849,
"step": 1785
},
{
"epoch": 3.558648111332008,
"grad_norm": 1.53125,
"learning_rate": 0.0008316148542583319,
"loss": 5.6313,
"step": 1790
},
{
"epoch": 3.5685884691848906,
"grad_norm": 1.328125,
"learning_rate": 0.000830396095716681,
"loss": 5.7723,
"step": 1795
},
{
"epoch": 3.5785288270377733,
"grad_norm": 1.421875,
"learning_rate": 0.0008291739657393626,
"loss": 5.7748,
"step": 1800
},
{
"epoch": 3.588469184890656,
"grad_norm": 1.4453125,
"learning_rate": 0.0008279484790210632,
"loss": 5.7255,
"step": 1805
},
{
"epoch": 3.5984095427435387,
"grad_norm": 1.5234375,
"learning_rate": 0.000826719650296829,
"loss": 5.7057,
"step": 1810
},
{
"epoch": 3.6083499005964215,
"grad_norm": 1.3515625,
"learning_rate": 0.0008254874943418914,
"loss": 5.7359,
"step": 1815
},
{
"epoch": 3.618290258449304,
"grad_norm": 1.3984375,
"learning_rate": 0.0008242520259714868,
"loss": 5.6212,
"step": 1820
},
{
"epoch": 3.628230616302187,
"grad_norm": 1.375,
"learning_rate": 0.00082301326004068,
"loss": 5.8307,
"step": 1825
},
{
"epoch": 3.6381709741550696,
"grad_norm": 1.4921875,
"learning_rate": 0.0008217712114441846,
"loss": 5.7231,
"step": 1830
},
{
"epoch": 3.6481113320079523,
"grad_norm": 1.40625,
"learning_rate": 0.0008205258951161852,
"loss": 5.7487,
"step": 1835
},
{
"epoch": 3.658051689860835,
"grad_norm": 1.4375,
"learning_rate": 0.0008192773260301564,
"loss": 5.8017,
"step": 1840
},
{
"epoch": 3.667992047713718,
"grad_norm": 1.8515625,
"learning_rate": 0.0008180255191986837,
"loss": 5.7182,
"step": 1845
},
{
"epoch": 3.6779324055666005,
"grad_norm": 1.328125,
"learning_rate": 0.0008167704896732828,
"loss": 5.7632,
"step": 1850
},
{
"epoch": 3.6878727634194832,
"grad_norm": 1.234375,
"learning_rate": 0.0008155122525442182,
"loss": 5.741,
"step": 1855
},
{
"epoch": 3.697813121272366,
"grad_norm": 1.4609375,
"learning_rate": 0.0008142508229403225,
"loss": 5.8122,
"step": 1860
},
{
"epoch": 3.7077534791252487,
"grad_norm": 1.3359375,
"learning_rate": 0.0008129862160288137,
"loss": 5.6688,
"step": 1865
},
{
"epoch": 3.717693836978131,
"grad_norm": 1.3671875,
"learning_rate": 0.0008117184470151134,
"loss": 5.7599,
"step": 1870
},
{
"epoch": 3.7276341948310137,
"grad_norm": 1.4921875,
"learning_rate": 0.000810447531142664,
"loss": 5.7124,
"step": 1875
},
{
"epoch": 3.7375745526838964,
"grad_norm": 1.3359375,
"learning_rate": 0.0008091734836927447,
"loss": 5.6663,
"step": 1880
},
{
"epoch": 3.747514910536779,
"grad_norm": 1.34375,
"learning_rate": 0.0008078963199842886,
"loss": 5.7889,
"step": 1885
},
{
"epoch": 3.757455268389662,
"grad_norm": 1.40625,
"learning_rate": 0.000806616055373698,
"loss": 5.7625,
"step": 1890
},
{
"epoch": 3.7673956262425445,
"grad_norm": 1.46875,
"learning_rate": 0.0008053327052546605,
"loss": 5.8341,
"step": 1895
},
{
"epoch": 3.7773359840954273,
"grad_norm": 1.4921875,
"learning_rate": 0.0008040462850579625,
"loss": 5.7046,
"step": 1900
},
{
"epoch": 3.78727634194831,
"grad_norm": 1.375,
"learning_rate": 0.000802756810251305,
"loss": 5.7442,
"step": 1905
},
{
"epoch": 3.7972166998011927,
"grad_norm": 1.5,
"learning_rate": 0.0008014642963391168,
"loss": 5.768,
"step": 1910
},
{
"epoch": 3.8071570576540754,
"grad_norm": 1.5,
"learning_rate": 0.0008001687588623686,
"loss": 5.7189,
"step": 1915
},
{
"epoch": 3.817097415506958,
"grad_norm": 1.421875,
"learning_rate": 0.0007988702133983861,
"loss": 5.7845,
"step": 1920
},
{
"epoch": 3.827037773359841,
"grad_norm": 1.328125,
"learning_rate": 0.0007975686755606623,
"loss": 5.7177,
"step": 1925
},
{
"epoch": 3.8369781312127236,
"grad_norm": 1.421875,
"learning_rate": 0.0007962641609986703,
"loss": 5.6932,
"step": 1930
},
{
"epoch": 3.8469184890656063,
"grad_norm": 1.390625,
"learning_rate": 0.0007949566853976738,
"loss": 5.4869,
"step": 1935
},
{
"epoch": 3.856858846918489,
"grad_norm": 1.3984375,
"learning_rate": 0.0007936462644785413,
"loss": 5.8108,
"step": 1940
},
{
"epoch": 3.8667992047713717,
"grad_norm": 1.4140625,
"learning_rate": 0.0007923329139975537,
"loss": 5.74,
"step": 1945
},
{
"epoch": 3.8767395626242545,
"grad_norm": 1.390625,
"learning_rate": 0.0007910166497462173,
"loss": 5.7332,
"step": 1950
},
{
"epoch": 3.886679920477137,
"grad_norm": 1.34375,
"learning_rate": 0.0007896974875510731,
"loss": 5.6456,
"step": 1955
},
{
"epoch": 3.89662027833002,
"grad_norm": 1.4296875,
"learning_rate": 0.0007883754432735058,
"loss": 5.7866,
"step": 1960
},
{
"epoch": 3.9065606361829026,
"grad_norm": 1.3046875,
"learning_rate": 0.0007870505328095545,
"loss": 5.781,
"step": 1965
},
{
"epoch": 3.9165009940357853,
"grad_norm": 1.4609375,
"learning_rate": 0.0007857227720897207,
"loss": 5.7998,
"step": 1970
},
{
"epoch": 3.926441351888668,
"grad_norm": 1.515625,
"learning_rate": 0.0007843921770787765,
"loss": 5.7848,
"step": 1975
},
{
"epoch": 3.9363817097415508,
"grad_norm": 1.515625,
"learning_rate": 0.0007830587637755736,
"loss": 5.7115,
"step": 1980
},
{
"epoch": 3.9463220675944335,
"grad_norm": 1.421875,
"learning_rate": 0.00078172254821285,
"loss": 5.6413,
"step": 1985
},
{
"epoch": 3.956262425447316,
"grad_norm": 1.390625,
"learning_rate": 0.0007803835464570379,
"loss": 5.701,
"step": 1990
},
{
"epoch": 3.966202783300199,
"grad_norm": 1.5625,
"learning_rate": 0.0007790417746080698,
"loss": 5.6178,
"step": 1995
},
{
"epoch": 3.9761431411530817,
"grad_norm": 1.375,
"learning_rate": 0.0007776972487991857,
"loss": 5.8228,
"step": 2000
},
{
"epoch": 3.9761431411530817,
"eval_loss": 6.56332540512085,
"eval_runtime": 1.0035,
"eval_samples_per_second": 3453.084,
"eval_steps_per_second": 432.507,
"step": 2000
},
{
"epoch": 3.9860834990059644,
"grad_norm": 1.3828125,
"learning_rate": 0.0007763499851967385,
"loss": 5.7431,
"step": 2005
},
{
"epoch": 3.996023856858847,
"grad_norm": 1.6171875,
"learning_rate": 0.0007750000000000001,
"loss": 5.6064,
"step": 2010
},
{
"epoch": 4.00596421471173,
"grad_norm": 1.4765625,
"learning_rate": 0.000773647309440966,
"loss": 5.3943,
"step": 2015
},
{
"epoch": 4.0159045725646125,
"grad_norm": 1.375,
"learning_rate": 0.0007722919297841613,
"loss": 5.287,
"step": 2020
},
{
"epoch": 4.025844930417495,
"grad_norm": 1.3046875,
"learning_rate": 0.0007709338773264435,
"loss": 5.3951,
"step": 2025
},
{
"epoch": 4.035785288270378,
"grad_norm": 1.421875,
"learning_rate": 0.0007695731683968077,
"loss": 5.3579,
"step": 2030
},
{
"epoch": 4.045725646123261,
"grad_norm": 1.421875,
"learning_rate": 0.0007682098193561904,
"loss": 5.3274,
"step": 2035
},
{
"epoch": 4.055666003976143,
"grad_norm": 1.4375,
"learning_rate": 0.0007668438465972717,
"loss": 5.2868,
"step": 2040
},
{
"epoch": 4.065606361829026,
"grad_norm": 1.484375,
"learning_rate": 0.0007654752665442794,
"loss": 5.3415,
"step": 2045
},
{
"epoch": 4.075546719681909,
"grad_norm": 1.3515625,
"learning_rate": 0.0007641040956527904,
"loss": 5.3547,
"step": 2050
},
{
"epoch": 4.085487077534792,
"grad_norm": 1.359375,
"learning_rate": 0.0007627303504095341,
"loss": 5.3144,
"step": 2055
},
{
"epoch": 4.095427435387674,
"grad_norm": 1.390625,
"learning_rate": 0.0007613540473321927,
"loss": 5.3039,
"step": 2060
},
{
"epoch": 4.105367793240557,
"grad_norm": 1.5078125,
"learning_rate": 0.0007599752029692041,
"loss": 5.2586,
"step": 2065
},
{
"epoch": 4.11530815109344,
"grad_norm": 1.4140625,
"learning_rate": 0.0007585938338995616,
"loss": 5.4059,
"step": 2070
},
{
"epoch": 4.1252485089463224,
"grad_norm": 1.4296875,
"learning_rate": 0.0007572099567326158,
"loss": 5.1738,
"step": 2075
},
{
"epoch": 4.135188866799205,
"grad_norm": 1.4296875,
"learning_rate": 0.0007558235881078734,
"loss": 5.3144,
"step": 2080
},
{
"epoch": 4.145129224652088,
"grad_norm": 1.3828125,
"learning_rate": 0.0007544347446947986,
"loss": 5.3125,
"step": 2085
},
{
"epoch": 4.155069582504971,
"grad_norm": 1.6015625,
"learning_rate": 0.0007530434431926118,
"loss": 5.2936,
"step": 2090
},
{
"epoch": 4.165009940357853,
"grad_norm": 1.4296875,
"learning_rate": 0.0007516497003300892,
"loss": 5.2708,
"step": 2095
},
{
"epoch": 4.174950298210735,
"grad_norm": 1.4296875,
"learning_rate": 0.0007502535328653615,
"loss": 5.3064,
"step": 2100
},
{
"epoch": 4.184890656063618,
"grad_norm": 1.390625,
"learning_rate": 0.0007488549575857124,
"loss": 5.2761,
"step": 2105
},
{
"epoch": 4.194831013916501,
"grad_norm": 1.421875,
"learning_rate": 0.0007474539913073764,
"loss": 5.3755,
"step": 2110
},
{
"epoch": 4.204771371769383,
"grad_norm": 1.4453125,
"learning_rate": 0.0007460506508753373,
"loss": 5.3384,
"step": 2115
},
{
"epoch": 4.214711729622266,
"grad_norm": 1.546875,
"learning_rate": 0.0007446449531631255,
"loss": 5.3616,
"step": 2120
},
{
"epoch": 4.224652087475149,
"grad_norm": 1.421875,
"learning_rate": 0.0007432369150726146,
"loss": 5.3979,
"step": 2125
},
{
"epoch": 4.2345924453280315,
"grad_norm": 1.40625,
"learning_rate": 0.0007418265535338187,
"loss": 5.3498,
"step": 2130
},
{
"epoch": 4.244532803180914,
"grad_norm": 1.359375,
"learning_rate": 0.0007404138855046884,
"loss": 5.3137,
"step": 2135
},
{
"epoch": 4.254473161033797,
"grad_norm": 1.515625,
"learning_rate": 0.0007389989279709077,
"loss": 5.3943,
"step": 2140
},
{
"epoch": 4.26441351888668,
"grad_norm": 1.40625,
"learning_rate": 0.0007375816979456887,
"loss": 5.23,
"step": 2145
},
{
"epoch": 4.274353876739562,
"grad_norm": 1.4765625,
"learning_rate": 0.0007361622124695677,
"loss": 5.3232,
"step": 2150
},
{
"epoch": 4.284294234592445,
"grad_norm": 1.421875,
"learning_rate": 0.0007347404886102002,
"loss": 5.4007,
"step": 2155
},
{
"epoch": 4.294234592445328,
"grad_norm": 1.328125,
"learning_rate": 0.0007333165434621556,
"loss": 5.2138,
"step": 2160
},
{
"epoch": 4.3041749502982105,
"grad_norm": 1.4921875,
"learning_rate": 0.0007318903941467119,
"loss": 5.3677,
"step": 2165
},
{
"epoch": 4.314115308151093,
"grad_norm": 1.546875,
"learning_rate": 0.0007304620578116493,
"loss": 5.4318,
"step": 2170
},
{
"epoch": 4.324055666003976,
"grad_norm": 1.6484375,
"learning_rate": 0.0007290315516310445,
"loss": 5.4126,
"step": 2175
},
{
"epoch": 4.333996023856859,
"grad_norm": 1.3828125,
"learning_rate": 0.0007275988928050645,
"loss": 5.4441,
"step": 2180
},
{
"epoch": 4.343936381709741,
"grad_norm": 1.46875,
"learning_rate": 0.0007261640985597584,
"loss": 5.4172,
"step": 2185
},
{
"epoch": 4.353876739562624,
"grad_norm": 1.6015625,
"learning_rate": 0.0007247271861468522,
"loss": 5.4395,
"step": 2190
},
{
"epoch": 4.363817097415507,
"grad_norm": 1.5078125,
"learning_rate": 0.0007232881728435397,
"loss": 5.1637,
"step": 2195
},
{
"epoch": 4.3737574552683895,
"grad_norm": 1.4609375,
"learning_rate": 0.0007218470759522759,
"loss": 5.2879,
"step": 2200
},
{
"epoch": 4.383697813121272,
"grad_norm": 1.375,
"learning_rate": 0.0007204039128005682,
"loss": 5.3645,
"step": 2205
},
{
"epoch": 4.393638170974155,
"grad_norm": 1.4140625,
"learning_rate": 0.0007189587007407686,
"loss": 5.3378,
"step": 2210
},
{
"epoch": 4.403578528827038,
"grad_norm": 1.25,
"learning_rate": 0.0007175114571498644,
"loss": 5.2921,
"step": 2215
},
{
"epoch": 4.41351888667992,
"grad_norm": 1.53125,
"learning_rate": 0.0007160621994292706,
"loss": 5.3694,
"step": 2220
},
{
"epoch": 4.423459244532803,
"grad_norm": 1.3515625,
"learning_rate": 0.0007146109450046187,
"loss": 5.2234,
"step": 2225
},
{
"epoch": 4.433399602385686,
"grad_norm": 1.3203125,
"learning_rate": 0.0007131577113255489,
"loss": 5.3948,
"step": 2230
},
{
"epoch": 4.443339960238569,
"grad_norm": 1.5546875,
"learning_rate": 0.0007117025158654991,
"loss": 5.399,
"step": 2235
},
{
"epoch": 4.453280318091451,
"grad_norm": 1.6875,
"learning_rate": 0.0007102453761214961,
"loss": 5.336,
"step": 2240
},
{
"epoch": 4.463220675944334,
"grad_norm": 1.4375,
"learning_rate": 0.0007087863096139438,
"loss": 5.302,
"step": 2245
},
{
"epoch": 4.473161033797217,
"grad_norm": 1.375,
"learning_rate": 0.0007073253338864137,
"loss": 5.4155,
"step": 2250
},
{
"epoch": 4.4831013916500995,
"grad_norm": 1.5390625,
"learning_rate": 0.0007058624665054326,
"loss": 5.4536,
"step": 2255
},
{
"epoch": 4.493041749502982,
"grad_norm": 1.4453125,
"learning_rate": 0.0007043977250602732,
"loss": 5.2725,
"step": 2260
},
{
"epoch": 4.502982107355865,
"grad_norm": 1.484375,
"learning_rate": 0.0007029311271627408,
"loss": 5.4126,
"step": 2265
},
{
"epoch": 4.512922465208748,
"grad_norm": 1.4375,
"learning_rate": 0.0007014626904469629,
"loss": 5.39,
"step": 2270
},
{
"epoch": 4.52286282306163,
"grad_norm": 1.4140625,
"learning_rate": 0.0006999924325691765,
"loss": 5.4129,
"step": 2275
},
{
"epoch": 4.532803180914513,
"grad_norm": 1.4296875,
"learning_rate": 0.0006985203712075161,
"loss": 5.364,
"step": 2280
},
{
"epoch": 4.542743538767396,
"grad_norm": 1.328125,
"learning_rate": 0.0006970465240618006,
"loss": 5.2892,
"step": 2285
},
{
"epoch": 4.5526838966202785,
"grad_norm": 1.46875,
"learning_rate": 0.0006955709088533212,
"loss": 5.4663,
"step": 2290
},
{
"epoch": 4.562624254473161,
"grad_norm": 1.421875,
"learning_rate": 0.0006940935433246279,
"loss": 5.516,
"step": 2295
},
{
"epoch": 4.572564612326044,
"grad_norm": 1.4453125,
"learning_rate": 0.0006926144452393163,
"loss": 5.4443,
"step": 2300
},
{
"epoch": 4.582504970178927,
"grad_norm": 1.5078125,
"learning_rate": 0.0006911336323818137,
"loss": 5.3674,
"step": 2305
},
{
"epoch": 4.592445328031809,
"grad_norm": 1.453125,
"learning_rate": 0.000689651122557166,
"loss": 5.412,
"step": 2310
},
{
"epoch": 4.602385685884692,
"grad_norm": 1.65625,
"learning_rate": 0.0006881669335908229,
"loss": 5.4932,
"step": 2315
},
{
"epoch": 4.612326043737575,
"grad_norm": 1.4140625,
"learning_rate": 0.0006866810833284234,
"loss": 5.5086,
"step": 2320
},
{
"epoch": 4.6222664015904575,
"grad_norm": 1.53125,
"learning_rate": 0.0006851935896355827,
"loss": 5.3945,
"step": 2325
},
{
"epoch": 4.63220675944334,
"grad_norm": 1.1953125,
"learning_rate": 0.0006837044703976754,
"loss": 5.2706,
"step": 2330
},
{
"epoch": 4.642147117296223,
"grad_norm": 1.484375,
"learning_rate": 0.0006822137435196214,
"loss": 5.3778,
"step": 2335
},
{
"epoch": 4.652087475149106,
"grad_norm": 1.40625,
"learning_rate": 0.0006807214269256713,
"loss": 5.2928,
"step": 2340
},
{
"epoch": 4.662027833001988,
"grad_norm": 1.484375,
"learning_rate": 0.0006792275385591895,
"loss": 5.3992,
"step": 2345
},
{
"epoch": 4.671968190854871,
"grad_norm": 1.5,
"learning_rate": 0.0006777320963824396,
"loss": 5.4247,
"step": 2350
},
{
"epoch": 4.681908548707754,
"grad_norm": 1.390625,
"learning_rate": 0.0006762351183763674,
"loss": 5.3852,
"step": 2355
},
{
"epoch": 4.691848906560637,
"grad_norm": 1.6015625,
"learning_rate": 0.0006747366225403858,
"loss": 5.4185,
"step": 2360
},
{
"epoch": 4.701789264413518,
"grad_norm": 1.3203125,
"learning_rate": 0.0006732366268921576,
"loss": 5.3019,
"step": 2365
},
{
"epoch": 4.711729622266402,
"grad_norm": 1.5078125,
"learning_rate": 0.0006717351494673791,
"loss": 5.4968,
"step": 2370
},
{
"epoch": 4.721669980119284,
"grad_norm": 1.4921875,
"learning_rate": 0.0006702322083195633,
"loss": 5.3232,
"step": 2375
},
{
"epoch": 4.7316103379721675,
"grad_norm": 1.4609375,
"learning_rate": 0.0006687278215198226,
"loss": 5.355,
"step": 2380
},
{
"epoch": 4.741550695825049,
"grad_norm": 1.453125,
"learning_rate": 0.000667222007156652,
"loss": 5.4293,
"step": 2385
},
{
"epoch": 4.751491053677933,
"grad_norm": 1.4375,
"learning_rate": 0.0006657147833357107,
"loss": 5.4875,
"step": 2390
},
{
"epoch": 4.761431411530815,
"grad_norm": 1.4609375,
"learning_rate": 0.0006642061681796056,
"loss": 5.408,
"step": 2395
},
{
"epoch": 4.7713717693836974,
"grad_norm": 1.484375,
"learning_rate": 0.0006626961798276726,
"loss": 5.3994,
"step": 2400
},
{
"epoch": 4.78131212723658,
"grad_norm": 1.5234375,
"learning_rate": 0.0006611848364357584,
"loss": 5.3414,
"step": 2405
},
{
"epoch": 4.791252485089463,
"grad_norm": 1.5390625,
"learning_rate": 0.0006596721561760028,
"loss": 5.4256,
"step": 2410
},
{
"epoch": 4.801192842942346,
"grad_norm": 1.296875,
"learning_rate": 0.0006581581572366196,
"loss": 5.2564,
"step": 2415
},
{
"epoch": 4.811133200795228,
"grad_norm": 1.4296875,
"learning_rate": 0.0006566428578216785,
"loss": 5.3714,
"step": 2420
},
{
"epoch": 4.821073558648111,
"grad_norm": 1.4296875,
"learning_rate": 0.0006551262761508857,
"loss": 5.3606,
"step": 2425
},
{
"epoch": 4.831013916500994,
"grad_norm": 1.6015625,
"learning_rate": 0.0006536084304593652,
"loss": 5.4128,
"step": 2430
},
{
"epoch": 4.8409542743538765,
"grad_norm": 1.3984375,
"learning_rate": 0.000652089338997439,
"loss": 5.4684,
"step": 2435
},
{
"epoch": 4.850894632206759,
"grad_norm": 1.515625,
"learning_rate": 0.0006505690200304083,
"loss": 5.5118,
"step": 2440
},
{
"epoch": 4.860834990059642,
"grad_norm": 1.53125,
"learning_rate": 0.0006490474918383339,
"loss": 5.3193,
"step": 2445
},
{
"epoch": 4.870775347912525,
"grad_norm": 1.4140625,
"learning_rate": 0.0006475247727158154,
"loss": 5.4077,
"step": 2450
},
{
"epoch": 4.880715705765407,
"grad_norm": 1.796875,
"learning_rate": 0.0006460008809717727,
"loss": 5.432,
"step": 2455
},
{
"epoch": 4.89065606361829,
"grad_norm": 1.5,
"learning_rate": 0.0006444758349292244,
"loss": 5.2856,
"step": 2460
},
{
"epoch": 4.900596421471173,
"grad_norm": 1.3828125,
"learning_rate": 0.0006429496529250689,
"loss": 5.3235,
"step": 2465
},
{
"epoch": 4.9105367793240555,
"grad_norm": 1.46875,
"learning_rate": 0.0006414223533098627,
"loss": 5.3809,
"step": 2470
},
{
"epoch": 4.920477137176938,
"grad_norm": 1.5078125,
"learning_rate": 0.0006398939544476005,
"loss": 5.511,
"step": 2475
},
{
"epoch": 4.930417495029821,
"grad_norm": 1.484375,
"learning_rate": 0.000638364474715494,
"loss": 5.5192,
"step": 2480
},
{
"epoch": 4.940357852882704,
"grad_norm": 1.6953125,
"learning_rate": 0.0006368339325037513,
"loss": 5.3635,
"step": 2485
},
{
"epoch": 4.950298210735586,
"grad_norm": 1.4296875,
"learning_rate": 0.0006353023462153552,
"loss": 5.4371,
"step": 2490
},
{
"epoch": 4.960238568588469,
"grad_norm": 1.46875,
"learning_rate": 0.0006337697342658431,
"loss": 5.4353,
"step": 2495
},
{
"epoch": 4.970178926441352,
"grad_norm": 1.453125,
"learning_rate": 0.0006322361150830839,
"loss": 5.4083,
"step": 2500
},
{
"epoch": 4.970178926441352,
"eval_loss": 6.548668384552002,
"eval_runtime": 1.0034,
"eval_samples_per_second": 3453.088,
"eval_steps_per_second": 432.508,
"step": 2500
},
{
"epoch": 4.980119284294235,
"grad_norm": 1.4765625,
"learning_rate": 0.0006307015071070575,
"loss": 5.3485,
"step": 2505
},
{
"epoch": 4.990059642147117,
"grad_norm": 1.3125,
"learning_rate": 0.0006291659287896334,
"loss": 5.4423,
"step": 2510
},
{
"epoch": 5.0,
"grad_norm": 1.875,
"learning_rate": 0.0006276293985943478,
"loss": 5.3597,
"step": 2515
},
{
"epoch": 5.009940357852883,
"grad_norm": 1.453125,
"learning_rate": 0.0006260919349961824,
"loss": 4.9602,
"step": 2520
},
{
"epoch": 5.019880715705765,
"grad_norm": 1.4375,
"learning_rate": 0.0006245535564813417,
"loss": 5.0917,
"step": 2525
},
{
"epoch": 5.029821073558648,
"grad_norm": 1.4765625,
"learning_rate": 0.0006230142815470312,
"loss": 5.0706,
"step": 2530
},
{
"epoch": 5.039761431411531,
"grad_norm": 1.515625,
"learning_rate": 0.0006214741287012348,
"loss": 5.0238,
"step": 2535
},
{
"epoch": 5.049701789264414,
"grad_norm": 1.3203125,
"learning_rate": 0.0006199331164624922,
"loss": 5.0024,
"step": 2540
},
{
"epoch": 5.059642147117296,
"grad_norm": 1.5546875,
"learning_rate": 0.0006183912633596763,
"loss": 4.9051,
"step": 2545
},
{
"epoch": 5.069582504970179,
"grad_norm": 1.4765625,
"learning_rate": 0.0006168485879317707,
"loss": 5.0626,
"step": 2550
},
{
"epoch": 5.079522862823062,
"grad_norm": 1.484375,
"learning_rate": 0.0006153051087276458,
"loss": 5.0613,
"step": 2555
},
{
"epoch": 5.0894632206759445,
"grad_norm": 1.3984375,
"learning_rate": 0.0006137608443058371,
"loss": 5.0961,
"step": 2560
},
{
"epoch": 5.099403578528827,
"grad_norm": 1.4375,
"learning_rate": 0.0006122158132343213,
"loss": 4.9515,
"step": 2565
},
{
"epoch": 5.10934393638171,
"grad_norm": 1.46875,
"learning_rate": 0.000610670034090293,
"loss": 5.0844,
"step": 2570
},
{
"epoch": 5.119284294234593,
"grad_norm": 1.484375,
"learning_rate": 0.0006091235254599417,
"loss": 5.1001,
"step": 2575
},
{
"epoch": 5.129224652087475,
"grad_norm": 1.59375,
"learning_rate": 0.0006075763059382278,
"loss": 5.0332,
"step": 2580
},
{
"epoch": 5.139165009940358,
"grad_norm": 1.59375,
"learning_rate": 0.0006060283941286597,
"loss": 5.088,
"step": 2585
},
{
"epoch": 5.149105367793241,
"grad_norm": 1.3828125,
"learning_rate": 0.0006044798086430697,
"loss": 5.1015,
"step": 2590
},
{
"epoch": 5.1590457256461235,
"grad_norm": 1.5078125,
"learning_rate": 0.00060293056810139,
"loss": 5.1099,
"step": 2595
},
{
"epoch": 5.168986083499006,
"grad_norm": 1.4296875,
"learning_rate": 0.0006013806911314293,
"loss": 5.0713,
"step": 2600
},
{
"epoch": 5.178926441351889,
"grad_norm": 1.5859375,
"learning_rate": 0.0005998301963686485,
"loss": 5.0783,
"step": 2605
},
{
"epoch": 5.188866799204772,
"grad_norm": 1.3984375,
"learning_rate": 0.0005982791024559371,
"loss": 5.1075,
"step": 2610
},
{
"epoch": 5.198807157057654,
"grad_norm": 1.421875,
"learning_rate": 0.0005967274280433881,
"loss": 5.1167,
"step": 2615
},
{
"epoch": 5.208747514910537,
"grad_norm": 1.484375,
"learning_rate": 0.0005951751917880747,
"loss": 5.0489,
"step": 2620
},
{
"epoch": 5.21868787276342,
"grad_norm": 1.40625,
"learning_rate": 0.0005936224123538254,
"loss": 5.0001,
"step": 2625
},
{
"epoch": 5.2286282306163026,
"grad_norm": 1.453125,
"learning_rate": 0.000592069108411,
"loss": 5.1188,
"step": 2630
},
{
"epoch": 5.238568588469185,
"grad_norm": 1.4921875,
"learning_rate": 0.0005905152986362649,
"loss": 5.0649,
"step": 2635
},
{
"epoch": 5.248508946322068,
"grad_norm": 1.6015625,
"learning_rate": 0.0005889610017123685,
"loss": 5.0793,
"step": 2640
},
{
"epoch": 5.258449304174951,
"grad_norm": 1.2890625,
"learning_rate": 0.0005874062363279164,
"loss": 5.0609,
"step": 2645
},
{
"epoch": 5.2683896620278325,
"grad_norm": 1.5546875,
"learning_rate": 0.0005858510211771469,
"loss": 5.0377,
"step": 2650
},
{
"epoch": 5.278330019880716,
"grad_norm": 1.5078125,
"learning_rate": 0.0005842953749597065,
"loss": 4.9826,
"step": 2655
},
{
"epoch": 5.288270377733598,
"grad_norm": 1.5703125,
"learning_rate": 0.0005827393163804249,
"loss": 4.9537,
"step": 2660
},
{
"epoch": 5.298210735586481,
"grad_norm": 1.3828125,
"learning_rate": 0.0005811828641490892,
"loss": 5.0896,
"step": 2665
},
{
"epoch": 5.308151093439363,
"grad_norm": 1.375,
"learning_rate": 0.0005796260369802205,
"loss": 5.0893,
"step": 2670
},
{
"epoch": 5.318091451292246,
"grad_norm": 1.5234375,
"learning_rate": 0.0005780688535928478,
"loss": 5.1596,
"step": 2675
},
{
"epoch": 5.328031809145129,
"grad_norm": 1.5625,
"learning_rate": 0.0005765113327102831,
"loss": 5.1561,
"step": 2680
},
{
"epoch": 5.337972166998012,
"grad_norm": 1.578125,
"learning_rate": 0.0005749534930598966,
"loss": 5.0675,
"step": 2685
},
{
"epoch": 5.347912524850894,
"grad_norm": 1.4453125,
"learning_rate": 0.0005733953533728912,
"loss": 4.9823,
"step": 2690
},
{
"epoch": 5.357852882703777,
"grad_norm": 1.46875,
"learning_rate": 0.0005718369323840773,
"loss": 4.975,
"step": 2695
},
{
"epoch": 5.36779324055666,
"grad_norm": 1.4765625,
"learning_rate": 0.0005702782488316478,
"loss": 5.1163,
"step": 2700
},
{
"epoch": 5.3777335984095425,
"grad_norm": 1.59375,
"learning_rate": 0.0005687193214569524,
"loss": 5.117,
"step": 2705
},
{
"epoch": 5.387673956262425,
"grad_norm": 1.4453125,
"learning_rate": 0.0005671601690042727,
"loss": 5.0839,
"step": 2710
},
{
"epoch": 5.397614314115308,
"grad_norm": 1.5078125,
"learning_rate": 0.0005656008102205966,
"loss": 5.146,
"step": 2715
},
{
"epoch": 5.407554671968191,
"grad_norm": 1.46875,
"learning_rate": 0.0005640412638553927,
"loss": 4.9767,
"step": 2720
},
{
"epoch": 5.417495029821073,
"grad_norm": 1.546875,
"learning_rate": 0.000562481548660385,
"loss": 5.1535,
"step": 2725
},
{
"epoch": 5.427435387673956,
"grad_norm": 1.4609375,
"learning_rate": 0.000560921683389328,
"loss": 5.1582,
"step": 2730
},
{
"epoch": 5.437375745526839,
"grad_norm": 1.5,
"learning_rate": 0.0005593616867977801,
"loss": 5.0419,
"step": 2735
},
{
"epoch": 5.4473161033797215,
"grad_norm": 1.5703125,
"learning_rate": 0.000557801577642879,
"loss": 5.0471,
"step": 2740
},
{
"epoch": 5.457256461232604,
"grad_norm": 1.5546875,
"learning_rate": 0.0005562413746831156,
"loss": 4.9536,
"step": 2745
},
{
"epoch": 5.467196819085487,
"grad_norm": 1.6328125,
"learning_rate": 0.000554681096678109,
"loss": 5.1315,
"step": 2750
},
{
"epoch": 5.47713717693837,
"grad_norm": 1.4609375,
"learning_rate": 0.0005531207623883801,
"loss": 5.1325,
"step": 2755
},
{
"epoch": 5.487077534791252,
"grad_norm": 1.3984375,
"learning_rate": 0.0005515603905751276,
"loss": 5.101,
"step": 2760
},
{
"epoch": 5.497017892644135,
"grad_norm": 1.5234375,
"learning_rate": 0.00055,
"loss": 5.125,
"step": 2765
},
{
"epoch": 5.506958250497018,
"grad_norm": 1.703125,
"learning_rate": 0.0005484396094248726,
"loss": 5.0818,
"step": 2770
},
{
"epoch": 5.5168986083499005,
"grad_norm": 1.5625,
"learning_rate": 0.0005468792376116198,
"loss": 5.099,
"step": 2775
},
{
"epoch": 5.526838966202783,
"grad_norm": 1.515625,
"learning_rate": 0.0005453189033218912,
"loss": 5.0717,
"step": 2780
},
{
"epoch": 5.536779324055666,
"grad_norm": 1.515625,
"learning_rate": 0.0005437586253168845,
"loss": 4.909,
"step": 2785
},
{
"epoch": 5.546719681908549,
"grad_norm": 1.5078125,
"learning_rate": 0.0005421984223571211,
"loss": 5.0351,
"step": 2790
},
{
"epoch": 5.556660039761431,
"grad_norm": 1.4609375,
"learning_rate": 0.0005406383132022199,
"loss": 4.9336,
"step": 2795
},
{
"epoch": 5.566600397614314,
"grad_norm": 1.375,
"learning_rate": 0.000539078316610672,
"loss": 5.1319,
"step": 2800
},
{
"epoch": 5.576540755467197,
"grad_norm": 1.546875,
"learning_rate": 0.000537518451339615,
"loss": 5.1102,
"step": 2805
},
{
"epoch": 5.58648111332008,
"grad_norm": 1.5625,
"learning_rate": 0.0005359587361446073,
"loss": 5.1353,
"step": 2810
},
{
"epoch": 5.596421471172962,
"grad_norm": 1.6015625,
"learning_rate": 0.0005343991897794036,
"loss": 5.1371,
"step": 2815
},
{
"epoch": 5.606361829025845,
"grad_norm": 1.4453125,
"learning_rate": 0.0005328398309957274,
"loss": 5.0075,
"step": 2820
},
{
"epoch": 5.616302186878728,
"grad_norm": 1.4296875,
"learning_rate": 0.0005312806785430478,
"loss": 5.099,
"step": 2825
},
{
"epoch": 5.6262425447316105,
"grad_norm": 1.2890625,
"learning_rate": 0.0005297217511683524,
"loss": 4.9603,
"step": 2830
},
{
"epoch": 5.636182902584493,
"grad_norm": 1.484375,
"learning_rate": 0.0005281630676159228,
"loss": 4.9791,
"step": 2835
},
{
"epoch": 5.646123260437376,
"grad_norm": 1.4765625,
"learning_rate": 0.0005266046466271089,
"loss": 5.1358,
"step": 2840
},
{
"epoch": 5.656063618290259,
"grad_norm": 1.6328125,
"learning_rate": 0.0005250465069401034,
"loss": 4.919,
"step": 2845
},
{
"epoch": 5.666003976143141,
"grad_norm": 1.5859375,
"learning_rate": 0.000523488667289717,
"loss": 5.0773,
"step": 2850
},
{
"epoch": 5.675944333996024,
"grad_norm": 1.703125,
"learning_rate": 0.0005219311464071524,
"loss": 5.1945,
"step": 2855
},
{
"epoch": 5.685884691848907,
"grad_norm": 1.6640625,
"learning_rate": 0.0005203739630197796,
"loss": 5.1397,
"step": 2860
},
{
"epoch": 5.6958250497017895,
"grad_norm": 1.4296875,
"learning_rate": 0.0005188171358509109,
"loss": 5.126,
"step": 2865
},
{
"epoch": 5.705765407554672,
"grad_norm": 1.546875,
"learning_rate": 0.0005172606836195753,
"loss": 5.1093,
"step": 2870
},
{
"epoch": 5.715705765407555,
"grad_norm": 1.515625,
"learning_rate": 0.0005157046250402936,
"loss": 4.9777,
"step": 2875
},
{
"epoch": 5.725646123260438,
"grad_norm": 1.546875,
"learning_rate": 0.0005141489788228533,
"loss": 5.107,
"step": 2880
},
{
"epoch": 5.73558648111332,
"grad_norm": 1.421875,
"learning_rate": 0.0005125937636720838,
"loss": 5.1157,
"step": 2885
},
{
"epoch": 5.745526838966203,
"grad_norm": 1.5859375,
"learning_rate": 0.0005110389982876316,
"loss": 5.2398,
"step": 2890
},
{
"epoch": 5.755467196819086,
"grad_norm": 1.4375,
"learning_rate": 0.000509484701363735,
"loss": 5.122,
"step": 2895
},
{
"epoch": 5.7654075546719685,
"grad_norm": 1.5625,
"learning_rate": 0.000507930891589,
"loss": 5.1468,
"step": 2900
},
{
"epoch": 5.775347912524851,
"grad_norm": 1.5859375,
"learning_rate": 0.0005063775876461746,
"loss": 5.0708,
"step": 2905
},
{
"epoch": 5.785288270377734,
"grad_norm": 1.40625,
"learning_rate": 0.0005048248082119253,
"loss": 5.1268,
"step": 2910
},
{
"epoch": 5.795228628230617,
"grad_norm": 1.671875,
"learning_rate": 0.000503272571956612,
"loss": 5.1971,
"step": 2915
},
{
"epoch": 5.805168986083499,
"grad_norm": 1.4375,
"learning_rate": 0.000501720897544063,
"loss": 4.9704,
"step": 2920
},
{
"epoch": 5.815109343936381,
"grad_norm": 1.4765625,
"learning_rate": 0.0005001698036313514,
"loss": 5.0857,
"step": 2925
},
{
"epoch": 5.825049701789265,
"grad_norm": 1.671875,
"learning_rate": 0.0004986193088685708,
"loss": 5.1096,
"step": 2930
},
{
"epoch": 5.834990059642147,
"grad_norm": 1.4765625,
"learning_rate": 0.0004970694318986101,
"loss": 5.2106,
"step": 2935
},
{
"epoch": 5.84493041749503,
"grad_norm": 1.484375,
"learning_rate": 0.0004955201913569304,
"loss": 5.2243,
"step": 2940
},
{
"epoch": 5.854870775347912,
"grad_norm": 1.4609375,
"learning_rate": 0.0004939716058713404,
"loss": 5.105,
"step": 2945
},
{
"epoch": 5.864811133200796,
"grad_norm": 1.5859375,
"learning_rate": 0.0004924236940617722,
"loss": 5.1933,
"step": 2950
},
{
"epoch": 5.8747514910536776,
"grad_norm": 1.578125,
"learning_rate": 0.0004908764745400584,
"loss": 5.1201,
"step": 2955
},
{
"epoch": 5.88469184890656,
"grad_norm": 1.46875,
"learning_rate": 0.000489329965909707,
"loss": 5.1032,
"step": 2960
},
{
"epoch": 5.894632206759443,
"grad_norm": 1.4375,
"learning_rate": 0.0004877841867656788,
"loss": 5.0158,
"step": 2965
},
{
"epoch": 5.904572564612326,
"grad_norm": 1.640625,
"learning_rate": 0.000486239155694163,
"loss": 5.12,
"step": 2970
},
{
"epoch": 5.914512922465208,
"grad_norm": 1.5703125,
"learning_rate": 0.00048469489127235424,
"loss": 5.1518,
"step": 2975
},
{
"epoch": 5.924453280318091,
"grad_norm": 1.625,
"learning_rate": 0.00048315141206822944,
"loss": 5.0896,
"step": 2980
},
{
"epoch": 5.934393638170974,
"grad_norm": 1.5234375,
"learning_rate": 0.0004816087366403237,
"loss": 5.1697,
"step": 2985
},
{
"epoch": 5.944333996023857,
"grad_norm": 1.546875,
"learning_rate": 0.0004800668835375078,
"loss": 5.0951,
"step": 2990
},
{
"epoch": 5.954274353876739,
"grad_norm": 1.4609375,
"learning_rate": 0.0004785258712987651,
"loss": 5.1784,
"step": 2995
},
{
"epoch": 5.964214711729622,
"grad_norm": 1.546875,
"learning_rate": 0.0004769857184529688,
"loss": 5.1312,
"step": 3000
},
{
"epoch": 5.964214711729622,
"eval_loss": 6.557336330413818,
"eval_runtime": 1.0006,
"eval_samples_per_second": 3462.823,
"eval_steps_per_second": 433.727,
"step": 3000
}
],
"logging_steps": 5,
"max_steps": 5030,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4137965147197440.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}