Files
Qwen3-4B-Base-ftjob-0511c5e…/checkpoint-288/trainer_state.json
ModelHub XC a6d8788b09 初始化项目,由ModelHub XC社区提供模型
Model: longtermrisk/Qwen3-4B-Base-ftjob-0511c5edc14e
Source: Original Platform
2026-05-05 10:06:50 +08:00

2069 lines
47 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 288,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010471204188481676,
"grad_norm": 0.8203125,
"learning_rate": 0.0,
"loss": 2.2601,
"step": 1
},
{
"epoch": 0.020942408376963352,
"grad_norm": 0.69921875,
"learning_rate": 4.000000000000001e-06,
"loss": 2.2972,
"step": 2
},
{
"epoch": 0.031413612565445025,
"grad_norm": 0.7578125,
"learning_rate": 8.000000000000001e-06,
"loss": 2.3696,
"step": 3
},
{
"epoch": 0.041884816753926704,
"grad_norm": 0.75390625,
"learning_rate": 1.2e-05,
"loss": 2.3109,
"step": 4
},
{
"epoch": 0.05235602094240838,
"grad_norm": 0.73828125,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.3398,
"step": 5
},
{
"epoch": 0.06282722513089005,
"grad_norm": 0.67578125,
"learning_rate": 2e-05,
"loss": 2.2808,
"step": 6
},
{
"epoch": 0.07329842931937172,
"grad_norm": 0.64453125,
"learning_rate": 1.992932862190813e-05,
"loss": 2.3087,
"step": 7
},
{
"epoch": 0.08376963350785341,
"grad_norm": 0.66796875,
"learning_rate": 1.9858657243816254e-05,
"loss": 2.3109,
"step": 8
},
{
"epoch": 0.09424083769633508,
"grad_norm": 0.828125,
"learning_rate": 1.9787985865724383e-05,
"loss": 2.273,
"step": 9
},
{
"epoch": 0.10471204188481675,
"grad_norm": 0.59765625,
"learning_rate": 1.971731448763251e-05,
"loss": 2.2373,
"step": 10
},
{
"epoch": 0.11518324607329843,
"grad_norm": 0.65234375,
"learning_rate": 1.964664310954064e-05,
"loss": 2.2352,
"step": 11
},
{
"epoch": 0.1256544502617801,
"grad_norm": 0.52734375,
"learning_rate": 1.9575971731448763e-05,
"loss": 2.1514,
"step": 12
},
{
"epoch": 0.13612565445026178,
"grad_norm": 0.58984375,
"learning_rate": 1.950530035335689e-05,
"loss": 2.2368,
"step": 13
},
{
"epoch": 0.14659685863874344,
"grad_norm": 0.58984375,
"learning_rate": 1.943462897526502e-05,
"loss": 2.3236,
"step": 14
},
{
"epoch": 0.15706806282722513,
"grad_norm": 0.52734375,
"learning_rate": 1.9363957597173148e-05,
"loss": 2.1718,
"step": 15
},
{
"epoch": 0.16753926701570682,
"grad_norm": 0.53125,
"learning_rate": 1.9293286219081272e-05,
"loss": 2.1896,
"step": 16
},
{
"epoch": 0.17801047120418848,
"grad_norm": 0.50390625,
"learning_rate": 1.92226148409894e-05,
"loss": 2.1297,
"step": 17
},
{
"epoch": 0.18848167539267016,
"grad_norm": 0.51953125,
"learning_rate": 1.915194346289753e-05,
"loss": 2.083,
"step": 18
},
{
"epoch": 0.19895287958115182,
"grad_norm": 0.5078125,
"learning_rate": 1.9081272084805657e-05,
"loss": 2.107,
"step": 19
},
{
"epoch": 0.2094240837696335,
"grad_norm": 0.494140625,
"learning_rate": 1.901060070671378e-05,
"loss": 2.2009,
"step": 20
},
{
"epoch": 0.2198952879581152,
"grad_norm": 0.484375,
"learning_rate": 1.893992932862191e-05,
"loss": 2.1653,
"step": 21
},
{
"epoch": 0.23036649214659685,
"grad_norm": 0.486328125,
"learning_rate": 1.8869257950530038e-05,
"loss": 2.1026,
"step": 22
},
{
"epoch": 0.24083769633507854,
"grad_norm": 0.498046875,
"learning_rate": 1.8798586572438166e-05,
"loss": 2.068,
"step": 23
},
{
"epoch": 0.2513089005235602,
"grad_norm": 0.51953125,
"learning_rate": 1.872791519434629e-05,
"loss": 2.1145,
"step": 24
},
{
"epoch": 0.2617801047120419,
"grad_norm": 0.47265625,
"learning_rate": 1.865724381625442e-05,
"loss": 2.1436,
"step": 25
},
{
"epoch": 0.27225130890052357,
"grad_norm": 0.48828125,
"learning_rate": 1.8586572438162547e-05,
"loss": 2.1229,
"step": 26
},
{
"epoch": 0.28272251308900526,
"grad_norm": 0.498046875,
"learning_rate": 1.8515901060070675e-05,
"loss": 2.1724,
"step": 27
},
{
"epoch": 0.2931937172774869,
"grad_norm": 0.490234375,
"learning_rate": 1.84452296819788e-05,
"loss": 2.0963,
"step": 28
},
{
"epoch": 0.3036649214659686,
"grad_norm": 0.5234375,
"learning_rate": 1.8374558303886928e-05,
"loss": 2.1396,
"step": 29
},
{
"epoch": 0.31413612565445026,
"grad_norm": 0.48046875,
"learning_rate": 1.8303886925795052e-05,
"loss": 2.0926,
"step": 30
},
{
"epoch": 0.32460732984293195,
"grad_norm": 0.48046875,
"learning_rate": 1.8233215547703184e-05,
"loss": 2.0929,
"step": 31
},
{
"epoch": 0.33507853403141363,
"grad_norm": 0.443359375,
"learning_rate": 1.816254416961131e-05,
"loss": 1.9604,
"step": 32
},
{
"epoch": 0.34554973821989526,
"grad_norm": 0.494140625,
"learning_rate": 1.8091872791519437e-05,
"loss": 2.1027,
"step": 33
},
{
"epoch": 0.35602094240837695,
"grad_norm": 0.494140625,
"learning_rate": 1.802120141342756e-05,
"loss": 2.136,
"step": 34
},
{
"epoch": 0.36649214659685864,
"grad_norm": 0.482421875,
"learning_rate": 1.7950530035335693e-05,
"loss": 2.0755,
"step": 35
},
{
"epoch": 0.3769633507853403,
"grad_norm": 0.498046875,
"learning_rate": 1.7879858657243818e-05,
"loss": 2.1039,
"step": 36
},
{
"epoch": 0.387434554973822,
"grad_norm": 0.458984375,
"learning_rate": 1.7809187279151946e-05,
"loss": 2.0986,
"step": 37
},
{
"epoch": 0.39790575916230364,
"grad_norm": 0.48046875,
"learning_rate": 1.773851590106007e-05,
"loss": 2.0914,
"step": 38
},
{
"epoch": 0.4083769633507853,
"grad_norm": 0.486328125,
"learning_rate": 1.76678445229682e-05,
"loss": 2.0567,
"step": 39
},
{
"epoch": 0.418848167539267,
"grad_norm": 0.494140625,
"learning_rate": 1.7597173144876327e-05,
"loss": 2.0717,
"step": 40
},
{
"epoch": 0.4293193717277487,
"grad_norm": 0.470703125,
"learning_rate": 1.7526501766784455e-05,
"loss": 2.0779,
"step": 41
},
{
"epoch": 0.4397905759162304,
"grad_norm": 0.46484375,
"learning_rate": 1.745583038869258e-05,
"loss": 2.0537,
"step": 42
},
{
"epoch": 0.450261780104712,
"grad_norm": 0.47265625,
"learning_rate": 1.7385159010600707e-05,
"loss": 2.0633,
"step": 43
},
{
"epoch": 0.4607329842931937,
"grad_norm": 0.466796875,
"learning_rate": 1.7314487632508836e-05,
"loss": 2.0341,
"step": 44
},
{
"epoch": 0.4712041884816754,
"grad_norm": 0.4921875,
"learning_rate": 1.7243816254416964e-05,
"loss": 2.0344,
"step": 45
},
{
"epoch": 0.4816753926701571,
"grad_norm": 0.5703125,
"learning_rate": 1.717314487632509e-05,
"loss": 2.0667,
"step": 46
},
{
"epoch": 0.49214659685863876,
"grad_norm": 0.4609375,
"learning_rate": 1.7102473498233216e-05,
"loss": 2.0143,
"step": 47
},
{
"epoch": 0.5026178010471204,
"grad_norm": 0.51171875,
"learning_rate": 1.7031802120141345e-05,
"loss": 2.0997,
"step": 48
},
{
"epoch": 0.5130890052356021,
"grad_norm": 0.466796875,
"learning_rate": 1.6961130742049473e-05,
"loss": 2.0175,
"step": 49
},
{
"epoch": 0.5235602094240838,
"grad_norm": 0.47265625,
"learning_rate": 1.6890459363957597e-05,
"loss": 2.0413,
"step": 50
},
{
"epoch": 0.5340314136125655,
"grad_norm": 0.466796875,
"learning_rate": 1.6819787985865726e-05,
"loss": 1.9973,
"step": 51
},
{
"epoch": 0.5445026178010471,
"grad_norm": 0.4765625,
"learning_rate": 1.6749116607773854e-05,
"loss": 2.034,
"step": 52
},
{
"epoch": 0.5549738219895288,
"grad_norm": 0.482421875,
"learning_rate": 1.6678445229681982e-05,
"loss": 1.9726,
"step": 53
},
{
"epoch": 0.5654450261780105,
"grad_norm": 0.47265625,
"learning_rate": 1.6607773851590106e-05,
"loss": 2.078,
"step": 54
},
{
"epoch": 0.5759162303664922,
"grad_norm": 0.490234375,
"learning_rate": 1.6537102473498235e-05,
"loss": 2.0228,
"step": 55
},
{
"epoch": 0.5863874345549738,
"grad_norm": 0.478515625,
"learning_rate": 1.6466431095406363e-05,
"loss": 2.0161,
"step": 56
},
{
"epoch": 0.5968586387434555,
"grad_norm": 0.46875,
"learning_rate": 1.639575971731449e-05,
"loss": 1.9855,
"step": 57
},
{
"epoch": 0.6073298429319371,
"grad_norm": 0.51171875,
"learning_rate": 1.6325088339222615e-05,
"loss": 2.0696,
"step": 58
},
{
"epoch": 0.6178010471204188,
"grad_norm": 0.46875,
"learning_rate": 1.6254416961130744e-05,
"loss": 1.9597,
"step": 59
},
{
"epoch": 0.6282722513089005,
"grad_norm": 0.49609375,
"learning_rate": 1.618374558303887e-05,
"loss": 2.0177,
"step": 60
},
{
"epoch": 0.6387434554973822,
"grad_norm": 0.498046875,
"learning_rate": 1.6113074204946996e-05,
"loss": 1.9962,
"step": 61
},
{
"epoch": 0.6492146596858639,
"grad_norm": 0.50390625,
"learning_rate": 1.6042402826855124e-05,
"loss": 2.0041,
"step": 62
},
{
"epoch": 0.6596858638743456,
"grad_norm": 0.50390625,
"learning_rate": 1.5971731448763253e-05,
"loss": 2.0206,
"step": 63
},
{
"epoch": 0.6701570680628273,
"grad_norm": 0.5078125,
"learning_rate": 1.590106007067138e-05,
"loss": 2.0517,
"step": 64
},
{
"epoch": 0.680628272251309,
"grad_norm": 0.494140625,
"learning_rate": 1.5830388692579505e-05,
"loss": 2.0569,
"step": 65
},
{
"epoch": 0.6910994764397905,
"grad_norm": 0.478515625,
"learning_rate": 1.5759717314487633e-05,
"loss": 2.0778,
"step": 66
},
{
"epoch": 0.7015706806282722,
"grad_norm": 0.478515625,
"learning_rate": 1.568904593639576e-05,
"loss": 2.0151,
"step": 67
},
{
"epoch": 0.7120418848167539,
"grad_norm": 0.48828125,
"learning_rate": 1.561837455830389e-05,
"loss": 2.0526,
"step": 68
},
{
"epoch": 0.7225130890052356,
"grad_norm": 0.490234375,
"learning_rate": 1.5547703180212014e-05,
"loss": 1.9519,
"step": 69
},
{
"epoch": 0.7329842931937173,
"grad_norm": 0.48828125,
"learning_rate": 1.5477031802120142e-05,
"loss": 1.9656,
"step": 70
},
{
"epoch": 0.743455497382199,
"grad_norm": 0.498046875,
"learning_rate": 1.540636042402827e-05,
"loss": 1.9285,
"step": 71
},
{
"epoch": 0.7539267015706806,
"grad_norm": 0.48046875,
"learning_rate": 1.53356890459364e-05,
"loss": 2.0631,
"step": 72
},
{
"epoch": 0.7643979057591623,
"grad_norm": 0.474609375,
"learning_rate": 1.5265017667844523e-05,
"loss": 1.971,
"step": 73
},
{
"epoch": 0.774869109947644,
"grad_norm": 0.5390625,
"learning_rate": 1.519434628975265e-05,
"loss": 2.0301,
"step": 74
},
{
"epoch": 0.7853403141361257,
"grad_norm": 0.45703125,
"learning_rate": 1.512367491166078e-05,
"loss": 1.9461,
"step": 75
},
{
"epoch": 0.7958115183246073,
"grad_norm": 0.4765625,
"learning_rate": 1.5053003533568906e-05,
"loss": 1.9787,
"step": 76
},
{
"epoch": 0.806282722513089,
"grad_norm": 0.486328125,
"learning_rate": 1.4982332155477032e-05,
"loss": 1.9635,
"step": 77
},
{
"epoch": 0.8167539267015707,
"grad_norm": 0.5390625,
"learning_rate": 1.4911660777385159e-05,
"loss": 2.0554,
"step": 78
},
{
"epoch": 0.8272251308900523,
"grad_norm": 0.5,
"learning_rate": 1.4840989399293289e-05,
"loss": 2.0113,
"step": 79
},
{
"epoch": 0.837696335078534,
"grad_norm": 0.478515625,
"learning_rate": 1.4770318021201415e-05,
"loss": 1.9093,
"step": 80
},
{
"epoch": 0.8481675392670157,
"grad_norm": 0.546875,
"learning_rate": 1.4699646643109541e-05,
"loss": 2.0891,
"step": 81
},
{
"epoch": 0.8586387434554974,
"grad_norm": 0.5078125,
"learning_rate": 1.4628975265017668e-05,
"loss": 2.0403,
"step": 82
},
{
"epoch": 0.8691099476439791,
"grad_norm": 0.50390625,
"learning_rate": 1.4558303886925796e-05,
"loss": 1.8833,
"step": 83
},
{
"epoch": 0.8795811518324608,
"grad_norm": 0.51953125,
"learning_rate": 1.4487632508833924e-05,
"loss": 2.005,
"step": 84
},
{
"epoch": 0.8900523560209425,
"grad_norm": 0.53125,
"learning_rate": 1.441696113074205e-05,
"loss": 2.0341,
"step": 85
},
{
"epoch": 0.900523560209424,
"grad_norm": 0.5,
"learning_rate": 1.4346289752650177e-05,
"loss": 1.9743,
"step": 86
},
{
"epoch": 0.9109947643979057,
"grad_norm": 0.515625,
"learning_rate": 1.4275618374558305e-05,
"loss": 1.994,
"step": 87
},
{
"epoch": 0.9214659685863874,
"grad_norm": 0.48046875,
"learning_rate": 1.4204946996466433e-05,
"loss": 1.9509,
"step": 88
},
{
"epoch": 0.9319371727748691,
"grad_norm": 0.53125,
"learning_rate": 1.413427561837456e-05,
"loss": 1.9457,
"step": 89
},
{
"epoch": 0.9424083769633508,
"grad_norm": 0.50390625,
"learning_rate": 1.4063604240282686e-05,
"loss": 2.0125,
"step": 90
},
{
"epoch": 0.9528795811518325,
"grad_norm": 0.55078125,
"learning_rate": 1.3992932862190814e-05,
"loss": 1.999,
"step": 91
},
{
"epoch": 0.9633507853403142,
"grad_norm": 0.498046875,
"learning_rate": 1.392226148409894e-05,
"loss": 2.0245,
"step": 92
},
{
"epoch": 0.9738219895287958,
"grad_norm": 0.59765625,
"learning_rate": 1.3851590106007068e-05,
"loss": 2.0204,
"step": 93
},
{
"epoch": 0.9842931937172775,
"grad_norm": 0.53515625,
"learning_rate": 1.3780918727915195e-05,
"loss": 2.0003,
"step": 94
},
{
"epoch": 0.9947643979057592,
"grad_norm": 0.51171875,
"learning_rate": 1.3710247349823323e-05,
"loss": 1.9902,
"step": 95
},
{
"epoch": 1.0,
"grad_norm": 0.8828125,
"learning_rate": 1.363957597173145e-05,
"loss": 1.8842,
"step": 96
},
{
"epoch": 1.0,
"eval_loss": 1.9897371530532837,
"eval_model_preparation_time": 0.0174,
"eval_runtime": 6.3645,
"eval_samples_per_second": 26.711,
"eval_steps_per_second": 13.355,
"step": 96
},
{
"epoch": 1.0104712041884816,
"grad_norm": 0.5,
"learning_rate": 1.3568904593639577e-05,
"loss": 1.9183,
"step": 97
},
{
"epoch": 1.0209424083769634,
"grad_norm": 0.490234375,
"learning_rate": 1.3498233215547704e-05,
"loss": 1.9927,
"step": 98
},
{
"epoch": 1.031413612565445,
"grad_norm": 0.5390625,
"learning_rate": 1.3427561837455832e-05,
"loss": 1.9935,
"step": 99
},
{
"epoch": 1.0418848167539267,
"grad_norm": 0.57421875,
"learning_rate": 1.3356890459363958e-05,
"loss": 2.0505,
"step": 100
},
{
"epoch": 1.0523560209424083,
"grad_norm": 0.55078125,
"learning_rate": 1.3286219081272085e-05,
"loss": 1.9709,
"step": 101
},
{
"epoch": 1.0628272251308901,
"grad_norm": 0.4921875,
"learning_rate": 1.3215547703180213e-05,
"loss": 2.0051,
"step": 102
},
{
"epoch": 1.0732984293193717,
"grad_norm": 0.5234375,
"learning_rate": 1.3144876325088341e-05,
"loss": 1.9533,
"step": 103
},
{
"epoch": 1.0837696335078535,
"grad_norm": 0.5,
"learning_rate": 1.3074204946996467e-05,
"loss": 1.9422,
"step": 104
},
{
"epoch": 1.094240837696335,
"grad_norm": 0.62109375,
"learning_rate": 1.3003533568904594e-05,
"loss": 1.9434,
"step": 105
},
{
"epoch": 1.1047120418848166,
"grad_norm": 0.515625,
"learning_rate": 1.2932862190812724e-05,
"loss": 1.9051,
"step": 106
},
{
"epoch": 1.1151832460732984,
"grad_norm": 0.4765625,
"learning_rate": 1.286219081272085e-05,
"loss": 1.9071,
"step": 107
},
{
"epoch": 1.12565445026178,
"grad_norm": 0.51171875,
"learning_rate": 1.2791519434628976e-05,
"loss": 1.9677,
"step": 108
},
{
"epoch": 1.1361256544502618,
"grad_norm": 0.49609375,
"learning_rate": 1.2720848056537103e-05,
"loss": 2.0063,
"step": 109
},
{
"epoch": 1.1465968586387434,
"grad_norm": 0.5234375,
"learning_rate": 1.2650176678445233e-05,
"loss": 1.9198,
"step": 110
},
{
"epoch": 1.1570680628272252,
"grad_norm": 0.51953125,
"learning_rate": 1.2579505300353359e-05,
"loss": 1.9661,
"step": 111
},
{
"epoch": 1.1675392670157068,
"grad_norm": 0.515625,
"learning_rate": 1.2508833922261485e-05,
"loss": 1.9719,
"step": 112
},
{
"epoch": 1.1780104712041886,
"grad_norm": 0.51171875,
"learning_rate": 1.2438162544169612e-05,
"loss": 1.9422,
"step": 113
},
{
"epoch": 1.1884816753926701,
"grad_norm": 0.5390625,
"learning_rate": 1.2367491166077738e-05,
"loss": 1.9787,
"step": 114
},
{
"epoch": 1.1989528795811517,
"grad_norm": 0.515625,
"learning_rate": 1.2296819787985868e-05,
"loss": 2.0106,
"step": 115
},
{
"epoch": 1.2094240837696335,
"grad_norm": 0.5546875,
"learning_rate": 1.2226148409893994e-05,
"loss": 1.9833,
"step": 116
},
{
"epoch": 1.2198952879581153,
"grad_norm": 0.55078125,
"learning_rate": 1.2155477031802121e-05,
"loss": 1.9402,
"step": 117
},
{
"epoch": 1.2303664921465969,
"grad_norm": 0.51171875,
"learning_rate": 1.2084805653710247e-05,
"loss": 1.8728,
"step": 118
},
{
"epoch": 1.2408376963350785,
"grad_norm": 0.51171875,
"learning_rate": 1.2014134275618377e-05,
"loss": 1.8937,
"step": 119
},
{
"epoch": 1.2513089005235603,
"grad_norm": 0.5078125,
"learning_rate": 1.1943462897526503e-05,
"loss": 1.9485,
"step": 120
},
{
"epoch": 1.2617801047120418,
"grad_norm": 0.5625,
"learning_rate": 1.187279151943463e-05,
"loss": 1.9519,
"step": 121
},
{
"epoch": 1.2722513089005236,
"grad_norm": 0.56640625,
"learning_rate": 1.1802120141342756e-05,
"loss": 2.0045,
"step": 122
},
{
"epoch": 1.2827225130890052,
"grad_norm": 0.5078125,
"learning_rate": 1.1731448763250883e-05,
"loss": 1.9656,
"step": 123
},
{
"epoch": 1.2931937172774868,
"grad_norm": 0.51171875,
"learning_rate": 1.1660777385159012e-05,
"loss": 1.9464,
"step": 124
},
{
"epoch": 1.3036649214659686,
"grad_norm": 0.5390625,
"learning_rate": 1.1590106007067139e-05,
"loss": 1.9476,
"step": 125
},
{
"epoch": 1.3141361256544504,
"grad_norm": 0.50390625,
"learning_rate": 1.1519434628975265e-05,
"loss": 1.9434,
"step": 126
},
{
"epoch": 1.324607329842932,
"grad_norm": 0.5546875,
"learning_rate": 1.1448763250883392e-05,
"loss": 2.0212,
"step": 127
},
{
"epoch": 1.3350785340314135,
"grad_norm": 0.53125,
"learning_rate": 1.1378091872791521e-05,
"loss": 1.9427,
"step": 128
},
{
"epoch": 1.3455497382198953,
"grad_norm": 0.546875,
"learning_rate": 1.1307420494699648e-05,
"loss": 1.9057,
"step": 129
},
{
"epoch": 1.356020942408377,
"grad_norm": 0.5078125,
"learning_rate": 1.1236749116607774e-05,
"loss": 1.8317,
"step": 130
},
{
"epoch": 1.3664921465968587,
"grad_norm": 0.51171875,
"learning_rate": 1.11660777385159e-05,
"loss": 1.9583,
"step": 131
},
{
"epoch": 1.3769633507853403,
"grad_norm": 0.53515625,
"learning_rate": 1.1095406360424029e-05,
"loss": 1.9704,
"step": 132
},
{
"epoch": 1.387434554973822,
"grad_norm": 0.53125,
"learning_rate": 1.1024734982332157e-05,
"loss": 1.9644,
"step": 133
},
{
"epoch": 1.3979057591623036,
"grad_norm": 0.53515625,
"learning_rate": 1.0954063604240283e-05,
"loss": 1.8924,
"step": 134
},
{
"epoch": 1.4083769633507854,
"grad_norm": 0.53515625,
"learning_rate": 1.088339222614841e-05,
"loss": 1.9863,
"step": 135
},
{
"epoch": 1.418848167539267,
"grad_norm": 0.53515625,
"learning_rate": 1.0812720848056538e-05,
"loss": 1.9072,
"step": 136
},
{
"epoch": 1.4293193717277486,
"grad_norm": 0.53125,
"learning_rate": 1.0742049469964666e-05,
"loss": 1.9805,
"step": 137
},
{
"epoch": 1.4397905759162304,
"grad_norm": 0.52734375,
"learning_rate": 1.0671378091872792e-05,
"loss": 1.9028,
"step": 138
},
{
"epoch": 1.450261780104712,
"grad_norm": 0.52734375,
"learning_rate": 1.0600706713780919e-05,
"loss": 1.8999,
"step": 139
},
{
"epoch": 1.4607329842931938,
"grad_norm": 0.55859375,
"learning_rate": 1.0530035335689047e-05,
"loss": 1.9354,
"step": 140
},
{
"epoch": 1.4712041884816753,
"grad_norm": 0.5234375,
"learning_rate": 1.0459363957597175e-05,
"loss": 1.9385,
"step": 141
},
{
"epoch": 1.4816753926701571,
"grad_norm": 0.5234375,
"learning_rate": 1.0388692579505301e-05,
"loss": 1.8297,
"step": 142
},
{
"epoch": 1.4921465968586387,
"grad_norm": 0.54296875,
"learning_rate": 1.0318021201413428e-05,
"loss": 1.9386,
"step": 143
},
{
"epoch": 1.5026178010471205,
"grad_norm": 0.5390625,
"learning_rate": 1.0247349823321556e-05,
"loss": 1.9242,
"step": 144
},
{
"epoch": 1.513089005235602,
"grad_norm": 0.5,
"learning_rate": 1.0176678445229682e-05,
"loss": 1.894,
"step": 145
},
{
"epoch": 1.5235602094240837,
"grad_norm": 0.53125,
"learning_rate": 1.010600706713781e-05,
"loss": 1.9134,
"step": 146
},
{
"epoch": 1.5340314136125655,
"grad_norm": 0.53515625,
"learning_rate": 1.0035335689045937e-05,
"loss": 1.9363,
"step": 147
},
{
"epoch": 1.5445026178010473,
"grad_norm": 0.640625,
"learning_rate": 9.964664310954065e-06,
"loss": 1.8895,
"step": 148
},
{
"epoch": 1.5549738219895288,
"grad_norm": 0.5625,
"learning_rate": 9.893992932862191e-06,
"loss": 2.008,
"step": 149
},
{
"epoch": 1.5654450261780104,
"grad_norm": 0.53515625,
"learning_rate": 9.82332155477032e-06,
"loss": 1.9005,
"step": 150
},
{
"epoch": 1.5759162303664922,
"grad_norm": 0.5703125,
"learning_rate": 9.752650176678446e-06,
"loss": 1.9343,
"step": 151
},
{
"epoch": 1.5863874345549738,
"grad_norm": 0.52734375,
"learning_rate": 9.681978798586574e-06,
"loss": 1.9238,
"step": 152
},
{
"epoch": 1.5968586387434556,
"grad_norm": 0.5234375,
"learning_rate": 9.6113074204947e-06,
"loss": 1.9308,
"step": 153
},
{
"epoch": 1.6073298429319371,
"grad_norm": 0.5078125,
"learning_rate": 9.540636042402828e-06,
"loss": 1.9385,
"step": 154
},
{
"epoch": 1.6178010471204187,
"grad_norm": 0.52734375,
"learning_rate": 9.469964664310955e-06,
"loss": 1.9442,
"step": 155
},
{
"epoch": 1.6282722513089005,
"grad_norm": 0.5546875,
"learning_rate": 9.399293286219083e-06,
"loss": 1.9648,
"step": 156
},
{
"epoch": 1.6387434554973823,
"grad_norm": 0.53125,
"learning_rate": 9.32862190812721e-06,
"loss": 1.9179,
"step": 157
},
{
"epoch": 1.649214659685864,
"grad_norm": 0.51953125,
"learning_rate": 9.257950530035337e-06,
"loss": 1.9523,
"step": 158
},
{
"epoch": 1.6596858638743455,
"grad_norm": 0.53515625,
"learning_rate": 9.187279151943464e-06,
"loss": 1.8984,
"step": 159
},
{
"epoch": 1.6701570680628273,
"grad_norm": 0.57421875,
"learning_rate": 9.116607773851592e-06,
"loss": 1.9394,
"step": 160
},
{
"epoch": 1.680628272251309,
"grad_norm": 0.53125,
"learning_rate": 9.045936395759718e-06,
"loss": 1.9487,
"step": 161
},
{
"epoch": 1.6910994764397906,
"grad_norm": 0.5625,
"learning_rate": 8.975265017667846e-06,
"loss": 1.9139,
"step": 162
},
{
"epoch": 1.7015706806282722,
"grad_norm": 0.54296875,
"learning_rate": 8.904593639575973e-06,
"loss": 1.9948,
"step": 163
},
{
"epoch": 1.7120418848167538,
"grad_norm": 0.5390625,
"learning_rate": 8.8339222614841e-06,
"loss": 1.8555,
"step": 164
},
{
"epoch": 1.7225130890052356,
"grad_norm": 0.56640625,
"learning_rate": 8.763250883392227e-06,
"loss": 1.9637,
"step": 165
},
{
"epoch": 1.7329842931937174,
"grad_norm": 0.54296875,
"learning_rate": 8.692579505300354e-06,
"loss": 1.9455,
"step": 166
},
{
"epoch": 1.743455497382199,
"grad_norm": 0.52734375,
"learning_rate": 8.621908127208482e-06,
"loss": 1.9076,
"step": 167
},
{
"epoch": 1.7539267015706805,
"grad_norm": 0.60546875,
"learning_rate": 8.551236749116608e-06,
"loss": 1.9631,
"step": 168
},
{
"epoch": 1.7643979057591623,
"grad_norm": 0.60546875,
"learning_rate": 8.480565371024736e-06,
"loss": 1.9204,
"step": 169
},
{
"epoch": 1.7748691099476441,
"grad_norm": 0.53515625,
"learning_rate": 8.409893992932863e-06,
"loss": 1.9112,
"step": 170
},
{
"epoch": 1.7853403141361257,
"grad_norm": 0.5546875,
"learning_rate": 8.339222614840991e-06,
"loss": 1.9782,
"step": 171
},
{
"epoch": 1.7958115183246073,
"grad_norm": 0.53125,
"learning_rate": 8.268551236749117e-06,
"loss": 1.9293,
"step": 172
},
{
"epoch": 1.8062827225130889,
"grad_norm": 0.53515625,
"learning_rate": 8.197879858657245e-06,
"loss": 1.9181,
"step": 173
},
{
"epoch": 1.8167539267015707,
"grad_norm": 0.55078125,
"learning_rate": 8.127208480565372e-06,
"loss": 1.942,
"step": 174
},
{
"epoch": 1.8272251308900525,
"grad_norm": 0.51953125,
"learning_rate": 8.056537102473498e-06,
"loss": 1.8667,
"step": 175
},
{
"epoch": 1.837696335078534,
"grad_norm": 0.5625,
"learning_rate": 7.985865724381626e-06,
"loss": 1.9354,
"step": 176
},
{
"epoch": 1.8481675392670156,
"grad_norm": 0.53125,
"learning_rate": 7.915194346289753e-06,
"loss": 1.8853,
"step": 177
},
{
"epoch": 1.8586387434554974,
"grad_norm": 0.57421875,
"learning_rate": 7.84452296819788e-06,
"loss": 1.986,
"step": 178
},
{
"epoch": 1.8691099476439792,
"grad_norm": 0.53515625,
"learning_rate": 7.773851590106007e-06,
"loss": 1.8868,
"step": 179
},
{
"epoch": 1.8795811518324608,
"grad_norm": 0.50390625,
"learning_rate": 7.703180212014135e-06,
"loss": 1.9034,
"step": 180
},
{
"epoch": 1.8900523560209423,
"grad_norm": 0.54296875,
"learning_rate": 7.632508833922262e-06,
"loss": 1.8874,
"step": 181
},
{
"epoch": 1.900523560209424,
"grad_norm": 0.578125,
"learning_rate": 7.56183745583039e-06,
"loss": 1.9344,
"step": 182
},
{
"epoch": 1.9109947643979057,
"grad_norm": 0.53515625,
"learning_rate": 7.491166077738516e-06,
"loss": 1.9538,
"step": 183
},
{
"epoch": 1.9214659685863875,
"grad_norm": 0.59375,
"learning_rate": 7.420494699646644e-06,
"loss": 1.9534,
"step": 184
},
{
"epoch": 1.931937172774869,
"grad_norm": 0.53515625,
"learning_rate": 7.349823321554771e-06,
"loss": 1.9045,
"step": 185
},
{
"epoch": 1.9424083769633507,
"grad_norm": 0.546875,
"learning_rate": 7.279151943462898e-06,
"loss": 1.8959,
"step": 186
},
{
"epoch": 1.9528795811518325,
"grad_norm": 0.54296875,
"learning_rate": 7.208480565371025e-06,
"loss": 1.9525,
"step": 187
},
{
"epoch": 1.9633507853403143,
"grad_norm": 0.55859375,
"learning_rate": 7.1378091872791525e-06,
"loss": 1.9923,
"step": 188
},
{
"epoch": 1.9738219895287958,
"grad_norm": 0.55859375,
"learning_rate": 7.06713780918728e-06,
"loss": 2.0032,
"step": 189
},
{
"epoch": 1.9842931937172774,
"grad_norm": 0.5546875,
"learning_rate": 6.996466431095407e-06,
"loss": 1.9329,
"step": 190
},
{
"epoch": 1.9947643979057592,
"grad_norm": 0.546875,
"learning_rate": 6.925795053003534e-06,
"loss": 1.9355,
"step": 191
},
{
"epoch": 2.0,
"grad_norm": 0.828125,
"learning_rate": 6.8551236749116615e-06,
"loss": 1.984,
"step": 192
},
{
"epoch": 2.0,
"eval_loss": 1.9503586292266846,
"eval_model_preparation_time": 0.0174,
"eval_runtime": 6.3629,
"eval_samples_per_second": 26.717,
"eval_steps_per_second": 13.359,
"step": 192
},
{
"epoch": 2.0104712041884816,
"grad_norm": 0.52734375,
"learning_rate": 6.784452296819789e-06,
"loss": 1.8839,
"step": 193
},
{
"epoch": 2.020942408376963,
"grad_norm": 0.52734375,
"learning_rate": 6.713780918727916e-06,
"loss": 1.9087,
"step": 194
},
{
"epoch": 2.031413612565445,
"grad_norm": 0.5625,
"learning_rate": 6.643109540636042e-06,
"loss": 1.971,
"step": 195
},
{
"epoch": 2.0418848167539267,
"grad_norm": 0.53515625,
"learning_rate": 6.5724381625441705e-06,
"loss": 1.9044,
"step": 196
},
{
"epoch": 2.0523560209424083,
"grad_norm": 0.56640625,
"learning_rate": 6.501766784452297e-06,
"loss": 1.9485,
"step": 197
},
{
"epoch": 2.06282722513089,
"grad_norm": 0.5078125,
"learning_rate": 6.431095406360425e-06,
"loss": 1.8813,
"step": 198
},
{
"epoch": 2.073298429319372,
"grad_norm": 0.55859375,
"learning_rate": 6.360424028268551e-06,
"loss": 1.9121,
"step": 199
},
{
"epoch": 2.0837696335078535,
"grad_norm": 0.52734375,
"learning_rate": 6.2897526501766795e-06,
"loss": 1.8859,
"step": 200
},
{
"epoch": 2.094240837696335,
"grad_norm": 0.60546875,
"learning_rate": 6.219081272084806e-06,
"loss": 1.8837,
"step": 201
},
{
"epoch": 2.1047120418848166,
"grad_norm": 0.55078125,
"learning_rate": 6.148409893992934e-06,
"loss": 1.9064,
"step": 202
},
{
"epoch": 2.115183246073298,
"grad_norm": 0.54296875,
"learning_rate": 6.0777385159010604e-06,
"loss": 1.8737,
"step": 203
},
{
"epoch": 2.1256544502617802,
"grad_norm": 0.5546875,
"learning_rate": 6.0070671378091885e-06,
"loss": 2.0019,
"step": 204
},
{
"epoch": 2.136125654450262,
"grad_norm": 0.5390625,
"learning_rate": 5.936395759717315e-06,
"loss": 1.8755,
"step": 205
},
{
"epoch": 2.1465968586387434,
"grad_norm": 0.578125,
"learning_rate": 5.865724381625441e-06,
"loss": 1.8861,
"step": 206
},
{
"epoch": 2.157068062827225,
"grad_norm": 0.55078125,
"learning_rate": 5.7950530035335694e-06,
"loss": 1.9246,
"step": 207
},
{
"epoch": 2.167539267015707,
"grad_norm": 0.5390625,
"learning_rate": 5.724381625441696e-06,
"loss": 1.888,
"step": 208
},
{
"epoch": 2.1780104712041886,
"grad_norm": 0.578125,
"learning_rate": 5.653710247349824e-06,
"loss": 1.9335,
"step": 209
},
{
"epoch": 2.18848167539267,
"grad_norm": 0.59375,
"learning_rate": 5.58303886925795e-06,
"loss": 1.9693,
"step": 210
},
{
"epoch": 2.1989528795811517,
"grad_norm": 0.57421875,
"learning_rate": 5.5123674911660785e-06,
"loss": 1.8692,
"step": 211
},
{
"epoch": 2.2094240837696333,
"grad_norm": 0.5234375,
"learning_rate": 5.441696113074205e-06,
"loss": 1.8835,
"step": 212
},
{
"epoch": 2.2198952879581153,
"grad_norm": 0.55078125,
"learning_rate": 5.371024734982333e-06,
"loss": 1.9988,
"step": 213
},
{
"epoch": 2.230366492146597,
"grad_norm": 0.546875,
"learning_rate": 5.300353356890459e-06,
"loss": 1.8328,
"step": 214
},
{
"epoch": 2.2408376963350785,
"grad_norm": 0.55859375,
"learning_rate": 5.2296819787985875e-06,
"loss": 1.8653,
"step": 215
},
{
"epoch": 2.25130890052356,
"grad_norm": 0.59375,
"learning_rate": 5.159010600706714e-06,
"loss": 1.9541,
"step": 216
},
{
"epoch": 2.261780104712042,
"grad_norm": 0.5625,
"learning_rate": 5.088339222614841e-06,
"loss": 1.896,
"step": 217
},
{
"epoch": 2.2722513089005236,
"grad_norm": 0.51953125,
"learning_rate": 5.017667844522968e-06,
"loss": 1.8845,
"step": 218
},
{
"epoch": 2.282722513089005,
"grad_norm": 0.56640625,
"learning_rate": 4.946996466431096e-06,
"loss": 1.8785,
"step": 219
},
{
"epoch": 2.2931937172774868,
"grad_norm": 0.53515625,
"learning_rate": 4.876325088339223e-06,
"loss": 1.8757,
"step": 220
},
{
"epoch": 2.303664921465969,
"grad_norm": 0.5546875,
"learning_rate": 4.80565371024735e-06,
"loss": 1.901,
"step": 221
},
{
"epoch": 2.3141361256544504,
"grad_norm": 0.56640625,
"learning_rate": 4.734982332155477e-06,
"loss": 1.9568,
"step": 222
},
{
"epoch": 2.324607329842932,
"grad_norm": 0.5390625,
"learning_rate": 4.664310954063605e-06,
"loss": 1.8776,
"step": 223
},
{
"epoch": 2.3350785340314135,
"grad_norm": 0.54296875,
"learning_rate": 4.593639575971732e-06,
"loss": 1.8304,
"step": 224
},
{
"epoch": 2.345549738219895,
"grad_norm": 0.5390625,
"learning_rate": 4.522968197879859e-06,
"loss": 1.9038,
"step": 225
},
{
"epoch": 2.356020942408377,
"grad_norm": 0.546875,
"learning_rate": 4.452296819787986e-06,
"loss": 1.9319,
"step": 226
},
{
"epoch": 2.3664921465968587,
"grad_norm": 0.55859375,
"learning_rate": 4.381625441696114e-06,
"loss": 1.9357,
"step": 227
},
{
"epoch": 2.3769633507853403,
"grad_norm": 0.53125,
"learning_rate": 4.310954063604241e-06,
"loss": 1.8749,
"step": 228
},
{
"epoch": 2.387434554973822,
"grad_norm": 0.546875,
"learning_rate": 4.240282685512368e-06,
"loss": 1.8875,
"step": 229
},
{
"epoch": 2.3979057591623034,
"grad_norm": 0.578125,
"learning_rate": 4.1696113074204954e-06,
"loss": 1.8741,
"step": 230
},
{
"epoch": 2.4083769633507854,
"grad_norm": 0.57421875,
"learning_rate": 4.098939929328623e-06,
"loss": 1.9785,
"step": 231
},
{
"epoch": 2.418848167539267,
"grad_norm": 0.59765625,
"learning_rate": 4.028268551236749e-06,
"loss": 1.936,
"step": 232
},
{
"epoch": 2.4293193717277486,
"grad_norm": 0.55078125,
"learning_rate": 3.957597173144876e-06,
"loss": 1.8668,
"step": 233
},
{
"epoch": 2.4397905759162306,
"grad_norm": 0.5703125,
"learning_rate": 3.886925795053004e-06,
"loss": 1.941,
"step": 234
},
{
"epoch": 2.450261780104712,
"grad_norm": 0.55078125,
"learning_rate": 3.816254416961131e-06,
"loss": 1.9373,
"step": 235
},
{
"epoch": 2.4607329842931938,
"grad_norm": 0.53515625,
"learning_rate": 3.745583038869258e-06,
"loss": 1.8342,
"step": 236
},
{
"epoch": 2.4712041884816753,
"grad_norm": 0.5390625,
"learning_rate": 3.6749116607773854e-06,
"loss": 1.9079,
"step": 237
},
{
"epoch": 2.481675392670157,
"grad_norm": 0.57421875,
"learning_rate": 3.6042402826855126e-06,
"loss": 1.873,
"step": 238
},
{
"epoch": 2.492146596858639,
"grad_norm": 0.55078125,
"learning_rate": 3.53356890459364e-06,
"loss": 1.88,
"step": 239
},
{
"epoch": 2.5026178010471205,
"grad_norm": 0.5625,
"learning_rate": 3.462897526501767e-06,
"loss": 1.9091,
"step": 240
},
{
"epoch": 2.513089005235602,
"grad_norm": 0.546875,
"learning_rate": 3.3922261484098944e-06,
"loss": 1.9115,
"step": 241
},
{
"epoch": 2.5235602094240837,
"grad_norm": 0.53125,
"learning_rate": 3.321554770318021e-06,
"loss": 1.847,
"step": 242
},
{
"epoch": 2.5340314136125652,
"grad_norm": 0.55078125,
"learning_rate": 3.2508833922261485e-06,
"loss": 1.8699,
"step": 243
},
{
"epoch": 2.5445026178010473,
"grad_norm": 0.55859375,
"learning_rate": 3.1802120141342757e-06,
"loss": 1.9372,
"step": 244
},
{
"epoch": 2.554973821989529,
"grad_norm": 0.51171875,
"learning_rate": 3.109540636042403e-06,
"loss": 1.8895,
"step": 245
},
{
"epoch": 2.5654450261780104,
"grad_norm": 0.59375,
"learning_rate": 3.0388692579505302e-06,
"loss": 1.9855,
"step": 246
},
{
"epoch": 2.5759162303664924,
"grad_norm": 0.55859375,
"learning_rate": 2.9681978798586575e-06,
"loss": 1.9301,
"step": 247
},
{
"epoch": 2.5863874345549736,
"grad_norm": 0.56640625,
"learning_rate": 2.8975265017667847e-06,
"loss": 1.9153,
"step": 248
},
{
"epoch": 2.5968586387434556,
"grad_norm": 0.58984375,
"learning_rate": 2.826855123674912e-06,
"loss": 1.8976,
"step": 249
},
{
"epoch": 2.607329842931937,
"grad_norm": 0.55078125,
"learning_rate": 2.7561837455830392e-06,
"loss": 1.9193,
"step": 250
},
{
"epoch": 2.6178010471204187,
"grad_norm": 0.5390625,
"learning_rate": 2.6855123674911665e-06,
"loss": 1.8798,
"step": 251
},
{
"epoch": 2.6282722513089007,
"grad_norm": 0.55078125,
"learning_rate": 2.6148409893992937e-06,
"loss": 1.8157,
"step": 252
},
{
"epoch": 2.6387434554973823,
"grad_norm": 0.59375,
"learning_rate": 2.5441696113074206e-06,
"loss": 1.9138,
"step": 253
},
{
"epoch": 2.649214659685864,
"grad_norm": 0.5859375,
"learning_rate": 2.473498233215548e-06,
"loss": 1.9798,
"step": 254
},
{
"epoch": 2.6596858638743455,
"grad_norm": 0.58984375,
"learning_rate": 2.402826855123675e-06,
"loss": 1.924,
"step": 255
},
{
"epoch": 2.670157068062827,
"grad_norm": 0.5625,
"learning_rate": 2.3321554770318023e-06,
"loss": 1.9763,
"step": 256
},
{
"epoch": 2.680628272251309,
"grad_norm": 0.51953125,
"learning_rate": 2.2614840989399296e-06,
"loss": 1.8371,
"step": 257
},
{
"epoch": 2.6910994764397906,
"grad_norm": 0.58203125,
"learning_rate": 2.190812720848057e-06,
"loss": 1.9625,
"step": 258
},
{
"epoch": 2.701570680628272,
"grad_norm": 0.55078125,
"learning_rate": 2.120141342756184e-06,
"loss": 1.8508,
"step": 259
},
{
"epoch": 2.712041884816754,
"grad_norm": 0.6015625,
"learning_rate": 2.0494699646643113e-06,
"loss": 1.9217,
"step": 260
},
{
"epoch": 2.7225130890052354,
"grad_norm": 0.53515625,
"learning_rate": 1.978798586572438e-06,
"loss": 1.8905,
"step": 261
},
{
"epoch": 2.7329842931937174,
"grad_norm": 0.55859375,
"learning_rate": 1.9081272084805654e-06,
"loss": 1.9174,
"step": 262
},
{
"epoch": 2.743455497382199,
"grad_norm": 0.5625,
"learning_rate": 1.8374558303886927e-06,
"loss": 1.9075,
"step": 263
},
{
"epoch": 2.7539267015706805,
"grad_norm": 0.5390625,
"learning_rate": 1.76678445229682e-06,
"loss": 1.9001,
"step": 264
},
{
"epoch": 2.7643979057591626,
"grad_norm": 0.53515625,
"learning_rate": 1.6961130742049472e-06,
"loss": 1.8356,
"step": 265
},
{
"epoch": 2.774869109947644,
"grad_norm": 0.55859375,
"learning_rate": 1.6254416961130742e-06,
"loss": 1.9398,
"step": 266
},
{
"epoch": 2.7853403141361257,
"grad_norm": 0.54296875,
"learning_rate": 1.5547703180212015e-06,
"loss": 1.8619,
"step": 267
},
{
"epoch": 2.7958115183246073,
"grad_norm": 0.55078125,
"learning_rate": 1.4840989399293287e-06,
"loss": 1.9078,
"step": 268
},
{
"epoch": 2.806282722513089,
"grad_norm": 0.58203125,
"learning_rate": 1.413427561837456e-06,
"loss": 1.9356,
"step": 269
},
{
"epoch": 2.816753926701571,
"grad_norm": 0.5546875,
"learning_rate": 1.3427561837455832e-06,
"loss": 1.8864,
"step": 270
},
{
"epoch": 2.8272251308900525,
"grad_norm": 0.60546875,
"learning_rate": 1.2720848056537103e-06,
"loss": 2.0435,
"step": 271
},
{
"epoch": 2.837696335078534,
"grad_norm": 0.54296875,
"learning_rate": 1.2014134275618375e-06,
"loss": 1.8981,
"step": 272
},
{
"epoch": 2.8481675392670156,
"grad_norm": 0.59375,
"learning_rate": 1.1307420494699648e-06,
"loss": 1.9489,
"step": 273
},
{
"epoch": 2.858638743455497,
"grad_norm": 0.54296875,
"learning_rate": 1.060070671378092e-06,
"loss": 1.8501,
"step": 274
},
{
"epoch": 2.869109947643979,
"grad_norm": 0.5390625,
"learning_rate": 9.89399293286219e-07,
"loss": 1.9134,
"step": 275
},
{
"epoch": 2.8795811518324608,
"grad_norm": 0.5625,
"learning_rate": 9.187279151943463e-07,
"loss": 1.9059,
"step": 276
},
{
"epoch": 2.8900523560209423,
"grad_norm": 0.5234375,
"learning_rate": 8.480565371024736e-07,
"loss": 1.8215,
"step": 277
},
{
"epoch": 2.900523560209424,
"grad_norm": 0.55859375,
"learning_rate": 7.773851590106007e-07,
"loss": 1.9826,
"step": 278
},
{
"epoch": 2.9109947643979055,
"grad_norm": 0.5390625,
"learning_rate": 7.06713780918728e-07,
"loss": 1.8633,
"step": 279
},
{
"epoch": 2.9214659685863875,
"grad_norm": 0.55078125,
"learning_rate": 6.360424028268551e-07,
"loss": 1.9494,
"step": 280
},
{
"epoch": 2.931937172774869,
"grad_norm": 0.55078125,
"learning_rate": 5.653710247349824e-07,
"loss": 1.8929,
"step": 281
},
{
"epoch": 2.9424083769633507,
"grad_norm": 0.52734375,
"learning_rate": 4.946996466431095e-07,
"loss": 1.8427,
"step": 282
},
{
"epoch": 2.9528795811518327,
"grad_norm": 0.52734375,
"learning_rate": 4.240282685512368e-07,
"loss": 1.9098,
"step": 283
},
{
"epoch": 2.9633507853403143,
"grad_norm": 0.54296875,
"learning_rate": 3.53356890459364e-07,
"loss": 1.8163,
"step": 284
},
{
"epoch": 2.973821989528796,
"grad_norm": 0.59375,
"learning_rate": 2.826855123674912e-07,
"loss": 1.9398,
"step": 285
},
{
"epoch": 2.9842931937172774,
"grad_norm": 0.53515625,
"learning_rate": 2.120141342756184e-07,
"loss": 1.8855,
"step": 286
},
{
"epoch": 2.994764397905759,
"grad_norm": 0.578125,
"learning_rate": 1.413427561837456e-07,
"loss": 1.9103,
"step": 287
},
{
"epoch": 3.0,
"grad_norm": 0.80078125,
"learning_rate": 7.06713780918728e-08,
"loss": 1.7957,
"step": 288
}
],
"logging_steps": 1,
"max_steps": 288,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.31738818434734e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}