Model: longtermrisk/Qwen3-4B-Base-ftjob-0511c5edc14e-ftjob-c816ae862a4e Source: Original Platform
2069 lines
47 KiB
JSON
2069 lines
47 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 288,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.010471204188481676,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.8282,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.020942408376963352,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 1.8966,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.031413612565445025,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 1.9458,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.041884816753926704,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 1.8837,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.05235602094240838,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 1.6000000000000003e-05,
|
|
"loss": 1.9112,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.06282722513089005,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.8852,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.07329842931937172,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.992932862190813e-05,
|
|
"loss": 1.9405,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.08376963350785341,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 1.9858657243816254e-05,
|
|
"loss": 1.9264,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.09424083769633508,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.9787985865724383e-05,
|
|
"loss": 1.9105,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.10471204188481675,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 1.971731448763251e-05,
|
|
"loss": 1.9066,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.11518324607329843,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 1.964664310954064e-05,
|
|
"loss": 1.8852,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.1256544502617801,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.9575971731448763e-05,
|
|
"loss": 1.8392,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.13612565445026178,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 1.950530035335689e-05,
|
|
"loss": 1.9161,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.14659685863874344,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 1.943462897526502e-05,
|
|
"loss": 1.9959,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.15706806282722513,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 1.9363957597173148e-05,
|
|
"loss": 1.8876,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.16753926701570682,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 1.9293286219081272e-05,
|
|
"loss": 1.9128,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.17801047120418848,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 1.92226148409894e-05,
|
|
"loss": 1.88,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.18848167539267016,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 1.915194346289753e-05,
|
|
"loss": 1.8278,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.19895287958115182,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.9081272084805657e-05,
|
|
"loss": 1.857,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.2094240837696335,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.901060070671378e-05,
|
|
"loss": 1.9399,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.2198952879581152,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 1.893992932862191e-05,
|
|
"loss": 1.9213,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.23036649214659685,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 1.8869257950530038e-05,
|
|
"loss": 1.8672,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.24083769633507854,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 1.8798586572438166e-05,
|
|
"loss": 1.8416,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.2513089005235602,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.872791519434629e-05,
|
|
"loss": 1.8805,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.2617801047120419,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 1.865724381625442e-05,
|
|
"loss": 1.9378,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.27225130890052357,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.8586572438162547e-05,
|
|
"loss": 1.9069,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.28272251308900526,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 1.8515901060070675e-05,
|
|
"loss": 1.9602,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.2931937172774869,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.84452296819788e-05,
|
|
"loss": 1.8986,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.3036649214659686,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 1.8374558303886928e-05,
|
|
"loss": 1.936,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.31413612565445026,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 1.8303886925795052e-05,
|
|
"loss": 1.9116,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.32460732984293195,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.8233215547703184e-05,
|
|
"loss": 1.8984,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.33507853403141363,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 1.816254416961131e-05,
|
|
"loss": 1.7889,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.34554973821989526,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 1.8091872791519437e-05,
|
|
"loss": 1.9241,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.35602094240837695,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 1.802120141342756e-05,
|
|
"loss": 1.9547,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.36649214659685864,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 1.7950530035335693e-05,
|
|
"loss": 1.9106,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.3769633507853403,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 1.7879858657243818e-05,
|
|
"loss": 1.9307,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.387434554973822,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.7809187279151946e-05,
|
|
"loss": 1.9254,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.39790575916230364,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.773851590106007e-05,
|
|
"loss": 1.9153,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.4083769633507853,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.76678445229682e-05,
|
|
"loss": 1.896,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.418848167539267,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.7597173144876327e-05,
|
|
"loss": 1.9148,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.4293193717277487,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 1.7526501766784455e-05,
|
|
"loss": 1.9218,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.4397905759162304,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.745583038869258e-05,
|
|
"loss": 1.902,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.450261780104712,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.7385159010600707e-05,
|
|
"loss": 1.9088,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.4607329842931937,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 1.7314487632508836e-05,
|
|
"loss": 1.8801,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.4712041884816754,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 1.7243816254416964e-05,
|
|
"loss": 1.8912,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.4816753926701571,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 1.717314487632509e-05,
|
|
"loss": 1.9125,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.49214659685863876,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.7102473498233216e-05,
|
|
"loss": 1.8652,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.5026178010471204,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 1.7031802120141345e-05,
|
|
"loss": 1.9374,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.5130890052356021,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 1.6961130742049473e-05,
|
|
"loss": 1.8739,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.5235602094240838,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.6890459363957597e-05,
|
|
"loss": 1.8936,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.5340314136125655,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 1.6819787985865726e-05,
|
|
"loss": 1.8582,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.5445026178010471,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.6749116607773854e-05,
|
|
"loss": 1.8879,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.5549738219895288,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 1.6678445229681982e-05,
|
|
"loss": 1.8416,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.5654450261780105,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.6607773851590106e-05,
|
|
"loss": 1.942,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.5759162303664922,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.6537102473498235e-05,
|
|
"loss": 1.8874,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.5863874345549738,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 1.6466431095406363e-05,
|
|
"loss": 1.8773,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.5968586387434555,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.639575971731449e-05,
|
|
"loss": 1.8509,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.6073298429319371,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 1.6325088339222615e-05,
|
|
"loss": 1.941,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.6178010471204188,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 1.6254416961130744e-05,
|
|
"loss": 1.8318,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.6282722513089005,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 1.618374558303887e-05,
|
|
"loss": 1.888,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.6387434554973822,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 1.6113074204946996e-05,
|
|
"loss": 1.8677,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.6492146596858639,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.6042402826855124e-05,
|
|
"loss": 1.8781,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.6596858638743456,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 1.5971731448763253e-05,
|
|
"loss": 1.8952,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.6701570680628273,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.590106007067138e-05,
|
|
"loss": 1.9261,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.680628272251309,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.5830388692579505e-05,
|
|
"loss": 1.9298,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.6910994764397905,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 1.5759717314487633e-05,
|
|
"loss": 1.9557,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.7015706806282722,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 1.568904593639576e-05,
|
|
"loss": 1.8957,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.7120418848167539,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.561837455830389e-05,
|
|
"loss": 1.9286,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.7225130890052356,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.5547703180212014e-05,
|
|
"loss": 1.8264,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.7329842931937173,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.5477031802120142e-05,
|
|
"loss": 1.8479,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.743455497382199,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.540636042402827e-05,
|
|
"loss": 1.8048,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.7539267015706806,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.53356890459364e-05,
|
|
"loss": 1.9449,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.7643979057591623,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 1.5265017667844523e-05,
|
|
"loss": 1.8642,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.774869109947644,
|
|
"grad_norm": 0.4375,
|
|
"learning_rate": 1.519434628975265e-05,
|
|
"loss": 1.9162,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.7853403141361257,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.512367491166078e-05,
|
|
"loss": 1.8409,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.7958115183246073,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.5053003533568906e-05,
|
|
"loss": 1.8648,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.806282722513089,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 1.4982332155477032e-05,
|
|
"loss": 1.8513,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.8167539267015707,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 1.4911660777385159e-05,
|
|
"loss": 1.9476,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.8272251308900523,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 1.4840989399293289e-05,
|
|
"loss": 1.8946,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.837696335078534,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 1.4770318021201415e-05,
|
|
"loss": 1.8031,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.8481675392670157,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 1.4699646643109541e-05,
|
|
"loss": 1.9714,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.8586387434554974,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.4628975265017668e-05,
|
|
"loss": 1.9363,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.8691099476439791,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 1.4558303886925796e-05,
|
|
"loss": 1.7736,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.8795811518324608,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 1.4487632508833924e-05,
|
|
"loss": 1.8937,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.8900523560209425,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 1.441696113074205e-05,
|
|
"loss": 1.9246,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.900523560209424,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 1.4346289752650177e-05,
|
|
"loss": 1.8658,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.9109947643979057,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.4275618374558305e-05,
|
|
"loss": 1.8971,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.9214659685863874,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.4204946996466433e-05,
|
|
"loss": 1.8512,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.9319371727748691,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 1.413427561837456e-05,
|
|
"loss": 1.8469,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.9424083769633508,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 1.4063604240282686e-05,
|
|
"loss": 1.9065,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.9528795811518325,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.3992932862190814e-05,
|
|
"loss": 1.8924,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.9633507853403142,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.392226148409894e-05,
|
|
"loss": 1.9272,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.9738219895287958,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 1.3851590106007068e-05,
|
|
"loss": 1.9156,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.9842931937172775,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.3780918727915195e-05,
|
|
"loss": 1.9006,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.9947643979057592,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 1.3710247349823323e-05,
|
|
"loss": 1.8934,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 1.363957597173145e-05,
|
|
"loss": 1.7664,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 1.9242948293685913,
|
|
"eval_model_preparation_time": 0.0172,
|
|
"eval_runtime": 17.0575,
|
|
"eval_samples_per_second": 9.966,
|
|
"eval_steps_per_second": 4.983,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 1.0104712041884816,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 1.3568904593639577e-05,
|
|
"loss": 1.8107,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 1.0209424083769634,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 1.3498233215547704e-05,
|
|
"loss": 1.8922,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 1.031413612565445,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 1.3427561837455832e-05,
|
|
"loss": 1.8867,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 1.0418848167539267,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 1.3356890459363958e-05,
|
|
"loss": 1.9454,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.0523560209424083,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 1.3286219081272085e-05,
|
|
"loss": 1.8757,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 1.0628272251308901,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 1.3215547703180213e-05,
|
|
"loss": 1.9068,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 1.0732984293193717,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 1.3144876325088341e-05,
|
|
"loss": 1.8562,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 1.0837696335078535,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.3074204946996467e-05,
|
|
"loss": 1.8455,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 1.094240837696335,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 1.3003533568904594e-05,
|
|
"loss": 1.8332,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 1.1047120418848166,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 1.2932862190812724e-05,
|
|
"loss": 1.8061,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 1.1151832460732984,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 1.286219081272085e-05,
|
|
"loss": 1.8123,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 1.12565445026178,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 1.2791519434628976e-05,
|
|
"loss": 1.8718,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 1.1361256544502618,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 1.2720848056537103e-05,
|
|
"loss": 1.9013,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 1.1465968586387434,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 1.2650176678445233e-05,
|
|
"loss": 1.8208,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 1.1570680628272252,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 1.2579505300353359e-05,
|
|
"loss": 1.8674,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 1.1675392670157068,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 1.2508833922261485e-05,
|
|
"loss": 1.8759,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 1.1780104712041886,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.2438162544169612e-05,
|
|
"loss": 1.8442,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 1.1884816753926701,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 1.2367491166077738e-05,
|
|
"loss": 1.8808,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 1.1989528795811517,
|
|
"grad_norm": 0.4375,
|
|
"learning_rate": 1.2296819787985868e-05,
|
|
"loss": 1.9081,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 1.2094240837696335,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 1.2226148409893994e-05,
|
|
"loss": 1.8863,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 1.2198952879581153,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 1.2155477031802121e-05,
|
|
"loss": 1.8366,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 1.2303664921465969,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 1.2084805653710247e-05,
|
|
"loss": 1.7759,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 1.2408376963350785,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 1.2014134275618377e-05,
|
|
"loss": 1.7929,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 1.2513089005235603,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 1.1943462897526503e-05,
|
|
"loss": 1.8546,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.2617801047120418,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 1.187279151943463e-05,
|
|
"loss": 1.855,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 1.2722513089005236,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 1.1802120141342756e-05,
|
|
"loss": 1.904,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 1.2827225130890052,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 1.1731448763250883e-05,
|
|
"loss": 1.8709,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 1.2931937172774868,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 1.1660777385159012e-05,
|
|
"loss": 1.8564,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 1.3036649214659686,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 1.1590106007067139e-05,
|
|
"loss": 1.8526,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 1.3141361256544504,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 1.1519434628975265e-05,
|
|
"loss": 1.851,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 1.324607329842932,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 1.1448763250883392e-05,
|
|
"loss": 1.9299,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 1.3350785340314135,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 1.1378091872791521e-05,
|
|
"loss": 1.851,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 1.3455497382198953,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 1.1307420494699648e-05,
|
|
"loss": 1.8138,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 1.356020942408377,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 1.1236749116607774e-05,
|
|
"loss": 1.7448,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 1.3664921465968587,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 1.11660777385159e-05,
|
|
"loss": 1.8666,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 1.3769633507853403,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 1.1095406360424029e-05,
|
|
"loss": 1.8786,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 1.387434554973822,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 1.1024734982332157e-05,
|
|
"loss": 1.8729,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 1.3979057591623036,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 1.0954063604240283e-05,
|
|
"loss": 1.8079,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 1.4083769633507854,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 1.088339222614841e-05,
|
|
"loss": 1.8937,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 1.418848167539267,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 1.0812720848056538e-05,
|
|
"loss": 1.8155,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 1.4293193717277486,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 1.0742049469964666e-05,
|
|
"loss": 1.8918,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 1.4397905759162304,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 1.0671378091872792e-05,
|
|
"loss": 1.8187,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 1.450261780104712,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 1.0600706713780919e-05,
|
|
"loss": 1.8176,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 1.4607329842931938,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 1.0530035335689047e-05,
|
|
"loss": 1.8434,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 1.4712041884816753,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 1.0459363957597175e-05,
|
|
"loss": 1.8584,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 1.4816753926701571,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 1.0388692579505301e-05,
|
|
"loss": 1.7339,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 1.4921465968586387,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 1.0318021201413428e-05,
|
|
"loss": 1.8484,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 1.5026178010471205,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 1.0247349823321556e-05,
|
|
"loss": 1.8373,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 1.513089005235602,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 1.0176678445229682e-05,
|
|
"loss": 1.8109,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 1.5235602094240837,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 1.010600706713781e-05,
|
|
"loss": 1.8238,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 1.5340314136125655,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 1.0035335689045937e-05,
|
|
"loss": 1.853,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 1.5445026178010473,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 9.964664310954065e-06,
|
|
"loss": 1.8,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 1.5549738219895288,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 9.893992932862191e-06,
|
|
"loss": 1.9207,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 1.5654450261780104,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 9.82332155477032e-06,
|
|
"loss": 1.8191,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.5759162303664922,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 9.752650176678446e-06,
|
|
"loss": 1.8445,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 1.5863874345549738,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 9.681978798586574e-06,
|
|
"loss": 1.8418,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 1.5968586387434556,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 9.6113074204947e-06,
|
|
"loss": 1.8399,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 1.6073298429319371,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 9.540636042402828e-06,
|
|
"loss": 1.8557,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 1.6178010471204187,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 9.469964664310955e-06,
|
|
"loss": 1.8562,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 1.6282722513089005,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 9.399293286219083e-06,
|
|
"loss": 1.8814,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 1.6387434554973823,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 9.32862190812721e-06,
|
|
"loss": 1.8363,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 1.649214659685864,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 9.257950530035337e-06,
|
|
"loss": 1.8691,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 1.6596858638743455,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 9.187279151943464e-06,
|
|
"loss": 1.8112,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 1.6701570680628273,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 9.116607773851592e-06,
|
|
"loss": 1.8547,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.680628272251309,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 9.045936395759718e-06,
|
|
"loss": 1.8688,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 1.6910994764397906,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 8.975265017667846e-06,
|
|
"loss": 1.8313,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 1.7015706806282722,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 8.904593639575973e-06,
|
|
"loss": 1.912,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 1.7120418848167538,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 8.8339222614841e-06,
|
|
"loss": 1.7744,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 1.7225130890052356,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 8.763250883392227e-06,
|
|
"loss": 1.8784,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 1.7329842931937174,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 8.692579505300354e-06,
|
|
"loss": 1.8648,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 1.743455497382199,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 8.621908127208482e-06,
|
|
"loss": 1.83,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 1.7539267015706805,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 8.551236749116608e-06,
|
|
"loss": 1.8801,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 1.7643979057591623,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 8.480565371024736e-06,
|
|
"loss": 1.8367,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 1.7748691099476441,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 8.409893992932863e-06,
|
|
"loss": 1.8364,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 1.7853403141361257,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 8.339222614840991e-06,
|
|
"loss": 1.8937,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 1.7958115183246073,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 8.268551236749117e-06,
|
|
"loss": 1.8488,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 1.8062827225130889,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 8.197879858657245e-06,
|
|
"loss": 1.8361,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 1.8167539267015707,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 8.127208480565372e-06,
|
|
"loss": 1.8572,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 1.8272251308900525,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 8.056537102473498e-06,
|
|
"loss": 1.7914,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 1.837696335078534,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 7.985865724381626e-06,
|
|
"loss": 1.8523,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 1.8481675392670156,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 7.915194346289753e-06,
|
|
"loss": 1.8058,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 1.8586387434554974,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 7.84452296819788e-06,
|
|
"loss": 1.8991,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 1.8691099476439792,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 7.773851590106007e-06,
|
|
"loss": 1.8085,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 1.8795811518324608,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 7.703180212014135e-06,
|
|
"loss": 1.8277,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.8900523560209423,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 7.632508833922262e-06,
|
|
"loss": 1.8078,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 1.900523560209424,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 7.56183745583039e-06,
|
|
"loss": 1.8549,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 1.9109947643979057,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 7.491166077738516e-06,
|
|
"loss": 1.8756,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 1.9214659685863875,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 7.420494699646644e-06,
|
|
"loss": 1.8747,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 1.931937172774869,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 7.349823321554771e-06,
|
|
"loss": 1.828,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 1.9424083769633507,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 7.279151943462898e-06,
|
|
"loss": 1.8193,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 1.9528795811518325,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 7.208480565371025e-06,
|
|
"loss": 1.8753,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 1.9633507853403143,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 7.1378091872791525e-06,
|
|
"loss": 1.9137,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 1.9738219895287958,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 7.06713780918728e-06,
|
|
"loss": 1.9182,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 1.9842931937172774,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 6.996466431095407e-06,
|
|
"loss": 1.8535,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 1.9947643979057592,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 6.925795053003534e-06,
|
|
"loss": 1.8572,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.79296875,
|
|
"learning_rate": 6.8551236749116615e-06,
|
|
"loss": 1.8961,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 1.905681848526001,
|
|
"eval_model_preparation_time": 0.0172,
|
|
"eval_runtime": 17.103,
|
|
"eval_samples_per_second": 9.94,
|
|
"eval_steps_per_second": 4.97,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 2.0104712041884816,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 6.784452296819789e-06,
|
|
"loss": 1.8013,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 2.020942408376963,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 6.713780918727916e-06,
|
|
"loss": 1.8229,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 2.031413612565445,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 6.643109540636042e-06,
|
|
"loss": 1.8868,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 2.0418848167539267,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 6.5724381625441705e-06,
|
|
"loss": 1.8231,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 2.0523560209424083,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 6.501766784452297e-06,
|
|
"loss": 1.8609,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 2.06282722513089,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 6.431095406360425e-06,
|
|
"loss": 1.7994,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 2.073298429319372,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 6.360424028268551e-06,
|
|
"loss": 1.8234,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 2.0837696335078535,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 6.2897526501766795e-06,
|
|
"loss": 1.802,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 2.094240837696335,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 6.219081272084806e-06,
|
|
"loss": 1.7988,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 2.1047120418848166,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 6.148409893992934e-06,
|
|
"loss": 1.8223,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 2.115183246073298,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 6.0777385159010604e-06,
|
|
"loss": 1.7838,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 2.1256544502617802,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 6.0070671378091885e-06,
|
|
"loss": 1.9178,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 2.136125654450262,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 5.936395759717315e-06,
|
|
"loss": 1.7947,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 2.1465968586387434,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 5.865724381625441e-06,
|
|
"loss": 1.802,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 2.157068062827225,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 5.7950530035335694e-06,
|
|
"loss": 1.8402,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 2.167539267015707,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 5.724381625441696e-06,
|
|
"loss": 1.7997,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 2.1780104712041886,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 5.653710247349824e-06,
|
|
"loss": 1.8513,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 2.18848167539267,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 5.58303886925795e-06,
|
|
"loss": 1.8858,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 2.1989528795811517,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 5.5123674911660785e-06,
|
|
"loss": 1.7843,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 2.2094240837696333,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 5.441696113074205e-06,
|
|
"loss": 1.8,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 2.2198952879581153,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 5.371024734982333e-06,
|
|
"loss": 1.9113,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 2.230366492146597,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 5.300353356890459e-06,
|
|
"loss": 1.7547,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 2.2408376963350785,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 5.2296819787985875e-06,
|
|
"loss": 1.7787,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 2.25130890052356,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 5.159010600706714e-06,
|
|
"loss": 1.8752,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 2.261780104712042,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 5.088339222614841e-06,
|
|
"loss": 1.8085,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 2.2722513089005236,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 5.017667844522968e-06,
|
|
"loss": 1.8047,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 2.282722513089005,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 4.946996466431096e-06,
|
|
"loss": 1.797,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 2.2931937172774868,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 4.876325088339223e-06,
|
|
"loss": 1.7975,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 2.303664921465969,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 4.80565371024735e-06,
|
|
"loss": 1.8178,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 2.3141361256544504,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 4.734982332155477e-06,
|
|
"loss": 1.8738,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 2.324607329842932,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 4.664310954063605e-06,
|
|
"loss": 1.7964,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 2.3350785340314135,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 4.593639575971732e-06,
|
|
"loss": 1.7519,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 2.345549738219895,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 4.522968197879859e-06,
|
|
"loss": 1.8255,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 2.356020942408377,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 4.452296819787986e-06,
|
|
"loss": 1.8518,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 2.3664921465968587,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 4.381625441696114e-06,
|
|
"loss": 1.8612,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 2.3769633507853403,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 4.310954063604241e-06,
|
|
"loss": 1.8026,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 2.387434554973822,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 4.240282685512368e-06,
|
|
"loss": 1.8072,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 2.3979057591623034,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 4.1696113074204954e-06,
|
|
"loss": 1.7921,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 2.4083769633507854,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 4.098939929328623e-06,
|
|
"loss": 1.8991,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 2.418848167539267,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 4.028268551236749e-06,
|
|
"loss": 1.855,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 2.4293193717277486,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 3.957597173144876e-06,
|
|
"loss": 1.7872,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 2.4397905759162306,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 3.886925795053004e-06,
|
|
"loss": 1.8558,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 2.450261780104712,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 3.816254416961131e-06,
|
|
"loss": 1.8594,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 2.4607329842931938,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 3.745583038869258e-06,
|
|
"loss": 1.7573,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 2.4712041884816753,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 3.6749116607773854e-06,
|
|
"loss": 1.8325,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 2.481675392670157,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 3.6042402826855126e-06,
|
|
"loss": 1.7963,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 2.492146596858639,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 3.53356890459364e-06,
|
|
"loss": 1.7997,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 2.5026178010471205,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 3.462897526501767e-06,
|
|
"loss": 1.8311,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 2.513089005235602,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 3.3922261484098944e-06,
|
|
"loss": 1.8321,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 2.5235602094240837,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 3.321554770318021e-06,
|
|
"loss": 1.7678,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 2.5340314136125652,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 3.2508833922261485e-06,
|
|
"loss": 1.7942,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 2.5445026178010473,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 3.1802120141342757e-06,
|
|
"loss": 1.854,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 2.554973821989529,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 3.109540636042403e-06,
|
|
"loss": 1.8134,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 2.5654450261780104,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 3.0388692579505302e-06,
|
|
"loss": 1.9039,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 2.5759162303664924,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 2.9681978798586575e-06,
|
|
"loss": 1.8558,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 2.5863874345549736,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 2.8975265017667847e-06,
|
|
"loss": 1.8358,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 2.5968586387434556,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 2.826855123674912e-06,
|
|
"loss": 1.8268,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 2.607329842931937,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 2.7561837455830392e-06,
|
|
"loss": 1.8387,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 2.6178010471204187,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 2.6855123674911665e-06,
|
|
"loss": 1.8029,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 2.6282722513089007,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 2.6148409893992937e-06,
|
|
"loss": 1.7407,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 2.6387434554973823,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 2.5441696113074206e-06,
|
|
"loss": 1.8378,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 2.649214659685864,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 2.473498233215548e-06,
|
|
"loss": 1.9012,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 2.6596858638743455,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 2.402826855123675e-06,
|
|
"loss": 1.844,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 2.670157068062827,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 2.3321554770318023e-06,
|
|
"loss": 1.8986,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 2.680628272251309,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 2.2614840989399296e-06,
|
|
"loss": 1.7605,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 2.6910994764397906,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 2.190812720848057e-06,
|
|
"loss": 1.8786,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 2.701570680628272,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 2.120141342756184e-06,
|
|
"loss": 1.7735,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 2.712041884816754,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 2.0494699646643113e-06,
|
|
"loss": 1.8454,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 2.7225130890052354,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 1.978798586572438e-06,
|
|
"loss": 1.8158,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 2.7329842931937174,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 1.9081272084805654e-06,
|
|
"loss": 1.8388,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 2.743455497382199,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 1.8374558303886927e-06,
|
|
"loss": 1.8308,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 2.7539267015706805,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 1.76678445229682e-06,
|
|
"loss": 1.8306,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 2.7643979057591626,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 1.6961130742049472e-06,
|
|
"loss": 1.764,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 2.774869109947644,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 1.6254416961130742e-06,
|
|
"loss": 1.8624,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 2.7853403141361257,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 1.5547703180212015e-06,
|
|
"loss": 1.7886,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 2.7958115183246073,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 1.4840989399293287e-06,
|
|
"loss": 1.8308,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 2.806282722513089,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 1.413427561837456e-06,
|
|
"loss": 1.861,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 2.816753926701571,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 1.3427561837455832e-06,
|
|
"loss": 1.812,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 2.8272251308900525,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 1.2720848056537103e-06,
|
|
"loss": 1.9705,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 2.837696335078534,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 1.2014134275618375e-06,
|
|
"loss": 1.823,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 2.8481675392670156,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 1.1307420494699648e-06,
|
|
"loss": 1.8767,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 2.858638743455497,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 1.060070671378092e-06,
|
|
"loss": 1.7743,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 2.869109947643979,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 9.89399293286219e-07,
|
|
"loss": 1.8407,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 2.8795811518324608,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 9.187279151943463e-07,
|
|
"loss": 1.8357,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 2.8900523560209423,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 8.480565371024736e-07,
|
|
"loss": 1.7537,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 2.900523560209424,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 7.773851590106007e-07,
|
|
"loss": 1.9056,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 2.9109947643979055,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 7.06713780918728e-07,
|
|
"loss": 1.7869,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 2.9214659685863875,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 6.360424028268551e-07,
|
|
"loss": 1.8753,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 2.931937172774869,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 5.653710247349824e-07,
|
|
"loss": 1.8214,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 2.9424083769633507,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 4.946996466431095e-07,
|
|
"loss": 1.7654,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 2.9528795811518327,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 4.240282685512368e-07,
|
|
"loss": 1.8402,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 2.9633507853403143,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 3.53356890459364e-07,
|
|
"loss": 1.7427,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 2.973821989528796,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 2.826855123674912e-07,
|
|
"loss": 1.862,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 2.9842931937172774,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 2.120141342756184e-07,
|
|
"loss": 1.8144,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 2.994764397905759,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 1.413427561837456e-07,
|
|
"loss": 1.8367,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 7.06713780918728e-08,
|
|
"loss": 1.7167,
|
|
"step": 288
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 288,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 5000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.31738818434734e+16,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|