2435 lines
59 KiB
JSON
2435 lines
59 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.23603461841070023,
|
|
"eval_steps": 500,
|
|
"global_step": 300,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0007867820613690008,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.7636,
|
|
"num_tokens": 381797.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.0015735641227380016,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 6.666666666666667e-07,
|
|
"loss": 0.7623,
|
|
"num_tokens": 837007.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.0023603461841070024,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 1.3333333333333334e-06,
|
|
"loss": 0.7581,
|
|
"num_tokens": 1282591.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.003147128245476003,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 0.7678,
|
|
"num_tokens": 1736983.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.003933910306845004,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 2.666666666666667e-06,
|
|
"loss": 0.748,
|
|
"num_tokens": 2211811.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.004720692368214005,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 0.7548,
|
|
"num_tokens": 2608382.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.0055074744295830055,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.7358,
|
|
"num_tokens": 3064838.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.006294256490952006,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 4.666666666666667e-06,
|
|
"loss": 0.704,
|
|
"num_tokens": 3465837.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.007081038552321007,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 5.333333333333334e-06,
|
|
"loss": 0.6919,
|
|
"num_tokens": 3900673.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.007867820613690008,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.6964,
|
|
"num_tokens": 4322216.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.00865460267505901,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 6.666666666666667e-06,
|
|
"loss": 0.6865,
|
|
"num_tokens": 4763831.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.00944138473642801,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 7.333333333333333e-06,
|
|
"loss": 0.6674,
|
|
"num_tokens": 5292618.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.010228166797797011,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 0.6771,
|
|
"num_tokens": 5687036.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.011014948859166011,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 8.666666666666668e-06,
|
|
"loss": 0.6447,
|
|
"num_tokens": 6116911.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.011801730920535013,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 9.333333333333334e-06,
|
|
"loss": 0.6377,
|
|
"num_tokens": 6516802.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.012588512981904013,
|
|
"grad_norm": 0.7890625,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.635,
|
|
"num_tokens": 6954839.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.013375295043273014,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 9.999726606524545e-06,
|
|
"loss": 0.614,
|
|
"num_tokens": 7431706.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.014162077104642014,
|
|
"grad_norm": 0.72265625,
|
|
"learning_rate": 9.998906459317727e-06,
|
|
"loss": 0.638,
|
|
"num_tokens": 7873139.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.014948859166011016,
|
|
"grad_norm": 0.76171875,
|
|
"learning_rate": 9.997539658034168e-06,
|
|
"loss": 0.6381,
|
|
"num_tokens": 8336156.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.015735641227380016,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 9.995626368751447e-06,
|
|
"loss": 0.6165,
|
|
"num_tokens": 8809699.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.016522423288749016,
|
|
"grad_norm": 0.64453125,
|
|
"learning_rate": 9.993166823949924e-06,
|
|
"loss": 0.5833,
|
|
"num_tokens": 9276482.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.01730920535011802,
|
|
"grad_norm": 0.76171875,
|
|
"learning_rate": 9.990161322484486e-06,
|
|
"loss": 0.6285,
|
|
"num_tokens": 9761551.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.01809598741148702,
|
|
"grad_norm": 0.609375,
|
|
"learning_rate": 9.986610229548242e-06,
|
|
"loss": 0.607,
|
|
"num_tokens": 10189001.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.01888276947285602,
|
|
"grad_norm": 0.546875,
|
|
"learning_rate": 9.982513976628143e-06,
|
|
"loss": 0.6078,
|
|
"num_tokens": 10643150.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.01966955153422502,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 9.977873061452551e-06,
|
|
"loss": 0.6149,
|
|
"num_tokens": 11078815.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.020456333595594022,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 9.972688047930773e-06,
|
|
"loss": 0.6132,
|
|
"num_tokens": 11527719.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.021243115656963022,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 9.966959566084523e-06,
|
|
"loss": 0.5965,
|
|
"num_tokens": 12019704.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.022029897718332022,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 9.960688311971389e-06,
|
|
"loss": 0.5971,
|
|
"num_tokens": 12463410.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.022816679779701022,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 9.953875047600236e-06,
|
|
"loss": 0.5854,
|
|
"num_tokens": 12905537.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.023603461841070025,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 9.946520600838634e-06,
|
|
"loss": 0.5803,
|
|
"num_tokens": 13354614.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.024390243902439025,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 9.938625865312252e-06,
|
|
"loss": 0.5919,
|
|
"num_tokens": 13770618.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.025177025963808025,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 9.930191800296282e-06,
|
|
"loss": 0.5905,
|
|
"num_tokens": 14215259.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.025963808025177025,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 9.921219430598881e-06,
|
|
"loss": 0.5701,
|
|
"num_tokens": 14649720.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.02675059008654603,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.911709846436643e-06,
|
|
"loss": 0.5838,
|
|
"num_tokens": 15030456.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.02753737214791503,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 9.901664203302126e-06,
|
|
"loss": 0.593,
|
|
"num_tokens": 15502604.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.02832415420928403,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 9.89108372182346e-06,
|
|
"loss": 0.581,
|
|
"num_tokens": 15967084.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.029110936270653028,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 9.879969687616026e-06,
|
|
"loss": 0.599,
|
|
"num_tokens": 16427896.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.02989771833202203,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 9.86832345112624e-06,
|
|
"loss": 0.5883,
|
|
"num_tokens": 16861474.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.03068450039339103,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 9.856146427467469e-06,
|
|
"loss": 0.5826,
|
|
"num_tokens": 17331485.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.03147128245476003,
|
|
"grad_norm": 0.341796875,
|
|
"learning_rate": 9.84344009624807e-06,
|
|
"loss": 0.5965,
|
|
"num_tokens": 17804516.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.03225806451612903,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 9.830206001391627e-06,
|
|
"loss": 0.5905,
|
|
"num_tokens": 18227140.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.03304484657749803,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 9.816445750949336e-06,
|
|
"loss": 0.5782,
|
|
"num_tokens": 18625691.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.03383162863886703,
|
|
"grad_norm": 0.35546875,
|
|
"learning_rate": 9.80216101690461e-06,
|
|
"loss": 0.5623,
|
|
"num_tokens": 18997781.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.03461841070023604,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 9.787353534969936e-06,
|
|
"loss": 0.6028,
|
|
"num_tokens": 19431031.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.03540519276160504,
|
|
"grad_norm": 0.31640625,
|
|
"learning_rate": 9.77202510437596e-06,
|
|
"loss": 0.59,
|
|
"num_tokens": 19904203.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.03619197482297404,
|
|
"grad_norm": 0.328125,
|
|
"learning_rate": 9.756177587652857e-06,
|
|
"loss": 0.5801,
|
|
"num_tokens": 20363517.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.03697875688434304,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 9.739812910404045e-06,
|
|
"loss": 0.5792,
|
|
"num_tokens": 20827976.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.03776553894571204,
|
|
"grad_norm": 0.322265625,
|
|
"learning_rate": 9.722933061072185e-06,
|
|
"loss": 0.5831,
|
|
"num_tokens": 21250892.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.03855232100708104,
|
|
"grad_norm": 0.349609375,
|
|
"learning_rate": 9.705540090697576e-06,
|
|
"loss": 0.5789,
|
|
"num_tokens": 21650869.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.03933910306845004,
|
|
"grad_norm": 0.3203125,
|
|
"learning_rate": 9.687636112668933e-06,
|
|
"loss": 0.5961,
|
|
"num_tokens": 22076209.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.04012588512981904,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 9.669223302466609e-06,
|
|
"loss": 0.5895,
|
|
"num_tokens": 22470639.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.040912667191188044,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 9.650303897398232e-06,
|
|
"loss": 0.5857,
|
|
"num_tokens": 22930578.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.041699449252557044,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 9.630880196326874e-06,
|
|
"loss": 0.5821,
|
|
"num_tokens": 23369639.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.042486231313926044,
|
|
"grad_norm": 0.328125,
|
|
"learning_rate": 9.610954559391704e-06,
|
|
"loss": 0.5868,
|
|
"num_tokens": 23836873.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.043273013375295044,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 9.590529407721232e-06,
|
|
"loss": 0.5755,
|
|
"num_tokens": 24277558.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.044059795436664044,
|
|
"grad_norm": 0.326171875,
|
|
"learning_rate": 9.5696072231391e-06,
|
|
"loss": 0.5816,
|
|
"num_tokens": 24662313.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.044846577498033044,
|
|
"grad_norm": 0.333984375,
|
|
"learning_rate": 9.548190547862532e-06,
|
|
"loss": 0.5677,
|
|
"num_tokens": 25128896.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.045633359559402044,
|
|
"grad_norm": 0.33984375,
|
|
"learning_rate": 9.526281984193437e-06,
|
|
"loss": 0.6084,
|
|
"num_tokens": 25535507.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.046420141620771044,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 9.503884194202195e-06,
|
|
"loss": 0.5585,
|
|
"num_tokens": 25950294.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.04720692368214005,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 9.480999899404207e-06,
|
|
"loss": 0.5812,
|
|
"num_tokens": 26394450.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.04799370574350905,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 9.4576318804292e-06,
|
|
"loss": 0.5978,
|
|
"num_tokens": 26895403.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.04878048780487805,
|
|
"grad_norm": 0.31640625,
|
|
"learning_rate": 9.433782976683366e-06,
|
|
"loss": 0.5836,
|
|
"num_tokens": 27334154.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.04956726986624705,
|
|
"grad_norm": 0.326171875,
|
|
"learning_rate": 9.409456086004336e-06,
|
|
"loss": 0.5956,
|
|
"num_tokens": 27755608.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.05035405192761605,
|
|
"grad_norm": 0.328125,
|
|
"learning_rate": 9.384654164309083e-06,
|
|
"loss": 0.5797,
|
|
"num_tokens": 28204981.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.05114083398898505,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 9.359380225234752e-06,
|
|
"loss": 0.5252,
|
|
"num_tokens": 28587979.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.05192761605035405,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 9.333637339772472e-06,
|
|
"loss": 0.5811,
|
|
"num_tokens": 28974623.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.05271439811172305,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 9.30742863589421e-06,
|
|
"loss": 0.5713,
|
|
"num_tokens": 29397669.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.05350118017309206,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 9.280757298172696e-06,
|
|
"loss": 0.5666,
|
|
"num_tokens": 29838270.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.05428796223446106,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 9.253626567394466e-06,
|
|
"loss": 0.5659,
|
|
"num_tokens": 30321067.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.05507474429583006,
|
|
"grad_norm": 0.318359375,
|
|
"learning_rate": 9.226039740166091e-06,
|
|
"loss": 0.5729,
|
|
"num_tokens": 30751915.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.05586152635719906,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 9.198000168513604e-06,
|
|
"loss": 0.5932,
|
|
"num_tokens": 31282597.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.05664830841856806,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 9.169511259475202e-06,
|
|
"loss": 0.5736,
|
|
"num_tokens": 31733067.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.057435090479937057,
|
|
"grad_norm": 0.3046875,
|
|
"learning_rate": 9.140576474687263e-06,
|
|
"loss": 0.5712,
|
|
"num_tokens": 32130865.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.058221872541306056,
|
|
"grad_norm": 0.328125,
|
|
"learning_rate": 9.111199329963735e-06,
|
|
"loss": 0.5755,
|
|
"num_tokens": 32548792.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.059008654602675056,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 9.081383394868924e-06,
|
|
"loss": 0.594,
|
|
"num_tokens": 32971975.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.05979543666404406,
|
|
"grad_norm": 0.3203125,
|
|
"learning_rate": 9.051132292283772e-06,
|
|
"loss": 0.5819,
|
|
"num_tokens": 33372811.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.06058221872541306,
|
|
"grad_norm": 0.322265625,
|
|
"learning_rate": 9.020449697965645e-06,
|
|
"loss": 0.5888,
|
|
"num_tokens": 33784941.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.06136900078678206,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 8.989339340101698e-06,
|
|
"loss": 0.5837,
|
|
"num_tokens": 34224127.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.06215578284815106,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 8.957804998855866e-06,
|
|
"loss": 0.5725,
|
|
"num_tokens": 34658551.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.06294256490952006,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 8.92585050590955e-06,
|
|
"loss": 0.5749,
|
|
"num_tokens": 35094129.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.06372934697088907,
|
|
"grad_norm": 0.328125,
|
|
"learning_rate": 8.893479743996034e-06,
|
|
"loss": 0.5657,
|
|
"num_tokens": 35436410.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.06451612903225806,
|
|
"grad_norm": 0.322265625,
|
|
"learning_rate": 8.860696646428693e-06,
|
|
"loss": 0.5629,
|
|
"num_tokens": 35825073.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.06530291109362707,
|
|
"grad_norm": 0.3203125,
|
|
"learning_rate": 8.827505196623074e-06,
|
|
"loss": 0.5661,
|
|
"num_tokens": 36223791.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.06608969315499606,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 8.793909427612878e-06,
|
|
"loss": 0.5734,
|
|
"num_tokens": 36656241.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.06687647521636507,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 8.759913421559902e-06,
|
|
"loss": 0.5675,
|
|
"num_tokens": 37077658.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.06766325727773406,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 8.725521309258031e-06,
|
|
"loss": 0.5734,
|
|
"num_tokens": 37476774.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.06845003933910307,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 8.690737269631318e-06,
|
|
"loss": 0.5708,
|
|
"num_tokens": 37871004.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.06923682140047208,
|
|
"grad_norm": 0.287109375,
|
|
"learning_rate": 8.655565529226199e-06,
|
|
"loss": 0.5664,
|
|
"num_tokens": 38301817.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.07002360346184107,
|
|
"grad_norm": 0.318359375,
|
|
"learning_rate": 8.62001036169794e-06,
|
|
"loss": 0.5746,
|
|
"num_tokens": 38681899.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.07081038552321008,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 8.58407608729135e-06,
|
|
"loss": 0.5912,
|
|
"num_tokens": 39130089.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.07159716758457907,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 8.547767072315835e-06,
|
|
"loss": 0.5813,
|
|
"num_tokens": 39591218.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.07238394964594808,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 8.511087728614863e-06,
|
|
"loss": 0.574,
|
|
"num_tokens": 40032847.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.07317073170731707,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 8.474042513029876e-06,
|
|
"loss": 0.5803,
|
|
"num_tokens": 40473954.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.07395751376868608,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 8.43663592685876e-06,
|
|
"loss": 0.5779,
|
|
"num_tokens": 40900930.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.07474429583005507,
|
|
"grad_norm": 0.27734375,
|
|
"learning_rate": 8.39887251530889e-06,
|
|
"loss": 0.5835,
|
|
"num_tokens": 41368577.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.07553107789142408,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 8.360756866944858e-06,
|
|
"loss": 0.5785,
|
|
"num_tokens": 41767993.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.07631785995279308,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 8.322293613130917e-06,
|
|
"loss": 0.5932,
|
|
"num_tokens": 42197577.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.07710464201416208,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 8.283487427468244e-06,
|
|
"loss": 0.5848,
|
|
"num_tokens": 42649936.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.07789142407553108,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 8.244343025227041e-06,
|
|
"loss": 0.5812,
|
|
"num_tokens": 43084525.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.07867820613690008,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 8.204865162773613e-06,
|
|
"loss": 0.5629,
|
|
"num_tokens": 43587988.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.07946498819826908,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 8.165058636992411e-06,
|
|
"loss": 0.5931,
|
|
"num_tokens": 43968594.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.08025177025963807,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 8.12492828470318e-06,
|
|
"loss": 0.5556,
|
|
"num_tokens": 44454046.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.08103855232100708,
|
|
"grad_norm": 0.32421875,
|
|
"learning_rate": 8.084478982073247e-06,
|
|
"loss": 0.6005,
|
|
"num_tokens": 44928448.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.08182533438237609,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 8.043715644025025e-06,
|
|
"loss": 0.5685,
|
|
"num_tokens": 45405277.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.08261211644374508,
|
|
"grad_norm": 0.28125,
|
|
"learning_rate": 8.002643223638803e-06,
|
|
"loss": 0.5731,
|
|
"num_tokens": 45870226.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.08339889850511409,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 7.961266711550922e-06,
|
|
"loss": 0.573,
|
|
"num_tokens": 46308121.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.08418568056648308,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 7.919591135347354e-06,
|
|
"loss": 0.5727,
|
|
"num_tokens": 46727971.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.08497246262785209,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 7.877621558952817e-06,
|
|
"loss": 0.5783,
|
|
"num_tokens": 47180249.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.08575924468922108,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 7.83536308201547e-06,
|
|
"loss": 0.5669,
|
|
"num_tokens": 47709483.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.08654602675059009,
|
|
"grad_norm": 0.28125,
|
|
"learning_rate": 7.792820839287257e-06,
|
|
"loss": 0.5861,
|
|
"num_tokens": 48152246.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.08733280881195908,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 7.75e-06,
|
|
"loss": 0.5473,
|
|
"num_tokens": 48563870.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.08811959087332809,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 7.706905767237288e-06,
|
|
"loss": 0.5773,
|
|
"num_tokens": 49025023.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.0889063729346971,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 7.663543377302257e-06,
|
|
"loss": 0.582,
|
|
"num_tokens": 49467162.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.08969315499606609,
|
|
"grad_norm": 0.326171875,
|
|
"learning_rate": 7.6199180990813535e-06,
|
|
"loss": 0.5933,
|
|
"num_tokens": 49856067.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.0904799370574351,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 7.576035233404097e-06,
|
|
"loss": 0.5623,
|
|
"num_tokens": 50334050.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.09126671911880409,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 7.531900112399004e-06,
|
|
"loss": 0.5649,
|
|
"num_tokens": 50811750.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.0920535011801731,
|
|
"grad_norm": 0.267578125,
|
|
"learning_rate": 7.487518098845684e-06,
|
|
"loss": 0.5709,
|
|
"num_tokens": 51318751.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.09284028324154209,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 7.442894585523218e-06,
|
|
"loss": 0.5651,
|
|
"num_tokens": 51726918.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.0936270653029111,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 7.398034994554895e-06,
|
|
"loss": 0.5844,
|
|
"num_tokens": 52184995.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.0944138473642801,
|
|
"grad_norm": 0.28125,
|
|
"learning_rate": 7.352944776749374e-06,
|
|
"loss": 0.573,
|
|
"num_tokens": 52641746.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.0952006294256491,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 7.307629410938364e-06,
|
|
"loss": 0.5648,
|
|
"num_tokens": 53083720.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.0959874114870181,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 7.262094403310912e-06,
|
|
"loss": 0.5709,
|
|
"num_tokens": 53509360.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.0967741935483871,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 7.216345286744349e-06,
|
|
"loss": 0.5748,
|
|
"num_tokens": 53994487.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.0975609756097561,
|
|
"grad_norm": 0.34375,
|
|
"learning_rate": 7.1703876201319935e-06,
|
|
"loss": 0.5866,
|
|
"num_tokens": 54432027.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.0983477576711251,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 7.124226987707717e-06,
|
|
"loss": 0.6052,
|
|
"num_tokens": 54867083.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.0991345397324941,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 7.0778689983673955e-06,
|
|
"loss": 0.5641,
|
|
"num_tokens": 55248221.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.0999213217938631,
|
|
"grad_norm": 0.27734375,
|
|
"learning_rate": 7.031319284987395e-06,
|
|
"loss": 0.5822,
|
|
"num_tokens": 55746694.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.1007081038552321,
|
|
"grad_norm": 0.267578125,
|
|
"learning_rate": 6.984583503740123e-06,
|
|
"loss": 0.5488,
|
|
"num_tokens": 56213653.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.10149488591660111,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 6.937667333406767e-06,
|
|
"loss": 0.5643,
|
|
"num_tokens": 56638790.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.1022816679779701,
|
|
"grad_norm": 0.27734375,
|
|
"learning_rate": 6.890576474687264e-06,
|
|
"loss": 0.5563,
|
|
"num_tokens": 57062435.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.10306845003933911,
|
|
"grad_norm": 0.3359375,
|
|
"learning_rate": 6.843316649507627e-06,
|
|
"loss": 0.5764,
|
|
"num_tokens": 57474963.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.1038552321007081,
|
|
"grad_norm": 0.28125,
|
|
"learning_rate": 6.795893600324678e-06,
|
|
"loss": 0.5544,
|
|
"num_tokens": 57881417.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.10464201416207711,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 6.748313089428301e-06,
|
|
"loss": 0.5557,
|
|
"num_tokens": 58314408.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.1054287962234461,
|
|
"grad_norm": 0.3359375,
|
|
"learning_rate": 6.700580898241268e-06,
|
|
"loss": 0.5747,
|
|
"num_tokens": 58739816.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.10621557828481511,
|
|
"grad_norm": 0.33984375,
|
|
"learning_rate": 6.6527028266167515e-06,
|
|
"loss": 0.5608,
|
|
"num_tokens": 59159651.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.10700236034618411,
|
|
"grad_norm": 0.271484375,
|
|
"learning_rate": 6.604684692133597e-06,
|
|
"loss": 0.5736,
|
|
"num_tokens": 59662731.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.1077891424075531,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 6.556532329389435e-06,
|
|
"loss": 0.5666,
|
|
"num_tokens": 60042452.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.10857592446892211,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 6.508251589291732e-06,
|
|
"loss": 0.5424,
|
|
"num_tokens": 60493087.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.1093627065302911,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 6.459848338346861e-06,
|
|
"loss": 0.5746,
|
|
"num_tokens": 60931029.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.11014948859166011,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 6.411328457947264e-06,
|
|
"loss": 0.5777,
|
|
"num_tokens": 61366704.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.1109362706530291,
|
|
"grad_norm": 0.3046875,
|
|
"learning_rate": 6.362697843656823e-06,
|
|
"loss": 0.5656,
|
|
"num_tokens": 61764432.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.11172305271439811,
|
|
"grad_norm": 0.28125,
|
|
"learning_rate": 6.313962404494496e-06,
|
|
"loss": 0.5826,
|
|
"num_tokens": 62217966.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.1125098347757671,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 6.265128062216319e-06,
|
|
"loss": 0.5523,
|
|
"num_tokens": 62650059.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.11329661683713611,
|
|
"grad_norm": 0.263671875,
|
|
"learning_rate": 6.216200750595878e-06,
|
|
"loss": 0.5631,
|
|
"num_tokens": 63177970.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.11408339889850512,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 6.167186414703289e-06,
|
|
"loss": 0.5898,
|
|
"num_tokens": 63563979.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.11487018095987411,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 6.118091010182837e-06,
|
|
"loss": 0.5582,
|
|
"num_tokens": 64029187.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.11565696302124312,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 6.068920502529309e-06,
|
|
"loss": 0.5884,
|
|
"num_tokens": 64506763.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.11644374508261211,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 6.019680866363139e-06,
|
|
"loss": 0.5653,
|
|
"num_tokens": 64953757.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.11723052714398112,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 5.970378084704441e-06,
|
|
"loss": 0.5931,
|
|
"num_tokens": 65389299.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.11801730920535011,
|
|
"grad_norm": 0.337890625,
|
|
"learning_rate": 5.921018148246031e-06,
|
|
"loss": 0.5773,
|
|
"num_tokens": 65813399.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.11880409126671912,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 5.871607054625497e-06,
|
|
"loss": 0.5549,
|
|
"num_tokens": 66215206.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.11959087332808813,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 5.822150807696443e-06,
|
|
"loss": 0.5653,
|
|
"num_tokens": 66640758.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.12037765538945712,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 5.772655416798972e-06,
|
|
"loss": 0.5904,
|
|
"num_tokens": 67055455.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.12116443745082613,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 5.723126896029501e-06,
|
|
"loss": 0.5743,
|
|
"num_tokens": 67483884.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.12195121951219512,
|
|
"grad_norm": 0.28125,
|
|
"learning_rate": 5.6735712635099975e-06,
|
|
"loss": 0.568,
|
|
"num_tokens": 67926609.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.12273800157356413,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 5.62399454065673e-06,
|
|
"loss": 0.5768,
|
|
"num_tokens": 68392157.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.12352478363493312,
|
|
"grad_norm": 0.287109375,
|
|
"learning_rate": 5.574402751448614e-06,
|
|
"loss": 0.5595,
|
|
"num_tokens": 68834479.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.12431156569630213,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 5.524801921695253e-06,
|
|
"loss": 0.5762,
|
|
"num_tokens": 69263714.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.12509834775767112,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 5.475198078304749e-06,
|
|
"loss": 0.5703,
|
|
"num_tokens": 69766869.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.12588512981904013,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 5.4255972485513875e-06,
|
|
"loss": 0.5676,
|
|
"num_tokens": 70234072.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.12667191188040913,
|
|
"grad_norm": 0.33984375,
|
|
"learning_rate": 5.376005459343272e-06,
|
|
"loss": 0.5646,
|
|
"num_tokens": 70568089.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.12745869394177814,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 5.326428736490002e-06,
|
|
"loss": 0.5526,
|
|
"num_tokens": 70975207.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.12824547600314712,
|
|
"grad_norm": 0.265625,
|
|
"learning_rate": 5.2768731039705005e-06,
|
|
"loss": 0.5821,
|
|
"num_tokens": 71474447.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.12903225806451613,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 5.227344583201031e-06,
|
|
"loss": 0.5662,
|
|
"num_tokens": 71959059.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.12981904012588513,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 5.17784919230356e-06,
|
|
"loss": 0.5653,
|
|
"num_tokens": 72364485.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.13060582218725414,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 5.1283929453745055e-06,
|
|
"loss": 0.569,
|
|
"num_tokens": 72743904.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.13139260424862312,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 5.0789818517539715e-06,
|
|
"loss": 0.5833,
|
|
"num_tokens": 73214490.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.13217938630999213,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 5.02962191529556e-06,
|
|
"loss": 0.5677,
|
|
"num_tokens": 73626086.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.13296616837136113,
|
|
"grad_norm": 0.318359375,
|
|
"learning_rate": 4.980319133636863e-06,
|
|
"loss": 0.5839,
|
|
"num_tokens": 74019455.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.13375295043273014,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 4.9310794974706926e-06,
|
|
"loss": 0.583,
|
|
"num_tokens": 74481873.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.13453973249409915,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 4.881908989817163e-06,
|
|
"loss": 0.5567,
|
|
"num_tokens": 74879232.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.13532651455546812,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 4.832813585296711e-06,
|
|
"loss": 0.5718,
|
|
"num_tokens": 75325621.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.13611329661683713,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 4.783799249404123e-06,
|
|
"loss": 0.5743,
|
|
"num_tokens": 75719223.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.13690007867820614,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 4.734871937783683e-06,
|
|
"loss": 0.5742,
|
|
"num_tokens": 76143196.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.13768686073957515,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 4.686037595505507e-06,
|
|
"loss": 0.588,
|
|
"num_tokens": 76585451.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.13847364280094415,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 4.637302156343178e-06,
|
|
"loss": 0.5555,
|
|
"num_tokens": 77054043.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.13926042486231313,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 4.588671542052737e-06,
|
|
"loss": 0.5721,
|
|
"num_tokens": 77503285.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.14004720692368214,
|
|
"grad_norm": 0.27734375,
|
|
"learning_rate": 4.54015166165314e-06,
|
|
"loss": 0.5688,
|
|
"num_tokens": 77954347.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.14083398898505115,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 4.491748410708268e-06,
|
|
"loss": 0.5803,
|
|
"num_tokens": 78357424.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.14162077104642015,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 4.4434676706105665e-06,
|
|
"loss": 0.5747,
|
|
"num_tokens": 78784439.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.14240755310778913,
|
|
"grad_norm": 0.3046875,
|
|
"learning_rate": 4.395315307866404e-06,
|
|
"loss": 0.564,
|
|
"num_tokens": 79210285.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.14319433516915814,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 4.347297173383248e-06,
|
|
"loss": 0.5558,
|
|
"num_tokens": 79630366.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.14398111723052714,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 4.299419101758733e-06,
|
|
"loss": 0.5728,
|
|
"num_tokens": 80044690.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.14476789929189615,
|
|
"grad_norm": 0.265625,
|
|
"learning_rate": 4.2516869105717e-06,
|
|
"loss": 0.5898,
|
|
"num_tokens": 80557643.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.14555468135326516,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 4.204106399675324e-06,
|
|
"loss": 0.5807,
|
|
"num_tokens": 80973066.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.14634146341463414,
|
|
"grad_norm": 0.2578125,
|
|
"learning_rate": 4.156683350492376e-06,
|
|
"loss": 0.5606,
|
|
"num_tokens": 81510795.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.14712824547600314,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 4.109423525312738e-06,
|
|
"loss": 0.5899,
|
|
"num_tokens": 81922256.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.14791502753737215,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 4.062332666593234e-06,
|
|
"loss": 0.5633,
|
|
"num_tokens": 82388653.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.14870180959874116,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 4.015416496259878e-06,
|
|
"loss": 0.559,
|
|
"num_tokens": 82787043.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.14948859166011014,
|
|
"grad_norm": 0.271484375,
|
|
"learning_rate": 3.968680715012606e-06,
|
|
"loss": 0.5568,
|
|
"num_tokens": 83289042.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.15027537372147914,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 3.922131001632607e-06,
|
|
"loss": 0.5856,
|
|
"num_tokens": 83677670.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.15106215578284815,
|
|
"grad_norm": 0.341796875,
|
|
"learning_rate": 3.875773012292286e-06,
|
|
"loss": 0.5845,
|
|
"num_tokens": 84134629.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.15184893784421716,
|
|
"grad_norm": 0.263671875,
|
|
"learning_rate": 3.829612379868006e-06,
|
|
"loss": 0.5485,
|
|
"num_tokens": 84652172.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.15263571990558616,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 3.7836547132556534e-06,
|
|
"loss": 0.5595,
|
|
"num_tokens": 85135514.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.15342250196695514,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 3.73790559668909e-06,
|
|
"loss": 0.5513,
|
|
"num_tokens": 85538238.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.15420928402832415,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 3.692370589061639e-06,
|
|
"loss": 0.5643,
|
|
"num_tokens": 85964644.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.15499606608969316,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 3.6470552232506282e-06,
|
|
"loss": 0.5812,
|
|
"num_tokens": 86406088.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.15578284815106216,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 3.601965005445106e-06,
|
|
"loss": 0.58,
|
|
"num_tokens": 86795385.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.15656963021243114,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 3.5571054144767823e-06,
|
|
"loss": 0.5695,
|
|
"num_tokens": 87271627.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.15735641227380015,
|
|
"grad_norm": 0.3359375,
|
|
"learning_rate": 3.5124819011543177e-06,
|
|
"loss": 0.5785,
|
|
"num_tokens": 87610861.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.15814319433516916,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 3.468099887600999e-06,
|
|
"loss": 0.5551,
|
|
"num_tokens": 88074052.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.15892997639653816,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 3.423964766595906e-06,
|
|
"loss": 0.5936,
|
|
"num_tokens": 88524495.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.15971675845790717,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 3.380081900918648e-06,
|
|
"loss": 0.5787,
|
|
"num_tokens": 89019306.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.16050354051927615,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 3.3364566226977414e-06,
|
|
"loss": 0.563,
|
|
"num_tokens": 89502903.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.16129032258064516,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 3.293094232762715e-06,
|
|
"loss": 0.5732,
|
|
"num_tokens": 89911337.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.16207710464201416,
|
|
"grad_norm": 0.271484375,
|
|
"learning_rate": 3.2500000000000015e-06,
|
|
"loss": 0.5682,
|
|
"num_tokens": 90392411.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.16286388670338317,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 3.207179160712744e-06,
|
|
"loss": 0.5702,
|
|
"num_tokens": 90806043.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.16365066876475218,
|
|
"grad_norm": 0.31640625,
|
|
"learning_rate": 3.1646369179845336e-06,
|
|
"loss": 0.5928,
|
|
"num_tokens": 91216707.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.16443745082612116,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 3.1223784410471857e-06,
|
|
"loss": 0.5525,
|
|
"num_tokens": 91655883.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.16522423288749016,
|
|
"grad_norm": 0.34375,
|
|
"learning_rate": 3.0804088646526488e-06,
|
|
"loss": 0.5697,
|
|
"num_tokens": 92157394.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.16601101494885917,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 3.0387332884490806e-06,
|
|
"loss": 0.5829,
|
|
"num_tokens": 92616352.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.16679779701022818,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 2.9973567763611975e-06,
|
|
"loss": 0.5913,
|
|
"num_tokens": 93057214.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.16758457907159716,
|
|
"grad_norm": 0.3359375,
|
|
"learning_rate": 2.9562843559749765e-06,
|
|
"loss": 0.5842,
|
|
"num_tokens": 93505292.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.16837136113296616,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 2.9155210179267546e-06,
|
|
"loss": 0.5798,
|
|
"num_tokens": 93947491.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.16915814319433517,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 2.8750717152968226e-06,
|
|
"loss": 0.5765,
|
|
"num_tokens": 94385854.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.16994492525570418,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 2.8349413630075907e-06,
|
|
"loss": 0.5708,
|
|
"num_tokens": 94879232.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.17073170731707318,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 2.7951348372263875e-06,
|
|
"loss": 0.5776,
|
|
"num_tokens": 95323846.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.17151848937844216,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 2.75565697477296e-06,
|
|
"loss": 0.5687,
|
|
"num_tokens": 95785832.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.17230527143981117,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 2.716512572531759e-06,
|
|
"loss": 0.5851,
|
|
"num_tokens": 96209254.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.17309205350118018,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 2.677706386869083e-06,
|
|
"loss": 0.5794,
|
|
"num_tokens": 96626894.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.17387883556254918,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 2.639243133055145e-06,
|
|
"loss": 0.5668,
|
|
"num_tokens": 97033416.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.17466561762391816,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 2.6011274846911117e-06,
|
|
"loss": 0.5687,
|
|
"num_tokens": 97471843.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.17545239968528717,
|
|
"grad_norm": 0.271484375,
|
|
"learning_rate": 2.5633640731412414e-06,
|
|
"loss": 0.5308,
|
|
"num_tokens": 97935067.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.17623918174665618,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 2.5259574869701252e-06,
|
|
"loss": 0.5587,
|
|
"num_tokens": 98406708.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.17702596380802518,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 2.4889122713851397e-06,
|
|
"loss": 0.5912,
|
|
"num_tokens": 98895234.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.1778127458693942,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 2.4522329276841664e-06,
|
|
"loss": 0.5614,
|
|
"num_tokens": 99347577.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.17859952793076317,
|
|
"grad_norm": 0.265625,
|
|
"learning_rate": 2.415923912708652e-06,
|
|
"loss": 0.5617,
|
|
"num_tokens": 99798396.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.17938630999213218,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 2.379989638302062e-06,
|
|
"loss": 0.5726,
|
|
"num_tokens": 100234238.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.18017309205350118,
|
|
"grad_norm": 0.318359375,
|
|
"learning_rate": 2.3444344707738017e-06,
|
|
"loss": 0.5679,
|
|
"num_tokens": 100582182.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.1809598741148702,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 2.3092627303686827e-06,
|
|
"loss": 0.5707,
|
|
"num_tokens": 101009465.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.18174665617623917,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 2.2744786907419704e-06,
|
|
"loss": 0.5719,
|
|
"num_tokens": 101390878.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.18253343823760818,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 2.2400865784401e-06,
|
|
"loss": 0.579,
|
|
"num_tokens": 101822510.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.18332022029897718,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 2.2060905723871225e-06,
|
|
"loss": 0.5608,
|
|
"num_tokens": 102238479.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.1841070023603462,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 2.1724948033769257e-06,
|
|
"loss": 0.5843,
|
|
"num_tokens": 102714846.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.1848937844217152,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 2.139303353571309e-06,
|
|
"loss": 0.5772,
|
|
"num_tokens": 103153235.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.18568056648308418,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 2.1065202560039678e-06,
|
|
"loss": 0.5757,
|
|
"num_tokens": 103556743.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.18646734854445318,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 2.0741494940904495e-06,
|
|
"loss": 0.5741,
|
|
"num_tokens": 103996580.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.1872541306058222,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 2.0421950011441354e-06,
|
|
"loss": 0.5644,
|
|
"num_tokens": 104425967.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.1880409126671912,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 2.0106606598983036e-06,
|
|
"loss": 0.5633,
|
|
"num_tokens": 104844108.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.1888276947285602,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 1.9795503020343557e-06,
|
|
"loss": 0.5896,
|
|
"num_tokens": 105302805.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.18961447678992918,
|
|
"grad_norm": 0.33203125,
|
|
"learning_rate": 1.94886770771623e-06,
|
|
"loss": 0.5854,
|
|
"num_tokens": 105708319.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.1904012588512982,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 1.9186166051310772e-06,
|
|
"loss": 0.5613,
|
|
"num_tokens": 106112247.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.1911880409126672,
|
|
"grad_norm": 0.267578125,
|
|
"learning_rate": 1.8888006700362654e-06,
|
|
"loss": 0.5528,
|
|
"num_tokens": 106610177.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.1919748229740362,
|
|
"grad_norm": 0.3125,
|
|
"learning_rate": 1.8594235253127373e-06,
|
|
"loss": 0.5574,
|
|
"num_tokens": 106966568.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.19276160503540518,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 1.8304887405247986e-06,
|
|
"loss": 0.5738,
|
|
"num_tokens": 107408366.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.1935483870967742,
|
|
"grad_norm": 0.26953125,
|
|
"learning_rate": 1.8019998314863974e-06,
|
|
"loss": 0.5744,
|
|
"num_tokens": 107911855.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.1943351691581432,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 1.77396025983391e-06,
|
|
"loss": 0.5776,
|
|
"num_tokens": 108341668.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.1951219512195122,
|
|
"grad_norm": 0.27734375,
|
|
"learning_rate": 1.7463734326055365e-06,
|
|
"loss": 0.5808,
|
|
"num_tokens": 108836854.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.1959087332808812,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 1.7192427018273066e-06,
|
|
"loss": 0.5761,
|
|
"num_tokens": 109245431.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.1966955153422502,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 1.6925713641057904e-06,
|
|
"loss": 0.5597,
|
|
"num_tokens": 109703188.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.1974822974036192,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 1.666362660227529e-06,
|
|
"loss": 0.5857,
|
|
"num_tokens": 110128169.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.1982690794649882,
|
|
"grad_norm": 0.287109375,
|
|
"learning_rate": 1.6406197747652485e-06,
|
|
"loss": 0.5891,
|
|
"num_tokens": 110581824.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.1990558615263572,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 1.6153458356909177e-06,
|
|
"loss": 0.5529,
|
|
"num_tokens": 111007577.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.1998426435877262,
|
|
"grad_norm": 0.263671875,
|
|
"learning_rate": 1.590543913995666e-06,
|
|
"loss": 0.5472,
|
|
"num_tokens": 111486780.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.2006294256490952,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 1.5662170233166353e-06,
|
|
"loss": 0.5905,
|
|
"num_tokens": 111908526.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.2014162077104642,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 1.5423681195707997e-06,
|
|
"loss": 0.5717,
|
|
"num_tokens": 112333164.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.2022029897718332,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 1.5190001005957938e-06,
|
|
"loss": 0.5803,
|
|
"num_tokens": 112818479.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.20298977183320221,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 1.4961158057978064e-06,
|
|
"loss": 0.5808,
|
|
"num_tokens": 113208745.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.2037765538945712,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 1.4737180158065645e-06,
|
|
"loss": 0.6051,
|
|
"num_tokens": 113612135.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.2045633359559402,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 1.4518094521374682e-06,
|
|
"loss": 0.5936,
|
|
"num_tokens": 114022639.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.2053501180173092,
|
|
"grad_norm": 0.28125,
|
|
"learning_rate": 1.4303927768609016e-06,
|
|
"loss": 0.5694,
|
|
"num_tokens": 114460242.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.20613690007867821,
|
|
"grad_norm": 0.267578125,
|
|
"learning_rate": 1.4094705922787688e-06,
|
|
"loss": 0.5386,
|
|
"num_tokens": 114905828.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.2069236821400472,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 1.389045440608296e-06,
|
|
"loss": 0.5655,
|
|
"num_tokens": 115362240.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.2077104642014162,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 1.3691198036731285e-06,
|
|
"loss": 0.5915,
|
|
"num_tokens": 115822643.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.2084972462627852,
|
|
"grad_norm": 0.26953125,
|
|
"learning_rate": 1.3496961026017689e-06,
|
|
"loss": 0.5665,
|
|
"num_tokens": 116307039.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.20928402832415421,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 1.3307766975333922e-06,
|
|
"loss": 0.5882,
|
|
"num_tokens": 116705870.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.21007081038552322,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 1.3123638873310676e-06,
|
|
"loss": 0.5847,
|
|
"num_tokens": 117156035.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.2108575924468922,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 1.2944599093024268e-06,
|
|
"loss": 0.5662,
|
|
"num_tokens": 117525103.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.2116443745082612,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 1.277066938927816e-06,
|
|
"loss": 0.5949,
|
|
"num_tokens": 117912232.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.21243115656963021,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 1.260187089595956e-06,
|
|
"loss": 0.5724,
|
|
"num_tokens": 118347405.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.21321793863099922,
|
|
"grad_norm": 0.3203125,
|
|
"learning_rate": 1.2438224123471442e-06,
|
|
"loss": 0.6001,
|
|
"num_tokens": 118728218.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.21400472069236823,
|
|
"grad_norm": 0.345703125,
|
|
"learning_rate": 1.2279748956240435e-06,
|
|
"loss": 0.5706,
|
|
"num_tokens": 119114776.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.2147915027537372,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 1.2126464650300652e-06,
|
|
"loss": 0.5783,
|
|
"num_tokens": 119617143.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.2155782848151062,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 1.1978389830953908e-06,
|
|
"loss": 0.5722,
|
|
"num_tokens": 120024450.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.21636506687647522,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 1.1835542490506658e-06,
|
|
"loss": 0.5742,
|
|
"num_tokens": 120395245.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.21715184893784423,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 1.1697939986083732e-06,
|
|
"loss": 0.565,
|
|
"num_tokens": 120854077.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.2179386309992132,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 1.1565599037519317e-06,
|
|
"loss": 0.5903,
|
|
"num_tokens": 121277050.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.2187254130605822,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 1.1438535725325342e-06,
|
|
"loss": 0.5744,
|
|
"num_tokens": 121651600.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.21951219512195122,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 1.1316765488737602e-06,
|
|
"loss": 0.5641,
|
|
"num_tokens": 122132040.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.22029897718332023,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 1.1200303123839744e-06,
|
|
"loss": 0.566,
|
|
"num_tokens": 122653140.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.22108575924468923,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 1.10891627817654e-06,
|
|
"loss": 0.5721,
|
|
"num_tokens": 123073782.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.2218725413060582,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 1.0983357966978747e-06,
|
|
"loss": 0.6001,
|
|
"num_tokens": 123501000.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.22265932336742722,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 1.088290153563358e-06,
|
|
"loss": 0.5673,
|
|
"num_tokens": 123967087.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.22344610542879623,
|
|
"grad_norm": 0.271484375,
|
|
"learning_rate": 1.0787805694011185e-06,
|
|
"loss": 0.5559,
|
|
"num_tokens": 124432312.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.22423288749016523,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 1.0698081997037178e-06,
|
|
"loss": 0.5893,
|
|
"num_tokens": 124898330.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.2250196695515342,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 1.0613741346877498e-06,
|
|
"loss": 0.5695,
|
|
"num_tokens": 125305283.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.22580645161290322,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 1.053479399161368e-06,
|
|
"loss": 0.5538,
|
|
"num_tokens": 125723575.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.22659323367427223,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 1.0461249523997647e-06,
|
|
"loss": 0.5838,
|
|
"num_tokens": 126109744.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.22738001573564123,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 1.0393116880286117e-06,
|
|
"loss": 0.5723,
|
|
"num_tokens": 126532859.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.22816679779701024,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 1.0330404339154763e-06,
|
|
"loss": 0.5648,
|
|
"num_tokens": 126984590.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.22895357985837922,
|
|
"grad_norm": 0.291015625,
|
|
"learning_rate": 1.0273119520692274e-06,
|
|
"loss": 0.5862,
|
|
"num_tokens": 127423409.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.22974036191974823,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 1.0221269385474486e-06,
|
|
"loss": 0.541,
|
|
"num_tokens": 127896042.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.23052714398111723,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 1.0174860233718585e-06,
|
|
"loss": 0.5795,
|
|
"num_tokens": 128311556.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.23131392604248624,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 1.0133897704517585e-06,
|
|
"loss": 0.5622,
|
|
"num_tokens": 128702300.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.23210070810385522,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 1.0098386775155147e-06,
|
|
"loss": 0.5686,
|
|
"num_tokens": 129182359.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.23288749016522423,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 1.0068331760500773e-06,
|
|
"loss": 0.5527,
|
|
"num_tokens": 129563093.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.23367427222659323,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 1.0043736312485536e-06,
|
|
"loss": 0.5668,
|
|
"num_tokens": 129956060.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.23446105428796224,
|
|
"grad_norm": 0.271484375,
|
|
"learning_rate": 1.0024603419658329e-06,
|
|
"loss": 0.5481,
|
|
"num_tokens": 130445808.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 0.23524783634933125,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 1.0010935406822748e-06,
|
|
"loss": 0.567,
|
|
"num_tokens": 130875660.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 0.23603461841070023,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 1.0002733934754567e-06,
|
|
"loss": 0.5759,
|
|
"num_tokens": 131289617.0,
|
|
"step": 300
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 300,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.862254349520732e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|