17602 lines
405 KiB
JSON
17602 lines
405 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.7135896458142392,
|
|
"eval_steps": 250,
|
|
"global_step": 2500,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0002854358583256957,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 8.571428571428571e-06,
|
|
"loss": 3.7737,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.0005708717166513914,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 1.7142857142857142e-05,
|
|
"loss": 3.8253,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.000856307574977087,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 2.571428571428571e-05,
|
|
"loss": 3.8136,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.0011417434333027827,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 3.4285714285714284e-05,
|
|
"loss": 3.7592,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.0014271792916284785,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 4.285714285714285e-05,
|
|
"loss": 3.7806,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.001712615149954174,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 5.142857142857142e-05,
|
|
"loss": 3.7962,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.0019980510082798697,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 5.9999999999999995e-05,
|
|
"loss": 3.7494,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.0022834868666055655,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 6.857142857142857e-05,
|
|
"loss": 3.7721,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.0025689227249312612,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 7.714285714285713e-05,
|
|
"loss": 3.744,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.002854358583256957,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 8.57142857142857e-05,
|
|
"loss": 3.7373,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.003139794441582653,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 9.428571428571427e-05,
|
|
"loss": 3.7021,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.003425230299908348,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 0.00010285714285714284,
|
|
"loss": 3.6838,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.003710666158234044,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.00011142857142857142,
|
|
"loss": 3.7084,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.003996102016559739,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 0.00011999999999999999,
|
|
"loss": 3.6489,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.004281537874885435,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.00012857142857142855,
|
|
"loss": 3.6588,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.004566973733211131,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 0.00013714285714285713,
|
|
"loss": 3.6394,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.004852409591536827,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 0.0001457142857142857,
|
|
"loss": 3.5906,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.0051378454498625225,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.00015428571428571425,
|
|
"loss": 3.5944,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.005423281308188218,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 0.00016285714285714284,
|
|
"loss": 3.5843,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.005708717166513914,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 0.0001714285714285714,
|
|
"loss": 3.5661,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.00599415302483961,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 0.00017999999999999998,
|
|
"loss": 3.5976,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.006279588883165306,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 0.00018857142857142854,
|
|
"loss": 3.5226,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.006565024741491001,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.00019714285714285713,
|
|
"loss": 3.5581,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.006850460599816696,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 0.0002057142857142857,
|
|
"loss": 3.5337,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.007135896458142392,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 0.00021428571428571427,
|
|
"loss": 3.502,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.007421332316468088,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.00022285714285714283,
|
|
"loss": 3.4848,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.007706768174793784,
|
|
"grad_norm": 7.65625,
|
|
"learning_rate": 0.00023142857142857142,
|
|
"loss": 3.5451,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.007992204033119479,
|
|
"grad_norm": 4.96875,
|
|
"learning_rate": 0.00023999999999999998,
|
|
"loss": 3.5235,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.008277639891445174,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 0.00024857142857142857,
|
|
"loss": 3.5061,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.00856307574977087,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 0.0002571428571428571,
|
|
"loss": 3.5228,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.008848511608096566,
|
|
"grad_norm": 7.75,
|
|
"learning_rate": 0.0002657142857142857,
|
|
"loss": 3.4963,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.009133947466422262,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 0.00027428571428571427,
|
|
"loss": 3.5074,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.009419383324747958,
|
|
"grad_norm": 5.3125,
|
|
"learning_rate": 0.0002828571428571428,
|
|
"loss": 3.4555,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.009704819183073653,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 0.0002914285714285714,
|
|
"loss": 3.4634,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.00999025504139935,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 0.0003,
|
|
"loss": 3.4516,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.010275690899725045,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 0.00029999993845357924,
|
|
"loss": 3.4341,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.01056112675805074,
|
|
"grad_norm": 5.40625,
|
|
"learning_rate": 0.0002999997538143675,
|
|
"loss": 3.4625,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.010846562616376437,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 0.0002999994460825163,
|
|
"loss": 3.4492,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.011131998474702132,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 0.0002999990152582781,
|
|
"loss": 3.4078,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.011417434333027828,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 0.00029999846134200653,
|
|
"loss": 3.4077,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.011702870191353524,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 0.0002999977843341562,
|
|
"loss": 3.4062,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.01198830604967922,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 0.0002999969842352825,
|
|
"loss": 3.3895,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.012273741908004916,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 0.0002999960610460421,
|
|
"loss": 3.3762,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.012559177766330611,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 0.00029999501476719257,
|
|
"loss": 3.3807,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.012844613624656307,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 0.00029999384539959253,
|
|
"loss": 3.3432,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.013130049482982001,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 0.0002999925529442016,
|
|
"loss": 3.3543,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.013415485341307697,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 0.0002999911374020804,
|
|
"loss": 3.3339,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.013700921199633393,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00029998959877439044,
|
|
"loss": 3.3377,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.013986357057959089,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 0.0002999879370623944,
|
|
"loss": 3.4033,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.014271792916284784,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 0.00029998615226745605,
|
|
"loss": 3.3567,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.01455722877461048,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 0.0002999842443910399,
|
|
"loss": 3.3819,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.014842664632936176,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 0.0002999822134347115,
|
|
"loss": 3.3586,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.015128100491261872,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 0.0002999800594001376,
|
|
"loss": 3.3414,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.015413536349587567,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 0.000299977782289086,
|
|
"loss": 3.3165,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.01569897220791326,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 0.00029997538210342503,
|
|
"loss": 3.3446,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.015984408066238957,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 0.0002999728588451245,
|
|
"loss": 3.3649,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.016269843924564653,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.000299970212516255,
|
|
"loss": 3.3258,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.01655527978289035,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 0.0002999674431189883,
|
|
"loss": 3.3137,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.016840715641216045,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0002999645506555967,
|
|
"loss": 3.31,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.01712615149954174,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 0.00029996153512845415,
|
|
"loss": 3.3022,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.017411587357867436,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 0.00029995839654003504,
|
|
"loss": 3.3119,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.017697023216193132,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.00029995513489291506,
|
|
"loss": 3.306,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.017982459074518828,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 0.0002999517501897707,
|
|
"loss": 3.2965,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.018267894932844524,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 0.0002999482424333796,
|
|
"loss": 3.3035,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.01855333079117022,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 0.00029994461162662024,
|
|
"loss": 3.2734,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.018838766649495915,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0002999408577724721,
|
|
"loss": 3.2772,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.01912420250782161,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 0.0002999369808740157,
|
|
"loss": 3.2491,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.019409638366147307,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 0.00029993298093443246,
|
|
"loss": 3.2943,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.019695074224473003,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0002999288579570049,
|
|
"loss": 3.2525,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.0199805100827987,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 0.00029992461194511624,
|
|
"loss": 3.2765,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.020265945941124394,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.000299920242902251,
|
|
"loss": 3.2538,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.02055138179945009,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.00029991575083199455,
|
|
"loss": 3.2407,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.020836817657775786,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 0.00029991113573803294,
|
|
"loss": 3.2537,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.02112225351610148,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 0.0002999063976241536,
|
|
"loss": 3.2618,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.021407689374427177,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00029990153649424463,
|
|
"loss": 3.2486,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.021693125232752873,
|
|
"grad_norm": 6.15625,
|
|
"learning_rate": 0.0002998965523522951,
|
|
"loss": 3.2839,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.02197856109107857,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 0.0002998914452023953,
|
|
"loss": 3.2866,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.022263996949404265,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 0.00029988621504873606,
|
|
"loss": 3.3082,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.02254943280772996,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 0.0002998808618956094,
|
|
"loss": 3.2833,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.022834868666055656,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 0.00029987538574740826,
|
|
"loss": 3.2748,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.023120304524381352,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 0.0002998697866086264,
|
|
"loss": 3.2491,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.023405740382707048,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 0.0002998640644838587,
|
|
"loss": 3.2526,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.023691176241032744,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 0.0002998582193778006,
|
|
"loss": 3.2262,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.02397661209935844,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 0.000299852251295249,
|
|
"loss": 3.2321,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.024262047957684135,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 0.0002998461602411013,
|
|
"loss": 3.2485,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.02454748381600983,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.00029983994622035585,
|
|
"loss": 3.2223,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.024832919674335527,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 0.0002998336092381121,
|
|
"loss": 3.2184,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.025118355532661223,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.0002998271492995702,
|
|
"loss": 3.2204,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.02540379139098692,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 0.00029982056641003147,
|
|
"loss": 3.2185,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.025689227249312614,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00029981386057489776,
|
|
"loss": 3.1942,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.025974663107638307,
|
|
"grad_norm": 2.953125,
|
|
"learning_rate": 0.00029980703179967213,
|
|
"loss": 3.1724,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.026260098965964002,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 0.00029980008008995834,
|
|
"loss": 3.2225,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.026545534824289698,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 0.0002997930054514612,
|
|
"loss": 3.2103,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.026830970682615394,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0002997858078899861,
|
|
"loss": 3.1942,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.02711640654094109,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.00029977848741143966,
|
|
"loss": 3.1652,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.027401842399266785,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 0.0002997710440218291,
|
|
"loss": 3.186,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.02768727825759248,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002997634777272627,
|
|
"loss": 3.1928,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.027972714115918177,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.0002997557885339494,
|
|
"loss": 3.169,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.028258149974243873,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00029974797644819926,
|
|
"loss": 3.174,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.02854358583256957,
|
|
"grad_norm": 3.984375,
|
|
"learning_rate": 0.0002997400414764229,
|
|
"loss": 3.1859,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.028829021690895264,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0002997319836251319,
|
|
"loss": 3.1975,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.02911445754922096,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0002997238029009387,
|
|
"loss": 3.163,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.029399893407546656,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 0.0002997154993105566,
|
|
"loss": 3.1766,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.029685329265872352,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 0.00029970707286079966,
|
|
"loss": 3.1692,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.029970765124198048,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 0.00029969852355858276,
|
|
"loss": 3.1785,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.030256200982523743,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.00029968985141092165,
|
|
"loss": 3.1622,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.03054163684084944,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.00029968105642493286,
|
|
"loss": 3.1934,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.030827072699175135,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 0.0002996721386078337,
|
|
"loss": 3.1503,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.03111250855750083,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.00029966309796694226,
|
|
"loss": 3.1415,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.03139794441582652,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.0002996539345096776,
|
|
"loss": 3.169,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.03168338027415222,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0002996446482435593,
|
|
"loss": 3.1381,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.031968816132477915,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.0002996352391762079,
|
|
"loss": 3.1506,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.03225425199080361,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 0.0002996257073153446,
|
|
"loss": 3.1666,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.032539687849129306,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.00029961605266879153,
|
|
"loss": 3.1883,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.032825123707455,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 0.0002996062752444714,
|
|
"loss": 3.1594,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.0331105595657807,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00029959637505040773,
|
|
"loss": 3.1553,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.033395995424106394,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.00029958635209472486,
|
|
"loss": 3.125,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.03368143128243209,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.00029957620638564785,
|
|
"loss": 3.1074,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.033966867140757785,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00029956593793150233,
|
|
"loss": 3.1193,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.03425230299908348,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0002995555467407149,
|
|
"loss": 3.107,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.03453773885740918,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0002995450328218127,
|
|
"loss": 3.1292,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.03482317471573487,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0002995343961834238,
|
|
"loss": 3.1159,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.03510861057406057,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0002995236368342766,
|
|
"loss": 3.1207,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.035394046432386264,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00029951275478320056,
|
|
"loss": 3.1056,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.03567948229071196,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 0.00029950175003912573,
|
|
"loss": 3.1206,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.035964918149037656,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0002994906226110827,
|
|
"loss": 3.1213,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.03625035400736335,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 0.00029947937250820295,
|
|
"loss": 3.1091,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.03653578986568905,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0002994679997397185,
|
|
"loss": 3.1071,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.03682122572401474,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 0.000299456504314962,
|
|
"loss": 3.143,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.03710666158234044,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.00029944488624336683,
|
|
"loss": 3.1106,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.037392097440666135,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 0.00029943314553446706,
|
|
"loss": 3.1163,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.03767753329899183,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.00029942128219789734,
|
|
"loss": 3.1173,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.037962969157317526,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.0002994092962433929,
|
|
"loss": 3.1289,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.03824840501564322,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0002993971876807896,
|
|
"loss": 3.1056,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.03853384087396892,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002993849565200241,
|
|
"loss": 3.0896,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.038819276732294614,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0002993726027711333,
|
|
"loss": 3.1087,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.03910471259062031,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.00029936012644425517,
|
|
"loss": 3.1059,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.039390148448946005,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 0.00029934752754962783,
|
|
"loss": 3.1265,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.0396755843072717,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00029933480609759027,
|
|
"loss": 3.0987,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.0399610201655974,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.00029932196209858197,
|
|
"loss": 3.1122,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.04024645602392309,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0002993089955631429,
|
|
"loss": 3.0887,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.04053189188224879,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0002992959065019136,
|
|
"loss": 3.0815,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.040817327740574484,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 0.00029928269492563537,
|
|
"loss": 3.0889,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.04110276359890018,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00029926936084514967,
|
|
"loss": 3.0793,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.041388199457225876,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.00029925590427139887,
|
|
"loss": 3.0804,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.04167363531555157,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00029924232521542557,
|
|
"loss": 3.0612,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.04195907117387727,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 0.00029922862368837315,
|
|
"loss": 3.0698,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.04224450703220296,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 0.00029921479970148517,
|
|
"loss": 3.088,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.04252994289052866,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.00029920085326610595,
|
|
"loss": 3.0765,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.042815378748854355,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 0.00029918678439368017,
|
|
"loss": 3.0926,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.04310081460718005,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.000299172593095753,
|
|
"loss": 3.0821,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.043386250465505746,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 0.00029915827938397017,
|
|
"loss": 3.0682,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.04367168632383144,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 0.0002991438432700777,
|
|
"loss": 3.0657,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.04395712218215714,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 0.0002991292847659222,
|
|
"loss": 3.0883,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.044242558040482834,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 0.0002991146038834505,
|
|
"loss": 3.0962,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.04452799389880853,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0002990998006347102,
|
|
"loss": 3.0695,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.044813429757134225,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 0.0002990848750318491,
|
|
"loss": 3.1003,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.04509886561545992,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.00029906982708711533,
|
|
"loss": 3.0733,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.04538430147378562,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 0.0002990546568128576,
|
|
"loss": 3.1179,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.04566973733211131,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 0.00029903936422152487,
|
|
"loss": 3.1125,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.04595517319043701,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 0.00029902394932566657,
|
|
"loss": 3.0922,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.046240609048762704,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 0.00029900841213793247,
|
|
"loss": 3.048,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.0465260449070884,
|
|
"grad_norm": 9.5,
|
|
"learning_rate": 0.00029899275267107264,
|
|
"loss": 3.1456,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.046811480765414096,
|
|
"grad_norm": 8.3125,
|
|
"learning_rate": 0.00029897697093793753,
|
|
"loss": 3.1066,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.04709691662373979,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 0.000298961066951478,
|
|
"loss": 3.0876,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.04738235248206549,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 0.0002989450407247451,
|
|
"loss": 3.1259,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.04766778834039118,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 0.0002989288922708902,
|
|
"loss": 3.1248,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.04795322419871688,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 0.0002989126216031652,
|
|
"loss": 3.0802,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.048238660057042575,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 0.00029889622873492195,
|
|
"loss": 3.0777,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.04852409591536827,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 0.0002988797136796128,
|
|
"loss": 3.0904,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.048809531773693966,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 0.0002988630764507904,
|
|
"loss": 3.081,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.04909496763201966,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 0.0002988463170621074,
|
|
"loss": 3.0743,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.04938040349034536,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.00029882943552731703,
|
|
"loss": 3.0189,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.049665839348671054,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.0002988124318602725,
|
|
"loss": 3.0684,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.04995127520699675,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0002987953060749274,
|
|
"loss": 3.0479,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.050236711065322445,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.0002987780581853355,
|
|
"loss": 3.0374,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.05052214692364814,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0002987606882056507,
|
|
"loss": 3.0589,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.05080758278197384,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 0.00029874319615012714,
|
|
"loss": 3.0731,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.05109301864029953,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 0.00029872558203311914,
|
|
"loss": 3.0793,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.05137845449862523,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0002987078458690811,
|
|
"loss": 3.0748,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.05166389035695092,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 0.0002986899876725678,
|
|
"loss": 3.0308,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.05194932621527661,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00029867200745823384,
|
|
"loss": 3.0496,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.05223476207360231,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002986539052408343,
|
|
"loss": 3.0577,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.052520197931928005,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.0002986356810352241,
|
|
"loss": 3.0357,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.0528056337902537,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00029861733485635834,
|
|
"loss": 3.023,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.053091069648579396,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.00029859886671929233,
|
|
"loss": 3.0768,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.05337650550690509,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00029858027663918135,
|
|
"loss": 3.0272,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.05366194136523079,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0002985615646312807,
|
|
"loss": 3.0348,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.053947377223556484,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00029854273071094596,
|
|
"loss": 3.0245,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.05423281308188218,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.00029852377489363247,
|
|
"loss": 3.0558,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.054518248940207875,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.00029850469719489573,
|
|
"loss": 3.0611,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.05480368479853357,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.00029848549763039135,
|
|
"loss": 3.0442,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.05508912065685927,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 0.00029846617621587474,
|
|
"loss": 3.06,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.05537455651518496,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00029844673296720154,
|
|
"loss": 3.0144,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.05565999237351066,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.0002984271679003272,
|
|
"loss": 3.0423,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.055945428231836354,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0002984074810313071,
|
|
"loss": 3.0504,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.05623086409016205,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00029838767237629684,
|
|
"loss": 3.0031,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.056516299948487746,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0002983677419515516,
|
|
"loss": 3.0401,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.05680173580681344,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00029834768977342677,
|
|
"loss": 3.0359,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.05708717166513914,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0002983275158583775,
|
|
"loss": 3.028,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.05737260752346483,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0002983072202229589,
|
|
"loss": 3.0115,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.05765804338179053,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.000298286802883826,
|
|
"loss": 3.0221,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.057943479240116225,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0002982662638577335,
|
|
"loss": 3.0104,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.05822891509844192,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00029824560316153633,
|
|
"loss": 2.9983,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.058514350956767616,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00029822482081218887,
|
|
"loss": 3.0208,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.05879978681509331,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.00029820391682674563,
|
|
"loss": 3.0206,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.05908522267341901,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00029818289122236075,
|
|
"loss": 3.0552,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.059370658531744704,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00029816174401628827,
|
|
"loss": 3.0075,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.0596560943900704,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00029814047522588194,
|
|
"loss": 3.0068,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.059941530248396095,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0002981190848685954,
|
|
"loss": 2.9909,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.06022696610672179,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.00029809757296198194,
|
|
"loss": 2.9962,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.06051240196504749,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00029807593952369465,
|
|
"loss": 3.0294,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.06079783782337318,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.00029805418457148637,
|
|
"loss": 2.9857,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.06108327368169888,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00029803230812320956,
|
|
"loss": 3.0202,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.061368709540024574,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00029801031019681645,
|
|
"loss": 2.9734,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.06165414539835027,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000297988190810359,
|
|
"loss": 2.9859,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.061939581256675966,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 0.0002979659499819888,
|
|
"loss": 3.0128,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.06222501711500166,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0002979435877299571,
|
|
"loss": 3.0178,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.06251045297332736,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.0002979211040726147,
|
|
"loss": 2.9779,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.06279588883165305,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.00029789849902841223,
|
|
"loss": 2.9843,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.06308132468997875,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0002978757726158998,
|
|
"loss": 2.9943,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.06336676054830444,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0002978529248537271,
|
|
"loss": 3.0043,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.06365219640663014,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00029782995576064337,
|
|
"loss": 2.9729,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.06393763226495583,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00029780686535549756,
|
|
"loss": 2.9874,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.06422306812328153,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0002977836536572382,
|
|
"loss": 3.0055,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.06450850398160722,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00029776032068491303,
|
|
"loss": 3.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.06479393983993292,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 0.0002977368664576696,
|
|
"loss": 3.0042,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.06507937569825861,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.000297713290994755,
|
|
"loss": 2.9981,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.06536481155658432,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0002976895943155156,
|
|
"loss": 2.9803,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.06565024741491,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 0.00029766577643939744,
|
|
"loss": 2.9994,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.0659356832732357,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0002976418373859458,
|
|
"loss": 2.9842,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.0662211191315614,
|
|
"grad_norm": 5.125,
|
|
"learning_rate": 0.00029761777717480554,
|
|
"loss": 3.0053,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.0665065549898871,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 0.00029759359582572103,
|
|
"loss": 2.9906,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.06679199084821279,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 0.00029756929335853584,
|
|
"loss": 3.0234,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.06707742670653849,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 0.0002975448697931931,
|
|
"loss": 2.9871,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.06736286256486418,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 0.00029752032514973516,
|
|
"loss": 3.0048,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.06764829842318988,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 0.0002974956594483039,
|
|
"loss": 3.0141,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.06793373428151557,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.0002974708727091404,
|
|
"loss": 2.9658,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.06821917013984127,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.00029744596495258525,
|
|
"loss": 3.002,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.06850460599816696,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.0002974209361990781,
|
|
"loss": 2.9831,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.06879004185649266,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0002973957864691581,
|
|
"loss": 2.9823,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.06907547771481835,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.00029737051578346345,
|
|
"loss": 2.9626,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.06936091357314406,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.000297345124162732,
|
|
"loss": 2.9729,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.06964634943146975,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 0.00029731961162780037,
|
|
"loss": 3.0036,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.06993178528979545,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.0002972939781996047,
|
|
"loss": 2.9818,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.07021722114812114,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 0.00029726822389918034,
|
|
"loss": 2.9709,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.07050265700644684,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 0.0002972423487476617,
|
|
"loss": 2.9748,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.07078809286477253,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.0002972163527662824,
|
|
"loss": 2.96,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.07107352872309823,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 0.00029719023597637523,
|
|
"loss": 2.9929,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.07135896458142392,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.00029716399839937216,
|
|
"loss": 2.9467,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.07135896458142392,
|
|
"eval_loss": 2.805173873901367,
|
|
"eval_runtime": 5998.7495,
|
|
"eval_samples_per_second": 10.717,
|
|
"eval_steps_per_second": 10.717,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.07164440043974962,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 0.00029713764005680427,
|
|
"loss": 2.9764,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.07192983629807531,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 0.00029711116097030167,
|
|
"loss": 2.9982,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.07221527215640101,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0002970845611615935,
|
|
"loss": 2.9649,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.0725007080147267,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 0.00029705784065250826,
|
|
"loss": 2.9516,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.0727861438730524,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 0.00029703099946497323,
|
|
"loss": 2.9788,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.0730715797313781,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.0002970040376210148,
|
|
"loss": 2.9737,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.0733570155897038,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.00029697695514275824,
|
|
"loss": 2.9806,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.07364245144802949,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.00029694975205242816,
|
|
"loss": 2.9629,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.07392788730635519,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.00029692242837234777,
|
|
"loss": 2.9698,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.07421332316468088,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0002968949841249395,
|
|
"loss": 2.9449,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.07449875902300658,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 0.00029686741933272455,
|
|
"loss": 2.9724,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.07478419488133227,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0002968397340183232,
|
|
"loss": 2.9606,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.07506963073965797,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 0.00029681192820445445,
|
|
"loss": 3.0101,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.07535506659798366,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 0.00029678400191393626,
|
|
"loss": 2.9797,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.07564050245630936,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 0.0002967559551696856,
|
|
"loss": 2.9859,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.07592593831463505,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 0.00029672778799471797,
|
|
"loss": 2.9839,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.07621137417296076,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 0.0002966995004121481,
|
|
"loss": 2.9812,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.07649681003128644,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 0.00029667109244518923,
|
|
"loss": 2.9904,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.07678224588961215,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0002966425641171534,
|
|
"loss": 2.9614,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.07706768174793784,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.00029661391545145156,
|
|
"loss": 2.9671,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.07735311760626354,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00029658514647159335,
|
|
"loss": 2.9646,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.07763855346458923,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.0002965562572011872,
|
|
"loss": 2.9729,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.07792398932291493,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00029652724766394007,
|
|
"loss": 2.9315,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.07820942518124062,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 0.0002964981178836578,
|
|
"loss": 2.9511,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.07849486103956632,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00029646886788424487,
|
|
"loss": 2.9338,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.07878029689789201,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 0.0002964394976897043,
|
|
"loss": 2.936,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.07906573275621771,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.0002964100073241379,
|
|
"loss": 2.9335,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.0793511686145434,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.000296380396811746,
|
|
"loss": 2.9638,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.0796366044728691,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00029635066617682754,
|
|
"loss": 2.9612,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.0799220403311948,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.00029632081544378003,
|
|
"loss": 2.9579,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.0802074761895205,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00029629084463709957,
|
|
"loss": 2.9506,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.08049291204784619,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0002962607537813808,
|
|
"loss": 2.9479,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.08077834790617189,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0002962305429013168,
|
|
"loss": 2.9124,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.08106378376449758,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0002962002120216992,
|
|
"loss": 2.9741,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.08134921962282328,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0002961697611674181,
|
|
"loss": 2.9481,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.08163465548114897,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00029613919036346203,
|
|
"loss": 2.9457,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.08192009133947467,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00029610849963491797,
|
|
"loss": 2.9509,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.08220552719780036,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002960776890069714,
|
|
"loss": 2.9441,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.08249096305612605,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0002960467585049059,
|
|
"loss": 2.9625,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.08277639891445175,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0002960157081541039,
|
|
"loss": 2.9183,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.08306183477277744,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0002959845379800457,
|
|
"loss": 2.9312,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.08334727063110314,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00029595324800831024,
|
|
"loss": 2.9224,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.08363270648942883,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0002959218382645746,
|
|
"loss": 2.9394,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.08391814234775453,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.00029589030877461426,
|
|
"loss": 2.9493,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.08420357820608022,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00029585865956430283,
|
|
"loss": 2.9385,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.08448901406440593,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.00029582689065961237,
|
|
"loss": 2.9265,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.08477444992273162,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00029579500208661296,
|
|
"loss": 2.9448,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.08505988578105732,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00029576299387147305,
|
|
"loss": 2.9555,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 0.085345321639383,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00029573086604045904,
|
|
"loss": 2.904,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 0.08563075749770871,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0002956986186199358,
|
|
"loss": 2.959,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.0859161933560344,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0002956662516363661,
|
|
"loss": 2.9075,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 0.0862016292143601,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002956337651163109,
|
|
"loss": 2.9521,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 0.08648706507268579,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00029560115908642924,
|
|
"loss": 2.9425,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 0.08677250093101149,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.0002955684335734783,
|
|
"loss": 2.9626,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 0.08705793678933718,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00029553558860431317,
|
|
"loss": 2.9293,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.08734337264766288,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0002955026242058872,
|
|
"loss": 2.9332,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 0.08762880850598857,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0002954695404052514,
|
|
"loss": 2.9323,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 0.08791424436431428,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0002954363372295551,
|
|
"loss": 2.9408,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 0.08819968022263996,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0002954030147060454,
|
|
"loss": 2.9305,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 0.08848511608096567,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0002953695728620675,
|
|
"loss": 2.9323,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.08877055193929136,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00029533601172506427,
|
|
"loss": 2.9138,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 0.08905598779761706,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.00029530233132257663,
|
|
"loss": 2.9394,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 0.08934142365594275,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00029526853168224343,
|
|
"loss": 2.8984,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 0.08962685951426845,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0002952346128318013,
|
|
"loss": 2.9322,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 0.08991229537259414,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.00029520057479908465,
|
|
"loss": 2.9164,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.09019773123091984,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0002951664176120257,
|
|
"loss": 2.9167,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 0.09048316708924553,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.00029513214129865456,
|
|
"loss": 2.9398,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 0.09076860294757123,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00029509774588709896,
|
|
"loss": 2.9395,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 0.09105403880589692,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00029506323140558445,
|
|
"loss": 2.9478,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 0.09133947466422263,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0002950285978824343,
|
|
"loss": 2.9216,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.09162491052254831,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00029499384534606936,
|
|
"loss": 2.8959,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 0.09191034638087402,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00029495897382500827,
|
|
"loss": 2.9072,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 0.0921957822391997,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00029492398334786727,
|
|
"loss": 2.9121,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 0.09248121809752541,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0002948888739433602,
|
|
"loss": 2.9344,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 0.0927666539558511,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0002948536456402985,
|
|
"loss": 2.9211,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.0930520898141768,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.00029481829846759116,
|
|
"loss": 2.9041,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 0.09333752567250249,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0002947828324542448,
|
|
"loss": 2.9353,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 0.09362296153082819,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002947472476293634,
|
|
"loss": 2.9037,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 0.09390839738915388,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00029471154402214864,
|
|
"loss": 2.9166,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 0.09419383324747958,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.00029467572166189956,
|
|
"loss": 2.9074,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.09447926910580527,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00029463978057801257,
|
|
"loss": 2.9137,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 0.09476470496413097,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00029460372079998177,
|
|
"loss": 2.8971,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 0.09505014082245666,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00029456754235739833,
|
|
"loss": 2.8784,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 0.09533557668078237,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0002945312452799511,
|
|
"loss": 2.9102,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 0.09562101253910806,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00029449482959742604,
|
|
"loss": 2.9096,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.09590644839743376,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0002944582953397067,
|
|
"loss": 2.8925,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 0.09619188425575945,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0002944216425367736,
|
|
"loss": 2.9094,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 0.09647732011408515,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0002943848712187048,
|
|
"loss": 2.9133,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 0.09676275597241084,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0002943479814156756,
|
|
"loss": 2.9073,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 0.09704819183073654,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00029431097315795834,
|
|
"loss": 2.8993,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.09733362768906223,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.00029427384647592284,
|
|
"loss": 2.8968,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 0.09761906354738793,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0002942366014000359,
|
|
"loss": 2.9124,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 0.09790449940571362,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0002941992379608615,
|
|
"loss": 2.8816,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 0.09818993526403932,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00029416175618906084,
|
|
"loss": 2.9015,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 0.09847537112236501,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.00029412415611539214,
|
|
"loss": 2.9286,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.09876080698069072,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00029408643777071073,
|
|
"loss": 2.9316,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 0.0990462428390164,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00029404860118596905,
|
|
"loss": 2.894,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 0.09933167869734211,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.00029401064639221643,
|
|
"loss": 2.8946,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 0.0996171145556678,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0002939725734205994,
|
|
"loss": 2.9068,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 0.0999025504139935,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00029393438230236124,
|
|
"loss": 2.8898,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.10018798627231919,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0002938960730688424,
|
|
"loss": 2.8922,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 0.10047342213064489,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00029385764575148014,
|
|
"loss": 2.8772,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 0.10075885798897058,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00029381910038180856,
|
|
"loss": 2.8961,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 0.10104429384729628,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00029378043699145886,
|
|
"loss": 2.9052,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 0.10132972970562197,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0002937416556121589,
|
|
"loss": 2.8703,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.10161516556394767,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002937027562757334,
|
|
"loss": 2.8967,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 0.10190060142227336,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00029366373901410387,
|
|
"loss": 2.913,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 0.10218603728059907,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0002936246038592886,
|
|
"loss": 2.8944,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 0.10247147313892475,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.00029358535084340274,
|
|
"loss": 2.8808,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 0.10275690899725046,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000293545979998658,
|
|
"loss": 2.9055,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.10304234485557615,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0002935064913573628,
|
|
"loss": 2.8925,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 0.10332778071390183,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002934668849519223,
|
|
"loss": 2.8751,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 0.10361321657222754,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00029342716081483825,
|
|
"loss": 2.8836,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 0.10389865243055323,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0002933873189787091,
|
|
"loss": 2.8702,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 0.10418408828887893,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0002933473594762297,
|
|
"loss": 2.8953,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.10446952414720462,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00029330728234019173,
|
|
"loss": 2.8753,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 0.10475496000553032,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.0002932670876034831,
|
|
"loss": 2.8844,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 0.10504039586385601,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00029322677529908844,
|
|
"loss": 2.9018,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 0.10532583172218171,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0002931863454600888,
|
|
"loss": 2.8967,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 0.1056112675805074,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002931457981196616,
|
|
"loss": 2.882,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.1058967034388331,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.00029310513331108086,
|
|
"loss": 2.8641,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 0.10618213929715879,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0002930643510677168,
|
|
"loss": 2.8808,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 0.1064675751554845,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00029302345142303616,
|
|
"loss": 2.8699,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 0.10675301101381018,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0002929824344106019,
|
|
"loss": 2.8467,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 0.10703844687213589,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0002929413000640735,
|
|
"loss": 2.8674,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.10732388273046158,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0002929000484172064,
|
|
"loss": 2.8897,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 0.10760931858878728,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00029285867950385255,
|
|
"loss": 2.8601,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 0.10789475444711297,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.00029281719335796013,
|
|
"loss": 2.89,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 0.10818019030543867,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00029277559001357343,
|
|
"loss": 2.9044,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 0.10846562616376436,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00029273386950483287,
|
|
"loss": 2.8765,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.10875106202209006,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00029269203186597513,
|
|
"loss": 2.8911,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 0.10903649788041575,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.00029265007713133304,
|
|
"loss": 2.8756,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 0.10932193373874145,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00029260800533533534,
|
|
"loss": 2.889,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 0.10960736959706714,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.000292565816512507,
|
|
"loss": 2.8758,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 0.10989280545539284,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.000292523510697469,
|
|
"loss": 2.8699,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.11017824131371853,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0002924810879249382,
|
|
"loss": 2.8935,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 0.11046367717204424,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00029243854822972763,
|
|
"loss": 2.8723,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 0.11074911303036993,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0002923958916467461,
|
|
"loss": 2.894,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 0.11103454888869563,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00029235311821099847,
|
|
"loss": 2.8676,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 0.11131998474702132,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00029231022795758537,
|
|
"loss": 2.8786,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.11160542060534702,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0002922672209217033,
|
|
"loss": 2.867,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 0.11189085646367271,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00029222409713864484,
|
|
"loss": 2.8938,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 0.11217629232199841,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00029218085664379806,
|
|
"loss": 2.8601,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 0.1124617281803241,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0002921374994726469,
|
|
"loss": 2.8817,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 0.1127471640386498,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0002920940256607711,
|
|
"loss": 2.8482,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.11303259989697549,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0002920504352438462,
|
|
"loss": 2.8996,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 0.1133180357553012,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00029200672825764314,
|
|
"loss": 2.8592,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 0.11360347161362688,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00029196290473802885,
|
|
"loss": 2.8327,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 0.11388890747195259,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0002919189647209656,
|
|
"loss": 2.8438,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 0.11417434333027827,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00029187490824251154,
|
|
"loss": 2.884,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.11445977918860398,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00029183073533882025,
|
|
"loss": 2.8601,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 0.11474521504692967,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.00029178644604614077,
|
|
"loss": 2.8788,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 0.11503065090525537,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00029174204040081773,
|
|
"loss": 2.8823,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 0.11531608676358106,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0002916975184392912,
|
|
"loss": 2.8464,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 0.11560152262190676,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0002916528801980969,
|
|
"loss": 2.8377,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.11588695848023245,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00029160812571386575,
|
|
"loss": 2.8409,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 0.11617239433855815,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00029156325502332413,
|
|
"loss": 2.8581,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 0.11645783019688384,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.00029151826816329365,
|
|
"loss": 2.865,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 0.11674326605520954,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00029147316517069157,
|
|
"loss": 2.8527,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 0.11702870191353523,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00029142794608253016,
|
|
"loss": 2.8494,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.11731413777186094,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 0.0002913826109359171,
|
|
"loss": 2.8461,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 0.11759957363018662,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00029133715976805525,
|
|
"loss": 2.8565,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 0.11788500948851233,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 0.0002912915926162427,
|
|
"loss": 2.8667,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 0.11817044534683802,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00029124590951787267,
|
|
"loss": 2.8504,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 0.11845588120516372,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 0.0002912001105104337,
|
|
"loss": 2.8719,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.11874131706348941,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00029115419563150916,
|
|
"loss": 2.8702,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 0.11902675292181511,
|
|
"grad_norm": 4.71875,
|
|
"learning_rate": 0.0002911081649187778,
|
|
"loss": 2.8971,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 0.1193121887801408,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 0.0002910620184100133,
|
|
"loss": 2.9119,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 0.1195976246384665,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 0.0002910157561430842,
|
|
"loss": 2.8927,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 0.11988306049679219,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 0.0002909693781559544,
|
|
"loss": 2.861,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.1201684963551179,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 0.0002909228844866824,
|
|
"loss": 2.8826,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 0.12045393221344358,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 0.0002908762751734219,
|
|
"loss": 2.8495,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 0.12073936807176928,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 0.0002908295502544213,
|
|
"loss": 2.8707,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 0.12102480393009497,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.00029078270976802393,
|
|
"loss": 2.8647,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 0.12131023978842068,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 0.00029073575375266806,
|
|
"loss": 2.8505,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.12159567564674637,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0002906886822468867,
|
|
"loss": 2.8821,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 0.12188111150507207,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 0.0002906414952893075,
|
|
"loss": 2.8788,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 0.12216654736339776,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 0.00029059419291865314,
|
|
"loss": 2.8715,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 0.12245198322172346,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 0.0002905467751737407,
|
|
"loss": 2.846,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 0.12273741908004915,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 0.00029049924209348214,
|
|
"loss": 2.856,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.12302285493837485,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.000290451593716884,
|
|
"loss": 2.8646,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 0.12330829079670054,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 0.00029040383008304744,
|
|
"loss": 2.8408,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 0.12359372665502623,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 0.00029035595123116817,
|
|
"loss": 2.8501,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 0.12387916251335193,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0002903079572005365,
|
|
"loss": 2.8384,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 0.12416459837167762,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 0.00029025984803053735,
|
|
"loss": 2.8436,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.12445003423000332,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 0.0002902116237606498,
|
|
"loss": 2.8543,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 0.12473547008832901,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 0.0002901632844304478,
|
|
"loss": 2.8469,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 0.12502090594665471,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 0.0002901148300795994,
|
|
"loss": 2.8636,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 0.1253063418049804,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 0.0002900662607478672,
|
|
"loss": 2.8424,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 0.1255917776633061,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 0.00029001757647510815,
|
|
"loss": 2.8493,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.1258772135216318,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 0.0002899687773012734,
|
|
"loss": 2.8214,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 0.1261626493799575,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.0002899198632664086,
|
|
"loss": 2.8492,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 0.1264480852382832,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 0.0002898708344106533,
|
|
"loss": 2.8111,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 0.12673352109660888,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 0.0002898216907742418,
|
|
"loss": 2.8513,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 0.1270189569549346,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0002897724323975021,
|
|
"loss": 2.8602,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.12730439281326028,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0002897230593208567,
|
|
"loss": 2.8462,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 0.12758982867158597,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.00028967357158482196,
|
|
"loss": 2.8422,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 0.12787526452991166,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.00028962396923000846,
|
|
"loss": 2.8382,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 0.12816070038823738,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 0.0002895742522971209,
|
|
"loss": 2.8544,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 0.12844613624656306,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 0.0002895244208269579,
|
|
"loss": 2.8542,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.12873157210488875,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002894744748604121,
|
|
"loss": 2.8417,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 0.12901700796321444,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0002894244144384701,
|
|
"loss": 2.8588,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 0.12930244382154016,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 0.0002893742396022125,
|
|
"loss": 2.8388,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 0.12958787967986585,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0002893239503928137,
|
|
"loss": 2.8559,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 0.12987331553819154,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 0.00028927354685154185,
|
|
"loss": 2.8341,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.13015875139651722,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 0.0002892230290197592,
|
|
"loss": 2.8267,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 0.13044418725484294,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0002891723969389216,
|
|
"loss": 2.8497,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 0.13072962311316863,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0002891216506505787,
|
|
"loss": 2.8252,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 0.13101505897149432,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 0.0002890707901963738,
|
|
"loss": 2.8563,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 0.13130049482982,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 0.00028901981561804403,
|
|
"loss": 2.861,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.13158593068814572,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.0002889687269574201,
|
|
"loss": 2.8336,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 0.1318713665464714,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 0.0002889175242564263,
|
|
"loss": 2.8575,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 0.1321568024047971,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.00028886620755708045,
|
|
"loss": 2.8301,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 0.1324422382631228,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0002888147769014942,
|
|
"loss": 2.8299,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 0.1327276741214485,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 0.0002887632323318723,
|
|
"loss": 2.8261,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.1330131099797742,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0002887115738905134,
|
|
"loss": 2.8398,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 0.13329854583809989,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.0002886598016198093,
|
|
"loss": 2.8414,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 0.13358398169642557,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.00028860791556224524,
|
|
"loss": 2.8286,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 0.1338694175547513,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.00028855591576040004,
|
|
"loss": 2.8641,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 0.13415485341307698,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0002885038022569457,
|
|
"loss": 2.8478,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.13444028927140267,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 0.0002884515750946474,
|
|
"loss": 2.8215,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 0.13472572512972836,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0002883992343163639,
|
|
"loss": 2.8004,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 0.13501116098805407,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.00028834677996504696,
|
|
"loss": 2.8395,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 0.13529659684637976,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.00028829421208374166,
|
|
"loss": 2.8313,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 0.13558203270470545,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.0002882415307155862,
|
|
"loss": 2.841,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.13586746856303114,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.00028818873590381183,
|
|
"loss": 2.8614,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 0.13615290442135686,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 0.000288135827691743,
|
|
"loss": 2.8482,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 0.13643834027968255,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0002880828061227973,
|
|
"loss": 2.8532,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 0.13672377613800824,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.0002880296712404851,
|
|
"loss": 2.8337,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 0.13700921199633392,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0002879764230884099,
|
|
"loss": 2.8183,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.13729464785465964,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.00028792306171026823,
|
|
"loss": 2.8161,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 0.13758008371298533,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.00028786958714984936,
|
|
"loss": 2.8174,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 0.13786551957131102,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.0002878159994510356,
|
|
"loss": 2.8075,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 0.1381509554296367,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.00028776229865780205,
|
|
"loss": 2.8157,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 0.13843639128796242,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.0002877084848142165,
|
|
"loss": 2.8291,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.1387218271462881,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0002876545579644396,
|
|
"loss": 2.8247,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 0.1390072630046138,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0002876005181527249,
|
|
"loss": 2.8366,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 0.1392926988629395,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0002875463654234183,
|
|
"loss": 2.8679,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 0.1395781347212652,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0002874920998209587,
|
|
"loss": 2.8432,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 0.1398635705795909,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.00028743772138987745,
|
|
"loss": 2.8366,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.14014900643791658,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0002873832301747985,
|
|
"loss": 2.8279,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 0.14043444229624227,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00028732862622043835,
|
|
"loss": 2.7933,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 0.140719878154568,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.000287273909571606,
|
|
"loss": 2.8563,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 0.14100531401289368,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.00028721908027320314,
|
|
"loss": 2.858,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 0.14129074987121937,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.00028716413837022355,
|
|
"loss": 2.7946,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.14157618572954506,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0002871090839077537,
|
|
"loss": 2.7874,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 0.14186162158787077,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0002870539169309723,
|
|
"loss": 2.8255,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 0.14214705744619646,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.0002869986374851504,
|
|
"loss": 2.8218,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 0.14243249330452215,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.00028694324561565136,
|
|
"loss": 2.8197,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 0.14271792916284784,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.00028688774136793085,
|
|
"loss": 2.8208,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.14271792916284784,
|
|
"eval_loss": 2.6708414554595947,
|
|
"eval_runtime": 6008.9725,
|
|
"eval_samples_per_second": 10.698,
|
|
"eval_steps_per_second": 10.698,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.14300336502117356,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.00028683212478753663,
|
|
"loss": 2.8263,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 0.14328880087949925,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00028677639592010874,
|
|
"loss": 2.8395,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 0.14357423673782493,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.00028672055481137937,
|
|
"loss": 2.815,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 0.14385967259615062,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0002866646015071728,
|
|
"loss": 2.8157,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 0.1441451084544763,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0002866085360534053,
|
|
"loss": 2.8449,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.14443054431280203,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00028655235849608533,
|
|
"loss": 2.7893,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 0.14471598017112772,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.00028649606888131327,
|
|
"loss": 2.8099,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 0.1450014160294534,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.00028643966725528134,
|
|
"loss": 2.8032,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 0.1452868518877791,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 0.0002863831536642739,
|
|
"loss": 2.8453,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 0.1455722877461048,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.0002863265281546669,
|
|
"loss": 2.7995,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.1458577236044305,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0002862697907729285,
|
|
"loss": 2.8297,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 0.1461431594627562,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00028621294156561843,
|
|
"loss": 2.7948,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 0.14642859532108188,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0002861559805793881,
|
|
"loss": 2.8182,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 0.1467140311794076,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0002860989078609809,
|
|
"loss": 2.8126,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 0.14699946703773328,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.00028604172345723174,
|
|
"loss": 2.8018,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.14728490289605897,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.00028598442741506724,
|
|
"loss": 2.8455,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 0.14757033875438466,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0002859270197815056,
|
|
"loss": 2.82,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 0.14785577461271038,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0002858695006036566,
|
|
"loss": 2.8428,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 0.14814121047103607,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.0002858118699287216,
|
|
"loss": 2.8128,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 0.14842664632936176,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.00028575412780399345,
|
|
"loss": 2.8563,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.14871208218768744,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 0.00028569627427685627,
|
|
"loss": 2.8428,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 0.14899751804601316,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.000285638309394786,
|
|
"loss": 2.8274,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 0.14928295390433885,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 0.0002855802332053496,
|
|
"loss": 2.8169,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 0.14956838976266454,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 0.00028552204575620543,
|
|
"loss": 2.828,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 0.14985382562099023,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0002854637470951033,
|
|
"loss": 2.8265,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.15013926147931594,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.00028540533726988414,
|
|
"loss": 2.853,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 0.15042469733764163,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.00028534681632848025,
|
|
"loss": 2.8193,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 0.15071013319596732,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0002852881843189149,
|
|
"loss": 2.8112,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 0.150995569054293,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 0.0002852294412893027,
|
|
"loss": 2.8376,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 0.15128100491261873,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.00028517058728784933,
|
|
"loss": 2.8126,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.15156644077094442,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0002851116223628514,
|
|
"loss": 2.8375,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 0.1518518766292701,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.00028505254656269673,
|
|
"loss": 2.8186,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 0.1521373124875958,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.00028499335993586403,
|
|
"loss": 2.8437,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 0.1524227483459215,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0002849340625309229,
|
|
"loss": 2.7927,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 0.1527081842042472,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.000284874654396534,
|
|
"loss": 2.8123,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.1529936200625729,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0002848151355814487,
|
|
"loss": 2.8459,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 0.15327905592089858,
|
|
"grad_norm": 2.953125,
|
|
"learning_rate": 0.0002847555061345093,
|
|
"loss": 2.8225,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 0.1535644917792243,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0002846957661046488,
|
|
"loss": 2.8028,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 0.15384992763754998,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0002846359155408911,
|
|
"loss": 2.8167,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 0.15413536349587567,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0002845759544923507,
|
|
"loss": 2.83,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.15442079935420136,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.00028451588300823266,
|
|
"loss": 2.8233,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 0.15470623521252708,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0002844557011378328,
|
|
"loss": 2.8076,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 0.15499167107085277,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.00028439540893053766,
|
|
"loss": 2.8473,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 0.15527710692917845,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.000284335006435824,
|
|
"loss": 2.8175,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 0.15556254278750414,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00028427449370325937,
|
|
"loss": 2.8237,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.15584797864582986,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0002842138707825015,
|
|
"loss": 2.8176,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 0.15613341450415555,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0002841531377232989,
|
|
"loss": 2.8295,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 0.15641885036248124,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0002840922945754901,
|
|
"loss": 2.8035,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 0.15670428622080693,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 0.00028403134138900427,
|
|
"loss": 2.8217,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 0.15698972207913264,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0002839702782138607,
|
|
"loss": 2.8093,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.15727515793745833,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.00028390910510016896,
|
|
"loss": 2.8026,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 0.15756059379578402,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.00028384782209812893,
|
|
"loss": 2.8124,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 0.1578460296541097,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0002837864292580305,
|
|
"loss": 2.8342,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 0.15813146551243543,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00028372492663025393,
|
|
"loss": 2.7897,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 0.15841690137076112,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002836633142652693,
|
|
"loss": 2.8149,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.1587023372290868,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00028360159221363704,
|
|
"loss": 2.8298,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 0.1589877730874125,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.00028353976052600727,
|
|
"loss": 2.8108,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 0.1592732089457382,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0002834778192531204,
|
|
"loss": 2.7943,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 0.1595586448040639,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 0.00028341576844580647,
|
|
"loss": 2.8394,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 0.1598440806623896,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 0.00028335360815498565,
|
|
"loss": 2.8056,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.16012951652071528,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00028329133843166786,
|
|
"loss": 2.8123,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 0.160414952379041,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0002832289593269527,
|
|
"loss": 2.8239,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 0.16070038823736668,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00028316647089202975,
|
|
"loss": 2.8298,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 0.16098582409569237,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 0.0002831038731781782,
|
|
"loss": 2.839,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 0.16127125995401806,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.00028304116623676685,
|
|
"loss": 2.8498,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.16155669581234378,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0002829783501192542,
|
|
"loss": 2.8228,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 0.16184213167066946,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0002829154248771885,
|
|
"loss": 2.8171,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 0.16212756752899515,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.00028285239056220724,
|
|
"loss": 2.7826,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 0.16241300338732084,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0002827892472260376,
|
|
"loss": 2.8087,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 0.16269843924564656,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00028272599492049625,
|
|
"loss": 2.7997,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.16298387510397225,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00028266263369748916,
|
|
"loss": 2.8093,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 0.16326931096229794,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0002825991636090118,
|
|
"loss": 2.7765,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 0.16355474682062363,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0002825355847071489,
|
|
"loss": 2.8033,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 0.16384018267894934,
|
|
"grad_norm": 50.75,
|
|
"learning_rate": 0.00028247189704407456,
|
|
"loss": 2.8378,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 0.16412561853727503,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 0.000282408100672052,
|
|
"loss": 2.8366,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.16441105439560072,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.0002823441956434338,
|
|
"loss": 2.8565,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 0.1646964902539264,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 0.0002822801820106617,
|
|
"loss": 2.8216,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 0.1649819261122521,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0002822160598262663,
|
|
"loss": 2.8249,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 0.16526736197057781,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00028215182914286766,
|
|
"loss": 2.8343,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 0.1655527978289035,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0002820874900131746,
|
|
"loss": 2.8027,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.1658382336872292,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00028202304248998506,
|
|
"loss": 2.8204,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 0.16612366954555488,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0002819584866261859,
|
|
"loss": 2.8122,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 0.1664091054038806,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0002818938224747529,
|
|
"loss": 2.816,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 0.16669454126220629,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0002818290500887506,
|
|
"loss": 2.8286,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 0.16697997712053197,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002817641695213327,
|
|
"loss": 2.8046,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.16726541297885766,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00028169918082574105,
|
|
"loss": 2.8249,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 0.16755084883718338,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0002816340840553069,
|
|
"loss": 2.8051,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 0.16783628469550907,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00028156887926344975,
|
|
"loss": 2.8328,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 0.16812172055383476,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.00028150356650367796,
|
|
"loss": 2.8087,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 0.16840715641216045,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00028143814582958827,
|
|
"loss": 2.7976,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.16869259227048616,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0002813726172948664,
|
|
"loss": 2.8238,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 0.16897802812881185,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.000281306980953286,
|
|
"loss": 2.8243,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 0.16926346398713754,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.0002812412368587097,
|
|
"loss": 2.8078,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 0.16954889984546323,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0002811753850650883,
|
|
"loss": 2.7899,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 0.16983433570378895,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 0.000281109425626461,
|
|
"loss": 2.8176,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.17011977156211464,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 0.00028104335859695543,
|
|
"loss": 2.8235,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 0.17040520742044032,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0002809771840307873,
|
|
"loss": 2.7986,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 0.170690643278766,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0002809109019822609,
|
|
"loss": 2.7848,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 0.17097607913709173,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00028084451250576844,
|
|
"loss": 2.7914,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 0.17126151499541742,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00028077801565579033,
|
|
"loss": 2.8036,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.1715469508537431,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0002807114114868953,
|
|
"loss": 2.8006,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 0.1718323867120688,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002806447000537398,
|
|
"loss": 2.7898,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 0.1721178225703945,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 0.00028057788141106865,
|
|
"loss": 2.7905,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 0.1724032584287202,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0002805109556137144,
|
|
"loss": 2.8129,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 0.1726886942870459,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0002804439227165977,
|
|
"loss": 2.8151,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.17297413014537158,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00028037678277472697,
|
|
"loss": 2.7888,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 0.1732595660036973,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0002803095358431985,
|
|
"loss": 2.7996,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 0.17354500186202299,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.00028024218197719643,
|
|
"loss": 2.7932,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 0.17383043772034867,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.0002801747212319926,
|
|
"loss": 2.7972,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 0.17411587357867436,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0002801071536629466,
|
|
"loss": 2.8141,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.17440130943700008,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0002800394793255056,
|
|
"loss": 2.8014,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 0.17468674529532577,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.00027997169827520454,
|
|
"loss": 2.8036,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 0.17497218115365146,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0002799038105676658,
|
|
"loss": 2.8235,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 0.17525761701197715,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00027983581625859927,
|
|
"loss": 2.7849,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 0.17554305287030286,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002797677154038024,
|
|
"loss": 2.7964,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.17582848872862855,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00027969950805916,
|
|
"loss": 2.8027,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 0.17611392458695424,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0002796311942806444,
|
|
"loss": 2.783,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 0.17639936044527993,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00027956277412431507,
|
|
"loss": 2.7981,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 0.17668479630360565,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.00027949424764631896,
|
|
"loss": 2.8145,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 0.17697023216193133,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0002794256149028902,
|
|
"loss": 2.83,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.17725566802025702,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.00027935687595035015,
|
|
"loss": 2.811,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 0.1775411038785827,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.00027928803084510716,
|
|
"loss": 2.8016,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 0.17782653973690843,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.000279219079643657,
|
|
"loss": 2.7996,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 0.17811197559523412,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0002791500224025822,
|
|
"loss": 2.817,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 0.1783974114535598,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.00027908085917855243,
|
|
"loss": 2.8096,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.1786828473118855,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0002790115900283245,
|
|
"loss": 2.7852,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 0.1789682831702112,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00027894221500874184,
|
|
"loss": 2.8088,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 0.1792537190285369,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0002788727341767349,
|
|
"loss": 2.767,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 0.1795391548868626,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.0002788031475893211,
|
|
"loss": 2.7955,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 0.17982459074518828,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00027873345530360436,
|
|
"loss": 2.8143,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.180110026603514,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.00027866365737677564,
|
|
"loss": 2.777,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 0.18039546246183968,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.00027859375386611227,
|
|
"loss": 2.8,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 0.18068089832016537,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0002785237448289786,
|
|
"loss": 2.7796,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 0.18096633417849106,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.00027845363032282514,
|
|
"loss": 2.8042,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 0.18125177003681678,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0002783834104051893,
|
|
"loss": 2.8206,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.18153720589514247,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00027831308513369494,
|
|
"loss": 2.812,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 0.18182264175346816,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00027824265456605224,
|
|
"loss": 2.7804,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 0.18210807761179384,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00027817211876005786,
|
|
"loss": 2.7941,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 0.18239351347011956,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0002781014777735948,
|
|
"loss": 2.7842,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 0.18267894932844525,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00027803073166463244,
|
|
"loss": 2.7955,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.18296438518677094,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00027795988049122625,
|
|
"loss": 2.7597,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 0.18324982104509663,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002778889243115183,
|
|
"loss": 2.811,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 0.18353525690342234,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00027781786318373627,
|
|
"loss": 2.7948,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 0.18382069276174803,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0002777466971661945,
|
|
"loss": 2.7811,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 0.18410612862007372,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00027767542631729306,
|
|
"loss": 2.7838,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.1843915644783994,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0002776040506955182,
|
|
"loss": 2.7958,
|
|
"step": 646
|
|
},
|
|
{
|
|
"epoch": 0.18467700033672513,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0002775325703594421,
|
|
"loss": 2.7798,
|
|
"step": 647
|
|
},
|
|
{
|
|
"epoch": 0.18496243619505082,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0002774609853677229,
|
|
"loss": 2.7891,
|
|
"step": 648
|
|
},
|
|
{
|
|
"epoch": 0.1852478720533765,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0002773892957791045,
|
|
"loss": 2.8067,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 0.1855333079117022,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0002773175016524169,
|
|
"loss": 2.7842,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.18581874377002788,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.00027724560304657553,
|
|
"loss": 2.7706,
|
|
"step": 651
|
|
},
|
|
{
|
|
"epoch": 0.1861041796283536,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0002771736000205819,
|
|
"loss": 2.7912,
|
|
"step": 652
|
|
},
|
|
{
|
|
"epoch": 0.1863896154866793,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.000277101492633523,
|
|
"loss": 2.7859,
|
|
"step": 653
|
|
},
|
|
{
|
|
"epoch": 0.18667505134500498,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0002770292809445715,
|
|
"loss": 2.7637,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 0.18696048720333067,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0002769569650129857,
|
|
"loss": 2.7884,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.18724592306165638,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.00027688454489810946,
|
|
"loss": 2.7858,
|
|
"step": 656
|
|
},
|
|
{
|
|
"epoch": 0.18753135891998207,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.00027681202065937203,
|
|
"loss": 2.7677,
|
|
"step": 657
|
|
},
|
|
{
|
|
"epoch": 0.18781679477830776,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00027673939235628827,
|
|
"loss": 2.7883,
|
|
"step": 658
|
|
},
|
|
{
|
|
"epoch": 0.18810223063663345,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00027666666004845823,
|
|
"loss": 2.7624,
|
|
"step": 659
|
|
},
|
|
{
|
|
"epoch": 0.18838766649495917,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0002765938237955674,
|
|
"loss": 2.8089,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.18867310235328486,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0002765208836573868,
|
|
"loss": 2.7795,
|
|
"step": 661
|
|
},
|
|
{
|
|
"epoch": 0.18895853821161054,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0002764478396937722,
|
|
"loss": 2.7722,
|
|
"step": 662
|
|
},
|
|
{
|
|
"epoch": 0.18924397406993623,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00027637469196466506,
|
|
"loss": 2.7653,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 0.18952940992826195,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.00027630144053009174,
|
|
"loss": 2.7717,
|
|
"step": 664
|
|
},
|
|
{
|
|
"epoch": 0.18981484578658764,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0002762280854501638,
|
|
"loss": 2.762,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.19010028164491333,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00027615462678507775,
|
|
"loss": 2.7989,
|
|
"step": 666
|
|
},
|
|
{
|
|
"epoch": 0.19038571750323902,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00027608106459511513,
|
|
"loss": 2.7851,
|
|
"step": 667
|
|
},
|
|
{
|
|
"epoch": 0.19067115336156473,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0002760073989406425,
|
|
"loss": 2.7428,
|
|
"step": 668
|
|
},
|
|
{
|
|
"epoch": 0.19095658921989042,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.00027593362988211133,
|
|
"loss": 2.7699,
|
|
"step": 669
|
|
},
|
|
{
|
|
"epoch": 0.1912420250782161,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00027585975748005783,
|
|
"loss": 2.7797,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.1915274609365418,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0002757857817951032,
|
|
"loss": 2.7656,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 0.19181289679486752,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.00027571170288795323,
|
|
"loss": 2.7674,
|
|
"step": 672
|
|
},
|
|
{
|
|
"epoch": 0.1920983326531932,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0002756375208193985,
|
|
"loss": 2.7576,
|
|
"step": 673
|
|
},
|
|
{
|
|
"epoch": 0.1923837685115189,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0002755632356503141,
|
|
"loss": 2.7844,
|
|
"step": 674
|
|
},
|
|
{
|
|
"epoch": 0.19266920436984458,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00027548884744166,
|
|
"loss": 2.7817,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.1929546402281703,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0002754143562544805,
|
|
"loss": 2.7589,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 0.193240076086496,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0002753397621499045,
|
|
"loss": 2.7841,
|
|
"step": 677
|
|
},
|
|
{
|
|
"epoch": 0.19352551194482168,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.00027526506518914533,
|
|
"loss": 2.7945,
|
|
"step": 678
|
|
},
|
|
{
|
|
"epoch": 0.19381094780314737,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00027519026543350067,
|
|
"loss": 2.7896,
|
|
"step": 679
|
|
},
|
|
{
|
|
"epoch": 0.19409638366147308,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0002751153629443528,
|
|
"loss": 2.7839,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.19438181951979877,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0002750403577831679,
|
|
"loss": 2.7684,
|
|
"step": 681
|
|
},
|
|
{
|
|
"epoch": 0.19466725537812446,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00027496525001149676,
|
|
"loss": 2.7598,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 0.19495269123645015,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00027489003969097416,
|
|
"loss": 2.7652,
|
|
"step": 683
|
|
},
|
|
{
|
|
"epoch": 0.19523812709477587,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.00027481472688331923,
|
|
"loss": 2.7909,
|
|
"step": 684
|
|
},
|
|
{
|
|
"epoch": 0.19552356295310155,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.00027473931165033496,
|
|
"loss": 2.7535,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.19580899881142724,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.00027466379405390864,
|
|
"loss": 2.763,
|
|
"step": 686
|
|
},
|
|
{
|
|
"epoch": 0.19609443466975293,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0002745881741560113,
|
|
"loss": 2.8034,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 0.19637987052807865,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0002745124520186981,
|
|
"loss": 2.7538,
|
|
"step": 688
|
|
},
|
|
{
|
|
"epoch": 0.19666530638640434,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0002744366277041082,
|
|
"loss": 2.7494,
|
|
"step": 689
|
|
},
|
|
{
|
|
"epoch": 0.19695074224473003,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0002743607012744643,
|
|
"loss": 2.7578,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.19723617810305571,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.00027428467279207316,
|
|
"loss": 2.7845,
|
|
"step": 691
|
|
},
|
|
{
|
|
"epoch": 0.19752161396138143,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00027420854231932515,
|
|
"loss": 2.7833,
|
|
"step": 692
|
|
},
|
|
{
|
|
"epoch": 0.19780704981970712,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0002741323099186944,
|
|
"loss": 2.7835,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 0.1980924856780328,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00027405597565273866,
|
|
"loss": 2.7663,
|
|
"step": 694
|
|
},
|
|
{
|
|
"epoch": 0.1983779215363585,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.00027397953958409923,
|
|
"loss": 2.7737,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.19866335739468421,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00027390300177550106,
|
|
"loss": 2.7501,
|
|
"step": 696
|
|
},
|
|
{
|
|
"epoch": 0.1989487932530099,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0002738263622897525,
|
|
"loss": 2.7862,
|
|
"step": 697
|
|
},
|
|
{
|
|
"epoch": 0.1992342291113356,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0002737496211897453,
|
|
"loss": 2.7629,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 0.19951966496966128,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0002736727785384548,
|
|
"loss": 2.7394,
|
|
"step": 699
|
|
},
|
|
{
|
|
"epoch": 0.199805100827987,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00027359583439893944,
|
|
"loss": 2.7867,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.2000905366863127,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00027351878883434105,
|
|
"loss": 2.7564,
|
|
"step": 701
|
|
},
|
|
{
|
|
"epoch": 0.20037597254463838,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0002734416419078847,
|
|
"loss": 2.7623,
|
|
"step": 702
|
|
},
|
|
{
|
|
"epoch": 0.20066140840296406,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00027336439368287857,
|
|
"loss": 2.7678,
|
|
"step": 703
|
|
},
|
|
{
|
|
"epoch": 0.20094684426128978,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0002732870442227141,
|
|
"loss": 2.7727,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 0.20123228011961547,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00027320959359086565,
|
|
"loss": 2.7808,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.20151771597794116,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002731320418508907,
|
|
"loss": 2.7509,
|
|
"step": 706
|
|
},
|
|
{
|
|
"epoch": 0.20180315183626685,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0002730543890664297,
|
|
"loss": 2.7839,
|
|
"step": 707
|
|
},
|
|
{
|
|
"epoch": 0.20208858769459256,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0002729766353012059,
|
|
"loss": 2.7573,
|
|
"step": 708
|
|
},
|
|
{
|
|
"epoch": 0.20237402355291825,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0002728987806190257,
|
|
"loss": 2.7872,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 0.20265945941124394,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.00027282082508377795,
|
|
"loss": 2.7727,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.20294489526956963,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0002727427687594345,
|
|
"loss": 2.7632,
|
|
"step": 711
|
|
},
|
|
{
|
|
"epoch": 0.20323033112789535,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00027266461171004985,
|
|
"loss": 2.7631,
|
|
"step": 712
|
|
},
|
|
{
|
|
"epoch": 0.20351576698622104,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00027258635399976115,
|
|
"loss": 2.768,
|
|
"step": 713
|
|
},
|
|
{
|
|
"epoch": 0.20380120284454672,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00027250799569278816,
|
|
"loss": 2.7666,
|
|
"step": 714
|
|
},
|
|
{
|
|
"epoch": 0.2040866387028724,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00027242953685343327,
|
|
"loss": 2.7794,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.20437207456119813,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0002723509775460811,
|
|
"loss": 2.7449,
|
|
"step": 716
|
|
},
|
|
{
|
|
"epoch": 0.20465751041952382,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00027227231783519913,
|
|
"loss": 2.7529,
|
|
"step": 717
|
|
},
|
|
{
|
|
"epoch": 0.2049429462778495,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0002721935577853368,
|
|
"loss": 2.7785,
|
|
"step": 718
|
|
},
|
|
{
|
|
"epoch": 0.2052283821361752,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.00027211469746112624,
|
|
"loss": 2.7653,
|
|
"step": 719
|
|
},
|
|
{
|
|
"epoch": 0.2055138179945009,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00027203573692728174,
|
|
"loss": 2.7664,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.2057992538528266,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002719566762485997,
|
|
"loss": 2.7677,
|
|
"step": 721
|
|
},
|
|
{
|
|
"epoch": 0.2060846897111523,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0002718775154899589,
|
|
"loss": 2.7667,
|
|
"step": 722
|
|
},
|
|
{
|
|
"epoch": 0.20637012556947798,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0002717982547163201,
|
|
"loss": 2.7674,
|
|
"step": 723
|
|
},
|
|
{
|
|
"epoch": 0.20665556142780367,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0002717188939927262,
|
|
"loss": 2.7747,
|
|
"step": 724
|
|
},
|
|
{
|
|
"epoch": 0.20694099728612939,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.00027163943338430214,
|
|
"loss": 2.7299,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.20722643314445507,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0002715598729562548,
|
|
"loss": 2.7672,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 0.20751186900278076,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.000271480212773873,
|
|
"loss": 2.7847,
|
|
"step": 727
|
|
},
|
|
{
|
|
"epoch": 0.20779730486110645,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0002714004529025273,
|
|
"loss": 2.7886,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 0.20808274071943217,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00027132059340767025,
|
|
"loss": 2.7586,
|
|
"step": 729
|
|
},
|
|
{
|
|
"epoch": 0.20836817657775786,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00027124063435483603,
|
|
"loss": 2.779,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.20865361243608355,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002711605758096406,
|
|
"loss": 2.7593,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 0.20893904829440924,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0002710804178377814,
|
|
"loss": 2.7684,
|
|
"step": 732
|
|
},
|
|
{
|
|
"epoch": 0.20922448415273495,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0002710001605050377,
|
|
"loss": 2.7542,
|
|
"step": 733
|
|
},
|
|
{
|
|
"epoch": 0.20950992001106064,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00027091980387727014,
|
|
"loss": 2.7644,
|
|
"step": 734
|
|
},
|
|
{
|
|
"epoch": 0.20979535586938633,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00027083934802042084,
|
|
"loss": 2.7772,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.21008079172771202,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0002707587930005136,
|
|
"loss": 2.7419,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 0.21036622758603774,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0002706781388836531,
|
|
"loss": 2.7889,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 0.21065166344436342,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00027059738573602583,
|
|
"loss": 2.768,
|
|
"step": 738
|
|
},
|
|
{
|
|
"epoch": 0.2109370993026891,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00027051653362389935,
|
|
"loss": 2.8016,
|
|
"step": 739
|
|
},
|
|
{
|
|
"epoch": 0.2112225351610148,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0002704355826136224,
|
|
"loss": 2.758,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.21150797101934052,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0002703545327716249,
|
|
"loss": 2.7658,
|
|
"step": 741
|
|
},
|
|
{
|
|
"epoch": 0.2117934068776662,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00027027338416441785,
|
|
"loss": 2.7693,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 0.2120788427359919,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0002701921368585934,
|
|
"loss": 2.7948,
|
|
"step": 743
|
|
},
|
|
{
|
|
"epoch": 0.21236427859431758,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0002701107909208246,
|
|
"loss": 2.7832,
|
|
"step": 744
|
|
},
|
|
{
|
|
"epoch": 0.2126497144526433,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00027002934641786545,
|
|
"loss": 2.7851,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.212935150310969,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00026994780341655093,
|
|
"loss": 2.7461,
|
|
"step": 746
|
|
},
|
|
{
|
|
"epoch": 0.21322058616929468,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0002698661619837967,
|
|
"loss": 2.7511,
|
|
"step": 747
|
|
},
|
|
{
|
|
"epoch": 0.21350602202762037,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0002697844221865993,
|
|
"loss": 2.7562,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 0.21379145788594608,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.00026970258409203594,
|
|
"loss": 2.729,
|
|
"step": 749
|
|
},
|
|
{
|
|
"epoch": 0.21407689374427177,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00026962064776726445,
|
|
"loss": 2.7467,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.21407689374427177,
|
|
"eval_loss": 2.6212494373321533,
|
|
"eval_runtime": 5936.0633,
|
|
"eval_samples_per_second": 10.83,
|
|
"eval_steps_per_second": 10.83,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.21436232960259746,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002695386132795234,
|
|
"loss": 2.7875,
|
|
"step": 751
|
|
},
|
|
{
|
|
"epoch": 0.21464776546092315,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0002694564806961319,
|
|
"loss": 2.7879,
|
|
"step": 752
|
|
},
|
|
{
|
|
"epoch": 0.21493320131924887,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.00026937425008448937,
|
|
"loss": 2.7634,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 0.21521863717757456,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0002692919215120759,
|
|
"loss": 2.7563,
|
|
"step": 754
|
|
},
|
|
{
|
|
"epoch": 0.21550407303590025,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0002692094950464519,
|
|
"loss": 2.7836,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.21578950889422593,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000269126970755258,
|
|
"loss": 2.7366,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 0.21607494475255165,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00026904434870621524,
|
|
"loss": 2.7813,
|
|
"step": 757
|
|
},
|
|
{
|
|
"epoch": 0.21636038061087734,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00026896162896712476,
|
|
"loss": 2.7718,
|
|
"step": 758
|
|
},
|
|
{
|
|
"epoch": 0.21664581646920303,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.00026887881160586813,
|
|
"loss": 2.7536,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 0.21693125232752872,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0002687958966904067,
|
|
"loss": 2.7619,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.21721668818585443,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00026871288428878206,
|
|
"loss": 2.7672,
|
|
"step": 761
|
|
},
|
|
{
|
|
"epoch": 0.21750212404418012,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0002686297744691158,
|
|
"loss": 2.7571,
|
|
"step": 762
|
|
},
|
|
{
|
|
"epoch": 0.2177875599025058,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0002685465672996093,
|
|
"loss": 2.7652,
|
|
"step": 763
|
|
},
|
|
{
|
|
"epoch": 0.2180729957608315,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000268463262848544,
|
|
"loss": 2.7748,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 0.21835843161915722,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0002683798611842812,
|
|
"loss": 2.7583,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.2186438674774829,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0002682963623752617,
|
|
"loss": 2.7586,
|
|
"step": 766
|
|
},
|
|
{
|
|
"epoch": 0.2189293033358086,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0002682127664900064,
|
|
"loss": 2.7338,
|
|
"step": 767
|
|
},
|
|
{
|
|
"epoch": 0.21921473919413428,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0002681290735971156,
|
|
"loss": 2.752,
|
|
"step": 768
|
|
},
|
|
{
|
|
"epoch": 0.21950017505246,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0002680452837652691,
|
|
"loss": 2.7629,
|
|
"step": 769
|
|
},
|
|
{
|
|
"epoch": 0.2197856109107857,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002679613970632267,
|
|
"loss": 2.7652,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.22007104676911138,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0002678774135598272,
|
|
"loss": 2.7537,
|
|
"step": 771
|
|
},
|
|
{
|
|
"epoch": 0.22035648262743707,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00026779333332398923,
|
|
"loss": 2.7141,
|
|
"step": 772
|
|
},
|
|
{
|
|
"epoch": 0.22064191848576278,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0002677091564247105,
|
|
"loss": 2.757,
|
|
"step": 773
|
|
},
|
|
{
|
|
"epoch": 0.22092735434408847,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0002676248829310682,
|
|
"loss": 2.7454,
|
|
"step": 774
|
|
},
|
|
{
|
|
"epoch": 0.22121279020241416,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0002675405129122188,
|
|
"loss": 2.7545,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.22149822606073985,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0002674560464373979,
|
|
"loss": 2.7331,
|
|
"step": 776
|
|
},
|
|
{
|
|
"epoch": 0.22178366191906557,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0002673714835759202,
|
|
"loss": 2.7603,
|
|
"step": 777
|
|
},
|
|
{
|
|
"epoch": 0.22206909777739126,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00026728682439717974,
|
|
"loss": 2.7551,
|
|
"step": 778
|
|
},
|
|
{
|
|
"epoch": 0.22235453363571694,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0002672020689706493,
|
|
"loss": 2.7814,
|
|
"step": 779
|
|
},
|
|
{
|
|
"epoch": 0.22263996949404263,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00026711721736588103,
|
|
"loss": 2.7604,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.22292540535236835,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00026703226965250546,
|
|
"loss": 2.7551,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 0.22321084121069404,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00026694722590023246,
|
|
"loss": 2.7357,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 0.22349627706901973,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00026686208617885055,
|
|
"loss": 2.7532,
|
|
"step": 783
|
|
},
|
|
{
|
|
"epoch": 0.22378171292734542,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0002667768505582269,
|
|
"loss": 2.7388,
|
|
"step": 784
|
|
},
|
|
{
|
|
"epoch": 0.22406714878567113,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0002666915191083076,
|
|
"loss": 2.7594,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.22435258464399682,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00026660609189911724,
|
|
"loss": 2.7504,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 0.2246380205023225,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00026652056900075885,
|
|
"loss": 2.7631,
|
|
"step": 787
|
|
},
|
|
{
|
|
"epoch": 0.2249234563606482,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002664349504834143,
|
|
"loss": 2.7534,
|
|
"step": 788
|
|
},
|
|
{
|
|
"epoch": 0.22520889221897392,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00026634923641734374,
|
|
"loss": 2.7584,
|
|
"step": 789
|
|
},
|
|
{
|
|
"epoch": 0.2254943280772996,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00026626342687288576,
|
|
"loss": 2.7519,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.2257797639356253,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002661775219204572,
|
|
"loss": 2.7477,
|
|
"step": 791
|
|
},
|
|
{
|
|
"epoch": 0.22606519979395098,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0002660915216305534,
|
|
"loss": 2.7484,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 0.22635063565227667,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0002660054260737478,
|
|
"loss": 2.7718,
|
|
"step": 793
|
|
},
|
|
{
|
|
"epoch": 0.2266360715106024,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.000265919235320692,
|
|
"loss": 2.7437,
|
|
"step": 794
|
|
},
|
|
{
|
|
"epoch": 0.22692150736892808,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00026583294944211583,
|
|
"loss": 2.7564,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.22720694322725377,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00026574656850882706,
|
|
"loss": 2.7322,
|
|
"step": 796
|
|
},
|
|
{
|
|
"epoch": 0.22749237908557945,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0002656600925917116,
|
|
"loss": 2.7623,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 0.22777781494390517,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00026557352176173317,
|
|
"loss": 2.7294,
|
|
"step": 798
|
|
},
|
|
{
|
|
"epoch": 0.22806325080223086,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.00026548685608993337,
|
|
"loss": 2.7457,
|
|
"step": 799
|
|
},
|
|
{
|
|
"epoch": 0.22834868666055655,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0002654000956474318,
|
|
"loss": 2.7512,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.22863412251888224,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0002653132405054257,
|
|
"loss": 2.7251,
|
|
"step": 801
|
|
},
|
|
{
|
|
"epoch": 0.22891955837720795,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00026522629073519,
|
|
"loss": 2.7645,
|
|
"step": 802
|
|
},
|
|
{
|
|
"epoch": 0.22920499423553364,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00026513924640807733,
|
|
"loss": 2.7856,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 0.22949043009385933,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000265052107595518,
|
|
"loss": 2.7234,
|
|
"step": 804
|
|
},
|
|
{
|
|
"epoch": 0.22977586595218502,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00026496487436901964,
|
|
"loss": 2.7626,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.23006130181051074,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00026487754680016765,
|
|
"loss": 2.7252,
|
|
"step": 806
|
|
},
|
|
{
|
|
"epoch": 0.23034673766883643,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0002647901249606245,
|
|
"loss": 2.7371,
|
|
"step": 807
|
|
},
|
|
{
|
|
"epoch": 0.23063217352716212,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00026470260892213034,
|
|
"loss": 2.7533,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 0.2309176093854878,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00026461499875650245,
|
|
"loss": 2.7512,
|
|
"step": 809
|
|
},
|
|
{
|
|
"epoch": 0.23120304524381352,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0002645272945356354,
|
|
"loss": 2.7423,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.2314884811021392,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0002644394963315009,
|
|
"loss": 2.7495,
|
|
"step": 811
|
|
},
|
|
{
|
|
"epoch": 0.2317739169604649,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00026435160421614784,
|
|
"loss": 2.7378,
|
|
"step": 812
|
|
},
|
|
{
|
|
"epoch": 0.2320593528187906,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0002642636182617022,
|
|
"loss": 2.7887,
|
|
"step": 813
|
|
},
|
|
{
|
|
"epoch": 0.2323447886771163,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0002641755385403669,
|
|
"loss": 2.7452,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 0.232630224535442,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0002640873651244217,
|
|
"loss": 2.7407,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.23291566039376768,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0002639990980862236,
|
|
"loss": 2.7571,
|
|
"step": 816
|
|
},
|
|
{
|
|
"epoch": 0.23320109625209337,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00026391073749820607,
|
|
"loss": 2.7219,
|
|
"step": 817
|
|
},
|
|
{
|
|
"epoch": 0.2334865321104191,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00026382228343287947,
|
|
"loss": 2.7314,
|
|
"step": 818
|
|
},
|
|
{
|
|
"epoch": 0.23377196796874478,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0002637337359628309,
|
|
"loss": 2.7363,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 0.23405740382707046,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00026364509516072415,
|
|
"loss": 2.7455,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.23434283968539615,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.00026355636109929946,
|
|
"loss": 2.7301,
|
|
"step": 821
|
|
},
|
|
{
|
|
"epoch": 0.23462827554372187,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0002634675338513738,
|
|
"loss": 2.733,
|
|
"step": 822
|
|
},
|
|
{
|
|
"epoch": 0.23491371140204756,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00026337861348984024,
|
|
"loss": 2.7564,
|
|
"step": 823
|
|
},
|
|
{
|
|
"epoch": 0.23519914726037325,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00026328960008766884,
|
|
"loss": 2.7489,
|
|
"step": 824
|
|
},
|
|
{
|
|
"epoch": 0.23548458311869894,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0002632004937179055,
|
|
"loss": 2.7493,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.23577001897702465,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00026311129445367255,
|
|
"loss": 2.7289,
|
|
"step": 826
|
|
},
|
|
{
|
|
"epoch": 0.23605545483535034,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0002630220023681687,
|
|
"loss": 2.7193,
|
|
"step": 827
|
|
},
|
|
{
|
|
"epoch": 0.23634089069367603,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0002629326175346687,
|
|
"loss": 2.738,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 0.23662632655200172,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0002628431400265235,
|
|
"loss": 2.7497,
|
|
"step": 829
|
|
},
|
|
{
|
|
"epoch": 0.23691176241032744,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00026275356991715986,
|
|
"loss": 2.7239,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.23719719826865313,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0002626639072800809,
|
|
"loss": 2.7372,
|
|
"step": 831
|
|
},
|
|
{
|
|
"epoch": 0.23748263412697881,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00026257415218886536,
|
|
"loss": 2.7284,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 0.2377680699853045,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00026248430471716795,
|
|
"loss": 2.7515,
|
|
"step": 833
|
|
},
|
|
{
|
|
"epoch": 0.23805350584363022,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0002623943649387194,
|
|
"loss": 2.7412,
|
|
"step": 834
|
|
},
|
|
{
|
|
"epoch": 0.2383389417019559,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0002623043329273257,
|
|
"loss": 2.7339,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.2386243775602816,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0002622142087568691,
|
|
"loss": 2.7482,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 0.2389098134186073,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00026212399250130706,
|
|
"loss": 2.7411,
|
|
"step": 837
|
|
},
|
|
{
|
|
"epoch": 0.239195249276933,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0002620336842346728,
|
|
"loss": 2.7394,
|
|
"step": 838
|
|
},
|
|
{
|
|
"epoch": 0.2394806851352587,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0002619432840310749,
|
|
"loss": 2.6938,
|
|
"step": 839
|
|
},
|
|
{
|
|
"epoch": 0.23976612099358438,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00026185279196469757,
|
|
"loss": 2.7298,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.24005155685191007,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00026176220810980035,
|
|
"loss": 2.7237,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 0.2403369927102358,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00026167153254071795,
|
|
"loss": 2.742,
|
|
"step": 842
|
|
},
|
|
{
|
|
"epoch": 0.24062242856856147,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0002615807653318605,
|
|
"loss": 2.7514,
|
|
"step": 843
|
|
},
|
|
{
|
|
"epoch": 0.24090786442688716,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002614899065577133,
|
|
"loss": 2.7606,
|
|
"step": 844
|
|
},
|
|
{
|
|
"epoch": 0.24119330028521285,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0002613989562928369,
|
|
"loss": 2.7474,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.24147873614353857,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00026130791461186656,
|
|
"loss": 2.7309,
|
|
"step": 846
|
|
},
|
|
{
|
|
"epoch": 0.24176417200186426,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.000261216781589513,
|
|
"loss": 2.726,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 0.24204960786018995,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002611255573005617,
|
|
"loss": 2.7471,
|
|
"step": 848
|
|
},
|
|
{
|
|
"epoch": 0.24233504371851564,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00026103424181987293,
|
|
"loss": 2.7328,
|
|
"step": 849
|
|
},
|
|
{
|
|
"epoch": 0.24262047957684135,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00026094283522238204,
|
|
"loss": 2.755,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.24290591543516704,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00026085133758309883,
|
|
"loss": 2.7581,
|
|
"step": 851
|
|
},
|
|
{
|
|
"epoch": 0.24319135129349273,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00026075974897710815,
|
|
"loss": 2.7312,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 0.24347678715181842,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0002606680694795693,
|
|
"loss": 2.7274,
|
|
"step": 853
|
|
},
|
|
{
|
|
"epoch": 0.24376222301014414,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0002605762991657163,
|
|
"loss": 2.7208,
|
|
"step": 854
|
|
},
|
|
{
|
|
"epoch": 0.24404765886846982,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.00026048443811085744,
|
|
"loss": 2.7326,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.2443330947267955,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.00026039248639037575,
|
|
"loss": 2.7559,
|
|
"step": 856
|
|
},
|
|
{
|
|
"epoch": 0.2446185305851212,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.00026030044407972854,
|
|
"loss": 2.7389,
|
|
"step": 857
|
|
},
|
|
{
|
|
"epoch": 0.24490396644344692,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00026020831125444745,
|
|
"loss": 2.7434,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 0.2451894023017726,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0002601160879901384,
|
|
"loss": 2.745,
|
|
"step": 859
|
|
},
|
|
{
|
|
"epoch": 0.2454748381600983,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0002600237743624816,
|
|
"loss": 2.74,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.24576027401842399,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00025993137044723135,
|
|
"loss": 2.736,
|
|
"step": 861
|
|
},
|
|
{
|
|
"epoch": 0.2460457098767497,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0002598388763202161,
|
|
"loss": 2.7447,
|
|
"step": 862
|
|
},
|
|
{
|
|
"epoch": 0.2463311457350754,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0002597462920573381,
|
|
"loss": 2.7457,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 0.24661658159340108,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.000259653617734574,
|
|
"loss": 2.7256,
|
|
"step": 864
|
|
},
|
|
{
|
|
"epoch": 0.24690201745172677,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00025956085342797395,
|
|
"loss": 2.7233,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.24718745331005246,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00025946799921366205,
|
|
"loss": 2.7471,
|
|
"step": 866
|
|
},
|
|
{
|
|
"epoch": 0.24747288916837817,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0002593750551678364,
|
|
"loss": 2.7426,
|
|
"step": 867
|
|
},
|
|
{
|
|
"epoch": 0.24775832502670386,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00025928202136676855,
|
|
"loss": 2.6968,
|
|
"step": 868
|
|
},
|
|
{
|
|
"epoch": 0.24804376088502955,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0002591888978868038,
|
|
"loss": 2.7192,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 0.24832919674335524,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.000259095684804361,
|
|
"loss": 2.7436,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.24861463260168096,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0002590023821959326,
|
|
"loss": 2.7627,
|
|
"step": 871
|
|
},
|
|
{
|
|
"epoch": 0.24890006846000665,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00025890899013808455,
|
|
"loss": 2.7603,
|
|
"step": 872
|
|
},
|
|
{
|
|
"epoch": 0.24918550431833233,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0002588155087074561,
|
|
"loss": 2.7315,
|
|
"step": 873
|
|
},
|
|
{
|
|
"epoch": 0.24947094017665802,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00025872193798075985,
|
|
"loss": 2.7302,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 0.24975637603498374,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0002586282780347818,
|
|
"loss": 2.7236,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.25004181189330943,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00025853452894638093,
|
|
"loss": 2.7152,
|
|
"step": 876
|
|
},
|
|
{
|
|
"epoch": 0.2503272477516351,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00025844069079248964,
|
|
"loss": 2.7169,
|
|
"step": 877
|
|
},
|
|
{
|
|
"epoch": 0.2506126836099608,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00025834676365011326,
|
|
"loss": 2.7202,
|
|
"step": 878
|
|
},
|
|
{
|
|
"epoch": 0.2508981194682865,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00025825274759633016,
|
|
"loss": 2.7239,
|
|
"step": 879
|
|
},
|
|
{
|
|
"epoch": 0.2511835553266122,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002581586427082918,
|
|
"loss": 2.7023,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.25146899118493793,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0002580644490632222,
|
|
"loss": 2.7203,
|
|
"step": 881
|
|
},
|
|
{
|
|
"epoch": 0.2517544270432636,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0002579701667384187,
|
|
"loss": 2.7288,
|
|
"step": 882
|
|
},
|
|
{
|
|
"epoch": 0.2520398629015893,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00025787579581125107,
|
|
"loss": 2.7284,
|
|
"step": 883
|
|
},
|
|
{
|
|
"epoch": 0.252325298759915,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00025778133635916183,
|
|
"loss": 2.7377,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 0.2526107346182407,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0002576867884596663,
|
|
"loss": 2.7267,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.2528961704765664,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00025759215219035213,
|
|
"loss": 2.723,
|
|
"step": 886
|
|
},
|
|
{
|
|
"epoch": 0.25318160633489206,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00025749742762887977,
|
|
"loss": 2.7178,
|
|
"step": 887
|
|
},
|
|
{
|
|
"epoch": 0.25346704219321775,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00025740261485298195,
|
|
"loss": 2.7387,
|
|
"step": 888
|
|
},
|
|
{
|
|
"epoch": 0.2537524780515435,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0002573077139404638,
|
|
"loss": 2.7513,
|
|
"step": 889
|
|
},
|
|
{
|
|
"epoch": 0.2540379139098692,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0002572127249692028,
|
|
"loss": 2.7288,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.2543233497681949,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00025711764801714874,
|
|
"loss": 2.7322,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 0.25460878562652056,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00025702248316232355,
|
|
"loss": 2.7598,
|
|
"step": 892
|
|
},
|
|
{
|
|
"epoch": 0.25489422148484625,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0002569272304828213,
|
|
"loss": 2.7304,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 0.25517965734317194,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00025683189005680827,
|
|
"loss": 2.7288,
|
|
"step": 894
|
|
},
|
|
{
|
|
"epoch": 0.25546509320149763,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0002567364619625224,
|
|
"loss": 2.753,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.2557505290598233,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00025664094627827393,
|
|
"loss": 2.7233,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 0.25603596491814906,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00025654534308244484,
|
|
"loss": 2.731,
|
|
"step": 897
|
|
},
|
|
{
|
|
"epoch": 0.25632140077647475,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0002564496524534888,
|
|
"loss": 2.7177,
|
|
"step": 898
|
|
},
|
|
{
|
|
"epoch": 0.25660683663480044,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00025635387446993154,
|
|
"loss": 2.7327,
|
|
"step": 899
|
|
},
|
|
{
|
|
"epoch": 0.25689227249312613,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0002562580092103702,
|
|
"loss": 2.7251,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.2571777083514518,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00025616205675347355,
|
|
"loss": 2.7005,
|
|
"step": 901
|
|
},
|
|
{
|
|
"epoch": 0.2574631442097775,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00025606601717798207,
|
|
"loss": 2.7263,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 0.2577485800681032,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0002559698905627077,
|
|
"loss": 2.6863,
|
|
"step": 903
|
|
},
|
|
{
|
|
"epoch": 0.2580340159264289,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00025587367698653367,
|
|
"loss": 2.718,
|
|
"step": 904
|
|
},
|
|
{
|
|
"epoch": 0.25831945178475463,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0002557773765284148,
|
|
"loss": 2.7263,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.2586048876430803,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0002556809892673769,
|
|
"loss": 2.7485,
|
|
"step": 906
|
|
},
|
|
{
|
|
"epoch": 0.258890323501406,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0002555845152825173,
|
|
"loss": 2.6922,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 0.2591757593597317,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00025548795465300426,
|
|
"loss": 2.7269,
|
|
"step": 908
|
|
},
|
|
{
|
|
"epoch": 0.2594611952180574,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0002553913074580774,
|
|
"loss": 2.7466,
|
|
"step": 909
|
|
},
|
|
{
|
|
"epoch": 0.25974663107638307,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00025529457377704713,
|
|
"loss": 2.728,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.26003206693470876,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002551977536892951,
|
|
"loss": 2.7171,
|
|
"step": 911
|
|
},
|
|
{
|
|
"epoch": 0.26031750279303445,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0002551008472742735,
|
|
"loss": 2.7028,
|
|
"step": 912
|
|
},
|
|
{
|
|
"epoch": 0.2606029386513602,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00025500385461150565,
|
|
"loss": 2.7107,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 0.2608883745096859,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0002549067757805856,
|
|
"loss": 2.7452,
|
|
"step": 914
|
|
},
|
|
{
|
|
"epoch": 0.26117381036801157,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00025480961086117815,
|
|
"loss": 2.7045,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.26145924622633726,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002547123599330185,
|
|
"loss": 2.72,
|
|
"step": 916
|
|
},
|
|
{
|
|
"epoch": 0.26174468208466295,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00025461502307591274,
|
|
"loss": 2.7136,
|
|
"step": 917
|
|
},
|
|
{
|
|
"epoch": 0.26203011794298864,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0002545176003697372,
|
|
"loss": 2.7097,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 0.2623155538013143,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000254420091894439,
|
|
"loss": 2.7218,
|
|
"step": 919
|
|
},
|
|
{
|
|
"epoch": 0.26260098965964,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0002543224977300352,
|
|
"loss": 2.6923,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.26288642551796576,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0002542248179566137,
|
|
"loss": 2.735,
|
|
"step": 921
|
|
},
|
|
{
|
|
"epoch": 0.26317186137629145,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0002541270526543321,
|
|
"loss": 2.7211,
|
|
"step": 922
|
|
},
|
|
{
|
|
"epoch": 0.26345729723461714,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00025402920190341864,
|
|
"loss": 2.73,
|
|
"step": 923
|
|
},
|
|
{
|
|
"epoch": 0.2637427330929428,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0002539312657841714,
|
|
"loss": 2.7038,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 0.2640281689512685,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0002538332443769587,
|
|
"loss": 2.7209,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.2643136048095942,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 0.0002537351377622187,
|
|
"loss": 2.7053,
|
|
"step": 926
|
|
},
|
|
{
|
|
"epoch": 0.2645990406679199,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 0.00025363694602045957,
|
|
"loss": 2.7378,
|
|
"step": 927
|
|
},
|
|
{
|
|
"epoch": 0.2648844765262456,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0002535386692322593,
|
|
"loss": 2.7339,
|
|
"step": 928
|
|
},
|
|
{
|
|
"epoch": 0.2651699123845713,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0002534403074782657,
|
|
"loss": 2.7474,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 0.265455348242897,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00025334186083919623,
|
|
"loss": 2.7283,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.2657407841012227,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00025324332939583813,
|
|
"loss": 2.7195,
|
|
"step": 931
|
|
},
|
|
{
|
|
"epoch": 0.2660262199595484,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002531447132290482,
|
|
"loss": 2.7133,
|
|
"step": 932
|
|
},
|
|
{
|
|
"epoch": 0.2663116558178741,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00025304601241975266,
|
|
"loss": 2.737,
|
|
"step": 933
|
|
},
|
|
{
|
|
"epoch": 0.26659709167619977,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0002529472270489473,
|
|
"loss": 2.7129,
|
|
"step": 934
|
|
},
|
|
{
|
|
"epoch": 0.26688252753452546,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0002528483571976973,
|
|
"loss": 2.7195,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.26716796339285115,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00025274940294713706,
|
|
"loss": 2.694,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 0.2674533992511769,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00025265036437847036,
|
|
"loss": 2.739,
|
|
"step": 937
|
|
},
|
|
{
|
|
"epoch": 0.2677388351095026,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0002525512415729701,
|
|
"loss": 2.7,
|
|
"step": 938
|
|
},
|
|
{
|
|
"epoch": 0.26802427096782827,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00025245203461197834,
|
|
"loss": 2.7329,
|
|
"step": 939
|
|
},
|
|
{
|
|
"epoch": 0.26830970682615396,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0002523527435769062,
|
|
"loss": 2.7321,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.26859514268447965,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0002522533685492338,
|
|
"loss": 2.7274,
|
|
"step": 941
|
|
},
|
|
{
|
|
"epoch": 0.26888057854280534,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0002521539096105101,
|
|
"loss": 2.719,
|
|
"step": 942
|
|
},
|
|
{
|
|
"epoch": 0.269166014401131,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00025205436684235313,
|
|
"loss": 2.7257,
|
|
"step": 943
|
|
},
|
|
{
|
|
"epoch": 0.2694514502594567,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002519547403264494,
|
|
"loss": 2.7126,
|
|
"step": 944
|
|
},
|
|
{
|
|
"epoch": 0.2697368861177824,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00025185503014455443,
|
|
"loss": 2.7297,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.27002232197610815,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00025175523637849224,
|
|
"loss": 2.7324,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 0.27030775783443384,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0002516553591101555,
|
|
"loss": 2.7367,
|
|
"step": 947
|
|
},
|
|
{
|
|
"epoch": 0.2705931936927595,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00025155539842150535,
|
|
"loss": 2.6977,
|
|
"step": 948
|
|
},
|
|
{
|
|
"epoch": 0.2708786295510852,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002514553543945715,
|
|
"loss": 2.6864,
|
|
"step": 949
|
|
},
|
|
{
|
|
"epoch": 0.2711640654094109,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00025135522711145197,
|
|
"loss": 2.7111,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.2714495012677366,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000251255016654313,
|
|
"loss": 2.7124,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 0.2717349371260623,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0002511547231053893,
|
|
"loss": 2.6945,
|
|
"step": 952
|
|
},
|
|
{
|
|
"epoch": 0.27202037298438797,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00025105434654698356,
|
|
"loss": 2.7364,
|
|
"step": 953
|
|
},
|
|
{
|
|
"epoch": 0.2723058088427137,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00025095388706146676,
|
|
"loss": 2.7086,
|
|
"step": 954
|
|
},
|
|
{
|
|
"epoch": 0.2725912447010394,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00025085334473127786,
|
|
"loss": 2.7037,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.2728766805593651,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0002507527196389238,
|
|
"loss": 2.7295,
|
|
"step": 956
|
|
},
|
|
{
|
|
"epoch": 0.2731621164176908,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0002506520118669794,
|
|
"loss": 2.6829,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 0.27344755227601647,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0002505512214980873,
|
|
"loss": 2.6869,
|
|
"step": 958
|
|
},
|
|
{
|
|
"epoch": 0.27373298813434216,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0002504503486149581,
|
|
"loss": 2.6919,
|
|
"step": 959
|
|
},
|
|
{
|
|
"epoch": 0.27401842399266785,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00025034939330037,
|
|
"loss": 2.6851,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.27430385985099354,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0002502483556371688,
|
|
"loss": 2.7326,
|
|
"step": 961
|
|
},
|
|
{
|
|
"epoch": 0.2745892957093193,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00025014723570826794,
|
|
"loss": 2.7369,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 0.27487473156764497,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00025004603359664833,
|
|
"loss": 2.7398,
|
|
"step": 963
|
|
},
|
|
{
|
|
"epoch": 0.27516016742597066,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0002499447493853583,
|
|
"loss": 2.7145,
|
|
"step": 964
|
|
},
|
|
{
|
|
"epoch": 0.27544560328429635,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00024984338315751366,
|
|
"loss": 2.733,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.27573103914262204,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00024974193499629745,
|
|
"loss": 2.707,
|
|
"step": 966
|
|
},
|
|
{
|
|
"epoch": 0.2760164750009477,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00024964040498496,
|
|
"loss": 2.7282,
|
|
"step": 967
|
|
},
|
|
{
|
|
"epoch": 0.2763019108592734,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00024953879320681853,
|
|
"loss": 2.7208,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 0.2765873467175991,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00024943709974525793,
|
|
"loss": 2.7021,
|
|
"step": 969
|
|
},
|
|
{
|
|
"epoch": 0.27687278257592485,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00024933532468372955,
|
|
"loss": 2.7056,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.27715821843425054,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00024923346810575193,
|
|
"loss": 2.7342,
|
|
"step": 971
|
|
},
|
|
{
|
|
"epoch": 0.2774436542925762,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0002491315300949106,
|
|
"loss": 2.7258,
|
|
"step": 972
|
|
},
|
|
{
|
|
"epoch": 0.2777290901509019,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00024902951073485784,
|
|
"loss": 2.7053,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 0.2780145260092276,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00024892741010931264,
|
|
"loss": 2.7111,
|
|
"step": 974
|
|
},
|
|
{
|
|
"epoch": 0.2782999618675533,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0002488252283020606,
|
|
"loss": 2.6961,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.278585397725879,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00024872296539695427,
|
|
"loss": 2.7148,
|
|
"step": 976
|
|
},
|
|
{
|
|
"epoch": 0.27887083358420467,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00024862062147791233,
|
|
"loss": 2.7192,
|
|
"step": 977
|
|
},
|
|
{
|
|
"epoch": 0.2791562694425304,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00024851819662892016,
|
|
"loss": 2.725,
|
|
"step": 978
|
|
},
|
|
{
|
|
"epoch": 0.2794417053008561,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0002484156909340296,
|
|
"loss": 2.7303,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 0.2797271411591818,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00024831310447735874,
|
|
"loss": 2.6735,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.2800125770175075,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00024821043734309204,
|
|
"loss": 2.6935,
|
|
"step": 981
|
|
},
|
|
{
|
|
"epoch": 0.28029801287583317,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0002481076896154799,
|
|
"loss": 2.7103,
|
|
"step": 982
|
|
},
|
|
{
|
|
"epoch": 0.28058344873415886,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00024800486137883926,
|
|
"loss": 2.7239,
|
|
"step": 983
|
|
},
|
|
{
|
|
"epoch": 0.28086888459248455,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00024790195271755277,
|
|
"loss": 2.7289,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 0.28115432045081024,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0002477989637160694,
|
|
"loss": 2.7095,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.281439756309136,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0002476958944589037,
|
|
"loss": 2.6648,
|
|
"step": 986
|
|
},
|
|
{
|
|
"epoch": 0.28172519216746167,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0002475927450306363,
|
|
"loss": 2.666,
|
|
"step": 987
|
|
},
|
|
{
|
|
"epoch": 0.28201062802578736,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00024748951551591364,
|
|
"loss": 2.7152,
|
|
"step": 988
|
|
},
|
|
{
|
|
"epoch": 0.28229606388411305,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00024738620599944774,
|
|
"loss": 2.7102,
|
|
"step": 989
|
|
},
|
|
{
|
|
"epoch": 0.28258149974243874,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002472828165660164,
|
|
"loss": 2.7055,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.2828669356007644,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002471793473004629,
|
|
"loss": 2.7004,
|
|
"step": 991
|
|
},
|
|
{
|
|
"epoch": 0.2831523714590901,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0002470757982876961,
|
|
"loss": 2.6998,
|
|
"step": 992
|
|
},
|
|
{
|
|
"epoch": 0.2834378073174158,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00024697216961269035,
|
|
"loss": 2.7259,
|
|
"step": 993
|
|
},
|
|
{
|
|
"epoch": 0.28372324317574155,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0002468684613604852,
|
|
"loss": 2.6939,
|
|
"step": 994
|
|
},
|
|
{
|
|
"epoch": 0.28400867903406724,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00024676467361618563,
|
|
"loss": 2.7005,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.2842941148923929,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00024666080646496187,
|
|
"loss": 2.7153,
|
|
"step": 996
|
|
},
|
|
{
|
|
"epoch": 0.2845795507507186,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0002465568599920493,
|
|
"loss": 2.7052,
|
|
"step": 997
|
|
},
|
|
{
|
|
"epoch": 0.2848649866090443,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0002464528342827482,
|
|
"loss": 2.7191,
|
|
"step": 998
|
|
},
|
|
{
|
|
"epoch": 0.28515042246737,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00024634872942242423,
|
|
"loss": 2.7117,
|
|
"step": 999
|
|
},
|
|
{
|
|
"epoch": 0.2854358583256957,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0002462445454965077,
|
|
"loss": 2.6923,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.2854358583256957,
|
|
"eval_loss": 2.571556806564331,
|
|
"eval_runtime": 5980.855,
|
|
"eval_samples_per_second": 10.749,
|
|
"eval_steps_per_second": 10.749,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.28572129418402137,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00024614028259049397,
|
|
"loss": 2.6922,
|
|
"step": 1001
|
|
},
|
|
{
|
|
"epoch": 0.2860067300423471,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002460359407899431,
|
|
"loss": 2.7178,
|
|
"step": 1002
|
|
},
|
|
{
|
|
"epoch": 0.2862921659006728,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00024593152018048,
|
|
"loss": 2.696,
|
|
"step": 1003
|
|
},
|
|
{
|
|
"epoch": 0.2865776017589985,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00024582702084779414,
|
|
"loss": 2.6841,
|
|
"step": 1004
|
|
},
|
|
{
|
|
"epoch": 0.2868630376173242,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00024572244287763976,
|
|
"loss": 2.6869,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.28714847347564987,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0002456177863558354,
|
|
"loss": 2.7185,
|
|
"step": 1006
|
|
},
|
|
{
|
|
"epoch": 0.28743390933397556,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00024551305136826424,
|
|
"loss": 2.69,
|
|
"step": 1007
|
|
},
|
|
{
|
|
"epoch": 0.28771934519230125,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00024540823800087386,
|
|
"loss": 2.6593,
|
|
"step": 1008
|
|
},
|
|
{
|
|
"epoch": 0.28800478105062693,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00024530334633967595,
|
|
"loss": 2.6818,
|
|
"step": 1009
|
|
},
|
|
{
|
|
"epoch": 0.2882902169089526,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00024519837647074674,
|
|
"loss": 2.7043,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.28857565276727837,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00024509332848022636,
|
|
"loss": 2.7057,
|
|
"step": 1011
|
|
},
|
|
{
|
|
"epoch": 0.28886108862560406,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0002449882024543193,
|
|
"loss": 2.6855,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"epoch": 0.28914652448392975,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00024488299847929385,
|
|
"loss": 2.7012,
|
|
"step": 1013
|
|
},
|
|
{
|
|
"epoch": 0.28943196034225543,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0002447777166414825,
|
|
"loss": 2.7178,
|
|
"step": 1014
|
|
},
|
|
{
|
|
"epoch": 0.2897173962005811,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002446723570272814,
|
|
"loss": 2.6926,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.2900028320589068,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00024456691972315076,
|
|
"loss": 2.6914,
|
|
"step": 1016
|
|
},
|
|
{
|
|
"epoch": 0.2902882679172325,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0002444614048156144,
|
|
"loss": 2.6794,
|
|
"step": 1017
|
|
},
|
|
{
|
|
"epoch": 0.2905737037755582,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00024435581239125987,
|
|
"loss": 2.7046,
|
|
"step": 1018
|
|
},
|
|
{
|
|
"epoch": 0.29085913963388393,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002442501425367382,
|
|
"loss": 2.6849,
|
|
"step": 1019
|
|
},
|
|
{
|
|
"epoch": 0.2911445754922096,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0002441443953387642,
|
|
"loss": 2.6808,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.2914300113505353,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000244038570884116,
|
|
"loss": 2.6968,
|
|
"step": 1021
|
|
},
|
|
{
|
|
"epoch": 0.291715447208861,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00024393266925963505,
|
|
"loss": 2.6755,
|
|
"step": 1022
|
|
},
|
|
{
|
|
"epoch": 0.2920008830671867,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00024382669055222634,
|
|
"loss": 2.7195,
|
|
"step": 1023
|
|
},
|
|
{
|
|
"epoch": 0.2922863189255124,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000243720634848858,
|
|
"loss": 2.6943,
|
|
"step": 1024
|
|
},
|
|
{
|
|
"epoch": 0.29257175478383807,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0002436145022365613,
|
|
"loss": 2.7172,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.29285719064216376,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00024350829280243074,
|
|
"loss": 2.7061,
|
|
"step": 1026
|
|
},
|
|
{
|
|
"epoch": 0.2931426265004895,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00024340200663362368,
|
|
"loss": 2.6897,
|
|
"step": 1027
|
|
},
|
|
{
|
|
"epoch": 0.2934280623588152,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00024329564381736068,
|
|
"loss": 2.691,
|
|
"step": 1028
|
|
},
|
|
{
|
|
"epoch": 0.2937134982171409,
|
|
"grad_norm": 0.8828125,
|
|
"learning_rate": 0.000243189204440925,
|
|
"loss": 2.7367,
|
|
"step": 1029
|
|
},
|
|
{
|
|
"epoch": 0.29399893407546657,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0002430826885916629,
|
|
"loss": 2.6964,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.29428436993379226,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0002429760963569832,
|
|
"loss": 2.7204,
|
|
"step": 1031
|
|
},
|
|
{
|
|
"epoch": 0.29456980579211794,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00024286942782435753,
|
|
"loss": 2.7186,
|
|
"step": 1032
|
|
},
|
|
{
|
|
"epoch": 0.29485524165044363,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002427626830813202,
|
|
"loss": 2.6901,
|
|
"step": 1033
|
|
},
|
|
{
|
|
"epoch": 0.2951406775087693,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002426558622154679,
|
|
"loss": 2.7291,
|
|
"step": 1034
|
|
},
|
|
{
|
|
"epoch": 0.29542611336709507,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0002425489653144598,
|
|
"loss": 2.717,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.29571154922542076,
|
|
"grad_norm": 0.71484375,
|
|
"learning_rate": 0.0002424419924660176,
|
|
"loss": 2.7074,
|
|
"step": 1036
|
|
},
|
|
{
|
|
"epoch": 0.29599698508374644,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00024233494375792524,
|
|
"loss": 2.7174,
|
|
"step": 1037
|
|
},
|
|
{
|
|
"epoch": 0.29628242094207213,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00024222781927802888,
|
|
"loss": 2.6859,
|
|
"step": 1038
|
|
},
|
|
{
|
|
"epoch": 0.2965678568003978,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0002421206191142369,
|
|
"loss": 2.6916,
|
|
"step": 1039
|
|
},
|
|
{
|
|
"epoch": 0.2968532926587235,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00024201334335451988,
|
|
"loss": 2.7098,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.2971387285170492,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0002419059920869102,
|
|
"loss": 2.7105,
|
|
"step": 1041
|
|
},
|
|
{
|
|
"epoch": 0.2974241643753749,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0002417985653995024,
|
|
"loss": 2.7329,
|
|
"step": 1042
|
|
},
|
|
{
|
|
"epoch": 0.29770960023370063,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0002416910633804529,
|
|
"loss": 2.6864,
|
|
"step": 1043
|
|
},
|
|
{
|
|
"epoch": 0.2979950360920263,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00024158348611797985,
|
|
"loss": 2.6915,
|
|
"step": 1044
|
|
},
|
|
{
|
|
"epoch": 0.298280471950352,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0002414758337003632,
|
|
"loss": 2.71,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.2985659078086777,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.00024136810621594454,
|
|
"loss": 2.7174,
|
|
"step": 1046
|
|
},
|
|
{
|
|
"epoch": 0.2988513436670034,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002412603037531271,
|
|
"loss": 2.7106,
|
|
"step": 1047
|
|
},
|
|
{
|
|
"epoch": 0.2991367795253291,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00024115242640037569,
|
|
"loss": 2.7032,
|
|
"step": 1048
|
|
},
|
|
{
|
|
"epoch": 0.29942221538365477,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0002410444742462164,
|
|
"loss": 2.6975,
|
|
"step": 1049
|
|
},
|
|
{
|
|
"epoch": 0.29970765124198046,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00024093644737923682,
|
|
"loss": 2.6909,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.2999930871003062,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00024082834588808592,
|
|
"loss": 2.7097,
|
|
"step": 1051
|
|
},
|
|
{
|
|
"epoch": 0.3002785229586319,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0002407201698614738,
|
|
"loss": 2.7031,
|
|
"step": 1052
|
|
},
|
|
{
|
|
"epoch": 0.3005639588169576,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0002406119193881718,
|
|
"loss": 2.6834,
|
|
"step": 1053
|
|
},
|
|
{
|
|
"epoch": 0.30084939467528327,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00024050359455701217,
|
|
"loss": 2.7092,
|
|
"step": 1054
|
|
},
|
|
{
|
|
"epoch": 0.30113483053360895,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00024039519545688846,
|
|
"loss": 2.6838,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.30142026639193464,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00024028672217675493,
|
|
"loss": 2.7051,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"epoch": 0.30170570225026033,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00024017817480562686,
|
|
"loss": 2.698,
|
|
"step": 1057
|
|
},
|
|
{
|
|
"epoch": 0.301991138108586,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00024006955343258032,
|
|
"loss": 2.6918,
|
|
"step": 1058
|
|
},
|
|
{
|
|
"epoch": 0.30227657396691177,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00023996085814675198,
|
|
"loss": 2.7027,
|
|
"step": 1059
|
|
},
|
|
{
|
|
"epoch": 0.30256200982523745,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0002398520890373393,
|
|
"loss": 2.6585,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.30284744568356314,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00023974324619360028,
|
|
"loss": 2.7134,
|
|
"step": 1061
|
|
},
|
|
{
|
|
"epoch": 0.30313288154188883,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00023963432970485333,
|
|
"loss": 2.7017,
|
|
"step": 1062
|
|
},
|
|
{
|
|
"epoch": 0.3034183174002145,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002395253396604775,
|
|
"loss": 2.7121,
|
|
"step": 1063
|
|
},
|
|
{
|
|
"epoch": 0.3037037532585402,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00023941627614991205,
|
|
"loss": 2.6666,
|
|
"step": 1064
|
|
},
|
|
{
|
|
"epoch": 0.3039891891168659,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00023930713926265652,
|
|
"loss": 2.6927,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.3042746249751916,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00023919792908827072,
|
|
"loss": 2.6844,
|
|
"step": 1066
|
|
},
|
|
{
|
|
"epoch": 0.30456006083351733,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00023908864571637464,
|
|
"loss": 2.6666,
|
|
"step": 1067
|
|
},
|
|
{
|
|
"epoch": 0.304845496691843,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00023897928923664825,
|
|
"loss": 2.6676,
|
|
"step": 1068
|
|
},
|
|
{
|
|
"epoch": 0.3051309325501687,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00023886985973883157,
|
|
"loss": 2.7065,
|
|
"step": 1069
|
|
},
|
|
{
|
|
"epoch": 0.3054163684084944,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00023876035731272444,
|
|
"loss": 2.6579,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.3057018042668201,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00023865078204818676,
|
|
"loss": 2.6919,
|
|
"step": 1071
|
|
},
|
|
{
|
|
"epoch": 0.3059872401251458,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002385411340351379,
|
|
"loss": 2.6779,
|
|
"step": 1072
|
|
},
|
|
{
|
|
"epoch": 0.30627267598347147,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00023843141336355725,
|
|
"loss": 2.6798,
|
|
"step": 1073
|
|
},
|
|
{
|
|
"epoch": 0.30655811184179715,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0002383216201234836,
|
|
"loss": 2.6775,
|
|
"step": 1074
|
|
},
|
|
{
|
|
"epoch": 0.3068435477001229,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00023821175440501535,
|
|
"loss": 2.693,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.3071289835584486,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00023810181629831042,
|
|
"loss": 2.6807,
|
|
"step": 1076
|
|
},
|
|
{
|
|
"epoch": 0.3074144194167743,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002379918058935861,
|
|
"loss": 2.6583,
|
|
"step": 1077
|
|
},
|
|
{
|
|
"epoch": 0.30769985527509996,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00023788172328111903,
|
|
"loss": 2.6784,
|
|
"step": 1078
|
|
},
|
|
{
|
|
"epoch": 0.30798529113342565,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00023777156855124505,
|
|
"loss": 2.6992,
|
|
"step": 1079
|
|
},
|
|
{
|
|
"epoch": 0.30827072699175134,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00023766134179435921,
|
|
"loss": 2.7007,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.30855616285007703,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0002375510431009157,
|
|
"loss": 2.698,
|
|
"step": 1081
|
|
},
|
|
{
|
|
"epoch": 0.3088415987084027,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00023744067256142775,
|
|
"loss": 2.6982,
|
|
"step": 1082
|
|
},
|
|
{
|
|
"epoch": 0.3091270345667284,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00023733023026646744,
|
|
"loss": 2.732,
|
|
"step": 1083
|
|
},
|
|
{
|
|
"epoch": 0.30941247042505415,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00023721971630666589,
|
|
"loss": 2.7234,
|
|
"step": 1084
|
|
},
|
|
{
|
|
"epoch": 0.30969790628337984,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00023710913077271286,
|
|
"loss": 2.6996,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.30998334214170553,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00023699847375535698,
|
|
"loss": 2.7038,
|
|
"step": 1086
|
|
},
|
|
{
|
|
"epoch": 0.3102687780000312,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00023688774534540554,
|
|
"loss": 2.6705,
|
|
"step": 1087
|
|
},
|
|
{
|
|
"epoch": 0.3105542138583569,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002367769456337243,
|
|
"loss": 2.6632,
|
|
"step": 1088
|
|
},
|
|
{
|
|
"epoch": 0.3108396497166826,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00023666607471123767,
|
|
"loss": 2.6572,
|
|
"step": 1089
|
|
},
|
|
{
|
|
"epoch": 0.3111250855750083,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002365551326689283,
|
|
"loss": 2.68,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.311410521433334,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0002364441195978375,
|
|
"loss": 2.6704,
|
|
"step": 1091
|
|
},
|
|
{
|
|
"epoch": 0.3116959572916597,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0002363330355890646,
|
|
"loss": 2.6514,
|
|
"step": 1092
|
|
},
|
|
{
|
|
"epoch": 0.3119813931499854,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00023622188073376728,
|
|
"loss": 2.6773,
|
|
"step": 1093
|
|
},
|
|
{
|
|
"epoch": 0.3122668290083111,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00023611065512316127,
|
|
"loss": 2.6896,
|
|
"step": 1094
|
|
},
|
|
{
|
|
"epoch": 0.3125522648666368,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00023599935884852045,
|
|
"loss": 2.7068,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.3128377007249625,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00023588799200117662,
|
|
"loss": 2.6837,
|
|
"step": 1096
|
|
},
|
|
{
|
|
"epoch": 0.31312313658328816,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00023577655467251963,
|
|
"loss": 2.6873,
|
|
"step": 1097
|
|
},
|
|
{
|
|
"epoch": 0.31340857244161385,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002356650469539969,
|
|
"loss": 2.6891,
|
|
"step": 1098
|
|
},
|
|
{
|
|
"epoch": 0.31369400829993954,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0002355534689371139,
|
|
"loss": 2.6888,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"epoch": 0.3139794441582653,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00023544182071343363,
|
|
"loss": 2.6745,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.314264880016591,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00023533010237457674,
|
|
"loss": 2.6668,
|
|
"step": 1101
|
|
},
|
|
{
|
|
"epoch": 0.31455031587491666,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00023521831401222132,
|
|
"loss": 2.6679,
|
|
"step": 1102
|
|
},
|
|
{
|
|
"epoch": 0.31483575173324235,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00023510645571810316,
|
|
"loss": 2.693,
|
|
"step": 1103
|
|
},
|
|
{
|
|
"epoch": 0.31512118759156804,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00023499452758401525,
|
|
"loss": 2.6966,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"epoch": 0.31540662344989373,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00023488252970180792,
|
|
"loss": 2.6786,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.3156920593082194,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00023477046216338875,
|
|
"loss": 2.6579,
|
|
"step": 1106
|
|
},
|
|
{
|
|
"epoch": 0.3159774951665451,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0002346583250607225,
|
|
"loss": 2.6717,
|
|
"step": 1107
|
|
},
|
|
{
|
|
"epoch": 0.31626293102487085,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00023454611848583104,
|
|
"loss": 2.6939,
|
|
"step": 1108
|
|
},
|
|
{
|
|
"epoch": 0.31654836688319654,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00023443384253079308,
|
|
"loss": 2.658,
|
|
"step": 1109
|
|
},
|
|
{
|
|
"epoch": 0.31683380274152223,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00023432149728774455,
|
|
"loss": 2.6733,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.3171192385998479,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.000234209082848878,
|
|
"loss": 2.6814,
|
|
"step": 1111
|
|
},
|
|
{
|
|
"epoch": 0.3174046744581736,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00023409659930644287,
|
|
"loss": 2.67,
|
|
"step": 1112
|
|
},
|
|
{
|
|
"epoch": 0.3176901103164993,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00023398404675274522,
|
|
"loss": 2.6662,
|
|
"step": 1113
|
|
},
|
|
{
|
|
"epoch": 0.317975546174825,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00023387142528014798,
|
|
"loss": 2.6935,
|
|
"step": 1114
|
|
},
|
|
{
|
|
"epoch": 0.3182609820331507,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00023375873498107026,
|
|
"loss": 2.6746,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.3185464178914764,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00023364597594798802,
|
|
"loss": 2.6977,
|
|
"step": 1116
|
|
},
|
|
{
|
|
"epoch": 0.3188318537498021,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0002335331482734333,
|
|
"loss": 2.6889,
|
|
"step": 1117
|
|
},
|
|
{
|
|
"epoch": 0.3191172896081278,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00023342025204999472,
|
|
"loss": 2.6725,
|
|
"step": 1118
|
|
},
|
|
{
|
|
"epoch": 0.3194027254664535,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0002333072873703171,
|
|
"loss": 2.669,
|
|
"step": 1119
|
|
},
|
|
{
|
|
"epoch": 0.3196881613247792,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00023319425432710136,
|
|
"loss": 2.691,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.31997359718310486,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0002330811530131045,
|
|
"loss": 2.6734,
|
|
"step": 1121
|
|
},
|
|
{
|
|
"epoch": 0.32025903304143055,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0002329679835211397,
|
|
"loss": 2.6915,
|
|
"step": 1122
|
|
},
|
|
{
|
|
"epoch": 0.32054446889975624,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00023285474594407585,
|
|
"loss": 2.6766,
|
|
"step": 1123
|
|
},
|
|
{
|
|
"epoch": 0.320829904758082,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000232741440374838,
|
|
"loss": 2.6737,
|
|
"step": 1124
|
|
},
|
|
{
|
|
"epoch": 0.3211153406164077,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00023262806690640673,
|
|
"loss": 2.6618,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.32140077647473336,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00023251462563181853,
|
|
"loss": 2.7,
|
|
"step": 1126
|
|
},
|
|
{
|
|
"epoch": 0.32168621233305905,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00023240111664416544,
|
|
"loss": 2.6777,
|
|
"step": 1127
|
|
},
|
|
{
|
|
"epoch": 0.32197164819138474,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0002322875400365951,
|
|
"loss": 2.6749,
|
|
"step": 1128
|
|
},
|
|
{
|
|
"epoch": 0.32225708404971043,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00023217389590231058,
|
|
"loss": 2.6936,
|
|
"step": 1129
|
|
},
|
|
{
|
|
"epoch": 0.3225425199080361,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00023206018433457045,
|
|
"loss": 2.6419,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.3228279557663618,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00023194640542668855,
|
|
"loss": 2.6704,
|
|
"step": 1131
|
|
},
|
|
{
|
|
"epoch": 0.32311339162468755,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00023183255927203405,
|
|
"loss": 2.7011,
|
|
"step": 1132
|
|
},
|
|
{
|
|
"epoch": 0.32339882748301324,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00023171864596403116,
|
|
"loss": 2.683,
|
|
"step": 1133
|
|
},
|
|
{
|
|
"epoch": 0.32368426334133893,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00023160466559615946,
|
|
"loss": 2.7078,
|
|
"step": 1134
|
|
},
|
|
{
|
|
"epoch": 0.3239696991996646,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00023149061826195327,
|
|
"loss": 2.6919,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.3242551350579903,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00023137650405500202,
|
|
"loss": 2.6554,
|
|
"step": 1136
|
|
},
|
|
{
|
|
"epoch": 0.324540570916316,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00023126232306895,
|
|
"loss": 2.6734,
|
|
"step": 1137
|
|
},
|
|
{
|
|
"epoch": 0.3248260067746417,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002311480753974963,
|
|
"loss": 2.6794,
|
|
"step": 1138
|
|
},
|
|
{
|
|
"epoch": 0.3251114426329674,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00023103376113439472,
|
|
"loss": 2.6802,
|
|
"step": 1139
|
|
},
|
|
{
|
|
"epoch": 0.3253968784912931,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0002309193803734537,
|
|
"loss": 2.6811,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.3256823143496188,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00023080493320853628,
|
|
"loss": 2.671,
|
|
"step": 1141
|
|
},
|
|
{
|
|
"epoch": 0.3259677502079445,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00023069041973355992,
|
|
"loss": 2.6759,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"epoch": 0.3262531860662702,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00023057584004249662,
|
|
"loss": 2.682,
|
|
"step": 1143
|
|
},
|
|
{
|
|
"epoch": 0.3265386219245959,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00023046119422937258,
|
|
"loss": 2.6591,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"epoch": 0.32682405778292156,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00023034648238826836,
|
|
"loss": 2.6607,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.32710949364124725,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00023023170461331863,
|
|
"loss": 2.6512,
|
|
"step": 1146
|
|
},
|
|
{
|
|
"epoch": 0.32739492949957294,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0002301168609987123,
|
|
"loss": 2.6913,
|
|
"step": 1147
|
|
},
|
|
{
|
|
"epoch": 0.3276803653578987,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00023000195163869216,
|
|
"loss": 2.6783,
|
|
"step": 1148
|
|
},
|
|
{
|
|
"epoch": 0.3279658012162244,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0002298869766275549,
|
|
"loss": 2.6467,
|
|
"step": 1149
|
|
},
|
|
{
|
|
"epoch": 0.32825123707455006,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00022977193605965143,
|
|
"loss": 2.7,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.32853667293287575,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000229656830029386,
|
|
"loss": 2.6604,
|
|
"step": 1151
|
|
},
|
|
{
|
|
"epoch": 0.32882210879120144,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002295416586312169,
|
|
"loss": 2.6538,
|
|
"step": 1152
|
|
},
|
|
{
|
|
"epoch": 0.32910754464952713,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00022942642195965596,
|
|
"loss": 2.69,
|
|
"step": 1153
|
|
},
|
|
{
|
|
"epoch": 0.3293929805078528,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0002293111201092686,
|
|
"loss": 2.6806,
|
|
"step": 1154
|
|
},
|
|
{
|
|
"epoch": 0.3296784163661785,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00022919575317467358,
|
|
"loss": 2.6815,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.3299638522245042,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0002290803212505433,
|
|
"loss": 2.6887,
|
|
"step": 1156
|
|
},
|
|
{
|
|
"epoch": 0.33024928808282994,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00022896482443160335,
|
|
"loss": 2.6799,
|
|
"step": 1157
|
|
},
|
|
{
|
|
"epoch": 0.33053472394115563,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00022884926281263265,
|
|
"loss": 2.6802,
|
|
"step": 1158
|
|
},
|
|
{
|
|
"epoch": 0.3308201597994813,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00022873363648846318,
|
|
"loss": 2.6585,
|
|
"step": 1159
|
|
},
|
|
{
|
|
"epoch": 0.331105595657807,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00022861794555398016,
|
|
"loss": 2.6746,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.3313910315161327,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0002285021901041217,
|
|
"loss": 2.6856,
|
|
"step": 1161
|
|
},
|
|
{
|
|
"epoch": 0.3316764673744584,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000228386370233879,
|
|
"loss": 2.6456,
|
|
"step": 1162
|
|
},
|
|
{
|
|
"epoch": 0.3319619032327841,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00022827048603829596,
|
|
"loss": 2.6973,
|
|
"step": 1163
|
|
},
|
|
{
|
|
"epoch": 0.33224733909110976,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0002281545376124694,
|
|
"loss": 2.665,
|
|
"step": 1164
|
|
},
|
|
{
|
|
"epoch": 0.3325327749494355,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00022803852505154867,
|
|
"loss": 2.666,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.3328182108077612,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00022792244845073608,
|
|
"loss": 2.6748,
|
|
"step": 1166
|
|
},
|
|
{
|
|
"epoch": 0.3331036466660869,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00022780630790528617,
|
|
"loss": 2.6593,
|
|
"step": 1167
|
|
},
|
|
{
|
|
"epoch": 0.33338908252441257,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00022769010351050606,
|
|
"loss": 2.6485,
|
|
"step": 1168
|
|
},
|
|
{
|
|
"epoch": 0.33367451838273826,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00022757383536175529,
|
|
"loss": 2.6684,
|
|
"step": 1169
|
|
},
|
|
{
|
|
"epoch": 0.33395995424106395,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00022745750355444573,
|
|
"loss": 2.6508,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.33424539009938964,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00022734110818404144,
|
|
"loss": 2.6546,
|
|
"step": 1171
|
|
},
|
|
{
|
|
"epoch": 0.3345308259577153,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00022722464934605869,
|
|
"loss": 2.6864,
|
|
"step": 1172
|
|
},
|
|
{
|
|
"epoch": 0.33481626181604107,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00022710812713606582,
|
|
"loss": 2.6611,
|
|
"step": 1173
|
|
},
|
|
{
|
|
"epoch": 0.33510169767436676,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00022699154164968307,
|
|
"loss": 2.6822,
|
|
"step": 1174
|
|
},
|
|
{
|
|
"epoch": 0.33538713353269245,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0002268748929825828,
|
|
"loss": 2.6522,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.33567256939101814,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0002267581812304891,
|
|
"loss": 2.6546,
|
|
"step": 1176
|
|
},
|
|
{
|
|
"epoch": 0.3359580052493438,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00022664140648917782,
|
|
"loss": 2.6711,
|
|
"step": 1177
|
|
},
|
|
{
|
|
"epoch": 0.3362434411076695,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00022652456885447652,
|
|
"loss": 2.6533,
|
|
"step": 1178
|
|
},
|
|
{
|
|
"epoch": 0.3365288769659952,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0002264076684222644,
|
|
"loss": 2.6659,
|
|
"step": 1179
|
|
},
|
|
{
|
|
"epoch": 0.3368143128243209,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00022629070528847216,
|
|
"loss": 2.6843,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.33709974868264664,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00022617367954908194,
|
|
"loss": 2.6654,
|
|
"step": 1181
|
|
},
|
|
{
|
|
"epoch": 0.3373851845409723,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00022605659130012733,
|
|
"loss": 2.6624,
|
|
"step": 1182
|
|
},
|
|
{
|
|
"epoch": 0.337670620399298,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00022593944063769314,
|
|
"loss": 2.6839,
|
|
"step": 1183
|
|
},
|
|
{
|
|
"epoch": 0.3379560562576237,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002258222276579154,
|
|
"loss": 2.6787,
|
|
"step": 1184
|
|
},
|
|
{
|
|
"epoch": 0.3382414921159494,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00022570495245698128,
|
|
"loss": 2.6928,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.3385269279742751,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00022558761513112913,
|
|
"loss": 2.6999,
|
|
"step": 1186
|
|
},
|
|
{
|
|
"epoch": 0.33881236383260077,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00022547021577664814,
|
|
"loss": 2.6904,
|
|
"step": 1187
|
|
},
|
|
{
|
|
"epoch": 0.33909779969092646,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00022535275448987832,
|
|
"loss": 2.6623,
|
|
"step": 1188
|
|
},
|
|
{
|
|
"epoch": 0.3393832355492522,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00022523523136721085,
|
|
"loss": 2.6658,
|
|
"step": 1189
|
|
},
|
|
{
|
|
"epoch": 0.3396686714075779,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00022511764650508728,
|
|
"loss": 2.6547,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.3399541072659036,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000225,
|
|
"loss": 2.6677,
|
|
"step": 1191
|
|
},
|
|
{
|
|
"epoch": 0.34023954312422927,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00022488229194849192,
|
|
"loss": 2.6869,
|
|
"step": 1192
|
|
},
|
|
{
|
|
"epoch": 0.34052497898255496,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00022476452244715663,
|
|
"loss": 2.6773,
|
|
"step": 1193
|
|
},
|
|
{
|
|
"epoch": 0.34081041484088065,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00022464669159263793,
|
|
"loss": 2.6669,
|
|
"step": 1194
|
|
},
|
|
{
|
|
"epoch": 0.34109585069920634,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00022452879948162998,
|
|
"loss": 2.64,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.341381286557532,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0002244108462108774,
|
|
"loss": 2.6452,
|
|
"step": 1196
|
|
},
|
|
{
|
|
"epoch": 0.34166672241585777,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00022429283187717485,
|
|
"loss": 2.6339,
|
|
"step": 1197
|
|
},
|
|
{
|
|
"epoch": 0.34195215827418346,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00022417475657736705,
|
|
"loss": 2.6572,
|
|
"step": 1198
|
|
},
|
|
{
|
|
"epoch": 0.34223759413250915,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00022405662040834895,
|
|
"loss": 2.646,
|
|
"step": 1199
|
|
},
|
|
{
|
|
"epoch": 0.34252302999083484,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00022393842346706523,
|
|
"loss": 2.6676,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.3428084658491605,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00022382016585051058,
|
|
"loss": 2.6574,
|
|
"step": 1201
|
|
},
|
|
{
|
|
"epoch": 0.3430939017074862,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00022370184765572944,
|
|
"loss": 2.6481,
|
|
"step": 1202
|
|
},
|
|
{
|
|
"epoch": 0.3433793375658119,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00022358346897981596,
|
|
"loss": 2.675,
|
|
"step": 1203
|
|
},
|
|
{
|
|
"epoch": 0.3436647734241376,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002234650299199139,
|
|
"loss": 2.6475,
|
|
"step": 1204
|
|
},
|
|
{
|
|
"epoch": 0.34395020928246334,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00022334653057321663,
|
|
"loss": 2.6372,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.344235645140789,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.00022322797103696692,
|
|
"loss": 2.657,
|
|
"step": 1206
|
|
},
|
|
{
|
|
"epoch": 0.3445210809991147,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00022310935140845706,
|
|
"loss": 2.6606,
|
|
"step": 1207
|
|
},
|
|
{
|
|
"epoch": 0.3448065168574404,
|
|
"grad_norm": 0.8515625,
|
|
"learning_rate": 0.0002229906717850284,
|
|
"loss": 2.6751,
|
|
"step": 1208
|
|
},
|
|
{
|
|
"epoch": 0.3450919527157661,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.00022287193226407185,
|
|
"loss": 2.6703,
|
|
"step": 1209
|
|
},
|
|
{
|
|
"epoch": 0.3453773885740918,
|
|
"grad_norm": 0.81640625,
|
|
"learning_rate": 0.00022275313294302726,
|
|
"loss": 2.6554,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.34566282443241747,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00022263427391938358,
|
|
"loss": 2.6401,
|
|
"step": 1211
|
|
},
|
|
{
|
|
"epoch": 0.34594826029074316,
|
|
"grad_norm": 0.78515625,
|
|
"learning_rate": 0.00022251535529067877,
|
|
"loss": 2.6659,
|
|
"step": 1212
|
|
},
|
|
{
|
|
"epoch": 0.3462336961490689,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00022239637715449977,
|
|
"loss": 2.6972,
|
|
"step": 1213
|
|
},
|
|
{
|
|
"epoch": 0.3465191320073946,
|
|
"grad_norm": 0.82421875,
|
|
"learning_rate": 0.0002222773396084822,
|
|
"loss": 2.6545,
|
|
"step": 1214
|
|
},
|
|
{
|
|
"epoch": 0.3468045678657203,
|
|
"grad_norm": 0.80859375,
|
|
"learning_rate": 0.0002221582427503106,
|
|
"loss": 2.6515,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.34709000372404597,
|
|
"grad_norm": 0.6953125,
|
|
"learning_rate": 0.00022203908667771808,
|
|
"loss": 2.6517,
|
|
"step": 1216
|
|
},
|
|
{
|
|
"epoch": 0.34737543958237166,
|
|
"grad_norm": 0.73828125,
|
|
"learning_rate": 0.00022191987148848636,
|
|
"loss": 2.6596,
|
|
"step": 1217
|
|
},
|
|
{
|
|
"epoch": 0.34766087544069735,
|
|
"grad_norm": 0.6640625,
|
|
"learning_rate": 0.0002218005972804457,
|
|
"loss": 2.6795,
|
|
"step": 1218
|
|
},
|
|
{
|
|
"epoch": 0.34794631129902304,
|
|
"grad_norm": 0.73828125,
|
|
"learning_rate": 0.00022168126415147478,
|
|
"loss": 2.6416,
|
|
"step": 1219
|
|
},
|
|
{
|
|
"epoch": 0.3482317471573487,
|
|
"grad_norm": 0.71875,
|
|
"learning_rate": 0.00022156187219950059,
|
|
"loss": 2.6384,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.34851718301567447,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 0.0002214424215224985,
|
|
"loss": 2.6574,
|
|
"step": 1221
|
|
},
|
|
{
|
|
"epoch": 0.34880261887400016,
|
|
"grad_norm": 0.77734375,
|
|
"learning_rate": 0.0002213229122184919,
|
|
"loss": 2.6864,
|
|
"step": 1222
|
|
},
|
|
{
|
|
"epoch": 0.34908805473232585,
|
|
"grad_norm": 0.796875,
|
|
"learning_rate": 0.0002212033443855525,
|
|
"loss": 2.6457,
|
|
"step": 1223
|
|
},
|
|
{
|
|
"epoch": 0.34937349059065154,
|
|
"grad_norm": 0.7265625,
|
|
"learning_rate": 0.0002210837181217998,
|
|
"loss": 2.6441,
|
|
"step": 1224
|
|
},
|
|
{
|
|
"epoch": 0.3496589264489772,
|
|
"grad_norm": 0.8203125,
|
|
"learning_rate": 0.0002209640335254015,
|
|
"loss": 2.6643,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.3499443623073029,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 0.00022084429069457297,
|
|
"loss": 2.6436,
|
|
"step": 1226
|
|
},
|
|
{
|
|
"epoch": 0.3502297981656286,
|
|
"grad_norm": 0.80859375,
|
|
"learning_rate": 0.0002207244897275775,
|
|
"loss": 2.6485,
|
|
"step": 1227
|
|
},
|
|
{
|
|
"epoch": 0.3505152340239543,
|
|
"grad_norm": 0.80859375,
|
|
"learning_rate": 0.00022060463072272595,
|
|
"loss": 2.6534,
|
|
"step": 1228
|
|
},
|
|
{
|
|
"epoch": 0.35080066988228,
|
|
"grad_norm": 0.8203125,
|
|
"learning_rate": 0.00022048471377837697,
|
|
"loss": 2.6605,
|
|
"step": 1229
|
|
},
|
|
{
|
|
"epoch": 0.3510861057406057,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0002203647389929367,
|
|
"loss": 2.6603,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.3513715415989314,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00022024470646485862,
|
|
"loss": 2.6937,
|
|
"step": 1231
|
|
},
|
|
{
|
|
"epoch": 0.3516569774572571,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0002201246162926437,
|
|
"loss": 2.6643,
|
|
"step": 1232
|
|
},
|
|
{
|
|
"epoch": 0.3519424133155828,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 0.00022000446857484035,
|
|
"loss": 2.6523,
|
|
"step": 1233
|
|
},
|
|
{
|
|
"epoch": 0.3522278491739085,
|
|
"grad_norm": 0.7578125,
|
|
"learning_rate": 0.0002198842634100439,
|
|
"loss": 2.6739,
|
|
"step": 1234
|
|
},
|
|
{
|
|
"epoch": 0.35251328503223417,
|
|
"grad_norm": 0.59765625,
|
|
"learning_rate": 0.00021976400089689712,
|
|
"loss": 2.6605,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.35279872089055986,
|
|
"grad_norm": 0.7109375,
|
|
"learning_rate": 0.00021964368113408959,
|
|
"loss": 2.6868,
|
|
"step": 1236
|
|
},
|
|
{
|
|
"epoch": 0.35308415674888555,
|
|
"grad_norm": 0.828125,
|
|
"learning_rate": 0.00021952330422035803,
|
|
"loss": 2.6759,
|
|
"step": 1237
|
|
},
|
|
{
|
|
"epoch": 0.3533695926072113,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0002194028702544861,
|
|
"loss": 2.6735,
|
|
"step": 1238
|
|
},
|
|
{
|
|
"epoch": 0.353655028465537,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00021928237933530403,
|
|
"loss": 2.661,
|
|
"step": 1239
|
|
},
|
|
{
|
|
"epoch": 0.35394046432386267,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00021916183156168908,
|
|
"loss": 2.6457,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.35422590018218836,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00021904122703256498,
|
|
"loss": 2.6761,
|
|
"step": 1241
|
|
},
|
|
{
|
|
"epoch": 0.35451133604051405,
|
|
"grad_norm": 0.80859375,
|
|
"learning_rate": 0.00021892056584690213,
|
|
"loss": 2.6441,
|
|
"step": 1242
|
|
},
|
|
{
|
|
"epoch": 0.35479677189883974,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00021879984810371734,
|
|
"loss": 2.6453,
|
|
"step": 1243
|
|
},
|
|
{
|
|
"epoch": 0.3550822077571654,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00021867907390207394,
|
|
"loss": 2.6208,
|
|
"step": 1244
|
|
},
|
|
{
|
|
"epoch": 0.3553676436154911,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00021855824334108143,
|
|
"loss": 2.6572,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.35565307947381686,
|
|
"grad_norm": 0.85546875,
|
|
"learning_rate": 0.00021843735651989575,
|
|
"loss": 2.6826,
|
|
"step": 1246
|
|
},
|
|
{
|
|
"epoch": 0.35593851533214255,
|
|
"grad_norm": 0.8125,
|
|
"learning_rate": 0.00021831641353771885,
|
|
"loss": 2.6611,
|
|
"step": 1247
|
|
},
|
|
{
|
|
"epoch": 0.35622395119046824,
|
|
"grad_norm": 0.83203125,
|
|
"learning_rate": 0.00021819541449379892,
|
|
"loss": 2.6597,
|
|
"step": 1248
|
|
},
|
|
{
|
|
"epoch": 0.3565093870487939,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00021807435948742994,
|
|
"loss": 2.635,
|
|
"step": 1249
|
|
},
|
|
{
|
|
"epoch": 0.3567948229071196,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00021795324861795208,
|
|
"loss": 2.6526,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.3567948229071196,
|
|
"eval_loss": 2.5330393314361572,
|
|
"eval_runtime": 5928.9133,
|
|
"eval_samples_per_second": 10.843,
|
|
"eval_steps_per_second": 10.843,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.3570802587654453,
|
|
"grad_norm": 0.84375,
|
|
"learning_rate": 0.00021783208198475107,
|
|
"loss": 2.6512,
|
|
"step": 1251
|
|
},
|
|
{
|
|
"epoch": 0.357365694623771,
|
|
"grad_norm": 0.7890625,
|
|
"learning_rate": 0.00021771085968725864,
|
|
"loss": 2.6381,
|
|
"step": 1252
|
|
},
|
|
{
|
|
"epoch": 0.3576511304820967,
|
|
"grad_norm": 0.7265625,
|
|
"learning_rate": 0.00021758958182495214,
|
|
"loss": 2.6498,
|
|
"step": 1253
|
|
},
|
|
{
|
|
"epoch": 0.3579365663404224,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00021746824849735435,
|
|
"loss": 2.6614,
|
|
"step": 1254
|
|
},
|
|
{
|
|
"epoch": 0.3582220021987481,
|
|
"grad_norm": 0.72265625,
|
|
"learning_rate": 0.00021734685980403376,
|
|
"loss": 2.6483,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.3585074380570738,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.0002172254158446043,
|
|
"loss": 2.6365,
|
|
"step": 1256
|
|
},
|
|
{
|
|
"epoch": 0.3587928739153995,
|
|
"grad_norm": 0.86328125,
|
|
"learning_rate": 0.00021710391671872514,
|
|
"loss": 2.6484,
|
|
"step": 1257
|
|
},
|
|
{
|
|
"epoch": 0.3590783097737252,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.00021698236252610072,
|
|
"loss": 2.6372,
|
|
"step": 1258
|
|
},
|
|
{
|
|
"epoch": 0.35936374563205087,
|
|
"grad_norm": 0.80859375,
|
|
"learning_rate": 0.00021686075336648075,
|
|
"loss": 2.6554,
|
|
"step": 1259
|
|
},
|
|
{
|
|
"epoch": 0.35964918149037656,
|
|
"grad_norm": 0.8359375,
|
|
"learning_rate": 0.00021673908933965996,
|
|
"loss": 2.6511,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.35993461734870225,
|
|
"grad_norm": 0.81640625,
|
|
"learning_rate": 0.00021661737054547826,
|
|
"loss": 2.6473,
|
|
"step": 1261
|
|
},
|
|
{
|
|
"epoch": 0.360220053207028,
|
|
"grad_norm": 0.7890625,
|
|
"learning_rate": 0.00021649559708382027,
|
|
"loss": 2.6396,
|
|
"step": 1262
|
|
},
|
|
{
|
|
"epoch": 0.3605054890653537,
|
|
"grad_norm": 0.87890625,
|
|
"learning_rate": 0.0002163737690546157,
|
|
"loss": 2.6517,
|
|
"step": 1263
|
|
},
|
|
{
|
|
"epoch": 0.36079092492367937,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00021625188655783893,
|
|
"loss": 2.6126,
|
|
"step": 1264
|
|
},
|
|
{
|
|
"epoch": 0.36107636078200506,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.000216129949693509,
|
|
"loss": 2.6551,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.36136179664033075,
|
|
"grad_norm": 0.8671875,
|
|
"learning_rate": 0.0002160079585616896,
|
|
"loss": 2.6316,
|
|
"step": 1266
|
|
},
|
|
{
|
|
"epoch": 0.36164723249865643,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.000215885913262489,
|
|
"loss": 2.6376,
|
|
"step": 1267
|
|
},
|
|
{
|
|
"epoch": 0.3619326683569821,
|
|
"grad_norm": 0.78125,
|
|
"learning_rate": 0.00021576381389605992,
|
|
"loss": 2.6378,
|
|
"step": 1268
|
|
},
|
|
{
|
|
"epoch": 0.3622181042153078,
|
|
"grad_norm": 0.78515625,
|
|
"learning_rate": 0.00021564166056259936,
|
|
"loss": 2.6742,
|
|
"step": 1269
|
|
},
|
|
{
|
|
"epoch": 0.36250354007363356,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00021551945336234867,
|
|
"loss": 2.6676,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.36278897593195925,
|
|
"grad_norm": 0.7578125,
|
|
"learning_rate": 0.00021539719239559336,
|
|
"loss": 2.6604,
|
|
"step": 1271
|
|
},
|
|
{
|
|
"epoch": 0.36307441179028493,
|
|
"grad_norm": 0.734375,
|
|
"learning_rate": 0.00021527487776266317,
|
|
"loss": 2.6459,
|
|
"step": 1272
|
|
},
|
|
{
|
|
"epoch": 0.3633598476486106,
|
|
"grad_norm": 0.74609375,
|
|
"learning_rate": 0.0002151525095639318,
|
|
"loss": 2.6323,
|
|
"step": 1273
|
|
},
|
|
{
|
|
"epoch": 0.3636452835069363,
|
|
"grad_norm": 0.80859375,
|
|
"learning_rate": 0.0002150300878998168,
|
|
"loss": 2.6476,
|
|
"step": 1274
|
|
},
|
|
{
|
|
"epoch": 0.363930719365262,
|
|
"grad_norm": 0.73046875,
|
|
"learning_rate": 0.0002149076128707798,
|
|
"loss": 2.6378,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.3642161552235877,
|
|
"grad_norm": 0.73828125,
|
|
"learning_rate": 0.00021478508457732615,
|
|
"loss": 2.654,
|
|
"step": 1276
|
|
},
|
|
{
|
|
"epoch": 0.3645015910819134,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.00021466250312000482,
|
|
"loss": 2.6398,
|
|
"step": 1277
|
|
},
|
|
{
|
|
"epoch": 0.3647870269402391,
|
|
"grad_norm": 0.74609375,
|
|
"learning_rate": 0.00021453986859940852,
|
|
"loss": 2.6306,
|
|
"step": 1278
|
|
},
|
|
{
|
|
"epoch": 0.3650724627985648,
|
|
"grad_norm": 0.84765625,
|
|
"learning_rate": 0.00021441718111617344,
|
|
"loss": 2.6299,
|
|
"step": 1279
|
|
},
|
|
{
|
|
"epoch": 0.3653578986568905,
|
|
"grad_norm": 0.8125,
|
|
"learning_rate": 0.00021429444077097928,
|
|
"loss": 2.6466,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.3656433345152162,
|
|
"grad_norm": 0.75390625,
|
|
"learning_rate": 0.00021417164766454903,
|
|
"loss": 2.6788,
|
|
"step": 1281
|
|
},
|
|
{
|
|
"epoch": 0.3659287703735419,
|
|
"grad_norm": 0.61328125,
|
|
"learning_rate": 0.00021404880189764913,
|
|
"loss": 2.6416,
|
|
"step": 1282
|
|
},
|
|
{
|
|
"epoch": 0.36621420623186757,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 0.00021392590357108905,
|
|
"loss": 2.6469,
|
|
"step": 1283
|
|
},
|
|
{
|
|
"epoch": 0.36649964209019326,
|
|
"grad_norm": 0.65625,
|
|
"learning_rate": 0.00021380295278572155,
|
|
"loss": 2.6422,
|
|
"step": 1284
|
|
},
|
|
{
|
|
"epoch": 0.36678507794851894,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 0.00021367994964244236,
|
|
"loss": 2.6202,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.3670705138068447,
|
|
"grad_norm": 0.640625,
|
|
"learning_rate": 0.00021355689424219023,
|
|
"loss": 2.6281,
|
|
"step": 1286
|
|
},
|
|
{
|
|
"epoch": 0.3673559496651704,
|
|
"grad_norm": 0.60546875,
|
|
"learning_rate": 0.00021343378668594662,
|
|
"loss": 2.6181,
|
|
"step": 1287
|
|
},
|
|
{
|
|
"epoch": 0.36764138552349607,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 0.00021331062707473605,
|
|
"loss": 2.6632,
|
|
"step": 1288
|
|
},
|
|
{
|
|
"epoch": 0.36792682138182176,
|
|
"grad_norm": 0.59765625,
|
|
"learning_rate": 0.00021318741550962556,
|
|
"loss": 2.6296,
|
|
"step": 1289
|
|
},
|
|
{
|
|
"epoch": 0.36821225724014744,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00021306415209172502,
|
|
"loss": 2.654,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.36849769309847313,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00021294083692218653,
|
|
"loss": 2.6375,
|
|
"step": 1291
|
|
},
|
|
{
|
|
"epoch": 0.3687831289567988,
|
|
"grad_norm": 0.61328125,
|
|
"learning_rate": 0.00021281747010220496,
|
|
"loss": 2.6488,
|
|
"step": 1292
|
|
},
|
|
{
|
|
"epoch": 0.3690685648151245,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.0002126940517330175,
|
|
"loss": 2.6565,
|
|
"step": 1293
|
|
},
|
|
{
|
|
"epoch": 0.36935400067345026,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00021257058191590354,
|
|
"loss": 2.6622,
|
|
"step": 1294
|
|
},
|
|
{
|
|
"epoch": 0.36963943653177594,
|
|
"grad_norm": 0.59765625,
|
|
"learning_rate": 0.00021244706075218472,
|
|
"loss": 2.6498,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.36992487239010163,
|
|
"grad_norm": 0.72265625,
|
|
"learning_rate": 0.00021232348834322495,
|
|
"loss": 2.6525,
|
|
"step": 1296
|
|
},
|
|
{
|
|
"epoch": 0.3702103082484273,
|
|
"grad_norm": 0.75,
|
|
"learning_rate": 0.00021219986479043001,
|
|
"loss": 2.6365,
|
|
"step": 1297
|
|
},
|
|
{
|
|
"epoch": 0.370495744106753,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 0.00021207619019524777,
|
|
"loss": 2.6502,
|
|
"step": 1298
|
|
},
|
|
{
|
|
"epoch": 0.3707811799650787,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.00021195246465916792,
|
|
"loss": 2.6183,
|
|
"step": 1299
|
|
},
|
|
{
|
|
"epoch": 0.3710666158234044,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00021182868828372196,
|
|
"loss": 2.6646,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.3713520516817301,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 0.00021170486117048315,
|
|
"loss": 2.6203,
|
|
"step": 1301
|
|
},
|
|
{
|
|
"epoch": 0.37163748754005577,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.0002115809834210664,
|
|
"loss": 2.625,
|
|
"step": 1302
|
|
},
|
|
{
|
|
"epoch": 0.3719229233983815,
|
|
"grad_norm": 0.6171875,
|
|
"learning_rate": 0.0002114570551371281,
|
|
"loss": 2.671,
|
|
"step": 1303
|
|
},
|
|
{
|
|
"epoch": 0.3722083592567072,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00021133307642036615,
|
|
"loss": 2.6239,
|
|
"step": 1304
|
|
},
|
|
{
|
|
"epoch": 0.3724937951150329,
|
|
"grad_norm": 0.64453125,
|
|
"learning_rate": 0.0002112090473725198,
|
|
"loss": 2.643,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.3727792309733586,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00021108496809536974,
|
|
"loss": 2.627,
|
|
"step": 1306
|
|
},
|
|
{
|
|
"epoch": 0.37306466683168427,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00021096083869073765,
|
|
"loss": 2.6038,
|
|
"step": 1307
|
|
},
|
|
{
|
|
"epoch": 0.37335010269000996,
|
|
"grad_norm": 0.640625,
|
|
"learning_rate": 0.0002108366592604866,
|
|
"loss": 2.6223,
|
|
"step": 1308
|
|
},
|
|
{
|
|
"epoch": 0.37363553854833564,
|
|
"grad_norm": 0.7109375,
|
|
"learning_rate": 0.00021071242990652043,
|
|
"loss": 2.6492,
|
|
"step": 1309
|
|
},
|
|
{
|
|
"epoch": 0.37392097440666133,
|
|
"grad_norm": 0.625,
|
|
"learning_rate": 0.00021058815073078422,
|
|
"loss": 2.6534,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.3742064102649871,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00021046382183526378,
|
|
"loss": 2.6197,
|
|
"step": 1311
|
|
},
|
|
{
|
|
"epoch": 0.37449184612331277,
|
|
"grad_norm": 0.7265625,
|
|
"learning_rate": 0.0002103394433219858,
|
|
"loss": 2.632,
|
|
"step": 1312
|
|
},
|
|
{
|
|
"epoch": 0.37477728198163845,
|
|
"grad_norm": 0.59375,
|
|
"learning_rate": 0.00021021501529301756,
|
|
"loss": 2.639,
|
|
"step": 1313
|
|
},
|
|
{
|
|
"epoch": 0.37506271783996414,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 0.00021009053785046706,
|
|
"loss": 2.6138,
|
|
"step": 1314
|
|
},
|
|
{
|
|
"epoch": 0.37534815369828983,
|
|
"grad_norm": 0.61328125,
|
|
"learning_rate": 0.0002099660110964829,
|
|
"loss": 2.647,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.3756335895566155,
|
|
"grad_norm": 0.60546875,
|
|
"learning_rate": 0.00020984143513325416,
|
|
"loss": 2.6299,
|
|
"step": 1316
|
|
},
|
|
{
|
|
"epoch": 0.3759190254149412,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.0002097168100630101,
|
|
"loss": 2.6422,
|
|
"step": 1317
|
|
},
|
|
{
|
|
"epoch": 0.3762044612732669,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.0002095921359880204,
|
|
"loss": 2.6092,
|
|
"step": 1318
|
|
},
|
|
{
|
|
"epoch": 0.37648989713159264,
|
|
"grad_norm": 0.609375,
|
|
"learning_rate": 0.00020946741301059514,
|
|
"loss": 2.6118,
|
|
"step": 1319
|
|
},
|
|
{
|
|
"epoch": 0.37677533298991833,
|
|
"grad_norm": 0.64453125,
|
|
"learning_rate": 0.0002093426412330842,
|
|
"loss": 2.6348,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.377060768848244,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 0.00020921782075787777,
|
|
"loss": 2.6552,
|
|
"step": 1321
|
|
},
|
|
{
|
|
"epoch": 0.3773462047065697,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.00020909295168740577,
|
|
"loss": 2.6427,
|
|
"step": 1322
|
|
},
|
|
{
|
|
"epoch": 0.3776316405648954,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00020896803412413824,
|
|
"loss": 2.626,
|
|
"step": 1323
|
|
},
|
|
{
|
|
"epoch": 0.3779170764232211,
|
|
"grad_norm": 0.59375,
|
|
"learning_rate": 0.00020884306817058482,
|
|
"loss": 2.6509,
|
|
"step": 1324
|
|
},
|
|
{
|
|
"epoch": 0.3782025122815468,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00020871805392929502,
|
|
"loss": 2.6215,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.37848794813987247,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.00020859299150285786,
|
|
"loss": 2.6605,
|
|
"step": 1326
|
|
},
|
|
{
|
|
"epoch": 0.3787733839981982,
|
|
"grad_norm": 0.625,
|
|
"learning_rate": 0.00020846788099390188,
|
|
"loss": 2.6488,
|
|
"step": 1327
|
|
},
|
|
{
|
|
"epoch": 0.3790588198565239,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 0.00020834272250509523,
|
|
"loss": 2.6607,
|
|
"step": 1328
|
|
},
|
|
{
|
|
"epoch": 0.3793442557148496,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00020821751613914525,
|
|
"loss": 2.6426,
|
|
"step": 1329
|
|
},
|
|
{
|
|
"epoch": 0.3796296915731753,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.0002080922619987987,
|
|
"loss": 2.6458,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.37991512743150097,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00020796696018684152,
|
|
"loss": 2.6278,
|
|
"step": 1331
|
|
},
|
|
{
|
|
"epoch": 0.38020056328982665,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00020784161080609868,
|
|
"loss": 2.6603,
|
|
"step": 1332
|
|
},
|
|
{
|
|
"epoch": 0.38048599914815234,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00020771621395943436,
|
|
"loss": 2.6395,
|
|
"step": 1333
|
|
},
|
|
{
|
|
"epoch": 0.38077143500647803,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.00020759076974975144,
|
|
"loss": 2.6346,
|
|
"step": 1334
|
|
},
|
|
{
|
|
"epoch": 0.3810568708648038,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.00020746527827999195,
|
|
"loss": 2.6412,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.38134230672312946,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00020733973965313655,
|
|
"loss": 2.6311,
|
|
"step": 1336
|
|
},
|
|
{
|
|
"epoch": 0.38162774258145515,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.0002072141539722046,
|
|
"loss": 2.6174,
|
|
"step": 1337
|
|
},
|
|
{
|
|
"epoch": 0.38191317843978084,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00020708852134025397,
|
|
"loss": 2.6192,
|
|
"step": 1338
|
|
},
|
|
{
|
|
"epoch": 0.38219861429810653,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 0.0002069628418603814,
|
|
"loss": 2.6467,
|
|
"step": 1339
|
|
},
|
|
{
|
|
"epoch": 0.3824840501564322,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00020683711563572167,
|
|
"loss": 2.6369,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.3827694860147579,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00020671134276944815,
|
|
"loss": 2.6372,
|
|
"step": 1341
|
|
},
|
|
{
|
|
"epoch": 0.3830549218730836,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.0002065855233647725,
|
|
"loss": 2.6436,
|
|
"step": 1342
|
|
},
|
|
{
|
|
"epoch": 0.38334035773140934,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.00020645965752494444,
|
|
"loss": 2.6342,
|
|
"step": 1343
|
|
},
|
|
{
|
|
"epoch": 0.38362579358973503,
|
|
"grad_norm": 0.60546875,
|
|
"learning_rate": 0.0002063337453532519,
|
|
"loss": 2.637,
|
|
"step": 1344
|
|
},
|
|
{
|
|
"epoch": 0.3839112294480607,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.0002062077869530207,
|
|
"loss": 2.6444,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.3841966653063864,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00020608178242761483,
|
|
"loss": 2.6339,
|
|
"step": 1346
|
|
},
|
|
{
|
|
"epoch": 0.3844821011647121,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 0.00020595573188043594,
|
|
"loss": 2.6422,
|
|
"step": 1347
|
|
},
|
|
{
|
|
"epoch": 0.3847675370230378,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 0.00020582963541492343,
|
|
"loss": 2.6472,
|
|
"step": 1348
|
|
},
|
|
{
|
|
"epoch": 0.3850529728813635,
|
|
"grad_norm": 0.65625,
|
|
"learning_rate": 0.00020570349313455452,
|
|
"loss": 2.6081,
|
|
"step": 1349
|
|
},
|
|
{
|
|
"epoch": 0.38533840873968916,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00020557730514284396,
|
|
"loss": 2.6214,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.3856238445980149,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00020545107154334397,
|
|
"loss": 2.6263,
|
|
"step": 1351
|
|
},
|
|
{
|
|
"epoch": 0.3859092804563406,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 0.0002053247924396442,
|
|
"loss": 2.6092,
|
|
"step": 1352
|
|
},
|
|
{
|
|
"epoch": 0.3861947163146663,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.0002051984679353718,
|
|
"loss": 2.6329,
|
|
"step": 1353
|
|
},
|
|
{
|
|
"epoch": 0.386480152172992,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.0002050720981341909,
|
|
"loss": 2.6087,
|
|
"step": 1354
|
|
},
|
|
{
|
|
"epoch": 0.38676558803131766,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 0.00020494568313980305,
|
|
"loss": 2.6249,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.38705102388964335,
|
|
"grad_norm": 0.7578125,
|
|
"learning_rate": 0.00020481922305594678,
|
|
"loss": 2.6385,
|
|
"step": 1356
|
|
},
|
|
{
|
|
"epoch": 0.38733645974796904,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 0.0002046927179863976,
|
|
"loss": 2.632,
|
|
"step": 1357
|
|
},
|
|
{
|
|
"epoch": 0.38762189560629473,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00020456616803496796,
|
|
"loss": 2.642,
|
|
"step": 1358
|
|
},
|
|
{
|
|
"epoch": 0.3879073314646205,
|
|
"grad_norm": 0.7109375,
|
|
"learning_rate": 0.00020443957330550718,
|
|
"loss": 2.6268,
|
|
"step": 1359
|
|
},
|
|
{
|
|
"epoch": 0.38819276732294616,
|
|
"grad_norm": 0.6640625,
|
|
"learning_rate": 0.0002043129339019013,
|
|
"loss": 2.6379,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.38847820318127185,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00020418624992807295,
|
|
"loss": 2.6577,
|
|
"step": 1361
|
|
},
|
|
{
|
|
"epoch": 0.38876363903959754,
|
|
"grad_norm": 0.67578125,
|
|
"learning_rate": 0.00020405952148798144,
|
|
"loss": 2.6331,
|
|
"step": 1362
|
|
},
|
|
{
|
|
"epoch": 0.38904907489792323,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00020393274868562254,
|
|
"loss": 2.6376,
|
|
"step": 1363
|
|
},
|
|
{
|
|
"epoch": 0.3893345107562489,
|
|
"grad_norm": 0.609375,
|
|
"learning_rate": 0.00020380593162502844,
|
|
"loss": 2.6041,
|
|
"step": 1364
|
|
},
|
|
{
|
|
"epoch": 0.3896199466145746,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 0.00020367907041026755,
|
|
"loss": 2.6439,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.3899053824729003,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00020355216514544462,
|
|
"loss": 2.6405,
|
|
"step": 1366
|
|
},
|
|
{
|
|
"epoch": 0.39019081833122604,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.0002034252159347005,
|
|
"loss": 2.6451,
|
|
"step": 1367
|
|
},
|
|
{
|
|
"epoch": 0.39047625418955173,
|
|
"grad_norm": 0.60546875,
|
|
"learning_rate": 0.00020329822288221218,
|
|
"loss": 2.637,
|
|
"step": 1368
|
|
},
|
|
{
|
|
"epoch": 0.3907616900478774,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00020317118609219253,
|
|
"loss": 2.5896,
|
|
"step": 1369
|
|
},
|
|
{
|
|
"epoch": 0.3910471259062031,
|
|
"grad_norm": 0.59375,
|
|
"learning_rate": 0.00020304410566889027,
|
|
"loss": 2.641,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.3913325617645288,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 0.0002029169817165901,
|
|
"loss": 2.6245,
|
|
"step": 1371
|
|
},
|
|
{
|
|
"epoch": 0.3916179976228545,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.0002027898143396123,
|
|
"loss": 2.6347,
|
|
"step": 1372
|
|
},
|
|
{
|
|
"epoch": 0.3919034334811802,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00020266260364231286,
|
|
"loss": 2.6158,
|
|
"step": 1373
|
|
},
|
|
{
|
|
"epoch": 0.39218886933950586,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 0.00020253534972908326,
|
|
"loss": 2.6349,
|
|
"step": 1374
|
|
},
|
|
{
|
|
"epoch": 0.39247430519783155,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 0.00020240805270435044,
|
|
"loss": 2.6329,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.3927597410561573,
|
|
"grad_norm": 0.6875,
|
|
"learning_rate": 0.00020228071267257687,
|
|
"loss": 2.6633,
|
|
"step": 1376
|
|
},
|
|
{
|
|
"epoch": 0.393045176914483,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 0.00020215332973826003,
|
|
"loss": 2.6117,
|
|
"step": 1377
|
|
},
|
|
{
|
|
"epoch": 0.3933306127728087,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.00020202590400593285,
|
|
"loss": 2.6286,
|
|
"step": 1378
|
|
},
|
|
{
|
|
"epoch": 0.39361604863113436,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 0.00020189843558016338,
|
|
"loss": 2.6105,
|
|
"step": 1379
|
|
},
|
|
{
|
|
"epoch": 0.39390148448946005,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0002017709245655545,
|
|
"loss": 2.6128,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.39418692034778574,
|
|
"grad_norm": 0.82421875,
|
|
"learning_rate": 0.00020164337106674417,
|
|
"loss": 2.6243,
|
|
"step": 1381
|
|
},
|
|
{
|
|
"epoch": 0.39447235620611143,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 0.0002015157751884053,
|
|
"loss": 2.6557,
|
|
"step": 1382
|
|
},
|
|
{
|
|
"epoch": 0.3947577920644371,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0002013881370352454,
|
|
"loss": 2.624,
|
|
"step": 1383
|
|
},
|
|
{
|
|
"epoch": 0.39504322792276286,
|
|
"grad_norm": 0.76171875,
|
|
"learning_rate": 0.00020126045671200682,
|
|
"loss": 2.6279,
|
|
"step": 1384
|
|
},
|
|
{
|
|
"epoch": 0.39532866378108855,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.00020113273432346632,
|
|
"loss": 2.6363,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.39561409963941424,
|
|
"grad_norm": 0.79296875,
|
|
"learning_rate": 0.00020100496997443553,
|
|
"loss": 2.6274,
|
|
"step": 1386
|
|
},
|
|
{
|
|
"epoch": 0.39589953549773993,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 0.00020087716376976014,
|
|
"loss": 2.6191,
|
|
"step": 1387
|
|
},
|
|
{
|
|
"epoch": 0.3961849713560656,
|
|
"grad_norm": 0.640625,
|
|
"learning_rate": 0.00020074931581432035,
|
|
"loss": 2.6355,
|
|
"step": 1388
|
|
},
|
|
{
|
|
"epoch": 0.3964704072143913,
|
|
"grad_norm": 0.6875,
|
|
"learning_rate": 0.0002006214262130307,
|
|
"loss": 2.6386,
|
|
"step": 1389
|
|
},
|
|
{
|
|
"epoch": 0.396755843072717,
|
|
"grad_norm": 0.6171875,
|
|
"learning_rate": 0.0002004934950708397,
|
|
"loss": 2.6345,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.3970412789310427,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00020036552249273014,
|
|
"loss": 2.6081,
|
|
"step": 1391
|
|
},
|
|
{
|
|
"epoch": 0.39732671478936843,
|
|
"grad_norm": 0.65625,
|
|
"learning_rate": 0.00020023750858371876,
|
|
"loss": 2.6243,
|
|
"step": 1392
|
|
},
|
|
{
|
|
"epoch": 0.3976121506476941,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00020010945344885615,
|
|
"loss": 2.6405,
|
|
"step": 1393
|
|
},
|
|
{
|
|
"epoch": 0.3978975865060198,
|
|
"grad_norm": 0.61328125,
|
|
"learning_rate": 0.0001999813571932268,
|
|
"loss": 2.5995,
|
|
"step": 1394
|
|
},
|
|
{
|
|
"epoch": 0.3981830223643455,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.00019985321992194892,
|
|
"loss": 2.6225,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.3984684582226712,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00019972504174017446,
|
|
"loss": 2.6077,
|
|
"step": 1396
|
|
},
|
|
{
|
|
"epoch": 0.3987538940809969,
|
|
"grad_norm": 0.59765625,
|
|
"learning_rate": 0.00019959682275308869,
|
|
"loss": 2.6165,
|
|
"step": 1397
|
|
},
|
|
{
|
|
"epoch": 0.39903932993932256,
|
|
"grad_norm": 0.66796875,
|
|
"learning_rate": 0.0001994685630659107,
|
|
"loss": 2.601,
|
|
"step": 1398
|
|
},
|
|
{
|
|
"epoch": 0.39932476579764825,
|
|
"grad_norm": 0.6171875,
|
|
"learning_rate": 0.00019934026278389274,
|
|
"loss": 2.6332,
|
|
"step": 1399
|
|
},
|
|
{
|
|
"epoch": 0.399610201655974,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00019921192201232047,
|
|
"loss": 2.6224,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.3998956375142997,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.0001990835408565127,
|
|
"loss": 2.5961,
|
|
"step": 1401
|
|
},
|
|
{
|
|
"epoch": 0.4001810733726254,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.0001989551194218216,
|
|
"loss": 2.6291,
|
|
"step": 1402
|
|
},
|
|
{
|
|
"epoch": 0.40046650923095106,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00019882665781363208,
|
|
"loss": 2.6164,
|
|
"step": 1403
|
|
},
|
|
{
|
|
"epoch": 0.40075194508927675,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00019869815613736224,
|
|
"loss": 2.6452,
|
|
"step": 1404
|
|
},
|
|
{
|
|
"epoch": 0.40103738094760244,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00019856961449846294,
|
|
"loss": 2.6502,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.40132281680592813,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.0001984410330024179,
|
|
"loss": 2.6174,
|
|
"step": 1406
|
|
},
|
|
{
|
|
"epoch": 0.4016082526642538,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.0001983124117547436,
|
|
"loss": 2.5982,
|
|
"step": 1407
|
|
},
|
|
{
|
|
"epoch": 0.40189368852257956,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00019818375086098897,
|
|
"loss": 2.5949,
|
|
"step": 1408
|
|
},
|
|
{
|
|
"epoch": 0.40217912438090525,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00019805505042673564,
|
|
"loss": 2.6337,
|
|
"step": 1409
|
|
},
|
|
{
|
|
"epoch": 0.40246456023923094,
|
|
"grad_norm": 0.64453125,
|
|
"learning_rate": 0.00019792631055759764,
|
|
"loss": 2.6204,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.40274999609755663,
|
|
"grad_norm": 0.61328125,
|
|
"learning_rate": 0.00019779753135922126,
|
|
"loss": 2.6416,
|
|
"step": 1411
|
|
},
|
|
{
|
|
"epoch": 0.4030354319558823,
|
|
"grad_norm": 0.60546875,
|
|
"learning_rate": 0.00019766871293728524,
|
|
"loss": 2.6037,
|
|
"step": 1412
|
|
},
|
|
{
|
|
"epoch": 0.403320867814208,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 0.00019753985539750036,
|
|
"loss": 2.6191,
|
|
"step": 1413
|
|
},
|
|
{
|
|
"epoch": 0.4036063036725337,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00019741095884560957,
|
|
"loss": 2.6103,
|
|
"step": 1414
|
|
},
|
|
{
|
|
"epoch": 0.4038917395308594,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00019728202338738785,
|
|
"loss": 2.6346,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.40417717538918513,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 0.0001971530491286421,
|
|
"loss": 2.6142,
|
|
"step": 1416
|
|
},
|
|
{
|
|
"epoch": 0.4044626112475108,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00019702403617521093,
|
|
"loss": 2.612,
|
|
"step": 1417
|
|
},
|
|
{
|
|
"epoch": 0.4047480471058365,
|
|
"grad_norm": 0.625,
|
|
"learning_rate": 0.00019689498463296487,
|
|
"loss": 2.6237,
|
|
"step": 1418
|
|
},
|
|
{
|
|
"epoch": 0.4050334829641622,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00019676589460780616,
|
|
"loss": 2.6104,
|
|
"step": 1419
|
|
},
|
|
{
|
|
"epoch": 0.4053189188224879,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 0.00019663676620566836,
|
|
"loss": 2.6246,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.4056043546808136,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 0.00019650759953251677,
|
|
"loss": 2.6212,
|
|
"step": 1421
|
|
},
|
|
{
|
|
"epoch": 0.40588979053913926,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00019637839469434804,
|
|
"loss": 2.6268,
|
|
"step": 1422
|
|
},
|
|
{
|
|
"epoch": 0.40617522639746495,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00019624915179719004,
|
|
"loss": 2.6045,
|
|
"step": 1423
|
|
},
|
|
{
|
|
"epoch": 0.4064606622557907,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00019611987094710192,
|
|
"loss": 2.5961,
|
|
"step": 1424
|
|
},
|
|
{
|
|
"epoch": 0.4067460981141164,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00019599055225017408,
|
|
"loss": 2.5987,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.40703153397244207,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 0.00019586119581252781,
|
|
"loss": 2.6394,
|
|
"step": 1426
|
|
},
|
|
{
|
|
"epoch": 0.40731696983076776,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.00019573180174031556,
|
|
"loss": 2.5998,
|
|
"step": 1427
|
|
},
|
|
{
|
|
"epoch": 0.40760240568909345,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00019560237013972046,
|
|
"loss": 2.6149,
|
|
"step": 1428
|
|
},
|
|
{
|
|
"epoch": 0.40788784154741914,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.0001954729011169565,
|
|
"loss": 2.6389,
|
|
"step": 1429
|
|
},
|
|
{
|
|
"epoch": 0.4081732774057448,
|
|
"grad_norm": 0.59375,
|
|
"learning_rate": 0.00019534339477826854,
|
|
"loss": 2.6498,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.4084587132640705,
|
|
"grad_norm": 0.60546875,
|
|
"learning_rate": 0.00019521385122993185,
|
|
"loss": 2.6256,
|
|
"step": 1431
|
|
},
|
|
{
|
|
"epoch": 0.40874414912239626,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 0.00019508427057825237,
|
|
"loss": 2.614,
|
|
"step": 1432
|
|
},
|
|
{
|
|
"epoch": 0.40902958498072195,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.0001949546529295664,
|
|
"loss": 2.5803,
|
|
"step": 1433
|
|
},
|
|
{
|
|
"epoch": 0.40931502083904764,
|
|
"grad_norm": 0.625,
|
|
"learning_rate": 0.00019482499839024062,
|
|
"loss": 2.6267,
|
|
"step": 1434
|
|
},
|
|
{
|
|
"epoch": 0.4096004566973733,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.00019469530706667205,
|
|
"loss": 2.627,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.409885892555699,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.0001945655790652878,
|
|
"loss": 2.6262,
|
|
"step": 1436
|
|
},
|
|
{
|
|
"epoch": 0.4101713284140247,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 0.00019443581449254515,
|
|
"loss": 2.6189,
|
|
"step": 1437
|
|
},
|
|
{
|
|
"epoch": 0.4104567642723504,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00019430601345493136,
|
|
"loss": 2.6023,
|
|
"step": 1438
|
|
},
|
|
{
|
|
"epoch": 0.4107422001306761,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.0001941761760589637,
|
|
"loss": 2.6085,
|
|
"step": 1439
|
|
},
|
|
{
|
|
"epoch": 0.4110276359890018,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00019404630241118902,
|
|
"loss": 2.6117,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.4113130718473275,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.00019391639261818428,
|
|
"loss": 2.6289,
|
|
"step": 1441
|
|
},
|
|
{
|
|
"epoch": 0.4115985077056532,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.00019378644678655582,
|
|
"loss": 2.6221,
|
|
"step": 1442
|
|
},
|
|
{
|
|
"epoch": 0.4118839435639789,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00019365646502293962,
|
|
"loss": 2.6028,
|
|
"step": 1443
|
|
},
|
|
{
|
|
"epoch": 0.4121693794223046,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00019352644743400124,
|
|
"loss": 2.599,
|
|
"step": 1444
|
|
},
|
|
{
|
|
"epoch": 0.41245481528063027,
|
|
"grad_norm": 0.68359375,
|
|
"learning_rate": 0.0001933963941264356,
|
|
"loss": 2.6002,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.41274025113895596,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.0001932663052069668,
|
|
"loss": 2.6078,
|
|
"step": 1446
|
|
},
|
|
{
|
|
"epoch": 0.41302568699728165,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.00019313618078234843,
|
|
"loss": 2.6375,
|
|
"step": 1447
|
|
},
|
|
{
|
|
"epoch": 0.41331112285560734,
|
|
"grad_norm": 0.625,
|
|
"learning_rate": 0.00019300602095936287,
|
|
"loss": 2.6145,
|
|
"step": 1448
|
|
},
|
|
{
|
|
"epoch": 0.4135965587139331,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00019287582584482193,
|
|
"loss": 2.6075,
|
|
"step": 1449
|
|
},
|
|
{
|
|
"epoch": 0.41388199457225877,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00019274559554556604,
|
|
"loss": 2.5988,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.41416743043058446,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00019261533016846468,
|
|
"loss": 2.6142,
|
|
"step": 1451
|
|
},
|
|
{
|
|
"epoch": 0.41445286628891015,
|
|
"grad_norm": 0.65234375,
|
|
"learning_rate": 0.00019248502982041613,
|
|
"loss": 2.5849,
|
|
"step": 1452
|
|
},
|
|
{
|
|
"epoch": 0.41473830214723584,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 0.00019235469460834732,
|
|
"loss": 2.6181,
|
|
"step": 1453
|
|
},
|
|
{
|
|
"epoch": 0.4150237380055615,
|
|
"grad_norm": 0.61328125,
|
|
"learning_rate": 0.00019222432463921374,
|
|
"loss": 2.5999,
|
|
"step": 1454
|
|
},
|
|
{
|
|
"epoch": 0.4153091738638872,
|
|
"grad_norm": 0.75,
|
|
"learning_rate": 0.0001920939200199995,
|
|
"loss": 2.6166,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.4155946097222129,
|
|
"grad_norm": 0.78515625,
|
|
"learning_rate": 0.00019196348085771713,
|
|
"loss": 2.6053,
|
|
"step": 1456
|
|
},
|
|
{
|
|
"epoch": 0.41588004558053865,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 0.0001918330072594074,
|
|
"loss": 2.6113,
|
|
"step": 1457
|
|
},
|
|
{
|
|
"epoch": 0.41616548143886434,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 0.00019170249933213947,
|
|
"loss": 2.6028,
|
|
"step": 1458
|
|
},
|
|
{
|
|
"epoch": 0.41645091729719,
|
|
"grad_norm": 0.765625,
|
|
"learning_rate": 0.00019157195718301067,
|
|
"loss": 2.6048,
|
|
"step": 1459
|
|
},
|
|
{
|
|
"epoch": 0.4167363531555157,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 0.00019144138091914617,
|
|
"loss": 2.6143,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.4170217890138414,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.00019131077064769953,
|
|
"loss": 2.6159,
|
|
"step": 1461
|
|
},
|
|
{
|
|
"epoch": 0.4173072248721671,
|
|
"grad_norm": 0.76171875,
|
|
"learning_rate": 0.00019118012647585192,
|
|
"loss": 2.5989,
|
|
"step": 1462
|
|
},
|
|
{
|
|
"epoch": 0.4175926607304928,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 0.00019104944851081244,
|
|
"loss": 2.6203,
|
|
"step": 1463
|
|
},
|
|
{
|
|
"epoch": 0.41787809658881847,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00019091873685981786,
|
|
"loss": 2.596,
|
|
"step": 1464
|
|
},
|
|
{
|
|
"epoch": 0.4181635324471442,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 0.00019078799163013273,
|
|
"loss": 2.5961,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.4184489683054699,
|
|
"grad_norm": 0.8125,
|
|
"learning_rate": 0.000190657212929049,
|
|
"loss": 2.6254,
|
|
"step": 1466
|
|
},
|
|
{
|
|
"epoch": 0.4187344041637956,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.0001905264008638861,
|
|
"loss": 2.616,
|
|
"step": 1467
|
|
},
|
|
{
|
|
"epoch": 0.4190198400221213,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 0.00019039555554199099,
|
|
"loss": 2.635,
|
|
"step": 1468
|
|
},
|
|
{
|
|
"epoch": 0.41930527588044697,
|
|
"grad_norm": 0.609375,
|
|
"learning_rate": 0.0001902646770707378,
|
|
"loss": 2.5834,
|
|
"step": 1469
|
|
},
|
|
{
|
|
"epoch": 0.41959071173877266,
|
|
"grad_norm": 0.58203125,
|
|
"learning_rate": 0.00019013376555752782,
|
|
"loss": 2.61,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.41987614759709835,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.00019000282110978958,
|
|
"loss": 2.6072,
|
|
"step": 1471
|
|
},
|
|
{
|
|
"epoch": 0.42016158345542404,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00018987184383497855,
|
|
"loss": 2.5803,
|
|
"step": 1472
|
|
},
|
|
{
|
|
"epoch": 0.4204470193137498,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00018974083384057713,
|
|
"loss": 2.639,
|
|
"step": 1473
|
|
},
|
|
{
|
|
"epoch": 0.42073245517207547,
|
|
"grad_norm": 0.64453125,
|
|
"learning_rate": 0.00018960979123409466,
|
|
"loss": 2.5955,
|
|
"step": 1474
|
|
},
|
|
{
|
|
"epoch": 0.42101789103040116,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.0001894787161230672,
|
|
"loss": 2.6356,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.42130332688872685,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.0001893476086150574,
|
|
"loss": 2.6224,
|
|
"step": 1476
|
|
},
|
|
{
|
|
"epoch": 0.42158876274705254,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.00018921646881765456,
|
|
"loss": 2.6103,
|
|
"step": 1477
|
|
},
|
|
{
|
|
"epoch": 0.4218741986053782,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.0001890852968384746,
|
|
"loss": 2.6162,
|
|
"step": 1478
|
|
},
|
|
{
|
|
"epoch": 0.4221596344637039,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.0001889540927851596,
|
|
"loss": 2.628,
|
|
"step": 1479
|
|
},
|
|
{
|
|
"epoch": 0.4224450703220296,
|
|
"grad_norm": 0.65625,
|
|
"learning_rate": 0.0001888228567653781,
|
|
"loss": 2.6217,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.42273050618035535,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00018869158888682494,
|
|
"loss": 2.613,
|
|
"step": 1481
|
|
},
|
|
{
|
|
"epoch": 0.42301594203868104,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 0.00018856028925722104,
|
|
"loss": 2.608,
|
|
"step": 1482
|
|
},
|
|
{
|
|
"epoch": 0.4233013778970067,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00018842895798431327,
|
|
"loss": 2.6083,
|
|
"step": 1483
|
|
},
|
|
{
|
|
"epoch": 0.4235868137553324,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00018829759517587457,
|
|
"loss": 2.6065,
|
|
"step": 1484
|
|
},
|
|
{
|
|
"epoch": 0.4238722496136581,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 0.00018816620093970387,
|
|
"loss": 2.6158,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.4241576854719838,
|
|
"grad_norm": 0.625,
|
|
"learning_rate": 0.00018803477538362562,
|
|
"loss": 2.628,
|
|
"step": 1486
|
|
},
|
|
{
|
|
"epoch": 0.4244431213303095,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00018790331861549023,
|
|
"loss": 2.6095,
|
|
"step": 1487
|
|
},
|
|
{
|
|
"epoch": 0.42472855718863517,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00018777183074317349,
|
|
"loss": 2.5987,
|
|
"step": 1488
|
|
},
|
|
{
|
|
"epoch": 0.4250139930469609,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.000187640311874577,
|
|
"loss": 2.5805,
|
|
"step": 1489
|
|
},
|
|
{
|
|
"epoch": 0.4252994289052866,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00018750876211762752,
|
|
"loss": 2.6163,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.4255848647636123,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00018737718158027734,
|
|
"loss": 2.596,
|
|
"step": 1491
|
|
},
|
|
{
|
|
"epoch": 0.425870300621938,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00018724557037050384,
|
|
"loss": 2.6397,
|
|
"step": 1492
|
|
},
|
|
{
|
|
"epoch": 0.42615573648026367,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.0001871139285963098,
|
|
"loss": 2.6378,
|
|
"step": 1493
|
|
},
|
|
{
|
|
"epoch": 0.42644117233858936,
|
|
"grad_norm": 0.546875,
|
|
"learning_rate": 0.00018698225636572285,
|
|
"loss": 2.6063,
|
|
"step": 1494
|
|
},
|
|
{
|
|
"epoch": 0.42672660819691505,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.0001868505537867958,
|
|
"loss": 2.6003,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.42701204405524074,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00018671882096760623,
|
|
"loss": 2.595,
|
|
"step": 1496
|
|
},
|
|
{
|
|
"epoch": 0.4272974799135665,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00018658705801625656,
|
|
"loss": 2.5969,
|
|
"step": 1497
|
|
},
|
|
{
|
|
"epoch": 0.42758291577189217,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00018645526504087402,
|
|
"loss": 2.6158,
|
|
"step": 1498
|
|
},
|
|
{
|
|
"epoch": 0.42786835163021786,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00018632344214961045,
|
|
"loss": 2.6027,
|
|
"step": 1499
|
|
},
|
|
{
|
|
"epoch": 0.42815378748854355,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.0001861915894506421,
|
|
"loss": 2.6258,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.42815378748854355,
|
|
"eval_loss": 2.498450517654419,
|
|
"eval_runtime": 5960.8882,
|
|
"eval_samples_per_second": 10.785,
|
|
"eval_steps_per_second": 10.785,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.42843922334686924,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00018605970705216988,
|
|
"loss": 2.5927,
|
|
"step": 1501
|
|
},
|
|
{
|
|
"epoch": 0.4287246592051949,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00018592779506241902,
|
|
"loss": 2.5965,
|
|
"step": 1502
|
|
},
|
|
{
|
|
"epoch": 0.4290100950635206,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00018579585358963885,
|
|
"loss": 2.6102,
|
|
"step": 1503
|
|
},
|
|
{
|
|
"epoch": 0.4292955309218463,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00018566388274210316,
|
|
"loss": 2.5903,
|
|
"step": 1504
|
|
},
|
|
{
|
|
"epoch": 0.42958096678017205,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00018553188262810974,
|
|
"loss": 2.6056,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.42986640263849774,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00018539985335598033,
|
|
"loss": 2.6157,
|
|
"step": 1506
|
|
},
|
|
{
|
|
"epoch": 0.4301518384968234,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.00018526779503406059,
|
|
"loss": 2.5769,
|
|
"step": 1507
|
|
},
|
|
{
|
|
"epoch": 0.4304372743551491,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00018513570777072024,
|
|
"loss": 2.6171,
|
|
"step": 1508
|
|
},
|
|
{
|
|
"epoch": 0.4307227102134748,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.0001850035916743525,
|
|
"loss": 2.5859,
|
|
"step": 1509
|
|
},
|
|
{
|
|
"epoch": 0.4310081460718005,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00018487144685337432,
|
|
"loss": 2.5976,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.4312935819301262,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00018473927341622627,
|
|
"loss": 2.6144,
|
|
"step": 1511
|
|
},
|
|
{
|
|
"epoch": 0.43157901778845187,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.0001846070714713724,
|
|
"loss": 2.6233,
|
|
"step": 1512
|
|
},
|
|
{
|
|
"epoch": 0.4318644536467776,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.0001844748411273001,
|
|
"loss": 2.6009,
|
|
"step": 1513
|
|
},
|
|
{
|
|
"epoch": 0.4321498895051033,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00018434258249252008,
|
|
"loss": 2.6117,
|
|
"step": 1514
|
|
},
|
|
{
|
|
"epoch": 0.432435325363429,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00018421029567556633,
|
|
"loss": 2.6089,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.4327207612217547,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00018407798078499588,
|
|
"loss": 2.5967,
|
|
"step": 1516
|
|
},
|
|
{
|
|
"epoch": 0.43300619708008037,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.0001839456379293889,
|
|
"loss": 2.6026,
|
|
"step": 1517
|
|
},
|
|
{
|
|
"epoch": 0.43329163293840606,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00018381326721734833,
|
|
"loss": 2.6104,
|
|
"step": 1518
|
|
},
|
|
{
|
|
"epoch": 0.43357706879673175,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00018368086875750013,
|
|
"loss": 2.6096,
|
|
"step": 1519
|
|
},
|
|
{
|
|
"epoch": 0.43386250465505743,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00018354844265849307,
|
|
"loss": 2.6035,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.4341479405133831,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.0001834159890289984,
|
|
"loss": 2.6119,
|
|
"step": 1521
|
|
},
|
|
{
|
|
"epoch": 0.43443337637170887,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 0.00018328350797771018,
|
|
"loss": 2.6295,
|
|
"step": 1522
|
|
},
|
|
{
|
|
"epoch": 0.43471881223003456,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.0001831509996133447,
|
|
"loss": 2.5938,
|
|
"step": 1523
|
|
},
|
|
{
|
|
"epoch": 0.43500424808836025,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.000183018464044641,
|
|
"loss": 2.6174,
|
|
"step": 1524
|
|
},
|
|
{
|
|
"epoch": 0.43528968394668593,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00018288590138036028,
|
|
"loss": 2.6166,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.4355751198050116,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00018275331172928587,
|
|
"loss": 2.6148,
|
|
"step": 1526
|
|
},
|
|
{
|
|
"epoch": 0.4358605556633373,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.00018262069520022338,
|
|
"loss": 2.5973,
|
|
"step": 1527
|
|
},
|
|
{
|
|
"epoch": 0.436145991521663,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00018248805190200048,
|
|
"loss": 2.5931,
|
|
"step": 1528
|
|
},
|
|
{
|
|
"epoch": 0.4364314273799887,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.0001823553819434668,
|
|
"loss": 2.5844,
|
|
"step": 1529
|
|
},
|
|
{
|
|
"epoch": 0.43671686323831443,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00018222268543349374,
|
|
"loss": 2.6187,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.4370022990966401,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.00018208996248097458,
|
|
"loss": 2.5919,
|
|
"step": 1531
|
|
},
|
|
{
|
|
"epoch": 0.4372877349549658,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.00018195721319482438,
|
|
"loss": 2.6071,
|
|
"step": 1532
|
|
},
|
|
{
|
|
"epoch": 0.4375731708132915,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00018182443768397963,
|
|
"loss": 2.6021,
|
|
"step": 1533
|
|
},
|
|
{
|
|
"epoch": 0.4378586066716172,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00018169163605739845,
|
|
"loss": 2.5948,
|
|
"step": 1534
|
|
},
|
|
{
|
|
"epoch": 0.4381440425299429,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.0001815588084240604,
|
|
"loss": 2.6145,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.43842947838826857,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.0001814259548929663,
|
|
"loss": 2.5996,
|
|
"step": 1536
|
|
},
|
|
{
|
|
"epoch": 0.43871491424659426,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.0001812930755731383,
|
|
"loss": 2.6011,
|
|
"step": 1537
|
|
},
|
|
{
|
|
"epoch": 0.43900035010492,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00018116017057361972,
|
|
"loss": 2.6185,
|
|
"step": 1538
|
|
},
|
|
{
|
|
"epoch": 0.4392857859632457,
|
|
"grad_norm": 0.609375,
|
|
"learning_rate": 0.00018102724000347488,
|
|
"loss": 2.5761,
|
|
"step": 1539
|
|
},
|
|
{
|
|
"epoch": 0.4395712218215714,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00018089428397178908,
|
|
"loss": 2.6193,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.43985665767989707,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.0001807613025876687,
|
|
"loss": 2.6,
|
|
"step": 1541
|
|
},
|
|
{
|
|
"epoch": 0.44014209353822276,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00018062829596024067,
|
|
"loss": 2.5964,
|
|
"step": 1542
|
|
},
|
|
{
|
|
"epoch": 0.44042752939654845,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.0001804952641986527,
|
|
"loss": 2.5884,
|
|
"step": 1543
|
|
},
|
|
{
|
|
"epoch": 0.44071296525487413,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00018036220741207332,
|
|
"loss": 2.5893,
|
|
"step": 1544
|
|
},
|
|
{
|
|
"epoch": 0.4409984011131998,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.0001802291257096914,
|
|
"loss": 2.5842,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.44128383697152557,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.00018009601920071624,
|
|
"loss": 2.6291,
|
|
"step": 1546
|
|
},
|
|
{
|
|
"epoch": 0.44156927282985126,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 0.00017996288799437758,
|
|
"loss": 2.6153,
|
|
"step": 1547
|
|
},
|
|
{
|
|
"epoch": 0.44185470868817694,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00017982973219992548,
|
|
"loss": 2.5752,
|
|
"step": 1548
|
|
},
|
|
{
|
|
"epoch": 0.44214014454650263,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00017969655192663007,
|
|
"loss": 2.5856,
|
|
"step": 1549
|
|
},
|
|
{
|
|
"epoch": 0.4424255804048283,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.00017956334728378158,
|
|
"loss": 2.5989,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.442711016263154,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00017943011838069021,
|
|
"loss": 2.621,
|
|
"step": 1551
|
|
},
|
|
{
|
|
"epoch": 0.4429964521214797,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.0001792968653266863,
|
|
"loss": 2.6003,
|
|
"step": 1552
|
|
},
|
|
{
|
|
"epoch": 0.4432818879798054,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00017916358823111972,
|
|
"loss": 2.6094,
|
|
"step": 1553
|
|
},
|
|
{
|
|
"epoch": 0.44356732383813113,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 0.0001790302872033601,
|
|
"loss": 2.6167,
|
|
"step": 1554
|
|
},
|
|
{
|
|
"epoch": 0.4438527596964568,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00017889696235279693,
|
|
"loss": 2.576,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.4441381955547825,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00017876361378883903,
|
|
"loss": 2.5914,
|
|
"step": 1556
|
|
},
|
|
{
|
|
"epoch": 0.4444236314131082,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017863024162091478,
|
|
"loss": 2.591,
|
|
"step": 1557
|
|
},
|
|
{
|
|
"epoch": 0.4447090672714339,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.0001784968459584719,
|
|
"loss": 2.6002,
|
|
"step": 1558
|
|
},
|
|
{
|
|
"epoch": 0.4449945031297596,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00017836342691097742,
|
|
"loss": 2.5826,
|
|
"step": 1559
|
|
},
|
|
{
|
|
"epoch": 0.44527993898808527,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 0.0001782299845879175,
|
|
"loss": 2.5972,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.44556537484641096,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.00017809651909879749,
|
|
"loss": 2.5984,
|
|
"step": 1561
|
|
},
|
|
{
|
|
"epoch": 0.4458508107047367,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00017796303055314164,
|
|
"loss": 2.5803,
|
|
"step": 1562
|
|
},
|
|
{
|
|
"epoch": 0.4461362465630624,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017782951906049316,
|
|
"loss": 2.6079,
|
|
"step": 1563
|
|
},
|
|
{
|
|
"epoch": 0.4464216824213881,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00017769598473041422,
|
|
"loss": 2.5998,
|
|
"step": 1564
|
|
},
|
|
{
|
|
"epoch": 0.44670711827971377,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.00017756242767248557,
|
|
"loss": 2.5921,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.44699255413803946,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.0001774288479963066,
|
|
"loss": 2.5799,
|
|
"step": 1566
|
|
},
|
|
{
|
|
"epoch": 0.44727798999636514,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00017729524581149537,
|
|
"loss": 2.639,
|
|
"step": 1567
|
|
},
|
|
{
|
|
"epoch": 0.44756342585469083,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00017716162122768836,
|
|
"loss": 2.613,
|
|
"step": 1568
|
|
},
|
|
{
|
|
"epoch": 0.4478488617130165,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.0001770279743545405,
|
|
"loss": 2.6075,
|
|
"step": 1569
|
|
},
|
|
{
|
|
"epoch": 0.44813429757134227,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017689430530172482,
|
|
"loss": 2.5834,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.44841973342966795,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00017676061417893274,
|
|
"loss": 2.607,
|
|
"step": 1571
|
|
},
|
|
{
|
|
"epoch": 0.44870516928799364,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00017662690109587382,
|
|
"loss": 2.5996,
|
|
"step": 1572
|
|
},
|
|
{
|
|
"epoch": 0.44899060514631933,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00017649316616227538,
|
|
"loss": 2.5941,
|
|
"step": 1573
|
|
},
|
|
{
|
|
"epoch": 0.449276041004645,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.0001763594094878829,
|
|
"loss": 2.5961,
|
|
"step": 1574
|
|
},
|
|
{
|
|
"epoch": 0.4495614768629707,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00017622563118245972,
|
|
"loss": 2.5923,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.4498469127212964,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00017609183135578675,
|
|
"loss": 2.5981,
|
|
"step": 1576
|
|
},
|
|
{
|
|
"epoch": 0.4501323485796221,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.00017595801011766274,
|
|
"loss": 2.6039,
|
|
"step": 1577
|
|
},
|
|
{
|
|
"epoch": 0.45041778443794783,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00017582416757790388,
|
|
"loss": 2.587,
|
|
"step": 1578
|
|
},
|
|
{
|
|
"epoch": 0.4507032202962735,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.0001756903038463439,
|
|
"loss": 2.5729,
|
|
"step": 1579
|
|
},
|
|
{
|
|
"epoch": 0.4509886561545992,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 0.0001755564190328339,
|
|
"loss": 2.6028,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.4512740920129249,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.00017542251324724237,
|
|
"loss": 2.5784,
|
|
"step": 1581
|
|
},
|
|
{
|
|
"epoch": 0.4515595278712506,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00017528858659945486,
|
|
"loss": 2.6228,
|
|
"step": 1582
|
|
},
|
|
{
|
|
"epoch": 0.4518449637295763,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00017515463919937413,
|
|
"loss": 2.6181,
|
|
"step": 1583
|
|
},
|
|
{
|
|
"epoch": 0.45213039958790197,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.00017502067115691996,
|
|
"loss": 2.5915,
|
|
"step": 1584
|
|
},
|
|
{
|
|
"epoch": 0.45241583544622765,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 0.0001748866825820291,
|
|
"loss": 2.6104,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.45270127130455334,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 0.00017475267358465504,
|
|
"loss": 2.5913,
|
|
"step": 1586
|
|
},
|
|
{
|
|
"epoch": 0.4529867071628791,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.00017461864427476814,
|
|
"loss": 2.6017,
|
|
"step": 1587
|
|
},
|
|
{
|
|
"epoch": 0.4532721430212048,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.0001744845947623554,
|
|
"loss": 2.6186,
|
|
"step": 1588
|
|
},
|
|
{
|
|
"epoch": 0.45355757887953047,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00017435052515742038,
|
|
"loss": 2.5961,
|
|
"step": 1589
|
|
},
|
|
{
|
|
"epoch": 0.45384301473785615,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00017421643556998312,
|
|
"loss": 2.5929,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.45412845059618184,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.0001740823261100801,
|
|
"loss": 2.5902,
|
|
"step": 1591
|
|
},
|
|
{
|
|
"epoch": 0.45441388645450753,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.0001739481968877641,
|
|
"loss": 2.5817,
|
|
"step": 1592
|
|
},
|
|
{
|
|
"epoch": 0.4546993223128332,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00017381404801310404,
|
|
"loss": 2.5856,
|
|
"step": 1593
|
|
},
|
|
{
|
|
"epoch": 0.4549847581711589,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017367987959618505,
|
|
"loss": 2.5742,
|
|
"step": 1594
|
|
},
|
|
{
|
|
"epoch": 0.45527019402948465,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00017354569174710834,
|
|
"loss": 2.5916,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.45555562988781034,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017341148457599096,
|
|
"loss": 2.5964,
|
|
"step": 1596
|
|
},
|
|
{
|
|
"epoch": 0.45584106574613603,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00017327725819296576,
|
|
"loss": 2.597,
|
|
"step": 1597
|
|
},
|
|
{
|
|
"epoch": 0.4561265016044617,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.0001731430127081816,
|
|
"loss": 2.5921,
|
|
"step": 1598
|
|
},
|
|
{
|
|
"epoch": 0.4564119374627874,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00017300874823180282,
|
|
"loss": 2.61,
|
|
"step": 1599
|
|
},
|
|
{
|
|
"epoch": 0.4566973733211131,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 0.00017287446487400935,
|
|
"loss": 2.5985,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.4569828091794388,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00017274016274499665,
|
|
"loss": 2.6079,
|
|
"step": 1601
|
|
},
|
|
{
|
|
"epoch": 0.4572682450377645,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00017260584195497567,
|
|
"loss": 2.5797,
|
|
"step": 1602
|
|
},
|
|
{
|
|
"epoch": 0.4575536808960902,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00017247150261417255,
|
|
"loss": 2.6106,
|
|
"step": 1603
|
|
},
|
|
{
|
|
"epoch": 0.4578391167544159,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.0001723371448328287,
|
|
"loss": 2.5846,
|
|
"step": 1604
|
|
},
|
|
{
|
|
"epoch": 0.4581245526127416,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00017220276872120072,
|
|
"loss": 2.5763,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.4584099884710673,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00017206837438956004,
|
|
"loss": 2.5878,
|
|
"step": 1606
|
|
},
|
|
{
|
|
"epoch": 0.458695424329393,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00017193396194819328,
|
|
"loss": 2.5931,
|
|
"step": 1607
|
|
},
|
|
{
|
|
"epoch": 0.45898086018771866,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00017179953150740193,
|
|
"loss": 2.5835,
|
|
"step": 1608
|
|
},
|
|
{
|
|
"epoch": 0.45926629604604435,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.000171665083177502,
|
|
"loss": 2.6094,
|
|
"step": 1609
|
|
},
|
|
{
|
|
"epoch": 0.45955173190437004,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017153061706882443,
|
|
"loss": 2.6024,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.4598371677626958,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.0001713961332917146,
|
|
"loss": 2.618,
|
|
"step": 1611
|
|
},
|
|
{
|
|
"epoch": 0.4601226036210215,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00017126163195653254,
|
|
"loss": 2.6115,
|
|
"step": 1612
|
|
},
|
|
{
|
|
"epoch": 0.46040803947934716,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017112711317365247,
|
|
"loss": 2.5529,
|
|
"step": 1613
|
|
},
|
|
{
|
|
"epoch": 0.46069347533767285,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00017099257705346314,
|
|
"loss": 2.6051,
|
|
"step": 1614
|
|
},
|
|
{
|
|
"epoch": 0.46097891119599854,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00017085802370636743,
|
|
"loss": 2.6073,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.46126434705432423,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00017072345324278232,
|
|
"loss": 2.5969,
|
|
"step": 1616
|
|
},
|
|
{
|
|
"epoch": 0.4615497829126499,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00017058886577313892,
|
|
"loss": 2.6139,
|
|
"step": 1617
|
|
},
|
|
{
|
|
"epoch": 0.4618352187709756,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00017045426140788224,
|
|
"loss": 2.5696,
|
|
"step": 1618
|
|
},
|
|
{
|
|
"epoch": 0.46212065462930135,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.00017031964025747117,
|
|
"loss": 2.5835,
|
|
"step": 1619
|
|
},
|
|
{
|
|
"epoch": 0.46240609048762704,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00017018500243237838,
|
|
"loss": 2.5731,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.46269152634595273,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00017005034804309027,
|
|
"loss": 2.6096,
|
|
"step": 1621
|
|
},
|
|
{
|
|
"epoch": 0.4629769622042784,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00016991567720010668,
|
|
"loss": 2.6063,
|
|
"step": 1622
|
|
},
|
|
{
|
|
"epoch": 0.4632623980626041,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00016978099001394112,
|
|
"loss": 2.6002,
|
|
"step": 1623
|
|
},
|
|
{
|
|
"epoch": 0.4635478339209298,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.00016964628659512046,
|
|
"loss": 2.5955,
|
|
"step": 1624
|
|
},
|
|
{
|
|
"epoch": 0.4638332697792555,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00016951156705418484,
|
|
"loss": 2.5975,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.4641187056375812,
|
|
"grad_norm": 0.609375,
|
|
"learning_rate": 0.00016937683150168765,
|
|
"loss": 2.5944,
|
|
"step": 1626
|
|
},
|
|
{
|
|
"epoch": 0.4644041414959069,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0001692420800481955,
|
|
"loss": 2.5734,
|
|
"step": 1627
|
|
},
|
|
{
|
|
"epoch": 0.4646895773542326,
|
|
"grad_norm": 0.73828125,
|
|
"learning_rate": 0.000169107312804288,
|
|
"loss": 2.6232,
|
|
"step": 1628
|
|
},
|
|
{
|
|
"epoch": 0.4649750132125583,
|
|
"grad_norm": 0.8046875,
|
|
"learning_rate": 0.0001689725298805576,
|
|
"loss": 2.5985,
|
|
"step": 1629
|
|
},
|
|
{
|
|
"epoch": 0.465260449070884,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00016883773138760976,
|
|
"loss": 2.578,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.4655458849292097,
|
|
"grad_norm": 0.83203125,
|
|
"learning_rate": 0.00016870291743606273,
|
|
"loss": 2.5762,
|
|
"step": 1631
|
|
},
|
|
{
|
|
"epoch": 0.46583132078753536,
|
|
"grad_norm": 0.8359375,
|
|
"learning_rate": 0.0001685680881365474,
|
|
"loss": 2.5714,
|
|
"step": 1632
|
|
},
|
|
{
|
|
"epoch": 0.46611675664586105,
|
|
"grad_norm": 0.8671875,
|
|
"learning_rate": 0.00016843324359970712,
|
|
"loss": 2.5721,
|
|
"step": 1633
|
|
},
|
|
{
|
|
"epoch": 0.46640219250418674,
|
|
"grad_norm": 0.7109375,
|
|
"learning_rate": 0.00016829838393619796,
|
|
"loss": 2.6092,
|
|
"step": 1634
|
|
},
|
|
{
|
|
"epoch": 0.4666876283625125,
|
|
"grad_norm": 0.6875,
|
|
"learning_rate": 0.00016816350925668837,
|
|
"loss": 2.5973,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 0.4669730642208382,
|
|
"grad_norm": 0.85546875,
|
|
"learning_rate": 0.000168028619671859,
|
|
"loss": 2.6003,
|
|
"step": 1636
|
|
},
|
|
{
|
|
"epoch": 0.46725850007916386,
|
|
"grad_norm": 0.6171875,
|
|
"learning_rate": 0.00016789371529240271,
|
|
"loss": 2.612,
|
|
"step": 1637
|
|
},
|
|
{
|
|
"epoch": 0.46754393593748955,
|
|
"grad_norm": 0.79296875,
|
|
"learning_rate": 0.0001677587962290248,
|
|
"loss": 2.5903,
|
|
"step": 1638
|
|
},
|
|
{
|
|
"epoch": 0.46782937179581524,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00016762386259244224,
|
|
"loss": 2.5791,
|
|
"step": 1639
|
|
},
|
|
{
|
|
"epoch": 0.46811480765414093,
|
|
"grad_norm": 0.65234375,
|
|
"learning_rate": 0.0001674889144933842,
|
|
"loss": 2.6103,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.4684002435124666,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 0.00016735395204259162,
|
|
"loss": 2.5757,
|
|
"step": 1641
|
|
},
|
|
{
|
|
"epoch": 0.4686856793707923,
|
|
"grad_norm": 0.60546875,
|
|
"learning_rate": 0.00016721897535081724,
|
|
"loss": 2.5925,
|
|
"step": 1642
|
|
},
|
|
{
|
|
"epoch": 0.46897111522911805,
|
|
"grad_norm": 0.66796875,
|
|
"learning_rate": 0.00016708398452882552,
|
|
"loss": 2.6213,
|
|
"step": 1643
|
|
},
|
|
{
|
|
"epoch": 0.46925655108744374,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00016694897968739245,
|
|
"loss": 2.5948,
|
|
"step": 1644
|
|
},
|
|
{
|
|
"epoch": 0.46954198694576943,
|
|
"grad_norm": 0.6015625,
|
|
"learning_rate": 0.0001668139609373056,
|
|
"loss": 2.5849,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 0.4698274228040951,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 0.00016667892838936389,
|
|
"loss": 2.6265,
|
|
"step": 1646
|
|
},
|
|
{
|
|
"epoch": 0.4701128586624208,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00016654388215437755,
|
|
"loss": 2.6059,
|
|
"step": 1647
|
|
},
|
|
{
|
|
"epoch": 0.4703982945207465,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.0001664088223431682,
|
|
"loss": 2.6298,
|
|
"step": 1648
|
|
},
|
|
{
|
|
"epoch": 0.4706837303790722,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.0001662737490665683,
|
|
"loss": 2.6045,
|
|
"step": 1649
|
|
},
|
|
{
|
|
"epoch": 0.4709691662373979,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.0001661386624354217,
|
|
"loss": 2.6153,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.4712546020957236,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.00016600356256058296,
|
|
"loss": 2.5974,
|
|
"step": 1651
|
|
},
|
|
{
|
|
"epoch": 0.4715400379540493,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00016586844955291768,
|
|
"loss": 2.5846,
|
|
"step": 1652
|
|
},
|
|
{
|
|
"epoch": 0.471825473812375,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00016573332352330203,
|
|
"loss": 2.5888,
|
|
"step": 1653
|
|
},
|
|
{
|
|
"epoch": 0.4721109096707007,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00016559818458262304,
|
|
"loss": 2.5823,
|
|
"step": 1654
|
|
},
|
|
{
|
|
"epoch": 0.4723963455290264,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00016546303284177837,
|
|
"loss": 2.5973,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 0.47268178138735206,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.000165327868411676,
|
|
"loss": 2.5688,
|
|
"step": 1656
|
|
},
|
|
{
|
|
"epoch": 0.47296721724567775,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00016519269140323443,
|
|
"loss": 2.584,
|
|
"step": 1657
|
|
},
|
|
{
|
|
"epoch": 0.47325265310400344,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00016505750192738253,
|
|
"loss": 2.5829,
|
|
"step": 1658
|
|
},
|
|
{
|
|
"epoch": 0.47353808896232913,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00016492230009505928,
|
|
"loss": 2.5653,
|
|
"step": 1659
|
|
},
|
|
{
|
|
"epoch": 0.4738235248206549,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.0001647870860172139,
|
|
"loss": 2.6081,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.47410896067898056,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 0.00016465185980480562,
|
|
"loss": 2.5732,
|
|
"step": 1661
|
|
},
|
|
{
|
|
"epoch": 0.47439439653730625,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.0001645166215688036,
|
|
"loss": 2.5776,
|
|
"step": 1662
|
|
},
|
|
{
|
|
"epoch": 0.47467983239563194,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.000164381371420187,
|
|
"loss": 2.5898,
|
|
"step": 1663
|
|
},
|
|
{
|
|
"epoch": 0.47496526825395763,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00016424610946994453,
|
|
"loss": 2.6061,
|
|
"step": 1664
|
|
},
|
|
{
|
|
"epoch": 0.4752507041122833,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00016411083582907476,
|
|
"loss": 2.5932,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 0.475536139970609,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.0001639755506085858,
|
|
"loss": 2.5887,
|
|
"step": 1666
|
|
},
|
|
{
|
|
"epoch": 0.4758215758289347,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.0001638402539194953,
|
|
"loss": 2.597,
|
|
"step": 1667
|
|
},
|
|
{
|
|
"epoch": 0.47610701168726044,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00016370494587283026,
|
|
"loss": 2.5624,
|
|
"step": 1668
|
|
},
|
|
{
|
|
"epoch": 0.47639244754558613,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.00016356962657962693,
|
|
"loss": 2.571,
|
|
"step": 1669
|
|
},
|
|
{
|
|
"epoch": 0.4766778834039118,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00016343429615093104,
|
|
"loss": 2.5971,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.4769633192622375,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 0.00016329895469779725,
|
|
"loss": 2.5999,
|
|
"step": 1671
|
|
},
|
|
{
|
|
"epoch": 0.4772487551205632,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.00016316360233128933,
|
|
"loss": 2.5949,
|
|
"step": 1672
|
|
},
|
|
{
|
|
"epoch": 0.4775341909788889,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.0001630282391624799,
|
|
"loss": 2.599,
|
|
"step": 1673
|
|
},
|
|
{
|
|
"epoch": 0.4778196268372146,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00016289286530245064,
|
|
"loss": 2.5983,
|
|
"step": 1674
|
|
},
|
|
{
|
|
"epoch": 0.47810506269554026,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 0.00016275748086229193,
|
|
"loss": 2.5857,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.478390498553866,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.0001626220859531027,
|
|
"loss": 2.5945,
|
|
"step": 1676
|
|
},
|
|
{
|
|
"epoch": 0.4786759344121917,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00016248668068599066,
|
|
"loss": 2.6017,
|
|
"step": 1677
|
|
},
|
|
{
|
|
"epoch": 0.4789613702705174,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.0001623512651720719,
|
|
"loss": 2.6014,
|
|
"step": 1678
|
|
},
|
|
{
|
|
"epoch": 0.4792468061288431,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00016221583952247097,
|
|
"loss": 2.5712,
|
|
"step": 1679
|
|
},
|
|
{
|
|
"epoch": 0.47953224198716876,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00016208040384832072,
|
|
"loss": 2.5989,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.47981767784549445,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.00016194495826076224,
|
|
"loss": 2.5548,
|
|
"step": 1681
|
|
},
|
|
{
|
|
"epoch": 0.48010311370382014,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 0.0001618095028709447,
|
|
"loss": 2.5883,
|
|
"step": 1682
|
|
},
|
|
{
|
|
"epoch": 0.48038854956214583,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0001616740377900254,
|
|
"loss": 2.6151,
|
|
"step": 1683
|
|
},
|
|
{
|
|
"epoch": 0.4806739854204716,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00016153856312916957,
|
|
"loss": 2.5432,
|
|
"step": 1684
|
|
},
|
|
{
|
|
"epoch": 0.48095942127879726,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 0.00016140307899955024,
|
|
"loss": 2.5735,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 0.48124485713712295,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 0.00016126758551234825,
|
|
"loss": 2.5766,
|
|
"step": 1686
|
|
},
|
|
{
|
|
"epoch": 0.48153029299544864,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.0001611320827787522,
|
|
"loss": 2.5697,
|
|
"step": 1687
|
|
},
|
|
{
|
|
"epoch": 0.4818157288537743,
|
|
"grad_norm": 0.5859375,
|
|
"learning_rate": 0.00016099657090995812,
|
|
"loss": 2.5824,
|
|
"step": 1688
|
|
},
|
|
{
|
|
"epoch": 0.4821011647121,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.0001608610500171696,
|
|
"loss": 2.5885,
|
|
"step": 1689
|
|
},
|
|
{
|
|
"epoch": 0.4823866005704257,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00016072552021159775,
|
|
"loss": 2.5984,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.4826720364287514,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.0001605899816044608,
|
|
"loss": 2.6025,
|
|
"step": 1691
|
|
},
|
|
{
|
|
"epoch": 0.48295747228707714,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00016045443430698437,
|
|
"loss": 2.6107,
|
|
"step": 1692
|
|
},
|
|
{
|
|
"epoch": 0.4832429081454028,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00016031887843040104,
|
|
"loss": 2.5978,
|
|
"step": 1693
|
|
},
|
|
{
|
|
"epoch": 0.4835283440037285,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00016018331408595063,
|
|
"loss": 2.5974,
|
|
"step": 1694
|
|
},
|
|
{
|
|
"epoch": 0.4838137798620542,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00016004774138487983,
|
|
"loss": 2.6113,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 0.4840992157203799,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.00015991216043844208,
|
|
"loss": 2.5766,
|
|
"step": 1696
|
|
},
|
|
{
|
|
"epoch": 0.4843846515787056,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00015977657135789764,
|
|
"loss": 2.5671,
|
|
"step": 1697
|
|
},
|
|
{
|
|
"epoch": 0.48467008743703127,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.0001596409742545136,
|
|
"loss": 2.6138,
|
|
"step": 1698
|
|
},
|
|
{
|
|
"epoch": 0.48495552329535696,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.00015950536923956346,
|
|
"loss": 2.5962,
|
|
"step": 1699
|
|
},
|
|
{
|
|
"epoch": 0.4852409591536827,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00015936975642432725,
|
|
"loss": 2.5992,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.4855263950120084,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00015923413592009144,
|
|
"loss": 2.5925,
|
|
"step": 1701
|
|
},
|
|
{
|
|
"epoch": 0.4858118308703341,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 0.00015909850783814874,
|
|
"loss": 2.5949,
|
|
"step": 1702
|
|
},
|
|
{
|
|
"epoch": 0.48609726672865977,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00015896287228979816,
|
|
"loss": 2.5671,
|
|
"step": 1703
|
|
},
|
|
{
|
|
"epoch": 0.48638270258698546,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00015882722938634477,
|
|
"loss": 2.5684,
|
|
"step": 1704
|
|
},
|
|
{
|
|
"epoch": 0.48666813844531115,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00015869157923909978,
|
|
"loss": 2.59,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 0.48695357430363684,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00015855592195938018,
|
|
"loss": 2.587,
|
|
"step": 1706
|
|
},
|
|
{
|
|
"epoch": 0.4872390101619625,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.00015842025765850894,
|
|
"loss": 2.5942,
|
|
"step": 1707
|
|
},
|
|
{
|
|
"epoch": 0.48752444602028827,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.00015828458644781478,
|
|
"loss": 2.604,
|
|
"step": 1708
|
|
},
|
|
{
|
|
"epoch": 0.48780988187861396,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00015814890843863204,
|
|
"loss": 2.5862,
|
|
"step": 1709
|
|
},
|
|
{
|
|
"epoch": 0.48809531773693965,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00015801322374230068,
|
|
"loss": 2.5813,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.48838075359526534,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.00015787753247016608,
|
|
"loss": 2.5988,
|
|
"step": 1711
|
|
},
|
|
{
|
|
"epoch": 0.488666189453591,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 0.00015774183473357914,
|
|
"loss": 2.5786,
|
|
"step": 1712
|
|
},
|
|
{
|
|
"epoch": 0.4889516253119167,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.00015760613064389595,
|
|
"loss": 2.5616,
|
|
"step": 1713
|
|
},
|
|
{
|
|
"epoch": 0.4892370611702424,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.00015747042031247785,
|
|
"loss": 2.5828,
|
|
"step": 1714
|
|
},
|
|
{
|
|
"epoch": 0.4895224970285681,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 0.0001573347038506914,
|
|
"loss": 2.565,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 0.48980793288689384,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.00015719898136990794,
|
|
"loss": 2.5747,
|
|
"step": 1716
|
|
},
|
|
{
|
|
"epoch": 0.4900933687452195,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 0.00015706325298150403,
|
|
"loss": 2.5779,
|
|
"step": 1717
|
|
},
|
|
{
|
|
"epoch": 0.4903788046035452,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 0.00015692751879686095,
|
|
"loss": 2.5682,
|
|
"step": 1718
|
|
},
|
|
{
|
|
"epoch": 0.4906642404618709,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.00015679177892736468,
|
|
"loss": 2.5675,
|
|
"step": 1719
|
|
},
|
|
{
|
|
"epoch": 0.4909496763201966,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 0.00015665603348440595,
|
|
"loss": 2.5824,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.4912351121785223,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.0001565202825793801,
|
|
"loss": 2.5604,
|
|
"step": 1721
|
|
},
|
|
{
|
|
"epoch": 0.49152054803684797,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.0001563845263236868,
|
|
"loss": 2.5612,
|
|
"step": 1722
|
|
},
|
|
{
|
|
"epoch": 0.49180598389517366,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.0001562487648287303,
|
|
"loss": 2.6068,
|
|
"step": 1723
|
|
},
|
|
{
|
|
"epoch": 0.4920914197534994,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 0.000156112998205919,
|
|
"loss": 2.5695,
|
|
"step": 1724
|
|
},
|
|
{
|
|
"epoch": 0.4923768556118251,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00015597722656666554,
|
|
"loss": 2.5929,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.4926622914701508,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00015584145002238677,
|
|
"loss": 2.5656,
|
|
"step": 1726
|
|
},
|
|
{
|
|
"epoch": 0.49294772732847647,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00015570566868450343,
|
|
"loss": 2.5609,
|
|
"step": 1727
|
|
},
|
|
{
|
|
"epoch": 0.49323316318680216,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.00015556988266444028,
|
|
"loss": 2.5954,
|
|
"step": 1728
|
|
},
|
|
{
|
|
"epoch": 0.49351859904512785,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.0001554340920736259,
|
|
"loss": 2.5662,
|
|
"step": 1729
|
|
},
|
|
{
|
|
"epoch": 0.49380403490345354,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 0.00015529829702349266,
|
|
"loss": 2.6074,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.4940894707617792,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.0001551624976254765,
|
|
"loss": 2.593,
|
|
"step": 1731
|
|
},
|
|
{
|
|
"epoch": 0.4943749066201049,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00015502669399101695,
|
|
"loss": 2.6089,
|
|
"step": 1732
|
|
},
|
|
{
|
|
"epoch": 0.49466034247843066,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00015489088623155716,
|
|
"loss": 2.5917,
|
|
"step": 1733
|
|
},
|
|
{
|
|
"epoch": 0.49494577833675635,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00015475507445854343,
|
|
"loss": 2.566,
|
|
"step": 1734
|
|
},
|
|
{
|
|
"epoch": 0.49523121419508204,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00015461925878342556,
|
|
"loss": 2.5928,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 0.4955166500534077,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.00015448343931765635,
|
|
"loss": 2.5719,
|
|
"step": 1736
|
|
},
|
|
{
|
|
"epoch": 0.4958020859117334,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.000154347616172692,
|
|
"loss": 2.5568,
|
|
"step": 1737
|
|
},
|
|
{
|
|
"epoch": 0.4960875217700591,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 0.00015421178945999143,
|
|
"loss": 2.5836,
|
|
"step": 1738
|
|
},
|
|
{
|
|
"epoch": 0.4963729576283848,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.00015407595929101665,
|
|
"loss": 2.5957,
|
|
"step": 1739
|
|
},
|
|
{
|
|
"epoch": 0.4966583934867105,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.0001539401257772324,
|
|
"loss": 2.6004,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.4969438293450362,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.0001538042890301064,
|
|
"loss": 2.5866,
|
|
"step": 1741
|
|
},
|
|
{
|
|
"epoch": 0.4972292652033619,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00015366844916110868,
|
|
"loss": 2.5744,
|
|
"step": 1742
|
|
},
|
|
{
|
|
"epoch": 0.4975147010616876,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.00015353260628171212,
|
|
"loss": 2.6165,
|
|
"step": 1743
|
|
},
|
|
{
|
|
"epoch": 0.4978001369200133,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.0001533967605033919,
|
|
"loss": 2.5778,
|
|
"step": 1744
|
|
},
|
|
{
|
|
"epoch": 0.498085572778339,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 0.00015326091193762568,
|
|
"loss": 2.5816,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 0.49837100863666467,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00015312506069589335,
|
|
"loss": 2.6123,
|
|
"step": 1746
|
|
},
|
|
{
|
|
"epoch": 0.49865644449499036,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00015298920688967702,
|
|
"loss": 2.5834,
|
|
"step": 1747
|
|
},
|
|
{
|
|
"epoch": 0.49894188035331605,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00015285335063046089,
|
|
"loss": 2.5644,
|
|
"step": 1748
|
|
},
|
|
{
|
|
"epoch": 0.4992273162116418,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00015271749202973116,
|
|
"loss": 2.5766,
|
|
"step": 1749
|
|
},
|
|
{
|
|
"epoch": 0.4995127520699675,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.000152581631198976,
|
|
"loss": 2.5764,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.4995127520699675,
|
|
"eval_loss": 2.4794108867645264,
|
|
"eval_runtime": 6003.2988,
|
|
"eval_samples_per_second": 10.708,
|
|
"eval_steps_per_second": 10.708,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.49979818792829317,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 0.00015244576824968538,
|
|
"loss": 2.5287,
|
|
"step": 1751
|
|
},
|
|
{
|
|
"epoch": 0.5000836237866189,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.000152309903293351,
|
|
"loss": 2.5808,
|
|
"step": 1752
|
|
},
|
|
{
|
|
"epoch": 0.5003690596449446,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00015217403644146626,
|
|
"loss": 2.6024,
|
|
"step": 1753
|
|
},
|
|
{
|
|
"epoch": 0.5006544955032702,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.000152038167805526,
|
|
"loss": 2.6072,
|
|
"step": 1754
|
|
},
|
|
{
|
|
"epoch": 0.500939931361596,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00015190229749702664,
|
|
"loss": 2.5662,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 0.5012253672199216,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00015176642562746587,
|
|
"loss": 2.5949,
|
|
"step": 1756
|
|
},
|
|
{
|
|
"epoch": 0.5015108030782474,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.0001516305523083428,
|
|
"loss": 2.5952,
|
|
"step": 1757
|
|
},
|
|
{
|
|
"epoch": 0.501796238936573,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 0.00015149467765115764,
|
|
"loss": 2.5761,
|
|
"step": 1758
|
|
},
|
|
{
|
|
"epoch": 0.5020816747948987,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.0001513588017674117,
|
|
"loss": 2.5776,
|
|
"step": 1759
|
|
},
|
|
{
|
|
"epoch": 0.5023671106532244,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.0001512229247686072,
|
|
"loss": 2.5913,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.5026525465115501,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.00015108704676624756,
|
|
"loss": 2.6031,
|
|
"step": 1761
|
|
},
|
|
{
|
|
"epoch": 0.5029379823698759,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.00015095116787183668,
|
|
"loss": 2.5457,
|
|
"step": 1762
|
|
},
|
|
{
|
|
"epoch": 0.5032234182282015,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.0001508152881968795,
|
|
"loss": 2.5609,
|
|
"step": 1763
|
|
},
|
|
{
|
|
"epoch": 0.5035088540865272,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.00015067940785288135,
|
|
"loss": 2.6055,
|
|
"step": 1764
|
|
},
|
|
{
|
|
"epoch": 0.5037942899448529,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.0001505435269513482,
|
|
"loss": 2.597,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 0.5040797258031786,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00015040764560378658,
|
|
"loss": 2.5936,
|
|
"step": 1766
|
|
},
|
|
{
|
|
"epoch": 0.5043651616615042,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.00015027176392170326,
|
|
"loss": 2.5551,
|
|
"step": 1767
|
|
},
|
|
{
|
|
"epoch": 0.50465059751983,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.00015013588201660529,
|
|
"loss": 2.5881,
|
|
"step": 1768
|
|
},
|
|
{
|
|
"epoch": 0.5049360333781557,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00015,
|
|
"loss": 2.5998,
|
|
"step": 1769
|
|
},
|
|
{
|
|
"epoch": 0.5052214692364814,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 0.0001498641179833947,
|
|
"loss": 2.58,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.5055069050948071,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00014972823607829674,
|
|
"loss": 2.5808,
|
|
"step": 1771
|
|
},
|
|
{
|
|
"epoch": 0.5057923409531327,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00014959235439621343,
|
|
"loss": 2.575,
|
|
"step": 1772
|
|
},
|
|
{
|
|
"epoch": 0.5060777768114585,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.00014945647304865175,
|
|
"loss": 2.5957,
|
|
"step": 1773
|
|
},
|
|
{
|
|
"epoch": 0.5063632126697841,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00014932059214711868,
|
|
"loss": 2.5831,
|
|
"step": 1774
|
|
},
|
|
{
|
|
"epoch": 0.5066486485281099,
|
|
"grad_norm": 0.59375,
|
|
"learning_rate": 0.00014918471180312053,
|
|
"loss": 2.5812,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.5069340843864355,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.0001490488321281633,
|
|
"loss": 2.5925,
|
|
"step": 1776
|
|
},
|
|
{
|
|
"epoch": 0.5072195202447612,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00014891295323375244,
|
|
"loss": 2.5934,
|
|
"step": 1777
|
|
},
|
|
{
|
|
"epoch": 0.507504956103087,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.0001487770752313928,
|
|
"loss": 2.5923,
|
|
"step": 1778
|
|
},
|
|
{
|
|
"epoch": 0.5077903919614126,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 0.00014864119823258836,
|
|
"loss": 2.5811,
|
|
"step": 1779
|
|
},
|
|
{
|
|
"epoch": 0.5080758278197384,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 0.00014850532234884236,
|
|
"loss": 2.5726,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.508361263678064,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00014836944769165716,
|
|
"loss": 2.57,
|
|
"step": 1781
|
|
},
|
|
{
|
|
"epoch": 0.5086466995363897,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.0001482335743725341,
|
|
"loss": 2.584,
|
|
"step": 1782
|
|
},
|
|
{
|
|
"epoch": 0.5089321353947154,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.00014809770250297336,
|
|
"loss": 2.5903,
|
|
"step": 1783
|
|
},
|
|
{
|
|
"epoch": 0.5092175712530411,
|
|
"grad_norm": 0.51171875,
|
|
"learning_rate": 0.000147961832194474,
|
|
"loss": 2.6009,
|
|
"step": 1784
|
|
},
|
|
{
|
|
"epoch": 0.5095030071113669,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00014782596355853374,
|
|
"loss": 2.6057,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 0.5097884429696925,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 0.00014769009670664897,
|
|
"loss": 2.5661,
|
|
"step": 1786
|
|
},
|
|
{
|
|
"epoch": 0.5100738788280182,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.0001475542317503146,
|
|
"loss": 2.5986,
|
|
"step": 1787
|
|
},
|
|
{
|
|
"epoch": 0.5103593146863439,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 0.000147418368801024,
|
|
"loss": 2.5837,
|
|
"step": 1788
|
|
},
|
|
{
|
|
"epoch": 0.5106447505446696,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.0001472825079702688,
|
|
"loss": 2.5738,
|
|
"step": 1789
|
|
},
|
|
{
|
|
"epoch": 0.5109301864029953,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.0001471466493695391,
|
|
"loss": 2.5681,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.511215622261321,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 0.00014701079311032298,
|
|
"loss": 2.5817,
|
|
"step": 1791
|
|
},
|
|
{
|
|
"epoch": 0.5115010581196466,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.00014687493930410663,
|
|
"loss": 2.5813,
|
|
"step": 1792
|
|
},
|
|
{
|
|
"epoch": 0.5117864939779724,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00014673908806237432,
|
|
"loss": 2.5893,
|
|
"step": 1793
|
|
},
|
|
{
|
|
"epoch": 0.5120719298362981,
|
|
"grad_norm": 0.498046875,
|
|
"learning_rate": 0.0001466032394966081,
|
|
"loss": 2.6104,
|
|
"step": 1794
|
|
},
|
|
{
|
|
"epoch": 0.5123573656946238,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.0001464673937182879,
|
|
"loss": 2.6105,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 0.5126428015529495,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00014633155083889132,
|
|
"loss": 2.6015,
|
|
"step": 1796
|
|
},
|
|
{
|
|
"epoch": 0.5129282374112751,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00014619571096989359,
|
|
"loss": 2.578,
|
|
"step": 1797
|
|
},
|
|
{
|
|
"epoch": 0.5132136732696009,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 0.00014605987422276756,
|
|
"loss": 2.5755,
|
|
"step": 1798
|
|
},
|
|
{
|
|
"epoch": 0.5134991091279265,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00014592404070898335,
|
|
"loss": 2.5822,
|
|
"step": 1799
|
|
},
|
|
{
|
|
"epoch": 0.5137845449862523,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.00014578821054000854,
|
|
"loss": 2.5701,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.514069980844578,
|
|
"grad_norm": 0.51953125,
|
|
"learning_rate": 0.000145652383827308,
|
|
"loss": 2.5652,
|
|
"step": 1801
|
|
},
|
|
{
|
|
"epoch": 0.5143554167029036,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00014551656068234362,
|
|
"loss": 2.5589,
|
|
"step": 1802
|
|
},
|
|
{
|
|
"epoch": 0.5146408525612294,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00014538074121657447,
|
|
"loss": 2.5928,
|
|
"step": 1803
|
|
},
|
|
{
|
|
"epoch": 0.514926288419555,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.00014524492554145657,
|
|
"loss": 2.5787,
|
|
"step": 1804
|
|
},
|
|
{
|
|
"epoch": 0.5152117242778808,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.0001451091137684428,
|
|
"loss": 2.6031,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 0.5154971601362064,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00014497330600898297,
|
|
"loss": 2.6,
|
|
"step": 1806
|
|
},
|
|
{
|
|
"epoch": 0.5157825959945321,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.0001448375023745235,
|
|
"loss": 2.5984,
|
|
"step": 1807
|
|
},
|
|
{
|
|
"epoch": 0.5160680318528578,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.00014470170297650734,
|
|
"loss": 2.5901,
|
|
"step": 1808
|
|
},
|
|
{
|
|
"epoch": 0.5163534677111835,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00014456590792637407,
|
|
"loss": 2.555,
|
|
"step": 1809
|
|
},
|
|
{
|
|
"epoch": 0.5166389035695093,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.0001444301173355597,
|
|
"loss": 2.5745,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.5169243394278349,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 0.0001442943313154966,
|
|
"loss": 2.5377,
|
|
"step": 1811
|
|
},
|
|
{
|
|
"epoch": 0.5172097752861606,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00014415854997761328,
|
|
"loss": 2.5617,
|
|
"step": 1812
|
|
},
|
|
{
|
|
"epoch": 0.5174952111444863,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.0001440227734333344,
|
|
"loss": 2.5987,
|
|
"step": 1813
|
|
},
|
|
{
|
|
"epoch": 0.517780647002812,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.000143887001794081,
|
|
"loss": 2.5686,
|
|
"step": 1814
|
|
},
|
|
{
|
|
"epoch": 0.5180660828611376,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 0.00014375123517126968,
|
|
"loss": 2.5911,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 0.5183515187194634,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.00014361547367631317,
|
|
"loss": 2.5687,
|
|
"step": 1816
|
|
},
|
|
{
|
|
"epoch": 0.518636954577789,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00014347971742061989,
|
|
"loss": 2.6098,
|
|
"step": 1817
|
|
},
|
|
{
|
|
"epoch": 0.5189223904361148,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.00014334396651559405,
|
|
"loss": 2.5648,
|
|
"step": 1818
|
|
},
|
|
{
|
|
"epoch": 0.5192078262944405,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 0.00014320822107263532,
|
|
"loss": 2.583,
|
|
"step": 1819
|
|
},
|
|
{
|
|
"epoch": 0.5194932621527661,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00014307248120313908,
|
|
"loss": 2.5763,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.5197786980110919,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00014293674701849595,
|
|
"loss": 2.5835,
|
|
"step": 1821
|
|
},
|
|
{
|
|
"epoch": 0.5200641338694175,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00014280101863009203,
|
|
"loss": 2.5738,
|
|
"step": 1822
|
|
},
|
|
{
|
|
"epoch": 0.5203495697277433,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.0001426652961493086,
|
|
"loss": 2.5956,
|
|
"step": 1823
|
|
},
|
|
{
|
|
"epoch": 0.5206350055860689,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00014252957968752212,
|
|
"loss": 2.5553,
|
|
"step": 1824
|
|
},
|
|
{
|
|
"epoch": 0.5209204414443946,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.00014239386935610405,
|
|
"loss": 2.5876,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.5212058773027204,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00014225816526642086,
|
|
"loss": 2.592,
|
|
"step": 1826
|
|
},
|
|
{
|
|
"epoch": 0.521491313161046,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00014212246752983392,
|
|
"loss": 2.5715,
|
|
"step": 1827
|
|
},
|
|
{
|
|
"epoch": 0.5217767490193718,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 0.00014198677625769937,
|
|
"loss": 2.5873,
|
|
"step": 1828
|
|
},
|
|
{
|
|
"epoch": 0.5220621848776974,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.0001418510915613679,
|
|
"loss": 2.5964,
|
|
"step": 1829
|
|
},
|
|
{
|
|
"epoch": 0.5223476207360231,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 0.0001417154135521852,
|
|
"loss": 2.5588,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.5226330565943488,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00014157974234149103,
|
|
"loss": 2.5652,
|
|
"step": 1831
|
|
},
|
|
{
|
|
"epoch": 0.5229184924526745,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.00014144407804061982,
|
|
"loss": 2.6088,
|
|
"step": 1832
|
|
},
|
|
{
|
|
"epoch": 0.5232039283110002,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00014130842076090023,
|
|
"loss": 2.5847,
|
|
"step": 1833
|
|
},
|
|
{
|
|
"epoch": 0.5234893641693259,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.0001411727706136552,
|
|
"loss": 2.5664,
|
|
"step": 1834
|
|
},
|
|
{
|
|
"epoch": 0.5237748000276516,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00014103712771020187,
|
|
"loss": 2.5667,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 0.5240602358859773,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00014090149216185123,
|
|
"loss": 2.5789,
|
|
"step": 1836
|
|
},
|
|
{
|
|
"epoch": 0.524345671744303,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.00014076586407990856,
|
|
"loss": 2.5775,
|
|
"step": 1837
|
|
},
|
|
{
|
|
"epoch": 0.5246311076026287,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.00014063024357567275,
|
|
"loss": 2.5817,
|
|
"step": 1838
|
|
},
|
|
{
|
|
"epoch": 0.5249165434609544,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00014049463076043652,
|
|
"loss": 2.6099,
|
|
"step": 1839
|
|
},
|
|
{
|
|
"epoch": 0.52520197931928,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00014035902574548637,
|
|
"loss": 2.5589,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.5254874151776058,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.00014022342864210234,
|
|
"loss": 2.5884,
|
|
"step": 1841
|
|
},
|
|
{
|
|
"epoch": 0.5257728510359315,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00014008783956155797,
|
|
"loss": 2.606,
|
|
"step": 1842
|
|
},
|
|
{
|
|
"epoch": 0.5260582868942572,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.0001399522586151202,
|
|
"loss": 2.5597,
|
|
"step": 1843
|
|
},
|
|
{
|
|
"epoch": 0.5263437227525829,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00013981668591404932,
|
|
"loss": 2.5987,
|
|
"step": 1844
|
|
},
|
|
{
|
|
"epoch": 0.5266291586109085,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.00013968112156959893,
|
|
"loss": 2.5708,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 0.5269145944692343,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.00013954556569301563,
|
|
"loss": 2.5932,
|
|
"step": 1846
|
|
},
|
|
{
|
|
"epoch": 0.5272000303275599,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.0001394100183955392,
|
|
"loss": 2.6022,
|
|
"step": 1847
|
|
},
|
|
{
|
|
"epoch": 0.5274854661858857,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.00013927447978840225,
|
|
"loss": 2.5497,
|
|
"step": 1848
|
|
},
|
|
{
|
|
"epoch": 0.5277709020442113,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00013913894998283038,
|
|
"loss": 2.5742,
|
|
"step": 1849
|
|
},
|
|
{
|
|
"epoch": 0.528056337902537,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00013900342909004188,
|
|
"loss": 2.624,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.5283417737608628,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00013886791722124783,
|
|
"loss": 2.5814,
|
|
"step": 1851
|
|
},
|
|
{
|
|
"epoch": 0.5286272096191884,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.00013873241448765167,
|
|
"loss": 2.5622,
|
|
"step": 1852
|
|
},
|
|
{
|
|
"epoch": 0.5289126454775142,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.00013859692100044973,
|
|
"loss": 2.5673,
|
|
"step": 1853
|
|
},
|
|
{
|
|
"epoch": 0.5291980813358398,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 0.00013846143687083043,
|
|
"loss": 2.5758,
|
|
"step": 1854
|
|
},
|
|
{
|
|
"epoch": 0.5294835171941655,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 0.00013832596220997458,
|
|
"loss": 2.5934,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 0.5297689530524912,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.0001381904971290553,
|
|
"loss": 2.5529,
|
|
"step": 1856
|
|
},
|
|
{
|
|
"epoch": 0.5300543889108169,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00013805504173923776,
|
|
"loss": 2.5794,
|
|
"step": 1857
|
|
},
|
|
{
|
|
"epoch": 0.5303398247691427,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 0.0001379195961516793,
|
|
"loss": 2.5519,
|
|
"step": 1858
|
|
},
|
|
{
|
|
"epoch": 0.5306252606274683,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00013778416047752903,
|
|
"loss": 2.5965,
|
|
"step": 1859
|
|
},
|
|
{
|
|
"epoch": 0.530910696485794,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.0001376487348279281,
|
|
"loss": 2.5725,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.5311961323441197,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.0001375133193140093,
|
|
"loss": 2.5638,
|
|
"step": 1861
|
|
},
|
|
{
|
|
"epoch": 0.5314815682024454,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.00013737791404689728,
|
|
"loss": 2.5935,
|
|
"step": 1862
|
|
},
|
|
{
|
|
"epoch": 0.531767004060771,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 0.00013724251913770807,
|
|
"loss": 2.6033,
|
|
"step": 1863
|
|
},
|
|
{
|
|
"epoch": 0.5320524399190968,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.00013710713469754934,
|
|
"loss": 2.5982,
|
|
"step": 1864
|
|
},
|
|
{
|
|
"epoch": 0.5323378757774224,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00013697176083752008,
|
|
"loss": 2.5374,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 0.5326233116357482,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 0.0001368363976687107,
|
|
"loss": 2.5623,
|
|
"step": 1866
|
|
},
|
|
{
|
|
"epoch": 0.5329087474940739,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00013670104530220275,
|
|
"loss": 2.574,
|
|
"step": 1867
|
|
},
|
|
{
|
|
"epoch": 0.5331941833523995,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.0001365657038490689,
|
|
"loss": 2.5917,
|
|
"step": 1868
|
|
},
|
|
{
|
|
"epoch": 0.5334796192107253,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 0.000136430373420373,
|
|
"loss": 2.5844,
|
|
"step": 1869
|
|
},
|
|
{
|
|
"epoch": 0.5337650550690509,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 0.00013629505412716974,
|
|
"loss": 2.6019,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.5340504909273767,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.0001361597460805047,
|
|
"loss": 2.5718,
|
|
"step": 1871
|
|
},
|
|
{
|
|
"epoch": 0.5343359267857023,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.0001360244493914142,
|
|
"loss": 2.5665,
|
|
"step": 1872
|
|
},
|
|
{
|
|
"epoch": 0.534621362644028,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.0001358891641709252,
|
|
"loss": 2.5814,
|
|
"step": 1873
|
|
},
|
|
{
|
|
"epoch": 0.5349067985023538,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00013575389053005547,
|
|
"loss": 2.5467,
|
|
"step": 1874
|
|
},
|
|
{
|
|
"epoch": 0.5351922343606794,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 0.00013561862857981304,
|
|
"loss": 2.5697,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 0.5354776702190052,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00013548337843119634,
|
|
"loss": 2.5856,
|
|
"step": 1876
|
|
},
|
|
{
|
|
"epoch": 0.5357631060773308,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00013534814019519438,
|
|
"loss": 2.5662,
|
|
"step": 1877
|
|
},
|
|
{
|
|
"epoch": 0.5360485419356565,
|
|
"grad_norm": 0.5625,
|
|
"learning_rate": 0.00013521291398278608,
|
|
"loss": 2.5983,
|
|
"step": 1878
|
|
},
|
|
{
|
|
"epoch": 0.5363339777939822,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00013507769990494072,
|
|
"loss": 2.5893,
|
|
"step": 1879
|
|
},
|
|
{
|
|
"epoch": 0.5366194136523079,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 0.00013494249807261748,
|
|
"loss": 2.5852,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.5369048495106336,
|
|
"grad_norm": 0.55078125,
|
|
"learning_rate": 0.00013480730859676557,
|
|
"loss": 2.5667,
|
|
"step": 1881
|
|
},
|
|
{
|
|
"epoch": 0.5371902853689593,
|
|
"grad_norm": 0.82421875,
|
|
"learning_rate": 0.00013467213158832402,
|
|
"loss": 2.5674,
|
|
"step": 1882
|
|
},
|
|
{
|
|
"epoch": 0.537475721227285,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.00013453696715822163,
|
|
"loss": 2.5955,
|
|
"step": 1883
|
|
},
|
|
{
|
|
"epoch": 0.5377611570856107,
|
|
"grad_norm": 0.67578125,
|
|
"learning_rate": 0.0001344018154173769,
|
|
"loss": 2.5681,
|
|
"step": 1884
|
|
},
|
|
{
|
|
"epoch": 0.5380465929439364,
|
|
"grad_norm": 0.55859375,
|
|
"learning_rate": 0.00013426667647669795,
|
|
"loss": 2.6069,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 0.538332028802262,
|
|
"grad_norm": 0.609375,
|
|
"learning_rate": 0.00013413155044708232,
|
|
"loss": 2.5682,
|
|
"step": 1886
|
|
},
|
|
{
|
|
"epoch": 0.5386174646605878,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.00013399643743941701,
|
|
"loss": 2.5783,
|
|
"step": 1887
|
|
},
|
|
{
|
|
"epoch": 0.5389029005189134,
|
|
"grad_norm": 0.59375,
|
|
"learning_rate": 0.0001338613375645783,
|
|
"loss": 2.5545,
|
|
"step": 1888
|
|
},
|
|
{
|
|
"epoch": 0.5391883363772392,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00013372625093343167,
|
|
"loss": 2.5683,
|
|
"step": 1889
|
|
},
|
|
{
|
|
"epoch": 0.5394737722355648,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00013359117765683183,
|
|
"loss": 2.5635,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.5397592080938906,
|
|
"grad_norm": 0.546875,
|
|
"learning_rate": 0.00013345611784562245,
|
|
"loss": 2.5851,
|
|
"step": 1891
|
|
},
|
|
{
|
|
"epoch": 0.5400446439522163,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 0.0001333210716106361,
|
|
"loss": 2.5822,
|
|
"step": 1892
|
|
},
|
|
{
|
|
"epoch": 0.5403300798105419,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.00013318603906269436,
|
|
"loss": 2.587,
|
|
"step": 1893
|
|
},
|
|
{
|
|
"epoch": 0.5406155156688677,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 0.00013305102031260755,
|
|
"loss": 2.5887,
|
|
"step": 1894
|
|
},
|
|
{
|
|
"epoch": 0.5409009515271933,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 0.00013291601547117448,
|
|
"loss": 2.5895,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 0.541186387385519,
|
|
"grad_norm": 0.56640625,
|
|
"learning_rate": 0.00013278102464918276,
|
|
"loss": 2.5535,
|
|
"step": 1896
|
|
},
|
|
{
|
|
"epoch": 0.5414718232438447,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00013264604795740838,
|
|
"loss": 2.5836,
|
|
"step": 1897
|
|
},
|
|
{
|
|
"epoch": 0.5417572591021704,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00013251108550661585,
|
|
"loss": 2.5933,
|
|
"step": 1898
|
|
},
|
|
{
|
|
"epoch": 0.5420426949604962,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.0001323761374075578,
|
|
"loss": 2.5745,
|
|
"step": 1899
|
|
},
|
|
{
|
|
"epoch": 0.5423281308188218,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 0.0001322412037709752,
|
|
"loss": 2.5632,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.5426135666771476,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.00013210628470759726,
|
|
"loss": 2.5525,
|
|
"step": 1901
|
|
},
|
|
{
|
|
"epoch": 0.5428990025354732,
|
|
"grad_norm": 0.5078125,
|
|
"learning_rate": 0.000131971380328141,
|
|
"loss": 2.6075,
|
|
"step": 1902
|
|
},
|
|
{
|
|
"epoch": 0.5431844383937989,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.0001318364907433116,
|
|
"loss": 2.5948,
|
|
"step": 1903
|
|
},
|
|
{
|
|
"epoch": 0.5434698742521246,
|
|
"grad_norm": 0.53125,
|
|
"learning_rate": 0.00013170161606380204,
|
|
"loss": 2.6039,
|
|
"step": 1904
|
|
},
|
|
{
|
|
"epoch": 0.5437553101104503,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00013156675640029289,
|
|
"loss": 2.5849,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 0.5440407459687759,
|
|
"grad_norm": 0.546875,
|
|
"learning_rate": 0.00013143191186345266,
|
|
"loss": 2.5805,
|
|
"step": 1906
|
|
},
|
|
{
|
|
"epoch": 0.5443261818271017,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 0.00013129708256393724,
|
|
"loss": 2.5466,
|
|
"step": 1907
|
|
},
|
|
{
|
|
"epoch": 0.5446116176854274,
|
|
"grad_norm": 0.515625,
|
|
"learning_rate": 0.00013116226861239019,
|
|
"loss": 2.5889,
|
|
"step": 1908
|
|
},
|
|
{
|
|
"epoch": 0.5448970535437531,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.00013102747011944238,
|
|
"loss": 2.5744,
|
|
"step": 1909
|
|
},
|
|
{
|
|
"epoch": 0.5451824894020788,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.000130892687195712,
|
|
"loss": 2.5408,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.5454679252604044,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 0.00013075791995180447,
|
|
"loss": 2.5915,
|
|
"step": 1911
|
|
},
|
|
{
|
|
"epoch": 0.5457533611187302,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.00013062316849831232,
|
|
"loss": 2.5739,
|
|
"step": 1912
|
|
},
|
|
{
|
|
"epoch": 0.5460387969770558,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00013048843294581516,
|
|
"loss": 2.5662,
|
|
"step": 1913
|
|
},
|
|
{
|
|
"epoch": 0.5463242328353816,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00013035371340487954,
|
|
"loss": 2.5486,
|
|
"step": 1914
|
|
},
|
|
{
|
|
"epoch": 0.5466096686937073,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 0.00013021900998605885,
|
|
"loss": 2.5508,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 0.5468951045520329,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.0001300843227998933,
|
|
"loss": 2.5886,
|
|
"step": 1916
|
|
},
|
|
{
|
|
"epoch": 0.5471805404103587,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00012994965195690976,
|
|
"loss": 2.5568,
|
|
"step": 1917
|
|
},
|
|
{
|
|
"epoch": 0.5474659762686843,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 0.0001298149975676216,
|
|
"loss": 2.5776,
|
|
"step": 1918
|
|
},
|
|
{
|
|
"epoch": 0.5477514121270101,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.0001296803597425288,
|
|
"loss": 2.5829,
|
|
"step": 1919
|
|
},
|
|
{
|
|
"epoch": 0.5480368479853357,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00012954573859211773,
|
|
"loss": 2.5828,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.5483222838436614,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 0.00012941113422686108,
|
|
"loss": 2.5825,
|
|
"step": 1921
|
|
},
|
|
{
|
|
"epoch": 0.5486077197019871,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.0001292765467572177,
|
|
"loss": 2.5706,
|
|
"step": 1922
|
|
},
|
|
{
|
|
"epoch": 0.5488931555603128,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 0.00012914197629363257,
|
|
"loss": 2.546,
|
|
"step": 1923
|
|
},
|
|
{
|
|
"epoch": 0.5491785914186386,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00012900742294653684,
|
|
"loss": 2.6005,
|
|
"step": 1924
|
|
},
|
|
{
|
|
"epoch": 0.5494640272769642,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.0001288728868263475,
|
|
"loss": 2.5664,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 0.5497494631352899,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.00012873836804346746,
|
|
"loss": 2.5662,
|
|
"step": 1926
|
|
},
|
|
{
|
|
"epoch": 0.5500348989936156,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00012860386670828538,
|
|
"loss": 2.5691,
|
|
"step": 1927
|
|
},
|
|
{
|
|
"epoch": 0.5503203348519413,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.0001284693829311756,
|
|
"loss": 2.556,
|
|
"step": 1928
|
|
},
|
|
{
|
|
"epoch": 0.550605770710267,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00012833491682249802,
|
|
"loss": 2.5723,
|
|
"step": 1929
|
|
},
|
|
{
|
|
"epoch": 0.5508912065685927,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.0001282004684925981,
|
|
"loss": 2.5932,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.5511766424269184,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00012806603805180666,
|
|
"loss": 2.5586,
|
|
"step": 1931
|
|
},
|
|
{
|
|
"epoch": 0.5514620782852441,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.00012793162561043994,
|
|
"loss": 2.6137,
|
|
"step": 1932
|
|
},
|
|
{
|
|
"epoch": 0.5517475141435698,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.0001277972312787993,
|
|
"loss": 2.5864,
|
|
"step": 1933
|
|
},
|
|
{
|
|
"epoch": 0.5520329500018955,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.0001276628551671713,
|
|
"loss": 2.5684,
|
|
"step": 1934
|
|
},
|
|
{
|
|
"epoch": 0.5523183858602212,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 0.00012752849738582745,
|
|
"loss": 2.5812,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 0.5526038217185468,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.0001273941580450243,
|
|
"loss": 2.5645,
|
|
"step": 1936
|
|
},
|
|
{
|
|
"epoch": 0.5528892575768726,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 0.00012725983725500332,
|
|
"loss": 2.5597,
|
|
"step": 1937
|
|
},
|
|
{
|
|
"epoch": 0.5531746934351982,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.0001271255351259907,
|
|
"loss": 2.5787,
|
|
"step": 1938
|
|
},
|
|
{
|
|
"epoch": 0.553460129293524,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 0.00012699125176819716,
|
|
"loss": 2.5669,
|
|
"step": 1939
|
|
},
|
|
{
|
|
"epoch": 0.5537455651518497,
|
|
"grad_norm": 0.78125,
|
|
"learning_rate": 0.00012685698729181837,
|
|
"loss": 2.5653,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.5540310010101753,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 0.0001267227418070342,
|
|
"loss": 2.5713,
|
|
"step": 1941
|
|
},
|
|
{
|
|
"epoch": 0.5543164368685011,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.00012658851542400907,
|
|
"loss": 2.5643,
|
|
"step": 1942
|
|
},
|
|
{
|
|
"epoch": 0.5546018727268267,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 0.00012645430825289163,
|
|
"loss": 2.5536,
|
|
"step": 1943
|
|
},
|
|
{
|
|
"epoch": 0.5548873085851525,
|
|
"grad_norm": 0.53515625,
|
|
"learning_rate": 0.00012632012040381493,
|
|
"loss": 2.5869,
|
|
"step": 1944
|
|
},
|
|
{
|
|
"epoch": 0.5551727444434781,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.00012618595198689596,
|
|
"loss": 2.5626,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 0.5554581803018038,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.0001260518031122359,
|
|
"loss": 2.5907,
|
|
"step": 1946
|
|
},
|
|
{
|
|
"epoch": 0.5557436161601295,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 0.00012591767388991985,
|
|
"loss": 2.5852,
|
|
"step": 1947
|
|
},
|
|
{
|
|
"epoch": 0.5560290520184552,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00012578356443001683,
|
|
"loss": 2.557,
|
|
"step": 1948
|
|
},
|
|
{
|
|
"epoch": 0.556314487876781,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.0001256494748425796,
|
|
"loss": 2.581,
|
|
"step": 1949
|
|
},
|
|
{
|
|
"epoch": 0.5565999237351066,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 0.00012551540523764458,
|
|
"loss": 2.5861,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.5568853595934323,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 0.00012538135572523183,
|
|
"loss": 2.5701,
|
|
"step": 1951
|
|
},
|
|
{
|
|
"epoch": 0.557170795451758,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00012524732641534496,
|
|
"loss": 2.5348,
|
|
"step": 1952
|
|
},
|
|
{
|
|
"epoch": 0.5574562313100837,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00012511331741797092,
|
|
"loss": 2.5597,
|
|
"step": 1953
|
|
},
|
|
{
|
|
"epoch": 0.5577416671684093,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 0.00012497932884308002,
|
|
"loss": 2.5808,
|
|
"step": 1954
|
|
},
|
|
{
|
|
"epoch": 0.5580271030267351,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.00012484536080062581,
|
|
"loss": 2.5469,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 0.5583125388850608,
|
|
"grad_norm": 0.54296875,
|
|
"learning_rate": 0.00012471141340054508,
|
|
"loss": 2.5758,
|
|
"step": 1956
|
|
},
|
|
{
|
|
"epoch": 0.5585979747433865,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.00012457748675275763,
|
|
"loss": 2.5819,
|
|
"step": 1957
|
|
},
|
|
{
|
|
"epoch": 0.5588834106017122,
|
|
"grad_norm": 0.494140625,
|
|
"learning_rate": 0.00012444358096716607,
|
|
"loss": 2.5616,
|
|
"step": 1958
|
|
},
|
|
{
|
|
"epoch": 0.5591688464600378,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.0001243096961536561,
|
|
"loss": 2.5502,
|
|
"step": 1959
|
|
},
|
|
{
|
|
"epoch": 0.5594542823183636,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 0.00012417583242209612,
|
|
"loss": 2.5667,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.5597397181766892,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00012404198988233729,
|
|
"loss": 2.5661,
|
|
"step": 1961
|
|
},
|
|
{
|
|
"epoch": 0.560025154035015,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00012390816864421325,
|
|
"loss": 2.5755,
|
|
"step": 1962
|
|
},
|
|
{
|
|
"epoch": 0.5603105898933406,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 0.00012377436881754025,
|
|
"loss": 2.5679,
|
|
"step": 1963
|
|
},
|
|
{
|
|
"epoch": 0.5605960257516663,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00012364059051211707,
|
|
"loss": 2.5471,
|
|
"step": 1964
|
|
},
|
|
{
|
|
"epoch": 0.5608814616099921,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00012350683383772462,
|
|
"loss": 2.5443,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 0.5611668974683177,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.00012337309890412618,
|
|
"loss": 2.5963,
|
|
"step": 1966
|
|
},
|
|
{
|
|
"epoch": 0.5614523333266435,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 0.00012323938582106724,
|
|
"loss": 2.5735,
|
|
"step": 1967
|
|
},
|
|
{
|
|
"epoch": 0.5617377691849691,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.00012310569469827518,
|
|
"loss": 2.5885,
|
|
"step": 1968
|
|
},
|
|
{
|
|
"epoch": 0.5620232050432948,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00012297202564545953,
|
|
"loss": 2.5558,
|
|
"step": 1969
|
|
},
|
|
{
|
|
"epoch": 0.5623086409016205,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 0.0001228383787723116,
|
|
"loss": 2.5914,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.5625940767599462,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.0001227047541885046,
|
|
"loss": 2.5518,
|
|
"step": 1971
|
|
},
|
|
{
|
|
"epoch": 0.562879512618272,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 0.00012257115200369338,
|
|
"loss": 2.541,
|
|
"step": 1972
|
|
},
|
|
{
|
|
"epoch": 0.5631649484765976,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.0001224375723275144,
|
|
"loss": 2.5672,
|
|
"step": 1973
|
|
},
|
|
{
|
|
"epoch": 0.5634503843349233,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.00012230401526958578,
|
|
"loss": 2.579,
|
|
"step": 1974
|
|
},
|
|
{
|
|
"epoch": 0.563735820193249,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 0.0001221704809395068,
|
|
"loss": 2.5442,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 0.5640212560515747,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00012203696944685838,
|
|
"loss": 2.582,
|
|
"step": 1976
|
|
},
|
|
{
|
|
"epoch": 0.5643066919099003,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 0.00012190348090120253,
|
|
"loss": 2.5607,
|
|
"step": 1977
|
|
},
|
|
{
|
|
"epoch": 0.5645921277682261,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 0.00012177001541208247,
|
|
"loss": 2.5668,
|
|
"step": 1978
|
|
},
|
|
{
|
|
"epoch": 0.5648775636265517,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 0.00012163657308902254,
|
|
"loss": 2.5663,
|
|
"step": 1979
|
|
},
|
|
{
|
|
"epoch": 0.5651629994848775,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 0.00012150315404152809,
|
|
"loss": 2.575,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.5654484353432032,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00012136975837908521,
|
|
"loss": 2.5806,
|
|
"step": 1981
|
|
},
|
|
{
|
|
"epoch": 0.5657338712015288,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.00012123638621116096,
|
|
"loss": 2.5632,
|
|
"step": 1982
|
|
},
|
|
{
|
|
"epoch": 0.5660193070598546,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 0.00012110303764720305,
|
|
"loss": 2.5993,
|
|
"step": 1983
|
|
},
|
|
{
|
|
"epoch": 0.5663047429181802,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 0.00012096971279663991,
|
|
"loss": 2.5778,
|
|
"step": 1984
|
|
},
|
|
{
|
|
"epoch": 0.566590178776506,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 0.00012083641176888034,
|
|
"loss": 2.5656,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 0.5668756146348316,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 0.00012070313467331368,
|
|
"loss": 2.5657,
|
|
"step": 1986
|
|
},
|
|
{
|
|
"epoch": 0.5671610504931573,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 0.00012056988161930973,
|
|
"loss": 2.5606,
|
|
"step": 1987
|
|
},
|
|
{
|
|
"epoch": 0.5674464863514831,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00012043665271621843,
|
|
"loss": 2.5621,
|
|
"step": 1988
|
|
},
|
|
{
|
|
"epoch": 0.5677319222098087,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00012030344807336993,
|
|
"loss": 2.5528,
|
|
"step": 1989
|
|
},
|
|
{
|
|
"epoch": 0.5680173580681345,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 0.00012017026780007452,
|
|
"loss": 2.5568,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.5683027939264601,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.00012003711200562242,
|
|
"loss": 2.5495,
|
|
"step": 1991
|
|
},
|
|
{
|
|
"epoch": 0.5685882297847858,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 0.00011990398079928378,
|
|
"loss": 2.5533,
|
|
"step": 1992
|
|
},
|
|
{
|
|
"epoch": 0.5688736656431115,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00011977087429030862,
|
|
"loss": 2.55,
|
|
"step": 1993
|
|
},
|
|
{
|
|
"epoch": 0.5691591015014372,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00011963779258792664,
|
|
"loss": 2.5533,
|
|
"step": 1994
|
|
},
|
|
{
|
|
"epoch": 0.5694445373597629,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00011950473580134723,
|
|
"loss": 2.567,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 0.5697299732180886,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.00011937170403975933,
|
|
"loss": 2.5419,
|
|
"step": 1996
|
|
},
|
|
{
|
|
"epoch": 0.5700154090764143,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 0.00011923869741233131,
|
|
"loss": 2.56,
|
|
"step": 1997
|
|
},
|
|
{
|
|
"epoch": 0.57030084493474,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00011910571602821089,
|
|
"loss": 2.571,
|
|
"step": 1998
|
|
},
|
|
{
|
|
"epoch": 0.5705862807930657,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 0.00011897275999652513,
|
|
"loss": 2.5794,
|
|
"step": 1999
|
|
},
|
|
{
|
|
"epoch": 0.5708717166513914,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00011883982942638028,
|
|
"loss": 2.5708,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.5708717166513914,
|
|
"eval_loss": 2.470252513885498,
|
|
"eval_runtime": 5925.0122,
|
|
"eval_samples_per_second": 10.85,
|
|
"eval_steps_per_second": 10.85,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.5711571525097171,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 0.00011870692442686172,
|
|
"loss": 2.5898,
|
|
"step": 2001
|
|
},
|
|
{
|
|
"epoch": 0.5714425883680427,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 0.00011857404510703366,
|
|
"loss": 2.5845,
|
|
"step": 2002
|
|
},
|
|
{
|
|
"epoch": 0.5717280242263685,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 0.0001184411915759396,
|
|
"loss": 2.5365,
|
|
"step": 2003
|
|
},
|
|
{
|
|
"epoch": 0.5720134600846942,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.00011830836394260153,
|
|
"loss": 2.562,
|
|
"step": 2004
|
|
},
|
|
{
|
|
"epoch": 0.5722988959430199,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.00011817556231602037,
|
|
"loss": 2.5718,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 0.5725843318013456,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 0.00011804278680517561,
|
|
"loss": 2.5428,
|
|
"step": 2006
|
|
},
|
|
{
|
|
"epoch": 0.5728697676596712,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.00011791003751902542,
|
|
"loss": 2.5839,
|
|
"step": 2007
|
|
},
|
|
{
|
|
"epoch": 0.573155203517997,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00011777731456650629,
|
|
"loss": 2.5791,
|
|
"step": 2008
|
|
},
|
|
{
|
|
"epoch": 0.5734406393763226,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.00011764461805653324,
|
|
"loss": 2.5559,
|
|
"step": 2009
|
|
},
|
|
{
|
|
"epoch": 0.5737260752346484,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.00011751194809799949,
|
|
"loss": 2.5588,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.574011511092974,
|
|
"grad_norm": 0.47265625,
|
|
"learning_rate": 0.00011737930479977658,
|
|
"loss": 2.597,
|
|
"step": 2011
|
|
},
|
|
{
|
|
"epoch": 0.5742969469512997,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.00011724668827071413,
|
|
"loss": 2.5619,
|
|
"step": 2012
|
|
},
|
|
{
|
|
"epoch": 0.5745823828096255,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00011711409861963971,
|
|
"loss": 2.5595,
|
|
"step": 2013
|
|
},
|
|
{
|
|
"epoch": 0.5748678186679511,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 0.00011698153595535897,
|
|
"loss": 2.5641,
|
|
"step": 2014
|
|
},
|
|
{
|
|
"epoch": 0.5751532545262769,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 0.0001168490003866553,
|
|
"loss": 2.5707,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 0.5754386903846025,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 0.00011671649202228988,
|
|
"loss": 2.5486,
|
|
"step": 2016
|
|
},
|
|
{
|
|
"epoch": 0.5757241262429282,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00011658401097100161,
|
|
"loss": 2.5753,
|
|
"step": 2017
|
|
},
|
|
{
|
|
"epoch": 0.5760095621012539,
|
|
"grad_norm": 0.50390625,
|
|
"learning_rate": 0.0001164515573415069,
|
|
"loss": 2.5995,
|
|
"step": 2018
|
|
},
|
|
{
|
|
"epoch": 0.5762949979595796,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00011631913124249981,
|
|
"loss": 2.587,
|
|
"step": 2019
|
|
},
|
|
{
|
|
"epoch": 0.5765804338179052,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.00011618673278265168,
|
|
"loss": 2.5885,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.576865869676231,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 0.00011605436207061112,
|
|
"loss": 2.5741,
|
|
"step": 2021
|
|
},
|
|
{
|
|
"epoch": 0.5771513055345567,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 0.00011592201921500408,
|
|
"loss": 2.5782,
|
|
"step": 2022
|
|
},
|
|
{
|
|
"epoch": 0.5774367413928824,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 0.00011578970432443364,
|
|
"loss": 2.5819,
|
|
"step": 2023
|
|
},
|
|
{
|
|
"epoch": 0.5777221772512081,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 0.00011565741750747992,
|
|
"loss": 2.5745,
|
|
"step": 2024
|
|
},
|
|
{
|
|
"epoch": 0.5780076131095337,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00011552515887269992,
|
|
"loss": 2.5694,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 0.5782930489678595,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 0.00011539292852862757,
|
|
"loss": 2.5542,
|
|
"step": 2026
|
|
},
|
|
{
|
|
"epoch": 0.5785784848261851,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 0.0001152607265837737,
|
|
"loss": 2.5776,
|
|
"step": 2027
|
|
},
|
|
{
|
|
"epoch": 0.5788639206845109,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 0.00011512855314662566,
|
|
"loss": 2.555,
|
|
"step": 2028
|
|
},
|
|
{
|
|
"epoch": 0.5791493565428366,
|
|
"grad_norm": 0.71484375,
|
|
"learning_rate": 0.00011499640832564749,
|
|
"loss": 2.5699,
|
|
"step": 2029
|
|
},
|
|
{
|
|
"epoch": 0.5794347924011622,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00011486429222927976,
|
|
"loss": 2.5698,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.579720228259488,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 0.00011473220496593937,
|
|
"loss": 2.546,
|
|
"step": 2031
|
|
},
|
|
{
|
|
"epoch": 0.5800056641178136,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.0001146001466440197,
|
|
"loss": 2.563,
|
|
"step": 2032
|
|
},
|
|
{
|
|
"epoch": 0.5802910999761394,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00011446811737189029,
|
|
"loss": 2.5682,
|
|
"step": 2033
|
|
},
|
|
{
|
|
"epoch": 0.580576535834465,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.0001143361172578968,
|
|
"loss": 2.5643,
|
|
"step": 2034
|
|
},
|
|
{
|
|
"epoch": 0.5808619716927907,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 0.00011420414641036111,
|
|
"loss": 2.5385,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 0.5811474075511164,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00011407220493758099,
|
|
"loss": 2.5788,
|
|
"step": 2036
|
|
},
|
|
{
|
|
"epoch": 0.5814328434094421,
|
|
"grad_norm": 0.4375,
|
|
"learning_rate": 0.00011394029294783011,
|
|
"loss": 2.5717,
|
|
"step": 2037
|
|
},
|
|
{
|
|
"epoch": 0.5817182792677679,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 0.00011380841054935789,
|
|
"loss": 2.595,
|
|
"step": 2038
|
|
},
|
|
{
|
|
"epoch": 0.5820037151260935,
|
|
"grad_norm": 0.484375,
|
|
"learning_rate": 0.00011367655785038957,
|
|
"loss": 2.5678,
|
|
"step": 2039
|
|
},
|
|
{
|
|
"epoch": 0.5822891509844192,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 0.00011354473495912596,
|
|
"loss": 2.5785,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.5825745868427449,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.00011341294198374341,
|
|
"loss": 2.5803,
|
|
"step": 2041
|
|
},
|
|
{
|
|
"epoch": 0.5828600227010706,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 0.00011328117903239376,
|
|
"loss": 2.5802,
|
|
"step": 2042
|
|
},
|
|
{
|
|
"epoch": 0.5831454585593963,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00011314944621320421,
|
|
"loss": 2.5512,
|
|
"step": 2043
|
|
},
|
|
{
|
|
"epoch": 0.583430894417722,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00011301774363427714,
|
|
"loss": 2.5891,
|
|
"step": 2044
|
|
},
|
|
{
|
|
"epoch": 0.5837163302760477,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.00011288607140369021,
|
|
"loss": 2.5855,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 0.5840017661343734,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 0.00011275442962949613,
|
|
"loss": 2.5551,
|
|
"step": 2046
|
|
},
|
|
{
|
|
"epoch": 0.5842872019926991,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00011262281841972272,
|
|
"loss": 2.5605,
|
|
"step": 2047
|
|
},
|
|
{
|
|
"epoch": 0.5845726378510248,
|
|
"grad_norm": 0.48046875,
|
|
"learning_rate": 0.0001124912378823725,
|
|
"loss": 2.5974,
|
|
"step": 2048
|
|
},
|
|
{
|
|
"epoch": 0.5848580737093505,
|
|
"grad_norm": 0.482421875,
|
|
"learning_rate": 0.00011235968812542298,
|
|
"loss": 2.5483,
|
|
"step": 2049
|
|
},
|
|
{
|
|
"epoch": 0.5851435095676761,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 0.00011222816925682647,
|
|
"loss": 2.5846,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.5854289454260019,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 0.00011209668138450979,
|
|
"loss": 2.572,
|
|
"step": 2051
|
|
},
|
|
{
|
|
"epoch": 0.5857143812843275,
|
|
"grad_norm": 0.466796875,
|
|
"learning_rate": 0.00011196522461637439,
|
|
"loss": 2.5609,
|
|
"step": 2052
|
|
},
|
|
{
|
|
"epoch": 0.5859998171426533,
|
|
"grad_norm": 0.52734375,
|
|
"learning_rate": 0.00011183379906029615,
|
|
"loss": 2.5499,
|
|
"step": 2053
|
|
},
|
|
{
|
|
"epoch": 0.586285253000979,
|
|
"grad_norm": 0.490234375,
|
|
"learning_rate": 0.00011170240482412542,
|
|
"loss": 2.5417,
|
|
"step": 2054
|
|
},
|
|
{
|
|
"epoch": 0.5865706888593046,
|
|
"grad_norm": 0.5390625,
|
|
"learning_rate": 0.00011157104201568677,
|
|
"loss": 2.5613,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 0.5868561247176304,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.000111439710742779,
|
|
"loss": 2.5377,
|
|
"step": 2056
|
|
},
|
|
{
|
|
"epoch": 0.587141560575956,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 0.00011130841111317501,
|
|
"loss": 2.5511,
|
|
"step": 2057
|
|
},
|
|
{
|
|
"epoch": 0.5874269964342818,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00011117714323462186,
|
|
"loss": 2.581,
|
|
"step": 2058
|
|
},
|
|
{
|
|
"epoch": 0.5877124322926074,
|
|
"grad_norm": 0.4921875,
|
|
"learning_rate": 0.0001110459072148404,
|
|
"loss": 2.556,
|
|
"step": 2059
|
|
},
|
|
{
|
|
"epoch": 0.5879978681509331,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00011091470316152543,
|
|
"loss": 2.5631,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.5882833040092589,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00011078353118234542,
|
|
"loss": 2.5587,
|
|
"step": 2061
|
|
},
|
|
{
|
|
"epoch": 0.5885687398675845,
|
|
"grad_norm": 0.486328125,
|
|
"learning_rate": 0.00011065239138494263,
|
|
"loss": 2.5622,
|
|
"step": 2062
|
|
},
|
|
{
|
|
"epoch": 0.5888541757259103,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 0.0001105212838769328,
|
|
"loss": 2.5687,
|
|
"step": 2063
|
|
},
|
|
{
|
|
"epoch": 0.5891396115842359,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 0.00011039020876590535,
|
|
"loss": 2.5541,
|
|
"step": 2064
|
|
},
|
|
{
|
|
"epoch": 0.5894250474425616,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00011025916615942281,
|
|
"loss": 2.5607,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 0.5897104833008873,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 0.00011012815616502145,
|
|
"loss": 2.5617,
|
|
"step": 2066
|
|
},
|
|
{
|
|
"epoch": 0.589995919159213,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.00010999717889021042,
|
|
"loss": 2.5915,
|
|
"step": 2067
|
|
},
|
|
{
|
|
"epoch": 0.5902813550175386,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 0.00010986623444247216,
|
|
"loss": 2.5686,
|
|
"step": 2068
|
|
},
|
|
{
|
|
"epoch": 0.5905667908758644,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.0001097353229292622,
|
|
"loss": 2.5715,
|
|
"step": 2069
|
|
},
|
|
{
|
|
"epoch": 0.5908522267341901,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 0.00010960444445800901,
|
|
"loss": 2.5551,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.5911376625925158,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.0001094735991361139,
|
|
"loss": 2.5485,
|
|
"step": 2071
|
|
},
|
|
{
|
|
"epoch": 0.5914230984508415,
|
|
"grad_norm": 0.453125,
|
|
"learning_rate": 0.00010934278707095103,
|
|
"loss": 2.5534,
|
|
"step": 2072
|
|
},
|
|
{
|
|
"epoch": 0.5917085343091671,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 0.00010921200836986727,
|
|
"loss": 2.56,
|
|
"step": 2073
|
|
},
|
|
{
|
|
"epoch": 0.5919939701674929,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 0.00010908126314018212,
|
|
"loss": 2.5518,
|
|
"step": 2074
|
|
},
|
|
{
|
|
"epoch": 0.5922794060258185,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00010895055148918756,
|
|
"loss": 2.587,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 0.5925648418841443,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 0.00010881987352414806,
|
|
"loss": 2.5573,
|
|
"step": 2076
|
|
},
|
|
{
|
|
"epoch": 0.59285027774247,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.00010868922935230049,
|
|
"loss": 2.5569,
|
|
"step": 2077
|
|
},
|
|
{
|
|
"epoch": 0.5931357136007956,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 0.00010855861908085383,
|
|
"loss": 2.5437,
|
|
"step": 2078
|
|
},
|
|
{
|
|
"epoch": 0.5934211494591214,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00010842804281698937,
|
|
"loss": 2.554,
|
|
"step": 2079
|
|
},
|
|
{
|
|
"epoch": 0.593706585317447,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.00010829750066786052,
|
|
"loss": 2.5834,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.5939920211757728,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.00010816699274059255,
|
|
"loss": 2.5947,
|
|
"step": 2081
|
|
},
|
|
{
|
|
"epoch": 0.5942774570340984,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 0.00010803651914228285,
|
|
"loss": 2.557,
|
|
"step": 2082
|
|
},
|
|
{
|
|
"epoch": 0.5945628928924241,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 0.00010790607998000048,
|
|
"loss": 2.5781,
|
|
"step": 2083
|
|
},
|
|
{
|
|
"epoch": 0.5948483287507498,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00010777567536078623,
|
|
"loss": 2.57,
|
|
"step": 2084
|
|
},
|
|
{
|
|
"epoch": 0.5951337646090755,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 0.0001076453053916527,
|
|
"loss": 2.5555,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 0.5954192004674013,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00010751497017958385,
|
|
"loss": 2.6032,
|
|
"step": 2086
|
|
},
|
|
{
|
|
"epoch": 0.5957046363257269,
|
|
"grad_norm": 0.5546875,
|
|
"learning_rate": 0.00010738466983153533,
|
|
"loss": 2.5711,
|
|
"step": 2087
|
|
},
|
|
{
|
|
"epoch": 0.5959900721840526,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.000107254404454434,
|
|
"loss": 2.5851,
|
|
"step": 2088
|
|
},
|
|
{
|
|
"epoch": 0.5962755080423783,
|
|
"grad_norm": 0.49609375,
|
|
"learning_rate": 0.00010712417415517808,
|
|
"loss": 2.5805,
|
|
"step": 2089
|
|
},
|
|
{
|
|
"epoch": 0.596560943900704,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 0.00010699397904063708,
|
|
"loss": 2.5809,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.5968463797590297,
|
|
"grad_norm": 0.57421875,
|
|
"learning_rate": 0.00010686381921765158,
|
|
"loss": 2.5796,
|
|
"step": 2091
|
|
},
|
|
{
|
|
"epoch": 0.5971318156173554,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 0.00010673369479303315,
|
|
"loss": 2.5641,
|
|
"step": 2092
|
|
},
|
|
{
|
|
"epoch": 0.597417251475681,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 0.00010660360587356438,
|
|
"loss": 2.5651,
|
|
"step": 2093
|
|
},
|
|
{
|
|
"epoch": 0.5977026873340068,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 0.00010647355256599877,
|
|
"loss": 2.5639,
|
|
"step": 2094
|
|
},
|
|
{
|
|
"epoch": 0.5979881231923325,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 0.00010634353497706037,
|
|
"loss": 2.5482,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 0.5982735590506582,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 0.0001062135532134442,
|
|
"loss": 2.5762,
|
|
"step": 2096
|
|
},
|
|
{
|
|
"epoch": 0.5985589949089839,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 0.0001060836073818157,
|
|
"loss": 2.573,
|
|
"step": 2097
|
|
},
|
|
{
|
|
"epoch": 0.5988444307673095,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 0.00010595369758881091,
|
|
"loss": 2.5582,
|
|
"step": 2098
|
|
},
|
|
{
|
|
"epoch": 0.5991298666256353,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 0.00010582382394103628,
|
|
"loss": 2.6,
|
|
"step": 2099
|
|
},
|
|
{
|
|
"epoch": 0.5994153024839609,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 0.0001056939865450686,
|
|
"loss": 2.573,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.5997007383422867,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 0.00010556418550745482,
|
|
"loss": 2.5422,
|
|
"step": 2101
|
|
},
|
|
{
|
|
"epoch": 0.5999861742006124,
|
|
"grad_norm": 0.427734375,
|
|
"learning_rate": 0.00010543442093471218,
|
|
"loss": 2.5682,
|
|
"step": 2102
|
|
},
|
|
{
|
|
"epoch": 0.600271610058938,
|
|
"grad_norm": 0.451171875,
|
|
"learning_rate": 0.00010530469293332797,
|
|
"loss": 2.563,
|
|
"step": 2103
|
|
},
|
|
{
|
|
"epoch": 0.6005570459172638,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 0.00010517500160975935,
|
|
"loss": 2.5584,
|
|
"step": 2104
|
|
},
|
|
{
|
|
"epoch": 0.6008424817755894,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00010504534707043357,
|
|
"loss": 2.5646,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 0.6011279176339152,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 0.00010491572942174763,
|
|
"loss": 2.5812,
|
|
"step": 2106
|
|
},
|
|
{
|
|
"epoch": 0.6014133534922408,
|
|
"grad_norm": 0.46875,
|
|
"learning_rate": 0.00010478614877006813,
|
|
"loss": 2.5652,
|
|
"step": 2107
|
|
},
|
|
{
|
|
"epoch": 0.6016987893505665,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 0.00010465660522173144,
|
|
"loss": 2.5468,
|
|
"step": 2108
|
|
},
|
|
{
|
|
"epoch": 0.6019842252088922,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.00010452709888304347,
|
|
"loss": 2.5424,
|
|
"step": 2109
|
|
},
|
|
{
|
|
"epoch": 0.6022696610672179,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 0.0001043976298602796,
|
|
"loss": 2.579,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.6025550969255437,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.00010426819825968449,
|
|
"loss": 2.5618,
|
|
"step": 2111
|
|
},
|
|
{
|
|
"epoch": 0.6028405327838693,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 0.00010413880418747215,
|
|
"loss": 2.5656,
|
|
"step": 2112
|
|
},
|
|
{
|
|
"epoch": 0.603125968642195,
|
|
"grad_norm": 0.4609375,
|
|
"learning_rate": 0.00010400944774982593,
|
|
"loss": 2.5724,
|
|
"step": 2113
|
|
},
|
|
{
|
|
"epoch": 0.6034114045005207,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 0.00010388012905289808,
|
|
"loss": 2.5452,
|
|
"step": 2114
|
|
},
|
|
{
|
|
"epoch": 0.6036968403588464,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 0.00010375084820280998,
|
|
"loss": 2.5538,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 0.603982276217172,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 0.00010362160530565197,
|
|
"loss": 2.5399,
|
|
"step": 2116
|
|
},
|
|
{
|
|
"epoch": 0.6042677120754978,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 0.00010349240046748324,
|
|
"loss": 2.5613,
|
|
"step": 2117
|
|
},
|
|
{
|
|
"epoch": 0.6045531479338235,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 0.00010336323379433165,
|
|
"loss": 2.5742,
|
|
"step": 2118
|
|
},
|
|
{
|
|
"epoch": 0.6048385837921492,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 0.00010323410539219388,
|
|
"loss": 2.5627,
|
|
"step": 2119
|
|
},
|
|
{
|
|
"epoch": 0.6051240196504749,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 0.00010310501536703507,
|
|
"loss": 2.5675,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.6054094555088005,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 0.00010297596382478906,
|
|
"loss": 2.5845,
|
|
"step": 2121
|
|
},
|
|
{
|
|
"epoch": 0.6056948913671263,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 0.00010284695087135791,
|
|
"loss": 2.5579,
|
|
"step": 2122
|
|
},
|
|
{
|
|
"epoch": 0.6059803272254519,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 0.00010271797661261215,
|
|
"loss": 2.5864,
|
|
"step": 2123
|
|
},
|
|
{
|
|
"epoch": 0.6062657630837777,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 0.0001025890411543904,
|
|
"loss": 2.5851,
|
|
"step": 2124
|
|
},
|
|
{
|
|
"epoch": 0.6065511989421033,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 0.00010246014460249964,
|
|
"loss": 2.5753,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 0.606836634800429,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 0.00010233128706271475,
|
|
"loss": 2.5756,
|
|
"step": 2126
|
|
},
|
|
{
|
|
"epoch": 0.6071220706587548,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 0.00010220246864077875,
|
|
"loss": 2.5755,
|
|
"step": 2127
|
|
},
|
|
{
|
|
"epoch": 0.6074075065170804,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 0.00010207368944240234,
|
|
"loss": 2.5598,
|
|
"step": 2128
|
|
},
|
|
{
|
|
"epoch": 0.6076929423754062,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.00010194494957326434,
|
|
"loss": 2.564,
|
|
"step": 2129
|
|
},
|
|
{
|
|
"epoch": 0.6079783782337318,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 0.00010181624913901099,
|
|
"loss": 2.5546,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.6082638140920575,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 0.0001016875882452564,
|
|
"loss": 2.5709,
|
|
"step": 2131
|
|
},
|
|
{
|
|
"epoch": 0.6085492499503832,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 0.00010155896699758206,
|
|
"loss": 2.5293,
|
|
"step": 2132
|
|
},
|
|
{
|
|
"epoch": 0.6088346858087089,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 0.00010143038550153703,
|
|
"loss": 2.5746,
|
|
"step": 2133
|
|
},
|
|
{
|
|
"epoch": 0.6091201216670347,
|
|
"grad_norm": 0.45703125,
|
|
"learning_rate": 0.0001013018438626378,
|
|
"loss": 2.5632,
|
|
"step": 2134
|
|
},
|
|
{
|
|
"epoch": 0.6094055575253603,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 0.00010117334218636793,
|
|
"loss": 2.5465,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 0.609690993383686,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 0.00010104488057817839,
|
|
"loss": 2.5461,
|
|
"step": 2136
|
|
},
|
|
{
|
|
"epoch": 0.6099764292420117,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 0.00010091645914348724,
|
|
"loss": 2.5891,
|
|
"step": 2137
|
|
},
|
|
{
|
|
"epoch": 0.6102618651003374,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 0.00010078807798767953,
|
|
"loss": 2.5954,
|
|
"step": 2138
|
|
},
|
|
{
|
|
"epoch": 0.610547300958663,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.00010065973721610727,
|
|
"loss": 2.5611,
|
|
"step": 2139
|
|
},
|
|
{
|
|
"epoch": 0.6108327368169888,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 0.00010053143693408932,
|
|
"loss": 2.5958,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.6111181726753144,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 0.00010040317724691133,
|
|
"loss": 2.5734,
|
|
"step": 2141
|
|
},
|
|
{
|
|
"epoch": 0.6114036085336402,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 0.00010027495825982558,
|
|
"loss": 2.5665,
|
|
"step": 2142
|
|
},
|
|
{
|
|
"epoch": 0.6116890443919659,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 0.00010014678007805106,
|
|
"loss": 2.5597,
|
|
"step": 2143
|
|
},
|
|
{
|
|
"epoch": 0.6119744802502916,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 0.00010001864280677316,
|
|
"loss": 2.5883,
|
|
"step": 2144
|
|
},
|
|
{
|
|
"epoch": 0.6122599161086173,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 9.989054655114383e-05,
|
|
"loss": 2.5357,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 0.6125453519669429,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 9.976249141628124e-05,
|
|
"loss": 2.5692,
|
|
"step": 2146
|
|
},
|
|
{
|
|
"epoch": 0.6128307878252687,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 9.963447750726984e-05,
|
|
"loss": 2.5544,
|
|
"step": 2147
|
|
},
|
|
{
|
|
"epoch": 0.6131162236835943,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 9.95065049291603e-05,
|
|
"loss": 2.5472,
|
|
"step": 2148
|
|
},
|
|
{
|
|
"epoch": 0.61340165954192,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 9.937857378696932e-05,
|
|
"loss": 2.6036,
|
|
"step": 2149
|
|
},
|
|
{
|
|
"epoch": 0.6136870954002458,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.925068418567967e-05,
|
|
"loss": 2.5645,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.6139725312585714,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 9.912283623023988e-05,
|
|
"loss": 2.5646,
|
|
"step": 2151
|
|
},
|
|
{
|
|
"epoch": 0.6142579671168972,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 9.899503002556442e-05,
|
|
"loss": 2.5792,
|
|
"step": 2152
|
|
},
|
|
{
|
|
"epoch": 0.6145434029752228,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 9.886726567653362e-05,
|
|
"loss": 2.5629,
|
|
"step": 2153
|
|
},
|
|
{
|
|
"epoch": 0.6148288388335486,
|
|
"grad_norm": 0.4375,
|
|
"learning_rate": 9.87395432879932e-05,
|
|
"loss": 2.5558,
|
|
"step": 2154
|
|
},
|
|
{
|
|
"epoch": 0.6151142746918742,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 9.861186296475458e-05,
|
|
"loss": 2.5663,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 0.6153997105501999,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 9.84842248115947e-05,
|
|
"loss": 2.5347,
|
|
"step": 2156
|
|
},
|
|
{
|
|
"epoch": 0.6156851464085256,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 9.835662893325584e-05,
|
|
"loss": 2.5608,
|
|
"step": 2157
|
|
},
|
|
{
|
|
"epoch": 0.6159705822668513,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 9.822907543444553e-05,
|
|
"loss": 2.5695,
|
|
"step": 2158
|
|
},
|
|
{
|
|
"epoch": 0.616256018125177,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 9.810156441983665e-05,
|
|
"loss": 2.5549,
|
|
"step": 2159
|
|
},
|
|
{
|
|
"epoch": 0.6165414539835027,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 9.797409599406709e-05,
|
|
"loss": 2.5916,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.6168268898418284,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 9.784667026173993e-05,
|
|
"loss": 2.546,
|
|
"step": 2161
|
|
},
|
|
{
|
|
"epoch": 0.6171123257001541,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 9.771928732742313e-05,
|
|
"loss": 2.5728,
|
|
"step": 2162
|
|
},
|
|
{
|
|
"epoch": 0.6173977615584798,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 9.759194729564954e-05,
|
|
"loss": 2.5711,
|
|
"step": 2163
|
|
},
|
|
{
|
|
"epoch": 0.6176831974168054,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 9.746465027091676e-05,
|
|
"loss": 2.5335,
|
|
"step": 2164
|
|
},
|
|
{
|
|
"epoch": 0.6179686332751312,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 9.733739635768714e-05,
|
|
"loss": 2.5583,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 0.6182540691334568,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 9.721018566038767e-05,
|
|
"loss": 2.537,
|
|
"step": 2166
|
|
},
|
|
{
|
|
"epoch": 0.6185395049917826,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 9.708301828340993e-05,
|
|
"loss": 2.5576,
|
|
"step": 2167
|
|
},
|
|
{
|
|
"epoch": 0.6188249408501083,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 9.695589433110968e-05,
|
|
"loss": 2.5786,
|
|
"step": 2168
|
|
},
|
|
{
|
|
"epoch": 0.6191103767084339,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 9.682881390780749e-05,
|
|
"loss": 2.584,
|
|
"step": 2169
|
|
},
|
|
{
|
|
"epoch": 0.6193958125667597,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 9.67017771177878e-05,
|
|
"loss": 2.5681,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.6196812484250853,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 9.657478406529946e-05,
|
|
"loss": 2.553,
|
|
"step": 2171
|
|
},
|
|
{
|
|
"epoch": 0.6199666842834111,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 9.644783485455537e-05,
|
|
"loss": 2.5665,
|
|
"step": 2172
|
|
},
|
|
{
|
|
"epoch": 0.6202521201417367,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 9.632092958973246e-05,
|
|
"loss": 2.5572,
|
|
"step": 2173
|
|
},
|
|
{
|
|
"epoch": 0.6205375560000624,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.61940683749716e-05,
|
|
"loss": 2.5576,
|
|
"step": 2174
|
|
},
|
|
{
|
|
"epoch": 0.6208229918583882,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 9.606725131437739e-05,
|
|
"loss": 2.5667,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 0.6211084277167138,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 9.594047851201855e-05,
|
|
"loss": 2.5688,
|
|
"step": 2176
|
|
},
|
|
{
|
|
"epoch": 0.6213938635750396,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 9.581375007192705e-05,
|
|
"loss": 2.5627,
|
|
"step": 2177
|
|
},
|
|
{
|
|
"epoch": 0.6216792994333652,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 9.568706609809872e-05,
|
|
"loss": 2.5918,
|
|
"step": 2178
|
|
},
|
|
{
|
|
"epoch": 0.6219647352916909,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 9.556042669449281e-05,
|
|
"loss": 2.5662,
|
|
"step": 2179
|
|
},
|
|
{
|
|
"epoch": 0.6222501711500166,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 9.543383196503206e-05,
|
|
"loss": 2.5345,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.6225356070083423,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.530728201360244e-05,
|
|
"loss": 2.5612,
|
|
"step": 2181
|
|
},
|
|
{
|
|
"epoch": 0.622821042866668,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 9.518077694405322e-05,
|
|
"loss": 2.5691,
|
|
"step": 2182
|
|
},
|
|
{
|
|
"epoch": 0.6231064787249937,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.505431686019692e-05,
|
|
"loss": 2.5599,
|
|
"step": 2183
|
|
},
|
|
{
|
|
"epoch": 0.6233919145833194,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 9.492790186580906e-05,
|
|
"loss": 2.5384,
|
|
"step": 2184
|
|
},
|
|
{
|
|
"epoch": 0.6236773504416451,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 9.480153206462817e-05,
|
|
"loss": 2.5833,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 0.6239627862999708,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 9.467520756035575e-05,
|
|
"loss": 2.5582,
|
|
"step": 2186
|
|
},
|
|
{
|
|
"epoch": 0.6242482221582965,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 9.454892845665603e-05,
|
|
"loss": 2.5327,
|
|
"step": 2187
|
|
},
|
|
{
|
|
"epoch": 0.6245336580166222,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 9.442269485715602e-05,
|
|
"loss": 2.5675,
|
|
"step": 2188
|
|
},
|
|
{
|
|
"epoch": 0.6248190938749478,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 9.429650686544546e-05,
|
|
"loss": 2.5706,
|
|
"step": 2189
|
|
},
|
|
{
|
|
"epoch": 0.6251045297332736,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 9.417036458507658e-05,
|
|
"loss": 2.5732,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.6253899655915993,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.404426811956404e-05,
|
|
"loss": 2.57,
|
|
"step": 2191
|
|
},
|
|
{
|
|
"epoch": 0.625675401449925,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.391821757238511e-05,
|
|
"loss": 2.5336,
|
|
"step": 2192
|
|
},
|
|
{
|
|
"epoch": 0.6259608373082507,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 9.379221304697925e-05,
|
|
"loss": 2.5533,
|
|
"step": 2193
|
|
},
|
|
{
|
|
"epoch": 0.6262462731665763,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.366625464674811e-05,
|
|
"loss": 2.5648,
|
|
"step": 2194
|
|
},
|
|
{
|
|
"epoch": 0.6265317090249021,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 9.354034247505556e-05,
|
|
"loss": 2.5672,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 0.6268171448832277,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.341447663522749e-05,
|
|
"loss": 2.5789,
|
|
"step": 2196
|
|
},
|
|
{
|
|
"epoch": 0.6271025807415535,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 9.328865723055185e-05,
|
|
"loss": 2.5557,
|
|
"step": 2197
|
|
},
|
|
{
|
|
"epoch": 0.6273880165998791,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 9.316288436427834e-05,
|
|
"loss": 2.5479,
|
|
"step": 2198
|
|
},
|
|
{
|
|
"epoch": 0.6276734524582048,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.30371581396186e-05,
|
|
"loss": 2.5853,
|
|
"step": 2199
|
|
},
|
|
{
|
|
"epoch": 0.6279588883165306,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 9.291147865974599e-05,
|
|
"loss": 2.588,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.6282443241748562,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 9.278584602779541e-05,
|
|
"loss": 2.5675,
|
|
"step": 2201
|
|
},
|
|
{
|
|
"epoch": 0.628529760033182,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 9.266026034686341e-05,
|
|
"loss": 2.59,
|
|
"step": 2202
|
|
},
|
|
{
|
|
"epoch": 0.6288151958915076,
|
|
"grad_norm": 0.44140625,
|
|
"learning_rate": 9.253472172000802e-05,
|
|
"loss": 2.5578,
|
|
"step": 2203
|
|
},
|
|
{
|
|
"epoch": 0.6291006317498333,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 9.240923025024853e-05,
|
|
"loss": 2.5348,
|
|
"step": 2204
|
|
},
|
|
{
|
|
"epoch": 0.629386067608159,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 9.228378604056568e-05,
|
|
"loss": 2.5759,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 0.6296715034664847,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 9.215838919390132e-05,
|
|
"loss": 2.5559,
|
|
"step": 2206
|
|
},
|
|
{
|
|
"epoch": 0.6299569393248104,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 9.203303981315847e-05,
|
|
"loss": 2.5611,
|
|
"step": 2207
|
|
},
|
|
{
|
|
"epoch": 0.6302423751831361,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 9.190773800120126e-05,
|
|
"loss": 2.5746,
|
|
"step": 2208
|
|
},
|
|
{
|
|
"epoch": 0.6305278110414618,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 9.178248386085474e-05,
|
|
"loss": 2.5519,
|
|
"step": 2209
|
|
},
|
|
{
|
|
"epoch": 0.6308132468997875,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 9.165727749490477e-05,
|
|
"loss": 2.5576,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.6310986827581132,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 9.15321190060981e-05,
|
|
"loss": 2.5854,
|
|
"step": 2211
|
|
},
|
|
{
|
|
"epoch": 0.6313841186164388,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 9.140700849714216e-05,
|
|
"loss": 2.5661,
|
|
"step": 2212
|
|
},
|
|
{
|
|
"epoch": 0.6316695544747646,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 9.128194607070498e-05,
|
|
"loss": 2.5572,
|
|
"step": 2213
|
|
},
|
|
{
|
|
"epoch": 0.6319549903330902,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 9.115693182941518e-05,
|
|
"loss": 2.5889,
|
|
"step": 2214
|
|
},
|
|
{
|
|
"epoch": 0.632240426191416,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 9.103196587586172e-05,
|
|
"loss": 2.5474,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 0.6325258620497417,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 9.090704831259422e-05,
|
|
"loss": 2.5664,
|
|
"step": 2216
|
|
},
|
|
{
|
|
"epoch": 0.6328112979080673,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 9.078217924212224e-05,
|
|
"loss": 2.5648,
|
|
"step": 2217
|
|
},
|
|
{
|
|
"epoch": 0.6330967337663931,
|
|
"grad_norm": 0.412109375,
|
|
"learning_rate": 9.065735876691578e-05,
|
|
"loss": 2.5675,
|
|
"step": 2218
|
|
},
|
|
{
|
|
"epoch": 0.6333821696247187,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 9.053258698940484e-05,
|
|
"loss": 2.5783,
|
|
"step": 2219
|
|
},
|
|
{
|
|
"epoch": 0.6336676054830445,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 9.040786401197957e-05,
|
|
"loss": 2.561,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.6339530413413701,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 9.028318993698993e-05,
|
|
"loss": 2.5814,
|
|
"step": 2221
|
|
},
|
|
{
|
|
"epoch": 0.6342384771996958,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 9.015856486674587e-05,
|
|
"loss": 2.6124,
|
|
"step": 2222
|
|
},
|
|
{
|
|
"epoch": 0.6345239130580216,
|
|
"grad_norm": 0.458984375,
|
|
"learning_rate": 9.003398890351704e-05,
|
|
"loss": 2.5395,
|
|
"step": 2223
|
|
},
|
|
{
|
|
"epoch": 0.6348093489163472,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 8.99094621495329e-05,
|
|
"loss": 2.5417,
|
|
"step": 2224
|
|
},
|
|
{
|
|
"epoch": 0.635094784774673,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 8.978498470698244e-05,
|
|
"loss": 2.5751,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 0.6353802206329986,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 8.966055667801422e-05,
|
|
"loss": 2.5614,
|
|
"step": 2226
|
|
},
|
|
{
|
|
"epoch": 0.6356656564913243,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 8.95361781647362e-05,
|
|
"loss": 2.5633,
|
|
"step": 2227
|
|
},
|
|
{
|
|
"epoch": 0.63595109234965,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 8.941184926921576e-05,
|
|
"loss": 2.5668,
|
|
"step": 2228
|
|
},
|
|
{
|
|
"epoch": 0.6362365282079757,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 8.928757009347956e-05,
|
|
"loss": 2.5793,
|
|
"step": 2229
|
|
},
|
|
{
|
|
"epoch": 0.6365219640663013,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 8.916334073951345e-05,
|
|
"loss": 2.5548,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.6368073999246271,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 8.90391613092623e-05,
|
|
"loss": 2.5783,
|
|
"step": 2231
|
|
},
|
|
{
|
|
"epoch": 0.6370928357829528,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 8.891503190463024e-05,
|
|
"loss": 2.5809,
|
|
"step": 2232
|
|
},
|
|
{
|
|
"epoch": 0.6373782716412785,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 8.879095262748018e-05,
|
|
"loss": 2.5614,
|
|
"step": 2233
|
|
},
|
|
{
|
|
"epoch": 0.6376637074996042,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 8.866692357963387e-05,
|
|
"loss": 2.5739,
|
|
"step": 2234
|
|
},
|
|
{
|
|
"epoch": 0.6379491433579298,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 8.854294486287188e-05,
|
|
"loss": 2.5764,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 0.6382345792162556,
|
|
"grad_norm": 0.4375,
|
|
"learning_rate": 8.84190165789336e-05,
|
|
"loss": 2.5702,
|
|
"step": 2236
|
|
},
|
|
{
|
|
"epoch": 0.6385200150745812,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 8.829513882951686e-05,
|
|
"loss": 2.5682,
|
|
"step": 2237
|
|
},
|
|
{
|
|
"epoch": 0.638805450932907,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 8.8171311716278e-05,
|
|
"loss": 2.5557,
|
|
"step": 2238
|
|
},
|
|
{
|
|
"epoch": 0.6390908867912326,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 8.804753534083208e-05,
|
|
"loss": 2.5917,
|
|
"step": 2239
|
|
},
|
|
{
|
|
"epoch": 0.6393763226495583,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 8.79238098047522e-05,
|
|
"loss": 2.5776,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.6396617585078841,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 8.780013520956996e-05,
|
|
"loss": 2.5412,
|
|
"step": 2241
|
|
},
|
|
{
|
|
"epoch": 0.6399471943662097,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 8.767651165677502e-05,
|
|
"loss": 2.572,
|
|
"step": 2242
|
|
},
|
|
{
|
|
"epoch": 0.6402326302245355,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 8.755293924781523e-05,
|
|
"loss": 2.5363,
|
|
"step": 2243
|
|
},
|
|
{
|
|
"epoch": 0.6405180660828611,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 8.742941808409647e-05,
|
|
"loss": 2.5623,
|
|
"step": 2244
|
|
},
|
|
{
|
|
"epoch": 0.6408035019411868,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 8.730594826698253e-05,
|
|
"loss": 2.551,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 0.6410889377995125,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 8.718252989779496e-05,
|
|
"loss": 2.5181,
|
|
"step": 2246
|
|
},
|
|
{
|
|
"epoch": 0.6413743736578382,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 8.705916307781344e-05,
|
|
"loss": 2.5543,
|
|
"step": 2247
|
|
},
|
|
{
|
|
"epoch": 0.641659809516164,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 8.6935847908275e-05,
|
|
"loss": 2.5636,
|
|
"step": 2248
|
|
},
|
|
{
|
|
"epoch": 0.6419452453744896,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 8.681258449037438e-05,
|
|
"loss": 2.5439,
|
|
"step": 2249
|
|
},
|
|
{
|
|
"epoch": 0.6422306812328153,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 8.668937292526394e-05,
|
|
"loss": 2.5287,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.6422306812328153,
|
|
"eval_loss": 2.4652860164642334,
|
|
"eval_runtime": 6001.1587,
|
|
"eval_samples_per_second": 10.712,
|
|
"eval_steps_per_second": 10.712,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.642516117091141,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 8.656621331405339e-05,
|
|
"loss": 2.5401,
|
|
"step": 2251
|
|
},
|
|
{
|
|
"epoch": 0.6428015529494667,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 8.644310575780979e-05,
|
|
"loss": 2.5709,
|
|
"step": 2252
|
|
},
|
|
{
|
|
"epoch": 0.6430869888077924,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 8.632005035755766e-05,
|
|
"loss": 2.6213,
|
|
"step": 2253
|
|
},
|
|
{
|
|
"epoch": 0.6433724246661181,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 8.619704721427843e-05,
|
|
"loss": 2.5512,
|
|
"step": 2254
|
|
},
|
|
{
|
|
"epoch": 0.6436578605244437,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 8.607409642891091e-05,
|
|
"loss": 2.563,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 0.6439432963827695,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 8.595119810235088e-05,
|
|
"loss": 2.5438,
|
|
"step": 2256
|
|
},
|
|
{
|
|
"epoch": 0.6442287322410952,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 8.582835233545093e-05,
|
|
"loss": 2.5563,
|
|
"step": 2257
|
|
},
|
|
{
|
|
"epoch": 0.6445141680994209,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 8.570555922902074e-05,
|
|
"loss": 2.5278,
|
|
"step": 2258
|
|
},
|
|
{
|
|
"epoch": 0.6447996039577466,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 8.558281888382659e-05,
|
|
"loss": 2.5753,
|
|
"step": 2259
|
|
},
|
|
{
|
|
"epoch": 0.6450850398160722,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 8.546013140059148e-05,
|
|
"loss": 2.5751,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.645370475674398,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 8.53374968799952e-05,
|
|
"loss": 2.5553,
|
|
"step": 2261
|
|
},
|
|
{
|
|
"epoch": 0.6456559115327236,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 8.521491542267386e-05,
|
|
"loss": 2.5534,
|
|
"step": 2262
|
|
},
|
|
{
|
|
"epoch": 0.6459413473910494,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 8.509238712922014e-05,
|
|
"loss": 2.5781,
|
|
"step": 2263
|
|
},
|
|
{
|
|
"epoch": 0.6462267832493751,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 8.496991210018319e-05,
|
|
"loss": 2.5595,
|
|
"step": 2264
|
|
},
|
|
{
|
|
"epoch": 0.6465122191077007,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 8.484749043606824e-05,
|
|
"loss": 2.5502,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 0.6467976549660265,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 8.472512223733679e-05,
|
|
"loss": 2.5458,
|
|
"step": 2266
|
|
},
|
|
{
|
|
"epoch": 0.6470830908243521,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 8.460280760440664e-05,
|
|
"loss": 2.5653,
|
|
"step": 2267
|
|
},
|
|
{
|
|
"epoch": 0.6473685266826779,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 8.448054663765135e-05,
|
|
"loss": 2.5727,
|
|
"step": 2268
|
|
},
|
|
{
|
|
"epoch": 0.6476539625410035,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 8.435833943740064e-05,
|
|
"loss": 2.5665,
|
|
"step": 2269
|
|
},
|
|
{
|
|
"epoch": 0.6479393983993292,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 8.423618610394004e-05,
|
|
"loss": 2.5411,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.6482248342576549,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 8.411408673751096e-05,
|
|
"loss": 2.5636,
|
|
"step": 2271
|
|
},
|
|
{
|
|
"epoch": 0.6485102701159806,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 8.399204143831036e-05,
|
|
"loss": 2.5729,
|
|
"step": 2272
|
|
},
|
|
{
|
|
"epoch": 0.6487957059743064,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 8.387005030649102e-05,
|
|
"loss": 2.5837,
|
|
"step": 2273
|
|
},
|
|
{
|
|
"epoch": 0.649081141832632,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 8.374811344216105e-05,
|
|
"loss": 2.5646,
|
|
"step": 2274
|
|
},
|
|
{
|
|
"epoch": 0.6493665776909577,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 8.362623094538428e-05,
|
|
"loss": 2.5886,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 0.6496520135492834,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 8.350440291617974e-05,
|
|
"loss": 2.5494,
|
|
"step": 2276
|
|
},
|
|
{
|
|
"epoch": 0.6499374494076091,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 8.338262945452176e-05,
|
|
"loss": 2.5577,
|
|
"step": 2277
|
|
},
|
|
{
|
|
"epoch": 0.6502228852659347,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 8.326091066033998e-05,
|
|
"loss": 2.5796,
|
|
"step": 2278
|
|
},
|
|
{
|
|
"epoch": 0.6505083211242605,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 8.313924663351926e-05,
|
|
"loss": 2.574,
|
|
"step": 2279
|
|
},
|
|
{
|
|
"epoch": 0.6507937569825862,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 8.301763747389925e-05,
|
|
"loss": 2.5544,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.6510791928409119,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 8.289608328127483e-05,
|
|
"loss": 2.5358,
|
|
"step": 2281
|
|
},
|
|
{
|
|
"epoch": 0.6513646286992376,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 8.277458415539569e-05,
|
|
"loss": 2.5567,
|
|
"step": 2282
|
|
},
|
|
{
|
|
"epoch": 0.6516500645575632,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 8.265314019596617e-05,
|
|
"loss": 2.5566,
|
|
"step": 2283
|
|
},
|
|
{
|
|
"epoch": 0.651935500415889,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 8.253175150264565e-05,
|
|
"loss": 2.5591,
|
|
"step": 2284
|
|
},
|
|
{
|
|
"epoch": 0.6522209362742146,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 8.241041817504791e-05,
|
|
"loss": 2.5519,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 0.6525063721325404,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 8.228914031274128e-05,
|
|
"loss": 2.5378,
|
|
"step": 2286
|
|
},
|
|
{
|
|
"epoch": 0.652791807990866,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 8.21679180152489e-05,
|
|
"loss": 2.5576,
|
|
"step": 2287
|
|
},
|
|
{
|
|
"epoch": 0.6530772438491917,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 8.204675138204794e-05,
|
|
"loss": 2.5636,
|
|
"step": 2288
|
|
},
|
|
{
|
|
"epoch": 0.6533626797075175,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 8.192564051257001e-05,
|
|
"loss": 2.5682,
|
|
"step": 2289
|
|
},
|
|
{
|
|
"epoch": 0.6536481155658431,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 8.180458550620109e-05,
|
|
"loss": 2.5616,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.6539335514241689,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 8.168358646228115e-05,
|
|
"loss": 2.5503,
|
|
"step": 2291
|
|
},
|
|
{
|
|
"epoch": 0.6542189872824945,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 8.156264348010425e-05,
|
|
"loss": 2.548,
|
|
"step": 2292
|
|
},
|
|
{
|
|
"epoch": 0.6545044231408202,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 8.144175665891858e-05,
|
|
"loss": 2.5327,
|
|
"step": 2293
|
|
},
|
|
{
|
|
"epoch": 0.6547898589991459,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 8.132092609792608e-05,
|
|
"loss": 2.5491,
|
|
"step": 2294
|
|
},
|
|
{
|
|
"epoch": 0.6550752948574716,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 8.120015189628259e-05,
|
|
"loss": 2.5576,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 0.6553607307157974,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 8.107943415309786e-05,
|
|
"loss": 2.5687,
|
|
"step": 2296
|
|
},
|
|
{
|
|
"epoch": 0.655646166574123,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 8.095877296743497e-05,
|
|
"loss": 2.5506,
|
|
"step": 2297
|
|
},
|
|
{
|
|
"epoch": 0.6559316024324487,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 8.083816843831091e-05,
|
|
"loss": 2.5609,
|
|
"step": 2298
|
|
},
|
|
{
|
|
"epoch": 0.6562170382907744,
|
|
"grad_norm": 0.35546875,
|
|
"learning_rate": 8.071762066469598e-05,
|
|
"loss": 2.5515,
|
|
"step": 2299
|
|
},
|
|
{
|
|
"epoch": 0.6565024741491001,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 8.059712974551392e-05,
|
|
"loss": 2.5587,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.6567879100074258,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 8.047669577964197e-05,
|
|
"loss": 2.5523,
|
|
"step": 2301
|
|
},
|
|
{
|
|
"epoch": 0.6570733458657515,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 8.03563188659104e-05,
|
|
"loss": 2.5321,
|
|
"step": 2302
|
|
},
|
|
{
|
|
"epoch": 0.6573587817240771,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 8.023599910310287e-05,
|
|
"loss": 2.5848,
|
|
"step": 2303
|
|
},
|
|
{
|
|
"epoch": 0.6576442175824029,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 8.011573658995606e-05,
|
|
"loss": 2.539,
|
|
"step": 2304
|
|
},
|
|
{
|
|
"epoch": 0.6579296534407286,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 7.999553142515969e-05,
|
|
"loss": 2.5545,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 0.6582150892990543,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.987538370735624e-05,
|
|
"loss": 2.5481,
|
|
"step": 2306
|
|
},
|
|
{
|
|
"epoch": 0.65850052515738,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.975529353514141e-05,
|
|
"loss": 2.5889,
|
|
"step": 2307
|
|
},
|
|
{
|
|
"epoch": 0.6587859610157056,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 7.963526100706337e-05,
|
|
"loss": 2.5113,
|
|
"step": 2308
|
|
},
|
|
{
|
|
"epoch": 0.6590713968740314,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 7.951528622162297e-05,
|
|
"loss": 2.5789,
|
|
"step": 2309
|
|
},
|
|
{
|
|
"epoch": 0.659356832732357,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 7.9395369277274e-05,
|
|
"loss": 2.546,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.6596422685906828,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 7.927551027242252e-05,
|
|
"loss": 2.5322,
|
|
"step": 2311
|
|
},
|
|
{
|
|
"epoch": 0.6599277044490084,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 7.9155709305427e-05,
|
|
"loss": 2.5277,
|
|
"step": 2312
|
|
},
|
|
{
|
|
"epoch": 0.6602131403073341,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 7.90359664745985e-05,
|
|
"loss": 2.5684,
|
|
"step": 2313
|
|
},
|
|
{
|
|
"epoch": 0.6604985761656599,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 7.891628187820021e-05,
|
|
"loss": 2.5712,
|
|
"step": 2314
|
|
},
|
|
{
|
|
"epoch": 0.6607840120239855,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 7.87966556144475e-05,
|
|
"loss": 2.5458,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 0.6610694478823113,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 7.867708778150812e-05,
|
|
"loss": 2.572,
|
|
"step": 2316
|
|
},
|
|
{
|
|
"epoch": 0.6613548837406369,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 7.855757847750151e-05,
|
|
"loss": 2.553,
|
|
"step": 2317
|
|
},
|
|
{
|
|
"epoch": 0.6616403195989626,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 7.843812780049935e-05,
|
|
"loss": 2.5738,
|
|
"step": 2318
|
|
},
|
|
{
|
|
"epoch": 0.6619257554572883,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 7.831873584852522e-05,
|
|
"loss": 2.5652,
|
|
"step": 2319
|
|
},
|
|
{
|
|
"epoch": 0.662211191315614,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 7.819940271955425e-05,
|
|
"loss": 2.5447,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.6624966271739398,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 7.808012851151362e-05,
|
|
"loss": 2.5698,
|
|
"step": 2321
|
|
},
|
|
{
|
|
"epoch": 0.6627820630322654,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 7.796091332228193e-05,
|
|
"loss": 2.54,
|
|
"step": 2322
|
|
},
|
|
{
|
|
"epoch": 0.6630674988905911,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 7.784175724968939e-05,
|
|
"loss": 2.5497,
|
|
"step": 2323
|
|
},
|
|
{
|
|
"epoch": 0.6633529347489168,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 7.772266039151781e-05,
|
|
"loss": 2.5507,
|
|
"step": 2324
|
|
},
|
|
{
|
|
"epoch": 0.6636383706072425,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 7.760362284550024e-05,
|
|
"loss": 2.5712,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 0.6639238064655681,
|
|
"grad_norm": 0.67578125,
|
|
"learning_rate": 7.748464470932117e-05,
|
|
"loss": 2.5554,
|
|
"step": 2326
|
|
},
|
|
{
|
|
"epoch": 0.6642092423238939,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 7.73657260806164e-05,
|
|
"loss": 2.5577,
|
|
"step": 2327
|
|
},
|
|
{
|
|
"epoch": 0.6644946781822195,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 7.724686705697274e-05,
|
|
"loss": 2.5744,
|
|
"step": 2328
|
|
},
|
|
{
|
|
"epoch": 0.6647801140405453,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 7.712806773592811e-05,
|
|
"loss": 2.547,
|
|
"step": 2329
|
|
},
|
|
{
|
|
"epoch": 0.665065549898871,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 7.700932821497157e-05,
|
|
"loss": 2.558,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.6653509857571966,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 7.689064859154299e-05,
|
|
"loss": 2.5383,
|
|
"step": 2331
|
|
},
|
|
{
|
|
"epoch": 0.6656364216155224,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 7.677202896303307e-05,
|
|
"loss": 2.6,
|
|
"step": 2332
|
|
},
|
|
{
|
|
"epoch": 0.665921857473848,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 7.665346942678335e-05,
|
|
"loss": 2.5926,
|
|
"step": 2333
|
|
},
|
|
{
|
|
"epoch": 0.6662072933321738,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 7.653497008008611e-05,
|
|
"loss": 2.5573,
|
|
"step": 2334
|
|
},
|
|
{
|
|
"epoch": 0.6664927291904994,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 7.641653102018402e-05,
|
|
"loss": 2.5838,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 0.6667781650488251,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 7.629815234427057e-05,
|
|
"loss": 2.5812,
|
|
"step": 2336
|
|
},
|
|
{
|
|
"epoch": 0.6670636009071509,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 7.617983414948937e-05,
|
|
"loss": 2.5533,
|
|
"step": 2337
|
|
},
|
|
{
|
|
"epoch": 0.6673490367654765,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 7.606157653293476e-05,
|
|
"loss": 2.5459,
|
|
"step": 2338
|
|
},
|
|
{
|
|
"epoch": 0.6676344726238023,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 7.594337959165107e-05,
|
|
"loss": 2.5619,
|
|
"step": 2339
|
|
},
|
|
{
|
|
"epoch": 0.6679199084821279,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 7.582524342263292e-05,
|
|
"loss": 2.5708,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.6682053443404536,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 7.570716812282512e-05,
|
|
"loss": 2.5465,
|
|
"step": 2341
|
|
},
|
|
{
|
|
"epoch": 0.6684907801987793,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 7.558915378912257e-05,
|
|
"loss": 2.5456,
|
|
"step": 2342
|
|
},
|
|
{
|
|
"epoch": 0.668776216057105,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 7.547120051836996e-05,
|
|
"loss": 2.5814,
|
|
"step": 2343
|
|
},
|
|
{
|
|
"epoch": 0.6690616519154307,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 7.535330840736209e-05,
|
|
"loss": 2.5684,
|
|
"step": 2344
|
|
},
|
|
{
|
|
"epoch": 0.6693470877737564,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 7.523547755284337e-05,
|
|
"loss": 2.5622,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 0.6696325236320821,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 7.511770805150802e-05,
|
|
"loss": 2.5668,
|
|
"step": 2346
|
|
},
|
|
{
|
|
"epoch": 0.6699179594904078,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 7.500000000000002e-05,
|
|
"loss": 2.5299,
|
|
"step": 2347
|
|
},
|
|
{
|
|
"epoch": 0.6702033953487335,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 7.488235349491278e-05,
|
|
"loss": 2.546,
|
|
"step": 2348
|
|
},
|
|
{
|
|
"epoch": 0.6704888312070592,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 7.47647686327891e-05,
|
|
"loss": 2.5488,
|
|
"step": 2349
|
|
},
|
|
{
|
|
"epoch": 0.6707742670653849,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 7.464724551012161e-05,
|
|
"loss": 2.5425,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.6710597029237105,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 7.45297842233519e-05,
|
|
"loss": 2.5346,
|
|
"step": 2351
|
|
},
|
|
{
|
|
"epoch": 0.6713451387820363,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.441238486887083e-05,
|
|
"loss": 2.5254,
|
|
"step": 2352
|
|
},
|
|
{
|
|
"epoch": 0.671630574640362,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 7.42950475430187e-05,
|
|
"loss": 2.5561,
|
|
"step": 2353
|
|
},
|
|
{
|
|
"epoch": 0.6719160104986877,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 7.417777234208463e-05,
|
|
"loss": 2.5601,
|
|
"step": 2354
|
|
},
|
|
{
|
|
"epoch": 0.6722014463570134,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 7.406055936230687e-05,
|
|
"loss": 2.5617,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 0.672486882215339,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 7.394340869987267e-05,
|
|
"loss": 2.5633,
|
|
"step": 2356
|
|
},
|
|
{
|
|
"epoch": 0.6727723180736648,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 7.382632045091803e-05,
|
|
"loss": 2.5703,
|
|
"step": 2357
|
|
},
|
|
{
|
|
"epoch": 0.6730577539319904,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 7.37092947115278e-05,
|
|
"loss": 2.5611,
|
|
"step": 2358
|
|
},
|
|
{
|
|
"epoch": 0.6733431897903162,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 7.359233157773557e-05,
|
|
"loss": 2.5762,
|
|
"step": 2359
|
|
},
|
|
{
|
|
"epoch": 0.6736286256486418,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.347543114552343e-05,
|
|
"loss": 2.5665,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.6739140615069675,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 7.335859351082217e-05,
|
|
"loss": 2.548,
|
|
"step": 2361
|
|
},
|
|
{
|
|
"epoch": 0.6741994973652933,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 7.324181876951092e-05,
|
|
"loss": 2.5389,
|
|
"step": 2362
|
|
},
|
|
{
|
|
"epoch": 0.6744849332236189,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 7.312510701741717e-05,
|
|
"loss": 2.5481,
|
|
"step": 2363
|
|
},
|
|
{
|
|
"epoch": 0.6747703690819447,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 7.300845835031693e-05,
|
|
"loss": 2.5571,
|
|
"step": 2364
|
|
},
|
|
{
|
|
"epoch": 0.6750558049402703,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 7.28918728639342e-05,
|
|
"loss": 2.5809,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 0.675341240798596,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 7.277535065394127e-05,
|
|
"loss": 2.5644,
|
|
"step": 2366
|
|
},
|
|
{
|
|
"epoch": 0.6756266766569217,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 7.265889181595853e-05,
|
|
"loss": 2.5799,
|
|
"step": 2367
|
|
},
|
|
{
|
|
"epoch": 0.6759121125152474,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.254249644555429e-05,
|
|
"loss": 2.5631,
|
|
"step": 2368
|
|
},
|
|
{
|
|
"epoch": 0.6761975483735732,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 7.242616463824469e-05,
|
|
"loss": 2.5673,
|
|
"step": 2369
|
|
},
|
|
{
|
|
"epoch": 0.6764829842318988,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 7.230989648949396e-05,
|
|
"loss": 2.5697,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.6767684200902245,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 7.219369209471387e-05,
|
|
"loss": 2.569,
|
|
"step": 2371
|
|
},
|
|
{
|
|
"epoch": 0.6770538559485502,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 7.207755154926386e-05,
|
|
"loss": 2.5493,
|
|
"step": 2372
|
|
},
|
|
{
|
|
"epoch": 0.6773392918068759,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 7.196147494845127e-05,
|
|
"loss": 2.5515,
|
|
"step": 2373
|
|
},
|
|
{
|
|
"epoch": 0.6776247276652015,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 7.184546238753064e-05,
|
|
"loss": 2.5449,
|
|
"step": 2374
|
|
},
|
|
{
|
|
"epoch": 0.6779101635235273,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 7.172951396170402e-05,
|
|
"loss": 2.5657,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 0.6781955993818529,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 7.1613629766121e-05,
|
|
"loss": 2.5615,
|
|
"step": 2376
|
|
},
|
|
{
|
|
"epoch": 0.6784810352401787,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 7.149780989587825e-05,
|
|
"loss": 2.5787,
|
|
"step": 2377
|
|
},
|
|
{
|
|
"epoch": 0.6787664710985044,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 7.138205444601985e-05,
|
|
"loss": 2.5632,
|
|
"step": 2378
|
|
},
|
|
{
|
|
"epoch": 0.67905190695683,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 7.126636351153684e-05,
|
|
"loss": 2.5594,
|
|
"step": 2379
|
|
},
|
|
{
|
|
"epoch": 0.6793373428151558,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.115073718736735e-05,
|
|
"loss": 2.55,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.6796227786734814,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 7.10351755683966e-05,
|
|
"loss": 2.5493,
|
|
"step": 2381
|
|
},
|
|
{
|
|
"epoch": 0.6799082145318072,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 7.09196787494567e-05,
|
|
"loss": 2.54,
|
|
"step": 2382
|
|
},
|
|
{
|
|
"epoch": 0.6801936503901328,
|
|
"grad_norm": 0.35546875,
|
|
"learning_rate": 7.08042468253264e-05,
|
|
"loss": 2.5681,
|
|
"step": 2383
|
|
},
|
|
{
|
|
"epoch": 0.6804790862484585,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 7.068887989073143e-05,
|
|
"loss": 2.5505,
|
|
"step": 2384
|
|
},
|
|
{
|
|
"epoch": 0.6807645221067842,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 7.057357804034404e-05,
|
|
"loss": 2.5489,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 0.6810499579651099,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.045834136878308e-05,
|
|
"loss": 2.5669,
|
|
"step": 2386
|
|
},
|
|
{
|
|
"epoch": 0.6813353938234357,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 7.0343169970614e-05,
|
|
"loss": 2.5354,
|
|
"step": 2387
|
|
},
|
|
{
|
|
"epoch": 0.6816208296817613,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 7.022806394034856e-05,
|
|
"loss": 2.5571,
|
|
"step": 2388
|
|
},
|
|
{
|
|
"epoch": 0.681906265540087,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 7.0113023372445e-05,
|
|
"loss": 2.5556,
|
|
"step": 2389
|
|
},
|
|
{
|
|
"epoch": 0.6821917013984127,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.999804836130784e-05,
|
|
"loss": 2.5822,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.6824771372567384,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 6.988313900128769e-05,
|
|
"loss": 2.5923,
|
|
"step": 2391
|
|
},
|
|
{
|
|
"epoch": 0.682762573115064,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 6.97682953866813e-05,
|
|
"loss": 2.5303,
|
|
"step": 2392
|
|
},
|
|
{
|
|
"epoch": 0.6830480089733898,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.965351761173165e-05,
|
|
"loss": 2.5794,
|
|
"step": 2393
|
|
},
|
|
{
|
|
"epoch": 0.6833334448317155,
|
|
"grad_norm": 0.35546875,
|
|
"learning_rate": 6.953880577062745e-05,
|
|
"loss": 2.582,
|
|
"step": 2394
|
|
},
|
|
{
|
|
"epoch": 0.6836188806900412,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.94241599575034e-05,
|
|
"loss": 2.5485,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 0.6839043165483669,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 6.930958026644005e-05,
|
|
"loss": 2.5524,
|
|
"step": 2396
|
|
},
|
|
{
|
|
"epoch": 0.6841897524066926,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.919506679146372e-05,
|
|
"loss": 2.5754,
|
|
"step": 2397
|
|
},
|
|
{
|
|
"epoch": 0.6844751882650183,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 6.908061962654626e-05,
|
|
"loss": 2.5647,
|
|
"step": 2398
|
|
},
|
|
{
|
|
"epoch": 0.6847606241233439,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 6.896623886560528e-05,
|
|
"loss": 2.567,
|
|
"step": 2399
|
|
},
|
|
{
|
|
"epoch": 0.6850460599816697,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.885192460250366e-05,
|
|
"loss": 2.5596,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.6853314958399953,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 6.873767693105e-05,
|
|
"loss": 2.5652,
|
|
"step": 2401
|
|
},
|
|
{
|
|
"epoch": 0.685616931698321,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 6.8623495944998e-05,
|
|
"loss": 2.5612,
|
|
"step": 2402
|
|
},
|
|
{
|
|
"epoch": 0.6859023675566468,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.850938173804672e-05,
|
|
"loss": 2.5595,
|
|
"step": 2403
|
|
},
|
|
{
|
|
"epoch": 0.6861878034149724,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 6.839533440384051e-05,
|
|
"loss": 2.5805,
|
|
"step": 2404
|
|
},
|
|
{
|
|
"epoch": 0.6864732392732982,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 6.82813540359688e-05,
|
|
"loss": 2.5742,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 0.6867586751316238,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 6.816744072796592e-05,
|
|
"loss": 2.5801,
|
|
"step": 2406
|
|
},
|
|
{
|
|
"epoch": 0.6870441109899496,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 6.805359457331144e-05,
|
|
"loss": 2.5545,
|
|
"step": 2407
|
|
},
|
|
{
|
|
"epoch": 0.6873295468482752,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 6.793981566542957e-05,
|
|
"loss": 2.553,
|
|
"step": 2408
|
|
},
|
|
{
|
|
"epoch": 0.6876149827066009,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 6.78261040976894e-05,
|
|
"loss": 2.5477,
|
|
"step": 2409
|
|
},
|
|
{
|
|
"epoch": 0.6879004185649267,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.771245996340491e-05,
|
|
"loss": 2.5584,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.6881858544232523,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 6.759888335583458e-05,
|
|
"loss": 2.5786,
|
|
"step": 2411
|
|
},
|
|
{
|
|
"epoch": 0.688471290281578,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 6.748537436818142e-05,
|
|
"loss": 2.5663,
|
|
"step": 2412
|
|
},
|
|
{
|
|
"epoch": 0.6887567261399037,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 6.737193309359324e-05,
|
|
"loss": 2.5402,
|
|
"step": 2413
|
|
},
|
|
{
|
|
"epoch": 0.6890421619982294,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 6.7258559625162e-05,
|
|
"loss": 2.5748,
|
|
"step": 2414
|
|
},
|
|
{
|
|
"epoch": 0.6893275978565551,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 6.714525405592412e-05,
|
|
"loss": 2.5759,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 0.6896130337148808,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 6.703201647886034e-05,
|
|
"loss": 2.5636,
|
|
"step": 2416
|
|
},
|
|
{
|
|
"epoch": 0.6898984695732064,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 6.691884698689548e-05,
|
|
"loss": 2.5573,
|
|
"step": 2417
|
|
},
|
|
{
|
|
"epoch": 0.6901839054315322,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 6.680574567289864e-05,
|
|
"loss": 2.5802,
|
|
"step": 2418
|
|
},
|
|
{
|
|
"epoch": 0.6904693412898579,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 6.66927126296829e-05,
|
|
"loss": 2.5497,
|
|
"step": 2419
|
|
},
|
|
{
|
|
"epoch": 0.6907547771481836,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.657974795000525e-05,
|
|
"loss": 2.5806,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.6910402130065093,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.646685172656667e-05,
|
|
"loss": 2.5485,
|
|
"step": 2421
|
|
},
|
|
{
|
|
"epoch": 0.6913256488648349,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.6354024052012e-05,
|
|
"loss": 2.5518,
|
|
"step": 2422
|
|
},
|
|
{
|
|
"epoch": 0.6916110847231607,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 6.62412650189297e-05,
|
|
"loss": 2.5628,
|
|
"step": 2423
|
|
},
|
|
{
|
|
"epoch": 0.6918965205814863,
|
|
"grad_norm": 0.349609375,
|
|
"learning_rate": 6.612857471985203e-05,
|
|
"loss": 2.5364,
|
|
"step": 2424
|
|
},
|
|
{
|
|
"epoch": 0.6921819564398121,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 6.601595324725474e-05,
|
|
"loss": 2.5879,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 0.6924673922981378,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 6.590340069355713e-05,
|
|
"loss": 2.5652,
|
|
"step": 2426
|
|
},
|
|
{
|
|
"epoch": 0.6927528281564634,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.579091715112201e-05,
|
|
"loss": 2.544,
|
|
"step": 2427
|
|
},
|
|
{
|
|
"epoch": 0.6930382640147892,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 6.567850271225543e-05,
|
|
"loss": 2.5717,
|
|
"step": 2428
|
|
},
|
|
{
|
|
"epoch": 0.6933236998731148,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.556615746920685e-05,
|
|
"loss": 2.5632,
|
|
"step": 2429
|
|
},
|
|
{
|
|
"epoch": 0.6936091357314406,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 6.545388151416896e-05,
|
|
"loss": 2.544,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.6938945715897662,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.534167493927748e-05,
|
|
"loss": 2.5697,
|
|
"step": 2431
|
|
},
|
|
{
|
|
"epoch": 0.6941800074480919,
|
|
"grad_norm": 0.35546875,
|
|
"learning_rate": 6.522953783661121e-05,
|
|
"loss": 2.5455,
|
|
"step": 2432
|
|
},
|
|
{
|
|
"epoch": 0.6944654433064176,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 6.511747029819207e-05,
|
|
"loss": 2.5844,
|
|
"step": 2433
|
|
},
|
|
{
|
|
"epoch": 0.6947508791647433,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.500547241598478e-05,
|
|
"loss": 2.5579,
|
|
"step": 2434
|
|
},
|
|
{
|
|
"epoch": 0.6950363150230691,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 6.489354428189683e-05,
|
|
"loss": 2.5542,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 0.6953217508813947,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 6.478168598777864e-05,
|
|
"loss": 2.5787,
|
|
"step": 2436
|
|
},
|
|
{
|
|
"epoch": 0.6956071867397204,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 6.466989762542332e-05,
|
|
"loss": 2.5676,
|
|
"step": 2437
|
|
},
|
|
{
|
|
"epoch": 0.6958926225980461,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 6.455817928656636e-05,
|
|
"loss": 2.5601,
|
|
"step": 2438
|
|
},
|
|
{
|
|
"epoch": 0.6961780584563718,
|
|
"grad_norm": 0.33984375,
|
|
"learning_rate": 6.444653106288612e-05,
|
|
"loss": 2.5721,
|
|
"step": 2439
|
|
},
|
|
{
|
|
"epoch": 0.6964634943146975,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 6.433495304600306e-05,
|
|
"loss": 2.5427,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.6967489301730232,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 6.422344532748039e-05,
|
|
"loss": 2.5505,
|
|
"step": 2441
|
|
},
|
|
{
|
|
"epoch": 0.6970343660313489,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 6.411200799882338e-05,
|
|
"loss": 2.5491,
|
|
"step": 2442
|
|
},
|
|
{
|
|
"epoch": 0.6973198018896746,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.400064115147955e-05,
|
|
"loss": 2.5645,
|
|
"step": 2443
|
|
},
|
|
{
|
|
"epoch": 0.6976052377480003,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 6.38893448768387e-05,
|
|
"loss": 2.5374,
|
|
"step": 2444
|
|
},
|
|
{
|
|
"epoch": 0.697890673606326,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 6.377811926623273e-05,
|
|
"loss": 2.5343,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 0.6981761094646517,
|
|
"grad_norm": 0.345703125,
|
|
"learning_rate": 6.366696441093536e-05,
|
|
"loss": 2.6022,
|
|
"step": 2446
|
|
},
|
|
{
|
|
"epoch": 0.6984615453229773,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 6.355588040216248e-05,
|
|
"loss": 2.5745,
|
|
"step": 2447
|
|
},
|
|
{
|
|
"epoch": 0.6987469811813031,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 6.344486733107168e-05,
|
|
"loss": 2.5623,
|
|
"step": 2448
|
|
},
|
|
{
|
|
"epoch": 0.6990324170396287,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 6.333392528876233e-05,
|
|
"loss": 2.567,
|
|
"step": 2449
|
|
},
|
|
{
|
|
"epoch": 0.6993178528979545,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 6.32230543662757e-05,
|
|
"loss": 2.5734,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.6996032887562802,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 6.311225465459442e-05,
|
|
"loss": 2.5358,
|
|
"step": 2451
|
|
},
|
|
{
|
|
"epoch": 0.6998887246146058,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 6.300152624464296e-05,
|
|
"loss": 2.5494,
|
|
"step": 2452
|
|
},
|
|
{
|
|
"epoch": 0.7001741604729316,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 6.289086922728712e-05,
|
|
"loss": 2.5602,
|
|
"step": 2453
|
|
},
|
|
{
|
|
"epoch": 0.7004595963312572,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 6.278028369333413e-05,
|
|
"loss": 2.5788,
|
|
"step": 2454
|
|
},
|
|
{
|
|
"epoch": 0.700745032189583,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 6.266976973353252e-05,
|
|
"loss": 2.5591,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 0.7010304680479086,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 6.255932743857226e-05,
|
|
"loss": 2.5517,
|
|
"step": 2456
|
|
},
|
|
{
|
|
"epoch": 0.7013159039062343,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 6.244895689908426e-05,
|
|
"loss": 2.5502,
|
|
"step": 2457
|
|
},
|
|
{
|
|
"epoch": 0.70160133976456,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 6.233865820564079e-05,
|
|
"loss": 2.5815,
|
|
"step": 2458
|
|
},
|
|
{
|
|
"epoch": 0.7018867756228857,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 6.222843144875492e-05,
|
|
"loss": 2.5633,
|
|
"step": 2459
|
|
},
|
|
{
|
|
"epoch": 0.7021722114812115,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 6.211827671888098e-05,
|
|
"loss": 2.5513,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.7024576473395371,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 6.200819410641385e-05,
|
|
"loss": 2.569,
|
|
"step": 2461
|
|
},
|
|
{
|
|
"epoch": 0.7027430831978628,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.189818370168956e-05,
|
|
"loss": 2.559,
|
|
"step": 2462
|
|
},
|
|
{
|
|
"epoch": 0.7030285190561885,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 6.17882455949846e-05,
|
|
"loss": 2.5625,
|
|
"step": 2463
|
|
},
|
|
{
|
|
"epoch": 0.7033139549145142,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 6.16783798765164e-05,
|
|
"loss": 2.552,
|
|
"step": 2464
|
|
},
|
|
{
|
|
"epoch": 0.7035993907728398,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 6.156858663644277e-05,
|
|
"loss": 2.5329,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 0.7038848266311656,
|
|
"grad_norm": 0.33984375,
|
|
"learning_rate": 6.145886596486208e-05,
|
|
"loss": 2.5371,
|
|
"step": 2466
|
|
},
|
|
{
|
|
"epoch": 0.7041702624894913,
|
|
"grad_norm": 0.337890625,
|
|
"learning_rate": 6.134921795181324e-05,
|
|
"loss": 2.561,
|
|
"step": 2467
|
|
},
|
|
{
|
|
"epoch": 0.704455698347817,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 6.123964268727554e-05,
|
|
"loss": 2.5607,
|
|
"step": 2468
|
|
},
|
|
{
|
|
"epoch": 0.7047411342061427,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 6.113014026116841e-05,
|
|
"loss": 2.5781,
|
|
"step": 2469
|
|
},
|
|
{
|
|
"epoch": 0.7050265700644683,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 6.102071076335173e-05,
|
|
"loss": 2.5742,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.7053120059227941,
|
|
"grad_norm": 0.341796875,
|
|
"learning_rate": 6.091135428362536e-05,
|
|
"loss": 2.5736,
|
|
"step": 2471
|
|
},
|
|
{
|
|
"epoch": 0.7055974417811197,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 6.0802070911729246e-05,
|
|
"loss": 2.5795,
|
|
"step": 2472
|
|
},
|
|
{
|
|
"epoch": 0.7058828776394455,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 6.06928607373435e-05,
|
|
"loss": 2.5563,
|
|
"step": 2473
|
|
},
|
|
{
|
|
"epoch": 0.7061683134977711,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 6.058372385008801e-05,
|
|
"loss": 2.5287,
|
|
"step": 2474
|
|
},
|
|
{
|
|
"epoch": 0.7064537493560968,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 6.047466033952245e-05,
|
|
"loss": 2.5752,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 0.7067391852144226,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 6.036567029514665e-05,
|
|
"loss": 2.5511,
|
|
"step": 2476
|
|
},
|
|
{
|
|
"epoch": 0.7070246210727482,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 6.025675380639976e-05,
|
|
"loss": 2.5685,
|
|
"step": 2477
|
|
},
|
|
{
|
|
"epoch": 0.707310056931074,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 6.0147910962660684e-05,
|
|
"loss": 2.577,
|
|
"step": 2478
|
|
},
|
|
{
|
|
"epoch": 0.7075954927893996,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 6.003914185324802e-05,
|
|
"loss": 2.5451,
|
|
"step": 2479
|
|
},
|
|
{
|
|
"epoch": 0.7078809286477253,
|
|
"grad_norm": 0.349609375,
|
|
"learning_rate": 5.993044656741965e-05,
|
|
"loss": 2.5405,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.708166364506051,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 5.982182519437311e-05,
|
|
"loss": 2.5569,
|
|
"step": 2481
|
|
},
|
|
{
|
|
"epoch": 0.7084518003643767,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 5.971327782324508e-05,
|
|
"loss": 2.5454,
|
|
"step": 2482
|
|
},
|
|
{
|
|
"epoch": 0.7087372362227025,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 5.960480454311155e-05,
|
|
"loss": 2.5725,
|
|
"step": 2483
|
|
},
|
|
{
|
|
"epoch": 0.7090226720810281,
|
|
"grad_norm": 0.34375,
|
|
"learning_rate": 5.949640544298779e-05,
|
|
"loss": 2.5612,
|
|
"step": 2484
|
|
},
|
|
{
|
|
"epoch": 0.7093081079393538,
|
|
"grad_norm": 0.3359375,
|
|
"learning_rate": 5.938808061182823e-05,
|
|
"loss": 2.5581,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 0.7095935437976795,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 5.927983013852614e-05,
|
|
"loss": 2.5476,
|
|
"step": 2486
|
|
},
|
|
{
|
|
"epoch": 0.7098789796560052,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 5.917165411191405e-05,
|
|
"loss": 2.5592,
|
|
"step": 2487
|
|
},
|
|
{
|
|
"epoch": 0.7101644155143308,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 5.906355262076317e-05,
|
|
"loss": 2.5649,
|
|
"step": 2488
|
|
},
|
|
{
|
|
"epoch": 0.7104498513726566,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 5.895552575378361e-05,
|
|
"loss": 2.5849,
|
|
"step": 2489
|
|
},
|
|
{
|
|
"epoch": 0.7107352872309822,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 5.8847573599624335e-05,
|
|
"loss": 2.5812,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.711020723089308,
|
|
"grad_norm": 0.365234375,
|
|
"learning_rate": 5.8739696246872853e-05,
|
|
"loss": 2.5425,
|
|
"step": 2491
|
|
},
|
|
{
|
|
"epoch": 0.7113061589476337,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 5.863189378405541e-05,
|
|
"loss": 2.554,
|
|
"step": 2492
|
|
},
|
|
{
|
|
"epoch": 0.7115915948059593,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 5.8524166299636785e-05,
|
|
"loss": 2.5374,
|
|
"step": 2493
|
|
},
|
|
{
|
|
"epoch": 0.7118770306642851,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 5.841651388202015e-05,
|
|
"loss": 2.5079,
|
|
"step": 2494
|
|
},
|
|
{
|
|
"epoch": 0.7121624665226107,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 5.8308936619547076e-05,
|
|
"loss": 2.5421,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 0.7124479023809365,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 5.820143460049759e-05,
|
|
"loss": 2.5617,
|
|
"step": 2496
|
|
},
|
|
{
|
|
"epoch": 0.7127333382392621,
|
|
"grad_norm": 0.3515625,
|
|
"learning_rate": 5.809400791308978e-05,
|
|
"loss": 2.5253,
|
|
"step": 2497
|
|
},
|
|
{
|
|
"epoch": 0.7130187740975878,
|
|
"grad_norm": 0.34765625,
|
|
"learning_rate": 5.798665664548015e-05,
|
|
"loss": 2.5518,
|
|
"step": 2498
|
|
},
|
|
{
|
|
"epoch": 0.7133042099559136,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 5.787938088576305e-05,
|
|
"loss": 2.5575,
|
|
"step": 2499
|
|
},
|
|
{
|
|
"epoch": 0.7135896458142392,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 5.777218072197113e-05,
|
|
"loss": 2.5604,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.7135896458142392,
|
|
"eval_loss": 2.4628705978393555,
|
|
"eval_runtime": 5982.5105,
|
|
"eval_samples_per_second": 10.746,
|
|
"eval_steps_per_second": 10.746,
|
|
"step": 2500
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 3503,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 250,
|
|
"total_flos": 9.70632734441472e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|