1022 lines
26 KiB
JSON
1022 lines
26 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"eval_steps": 50,
|
||
|
|
"global_step": 485,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.008247422680412371,
|
||
|
|
"grad_norm": 16.75,
|
||
|
|
"learning_rate": 6.666666666666667e-06,
|
||
|
|
"loss": 0.8829,
|
||
|
|
"num_input_tokens_seen": 1413808,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.016494845360824743,
|
||
|
|
"grad_norm": 3.90625,
|
||
|
|
"learning_rate": 1.3333333333333333e-05,
|
||
|
|
"loss": 0.4033,
|
||
|
|
"num_input_tokens_seen": 2866496,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024742268041237112,
|
||
|
|
"grad_norm": 5.0,
|
||
|
|
"learning_rate": 2e-05,
|
||
|
|
"loss": 0.2755,
|
||
|
|
"num_input_tokens_seen": 4305104,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.032989690721649485,
|
||
|
|
"grad_norm": 2.453125,
|
||
|
|
"learning_rate": 2.6666666666666667e-05,
|
||
|
|
"loss": 0.2531,
|
||
|
|
"num_input_tokens_seen": 5594128,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.041237113402061855,
|
||
|
|
"grad_norm": 1.484375,
|
||
|
|
"learning_rate": 3.3333333333333335e-05,
|
||
|
|
"loss": 0.2585,
|
||
|
|
"num_input_tokens_seen": 6683376,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049484536082474224,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 4e-05,
|
||
|
|
"loss": 0.2308,
|
||
|
|
"num_input_tokens_seen": 8030336,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0577319587628866,
|
||
|
|
"grad_norm": 1.3671875,
|
||
|
|
"learning_rate": 3.9992569962849926e-05,
|
||
|
|
"loss": 0.2221,
|
||
|
|
"num_input_tokens_seen": 9395728,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06597938144329897,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 3.99702853719449e-05,
|
||
|
|
"loss": 0.2259,
|
||
|
|
"num_input_tokens_seen": 10689344,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07422680412371134,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 3.9933162784818745e-05,
|
||
|
|
"loss": 0.2201,
|
||
|
|
"num_input_tokens_seen": 11936704,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08247422680412371,
|
||
|
|
"grad_norm": 1.359375,
|
||
|
|
"learning_rate": 3.988122978369162e-05,
|
||
|
|
"loss": 0.2242,
|
||
|
|
"num_input_tokens_seen": 13217248,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09072164948453608,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 3.981452495497628e-05,
|
||
|
|
"loss": 0.2213,
|
||
|
|
"num_input_tokens_seen": 14587328,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09896907216494845,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 3.973309786060829e-05,
|
||
|
|
"loss": 0.1958,
|
||
|
|
"num_input_tokens_seen": 15976464,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10721649484536082,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 3.963700900122124e-05,
|
||
|
|
"loss": 0.2136,
|
||
|
|
"num_input_tokens_seen": 17262576,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1154639175257732,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 3.952632977119465e-05,
|
||
|
|
"loss": 0.2059,
|
||
|
|
"num_input_tokens_seen": 18801264,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12371134020618557,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 3.9401142405607594e-05,
|
||
|
|
"loss": 0.197,
|
||
|
|
"num_input_tokens_seen": 20158000,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13195876288659794,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 3.9261539919137776e-05,
|
||
|
|
"loss": 0.2273,
|
||
|
|
"num_input_tokens_seen": 21322240,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1402061855670103,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 3.9107626036951266e-05,
|
||
|
|
"loss": 0.1971,
|
||
|
|
"num_input_tokens_seen": 22631360,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14845360824742268,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 3.8939515117634326e-05,
|
||
|
|
"loss": 0.2194,
|
||
|
|
"num_input_tokens_seen": 23848496,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15670103092783505,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 3.875733206822452e-05,
|
||
|
|
"loss": 0.2215,
|
||
|
|
"num_input_tokens_seen": 25148336,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16494845360824742,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 3.8561212251404406e-05,
|
||
|
|
"loss": 0.1989,
|
||
|
|
"num_input_tokens_seen": 26427264,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1731958762886598,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 3.835130138492644e-05,
|
||
|
|
"loss": 0.2072,
|
||
|
|
"num_input_tokens_seen": 27833024,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18144329896907216,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 3.812775543334425e-05,
|
||
|
|
"loss": 0.2013,
|
||
|
|
"num_input_tokens_seen": 29273008,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18969072164948453,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 3.789074049213033e-05,
|
||
|
|
"loss": 0.2119,
|
||
|
|
"num_input_tokens_seen": 30628416,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1979381443298969,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 3.7640432664266514e-05,
|
||
|
|
"loss": 0.2213,
|
||
|
|
"num_input_tokens_seen": 31861856,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20618556701030927,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 3.737701792939881e-05,
|
||
|
|
"loss": 0.2102,
|
||
|
|
"num_input_tokens_seen": 33121072,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21443298969072164,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 3.7100692005653796e-05,
|
||
|
|
"loss": 0.2052,
|
||
|
|
"num_input_tokens_seen": 34464560,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22268041237113403,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 3.681166020421938e-05,
|
||
|
|
"loss": 0.1942,
|
||
|
|
"num_input_tokens_seen": 35918800,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2309278350515464,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 3.6510137276797786e-05,
|
||
|
|
"loss": 0.1946,
|
||
|
|
"num_input_tokens_seen": 37267616,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23917525773195877,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 3.6196347256044236e-05,
|
||
|
|
"loss": 0.2263,
|
||
|
|
"num_input_tokens_seen": 38542608,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24742268041237114,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 3.5870523289109886e-05,
|
||
|
|
"loss": 0.2102,
|
||
|
|
"num_input_tokens_seen": 39934016,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2556701030927835,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 3.553290746441261e-05,
|
||
|
|
"loss": 0.2084,
|
||
|
|
"num_input_tokens_seen": 41070080,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2639175257731959,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 3.5183750631764406e-05,
|
||
|
|
"loss": 0.1939,
|
||
|
|
"num_input_tokens_seen": 42375696,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2721649484536082,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 3.4823312215989046e-05,
|
||
|
|
"loss": 0.2027,
|
||
|
|
"num_input_tokens_seen": 43648368,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2804123711340206,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 3.445186002416849e-05,
|
||
|
|
"loss": 0.2093,
|
||
|
|
"num_input_tokens_seen": 44952352,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28865979381443296,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 3.4069670046661197e-05,
|
||
|
|
"loss": 0.1887,
|
||
|
|
"num_input_tokens_seen": 46407584,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29690721649484536,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 3.3677026252040306e-05,
|
||
|
|
"loss": 0.2109,
|
||
|
|
"num_input_tokens_seen": 47649744,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30515463917525776,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 3.327422037610389e-05,
|
||
|
|
"loss": 0.2072,
|
||
|
|
"num_input_tokens_seen": 49014464,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3134020618556701,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 3.286155170511419e-05,
|
||
|
|
"loss": 0.2046,
|
||
|
|
"num_input_tokens_seen": 50443616,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3216494845360825,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 3.2439326853426824e-05,
|
||
|
|
"loss": 0.211,
|
||
|
|
"num_input_tokens_seen": 51801328,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32989690721649484,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 3.200785953567517e-05,
|
||
|
|
"loss": 0.1977,
|
||
|
|
"num_input_tokens_seen": 53112944,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33814432989690724,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 3.156747033367922e-05,
|
||
|
|
"loss": 0.2001,
|
||
|
|
"num_input_tokens_seen": 54444256,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3463917525773196,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 3.1118486458252094e-05,
|
||
|
|
"loss": 0.2,
|
||
|
|
"num_input_tokens_seen": 55882912,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.354639175257732,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 3.0661241506081236e-05,
|
||
|
|
"loss": 0.1997,
|
||
|
|
"num_input_tokens_seen": 57157872,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3628865979381443,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 3.019607521186475e-05,
|
||
|
|
"loss": 0.2085,
|
||
|
|
"num_input_tokens_seen": 58474160,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3711340206185567,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 2.972333319588736e-05,
|
||
|
|
"loss": 0.2093,
|
||
|
|
"num_input_tokens_seen": 59687904,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37938144329896906,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 2.9243366707223165e-05,
|
||
|
|
"loss": 0.1963,
|
||
|
|
"num_input_tokens_seen": 61006320,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38762886597938145,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 2.875653236275632e-05,
|
||
|
|
"loss": 0.2001,
|
||
|
|
"num_input_tokens_seen": 62265552,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3958762886597938,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 2.8263191882213362e-05,
|
||
|
|
"loss": 0.1948,
|
||
|
|
"num_input_tokens_seen": 63682384,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4041237113402062,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 2.7763711819404098e-05,
|
||
|
|
"loss": 0.2048,
|
||
|
|
"num_input_tokens_seen": 64848160,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41237113402061853,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 2.7258463289870764e-05,
|
||
|
|
"loss": 0.192,
|
||
|
|
"num_input_tokens_seen": 66278032,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42061855670103093,
|
||
|
|
"grad_norm": 0.80859375,
|
||
|
|
"learning_rate": 2.6747821695147806e-05,
|
||
|
|
"loss": 0.1933,
|
||
|
|
"num_input_tokens_seen": 67686560,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4288659793814433,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 2.623216644383715e-05,
|
||
|
|
"loss": 0.2094,
|
||
|
|
"num_input_tokens_seen": 68863776,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43711340206185567,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 2.5711880669706172e-05,
|
||
|
|
"loss": 0.1964,
|
||
|
|
"num_input_tokens_seen": 70186224,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44536082474226807,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 2.5187350947017918e-05,
|
||
|
|
"loss": 0.2042,
|
||
|
|
"num_input_tokens_seen": 71498112,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4536082474226804,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 2.4658967003304986e-05,
|
||
|
|
"loss": 0.1908,
|
||
|
|
"num_input_tokens_seen": 72880736,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4618556701030928,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 2.4127121429800498e-05,
|
||
|
|
"loss": 0.187,
|
||
|
|
"num_input_tokens_seen": 74122048,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47010309278350515,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 2.3592209389741372e-05,
|
||
|
|
"loss": 0.1778,
|
||
|
|
"num_input_tokens_seen": 75602400,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47835051546391755,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 2.30546283247606e-05,
|
||
|
|
"loss": 0.207,
|
||
|
|
"num_input_tokens_seen": 76746240,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4865979381443299,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 2.251477765958655e-05,
|
||
|
|
"loss": 0.1911,
|
||
|
|
"num_input_tokens_seen": 78209744,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4948453608247423,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 2.1973058505269007e-05,
|
||
|
|
"loss": 0.1935,
|
||
|
|
"num_input_tokens_seen": 79494896,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5030927835051546,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 2.1429873361152124e-05,
|
||
|
|
"loss": 0.1977,
|
||
|
|
"num_input_tokens_seen": 80721808,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.511340206185567,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 2.088562581581592e-05,
|
||
|
|
"loss": 0.1956,
|
||
|
|
"num_input_tokens_seen": 81918944,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5195876288659794,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 2.0340720247208447e-05,
|
||
|
|
"loss": 0.1912,
|
||
|
|
"num_input_tokens_seen": 83184064,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5278350515463918,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 1.9795561522191523e-05,
|
||
|
|
"loss": 0.1843,
|
||
|
|
"num_input_tokens_seen": 84574976,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5360824742268041,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 1.9250554695723107e-05,
|
||
|
|
"loss": 0.1942,
|
||
|
|
"num_input_tokens_seen": 85844768,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5443298969072164,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 1.8706104709899964e-05,
|
||
|
|
"loss": 0.1922,
|
||
|
|
"num_input_tokens_seen": 87245056,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5525773195876289,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 1.816261609308419e-05,
|
||
|
|
"loss": 0.182,
|
||
|
|
"num_input_tokens_seen": 88603792,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5608247422680412,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 1.7620492659337155e-05,
|
||
|
|
"loss": 0.1879,
|
||
|
|
"num_input_tokens_seen": 90054816,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5690721649484536,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 1.7080137208384122e-05,
|
||
|
|
"loss": 0.1809,
|
||
|
|
"num_input_tokens_seen": 91432912,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5773195876288659,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 1.6541951226332565e-05,
|
||
|
|
"loss": 0.1735,
|
||
|
|
"num_input_tokens_seen": 92795296,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5855670103092784,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 1.600633458736653e-05,
|
||
|
|
"loss": 0.1915,
|
||
|
|
"num_input_tokens_seen": 94071744,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5938144329896907,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 1.5473685256638572e-05,
|
||
|
|
"loss": 0.1895,
|
||
|
|
"num_input_tokens_seen": 95342096,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6020618556701031,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 1.4944398994580232e-05,
|
||
|
|
"loss": 0.1869,
|
||
|
|
"num_input_tokens_seen": 96569312,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6103092783505155,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 1.4418869062850514e-05,
|
||
|
|
"loss": 0.2004,
|
||
|
|
"num_input_tokens_seen": 97849216,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6185567010309279,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 1.3897485932141042e-05,
|
||
|
|
"loss": 0.1865,
|
||
|
|
"num_input_tokens_seen": 99083488,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6268041237113402,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 1.3380636992054878e-05,
|
||
|
|
"loss": 0.1769,
|
||
|
|
"num_input_tokens_seen": 100566624,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6350515463917525,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 1.2868706263274602e-05,
|
||
|
|
"loss": 0.1969,
|
||
|
|
"num_input_tokens_seen": 101823872,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.643298969072165,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 1.236207411223353e-05,
|
||
|
|
"loss": 0.1767,
|
||
|
|
"num_input_tokens_seen": 103284176,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6515463917525773,
|
||
|
|
"grad_norm": 0.83984375,
|
||
|
|
"learning_rate": 1.1861116968502015e-05,
|
||
|
|
"loss": 0.1799,
|
||
|
|
"num_input_tokens_seen": 104567360,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6597938144329897,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 1.136620704509892e-05,
|
||
|
|
"loss": 0.1856,
|
||
|
|
"num_input_tokens_seen": 105872848,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.668041237113402,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 1.087771206193593e-05,
|
||
|
|
"loss": 0.1791,
|
||
|
|
"num_input_tokens_seen": 107217232,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6762886597938145,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 1.0395994972600285e-05,
|
||
|
|
"loss": 0.1806,
|
||
|
|
"num_input_tokens_seen": 108626976,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6845360824742268,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 9.921413694678959e-06,
|
||
|
|
"loss": 0.2018,
|
||
|
|
"num_input_tokens_seen": 109754000,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6927835051546392,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 9.454320843824512e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"num_input_tokens_seen": 111026592,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7010309278350515,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 8.995063471760377e-06,
|
||
|
|
"loss": 0.1885,
|
||
|
|
"num_input_tokens_seen": 112287760,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.709278350515464,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 8.543982808420156e-06,
|
||
|
|
"loss": 0.1838,
|
||
|
|
"num_input_tokens_seen": 113634128,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7175257731958763,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 8.101414008412469e-06,
|
||
|
|
"loss": 0.1842,
|
||
|
|
"num_input_tokens_seen": 114949760,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7257731958762886,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 7.667685901999875e-06,
|
||
|
|
"loss": 0.1935,
|
||
|
|
"num_input_tokens_seen": 116223648,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.734020618556701,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 7.24312075077674e-06,
|
||
|
|
"loss": 0.1866,
|
||
|
|
"num_input_tokens_seen": 117618112,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7422680412371134,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 6.828034008227678e-06,
|
||
|
|
"loss": 0.1751,
|
||
|
|
"num_input_tokens_seen": 119000256,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7505154639175258,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 6.422734085344464e-06,
|
||
|
|
"loss": 0.1796,
|
||
|
|
"num_input_tokens_seen": 120232672,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7587628865979381,
|
||
|
|
"grad_norm": 0.83984375,
|
||
|
|
"learning_rate": 6.027522121475482e-06,
|
||
|
|
"loss": 0.1783,
|
||
|
|
"num_input_tokens_seen": 121499376,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7670103092783506,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 5.642691760578116e-06,
|
||
|
|
"loss": 0.1856,
|
||
|
|
"num_input_tokens_seen": 122791312,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7752577319587629,
|
||
|
|
"grad_norm": 0.85546875,
|
||
|
|
"learning_rate": 5.268528933040147e-06,
|
||
|
|
"loss": 0.1674,
|
||
|
|
"num_input_tokens_seen": 124261040,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7835051546391752,
|
||
|
|
"grad_norm": 0.82421875,
|
||
|
|
"learning_rate": 4.905311643232464e-06,
|
||
|
|
"loss": 0.1773,
|
||
|
|
"num_input_tokens_seen": 125708848,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7917525773195876,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 4.553309762950739e-06,
|
||
|
|
"loss": 0.1905,
|
||
|
|
"num_input_tokens_seen": 126865712,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 4.212784830899725e-06,
|
||
|
|
"loss": 0.1793,
|
||
|
|
"num_input_tokens_seen": 128157040,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8082474226804124,
|
||
|
|
"grad_norm": 0.84375,
|
||
|
|
"learning_rate": 3.8839898583689725e-06,
|
||
|
|
"loss": 0.1812,
|
||
|
|
"num_input_tokens_seen": 129465312,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8164948453608247,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 3.567169141244562e-06,
|
||
|
|
"loss": 0.1813,
|
||
|
|
"num_input_tokens_seen": 130665504,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8247422680412371,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 3.262558078496301e-06,
|
||
|
|
"loss": 0.1727,
|
||
|
|
"num_input_tokens_seen": 132002896,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8329896907216495,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 2.9703829972754407e-06,
|
||
|
|
"loss": 0.1858,
|
||
|
|
"num_input_tokens_seen": 133415088,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8412371134020619,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 2.69086098475277e-06,
|
||
|
|
"loss": 0.1707,
|
||
|
|
"num_input_tokens_seen": 134815184,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8494845360824742,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 2.4241997268220096e-06,
|
||
|
|
"loss": 0.1822,
|
||
|
|
"num_input_tokens_seen": 136261472,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8577319587628865,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 2.1705973537884615e-06,
|
||
|
|
"loss": 0.1809,
|
||
|
|
"num_input_tokens_seen": 137429504,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.865979381443299,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 1.9302422931574183e-06,
|
||
|
|
"loss": 0.1885,
|
||
|
|
"num_input_tokens_seen": 138708544,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8742268041237113,
|
||
|
|
"grad_norm": 0.83984375,
|
||
|
|
"learning_rate": 1.7033131296318473e-06,
|
||
|
|
"loss": 0.1687,
|
||
|
|
"num_input_tokens_seen": 140033024,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8824742268041237,
|
||
|
|
"grad_norm": 0.859375,
|
||
|
|
"learning_rate": 1.4899784724232968e-06,
|
||
|
|
"loss": 0.1748,
|
||
|
|
"num_input_tokens_seen": 141348192,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8907216494845361,
|
||
|
|
"grad_norm": 0.859375,
|
||
|
|
"learning_rate": 1.2903968299746094e-06,
|
||
|
|
"loss": 0.1716,
|
||
|
|
"num_input_tokens_seen": 142800048,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8989690721649485,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 1.104716492187574e-06,
|
||
|
|
"loss": 0.1841,
|
||
|
|
"num_input_tokens_seen": 144156592,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9072164948453608,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 9.330754202429726e-07,
|
||
|
|
"loss": 0.1855,
|
||
|
|
"num_input_tokens_seen": 145334944,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9154639175257732,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 7.756011440948996e-07,
|
||
|
|
"loss": 0.1895,
|
||
|
|
"num_input_tokens_seen": 146529728,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9237113402061856,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 6.324106677155573e-07,
|
||
|
|
"loss": 0.1841,
|
||
|
|
"num_input_tokens_seen": 147823952,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.931958762886598,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 5.036103821608485e-07,
|
||
|
|
"loss": 0.1838,
|
||
|
|
"num_input_tokens_seen": 149194048,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9402061855670103,
|
||
|
|
"grad_norm": 0.83984375,
|
||
|
|
"learning_rate": 3.892959865214363e-07,
|
||
|
|
"loss": 0.1773,
|
||
|
|
"num_input_tokens_seen": 150529248,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9484536082474226,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 2.8955241681795534e-07,
|
||
|
|
"loss": 0.1863,
|
||
|
|
"num_input_tokens_seen": 151864336,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9567010309278351,
|
||
|
|
"grad_norm": 0.765625,
|
||
|
|
"learning_rate": 2.044537828932458e-07,
|
||
|
|
"loss": 0.1803,
|
||
|
|
"num_input_tokens_seen": 153203872,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9649484536082474,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 1.3406331334845813e-07,
|
||
|
|
"loss": 0.1869,
|
||
|
|
"num_input_tokens_seen": 154513568,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9731958762886598,
|
||
|
|
"grad_norm": 0.8046875,
|
||
|
|
"learning_rate": 7.843330856396103e-08,
|
||
|
|
"loss": 0.1818,
|
||
|
|
"num_input_tokens_seen": 155739584,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9814432989690721,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 3.760510183997701e-08,
|
||
|
|
"loss": 0.1826,
|
||
|
|
"num_input_tokens_seen": 157087344,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9896907216494846,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 1.160902868577951e-08,
|
||
|
|
"loss": 0.1905,
|
||
|
|
"num_input_tokens_seen": 158352288,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9979381443298969,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 4.64404280295927e-10,
|
||
|
|
"loss": 0.1794,
|
||
|
|
"num_input_tokens_seen": 159671536,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"eval_loss": 0.09546061605215073,
|
||
|
|
"eval_runtime": 83.6615,
|
||
|
|
"eval_samples_per_second": 12.419,
|
||
|
|
"eval_steps_per_second": 0.394,
|
||
|
|
"num_input_tokens_seen": 160044208,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"num_input_tokens_seen": 160044208,
|
||
|
|
"step": 485,
|
||
|
|
"total_flos": 9.013493089079132e+17,
|
||
|
|
"train_loss": 0.20329311268845784,
|
||
|
|
"train_runtime": 14400.7959,
|
||
|
|
"train_samples_per_second": 4.305,
|
||
|
|
"train_steps_per_second": 0.034,
|
||
|
|
"train_tokens_per_second": 1385.946
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 4,
|
||
|
|
"max_steps": 485,
|
||
|
|
"num_input_tokens_seen": 160044208,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 0,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": false,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 9.013493089079132e+17,
|
||
|
|
"train_batch_size": 16,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|