7912 lines
206 KiB
JSON
7912 lines
206 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": 964,
|
||
|
|
"best_metric": 0.17627178132534027,
|
||
|
|
"best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_cola_42_1776331560/checkpoint-964",
|
||
|
|
"epoch": 5.0,
|
||
|
|
"eval_steps": 241,
|
||
|
|
"global_step": 4810,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.005197505197505198,
|
||
|
|
"grad_norm": 440.98797607421875,
|
||
|
|
"learning_rate": 4.158004158004159e-08,
|
||
|
|
"loss": 1.2917,
|
||
|
|
"num_input_tokens_seen": 2048,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.010395010395010396,
|
||
|
|
"grad_norm": 396.0676574707031,
|
||
|
|
"learning_rate": 9.355509355509357e-08,
|
||
|
|
"loss": 1.2491,
|
||
|
|
"num_input_tokens_seen": 4224,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.015592515592515593,
|
||
|
|
"grad_norm": 480.88055419921875,
|
||
|
|
"learning_rate": 1.4553014553014554e-07,
|
||
|
|
"loss": 1.3117,
|
||
|
|
"num_input_tokens_seen": 6272,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02079002079002079,
|
||
|
|
"grad_norm": 408.1519775390625,
|
||
|
|
"learning_rate": 1.9750519750519752e-07,
|
||
|
|
"loss": 1.1366,
|
||
|
|
"num_input_tokens_seen": 8384,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02598752598752599,
|
||
|
|
"grad_norm": 239.34207153320312,
|
||
|
|
"learning_rate": 2.494802494802495e-07,
|
||
|
|
"loss": 0.7792,
|
||
|
|
"num_input_tokens_seen": 10496,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.031185031185031187,
|
||
|
|
"grad_norm": 36.67585754394531,
|
||
|
|
"learning_rate": 3.014553014553015e-07,
|
||
|
|
"loss": 0.523,
|
||
|
|
"num_input_tokens_seen": 12544,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.036382536382536385,
|
||
|
|
"grad_norm": 53.71092987060547,
|
||
|
|
"learning_rate": 3.534303534303535e-07,
|
||
|
|
"loss": 0.3194,
|
||
|
|
"num_input_tokens_seen": 14528,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04158004158004158,
|
||
|
|
"grad_norm": 72.29695129394531,
|
||
|
|
"learning_rate": 4.0540540540540546e-07,
|
||
|
|
"loss": 0.3482,
|
||
|
|
"num_input_tokens_seen": 16576,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04677754677754678,
|
||
|
|
"grad_norm": 158.2495880126953,
|
||
|
|
"learning_rate": 4.5738045738045745e-07,
|
||
|
|
"loss": 0.377,
|
||
|
|
"num_input_tokens_seen": 18560,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05197505197505198,
|
||
|
|
"grad_norm": 47.052833557128906,
|
||
|
|
"learning_rate": 5.093555093555094e-07,
|
||
|
|
"loss": 0.2695,
|
||
|
|
"num_input_tokens_seen": 20608,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.057172557172557176,
|
||
|
|
"grad_norm": 29.388397216796875,
|
||
|
|
"learning_rate": 5.613305613305614e-07,
|
||
|
|
"loss": 0.2431,
|
||
|
|
"num_input_tokens_seen": 22656,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.062370062370062374,
|
||
|
|
"grad_norm": 94.9759292602539,
|
||
|
|
"learning_rate": 6.133056133056134e-07,
|
||
|
|
"loss": 0.3162,
|
||
|
|
"num_input_tokens_seen": 24640,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06756756756756757,
|
||
|
|
"grad_norm": 108.12734985351562,
|
||
|
|
"learning_rate": 6.652806652806654e-07,
|
||
|
|
"loss": 0.3171,
|
||
|
|
"num_input_tokens_seen": 26752,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07276507276507277,
|
||
|
|
"grad_norm": 64.33253479003906,
|
||
|
|
"learning_rate": 7.172557172557173e-07,
|
||
|
|
"loss": 0.4943,
|
||
|
|
"num_input_tokens_seen": 28608,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07796257796257797,
|
||
|
|
"grad_norm": 85.49552154541016,
|
||
|
|
"learning_rate": 7.692307692307694e-07,
|
||
|
|
"loss": 0.3067,
|
||
|
|
"num_input_tokens_seen": 30912,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08316008316008316,
|
||
|
|
"grad_norm": 71.20551300048828,
|
||
|
|
"learning_rate": 8.212058212058213e-07,
|
||
|
|
"loss": 0.5316,
|
||
|
|
"num_input_tokens_seen": 32896,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08835758835758836,
|
||
|
|
"grad_norm": 8.865546226501465,
|
||
|
|
"learning_rate": 8.731808731808733e-07,
|
||
|
|
"loss": 0.3228,
|
||
|
|
"num_input_tokens_seen": 34816,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09355509355509356,
|
||
|
|
"grad_norm": 39.09188461303711,
|
||
|
|
"learning_rate": 9.251559251559253e-07,
|
||
|
|
"loss": 0.3339,
|
||
|
|
"num_input_tokens_seen": 36736,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09875259875259876,
|
||
|
|
"grad_norm": 8.73692512512207,
|
||
|
|
"learning_rate": 9.771309771309773e-07,
|
||
|
|
"loss": 0.2951,
|
||
|
|
"num_input_tokens_seen": 38720,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10395010395010396,
|
||
|
|
"grad_norm": 16.94306182861328,
|
||
|
|
"learning_rate": 1.0291060291060292e-06,
|
||
|
|
"loss": 0.2279,
|
||
|
|
"num_input_tokens_seen": 40640,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10914760914760915,
|
||
|
|
"grad_norm": 23.53331756591797,
|
||
|
|
"learning_rate": 1.0810810810810812e-06,
|
||
|
|
"loss": 0.2562,
|
||
|
|
"num_input_tokens_seen": 42688,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11434511434511435,
|
||
|
|
"grad_norm": 104.1171646118164,
|
||
|
|
"learning_rate": 1.1330561330561333e-06,
|
||
|
|
"loss": 0.2794,
|
||
|
|
"num_input_tokens_seen": 44544,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11954261954261955,
|
||
|
|
"grad_norm": 18.66437530517578,
|
||
|
|
"learning_rate": 1.1850311850311852e-06,
|
||
|
|
"loss": 0.2511,
|
||
|
|
"num_input_tokens_seen": 46400,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12474012474012475,
|
||
|
|
"grad_norm": 52.965763092041016,
|
||
|
|
"learning_rate": 1.2370062370062372e-06,
|
||
|
|
"loss": 0.2503,
|
||
|
|
"num_input_tokens_seen": 48448,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12993762993762994,
|
||
|
|
"grad_norm": 38.35150146484375,
|
||
|
|
"learning_rate": 1.288981288981289e-06,
|
||
|
|
"loss": 0.3088,
|
||
|
|
"num_input_tokens_seen": 50496,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13513513513513514,
|
||
|
|
"grad_norm": 67.04692840576172,
|
||
|
|
"learning_rate": 1.340956340956341e-06,
|
||
|
|
"loss": 0.2409,
|
||
|
|
"num_input_tokens_seen": 52416,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14033264033264034,
|
||
|
|
"grad_norm": 44.75727844238281,
|
||
|
|
"learning_rate": 1.3929313929313932e-06,
|
||
|
|
"loss": 0.2575,
|
||
|
|
"num_input_tokens_seen": 54464,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14553014553014554,
|
||
|
|
"grad_norm": 21.614540100097656,
|
||
|
|
"learning_rate": 1.4449064449064451e-06,
|
||
|
|
"loss": 0.2283,
|
||
|
|
"num_input_tokens_seen": 56448,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15072765072765074,
|
||
|
|
"grad_norm": 66.04798126220703,
|
||
|
|
"learning_rate": 1.496881496881497e-06,
|
||
|
|
"loss": 0.2004,
|
||
|
|
"num_input_tokens_seen": 58368,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15592515592515593,
|
||
|
|
"grad_norm": 141.60061645507812,
|
||
|
|
"learning_rate": 1.548856548856549e-06,
|
||
|
|
"loss": 0.3166,
|
||
|
|
"num_input_tokens_seen": 60544,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16112266112266113,
|
||
|
|
"grad_norm": 102.77252960205078,
|
||
|
|
"learning_rate": 1.6008316008316011e-06,
|
||
|
|
"loss": 0.3419,
|
||
|
|
"num_input_tokens_seen": 62592,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16632016632016633,
|
||
|
|
"grad_norm": 21.740901947021484,
|
||
|
|
"learning_rate": 1.652806652806653e-06,
|
||
|
|
"loss": 0.2605,
|
||
|
|
"num_input_tokens_seen": 64576,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17151767151767153,
|
||
|
|
"grad_norm": 58.935909271240234,
|
||
|
|
"learning_rate": 1.704781704781705e-06,
|
||
|
|
"loss": 0.2617,
|
||
|
|
"num_input_tokens_seen": 66688,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17671517671517672,
|
||
|
|
"grad_norm": 14.498135566711426,
|
||
|
|
"learning_rate": 1.756756756756757e-06,
|
||
|
|
"loss": 0.2505,
|
||
|
|
"num_input_tokens_seen": 68544,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18191268191268192,
|
||
|
|
"grad_norm": 55.772621154785156,
|
||
|
|
"learning_rate": 1.808731808731809e-06,
|
||
|
|
"loss": 0.2672,
|
||
|
|
"num_input_tokens_seen": 70592,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18711018711018712,
|
||
|
|
"grad_norm": 37.71332931518555,
|
||
|
|
"learning_rate": 1.860706860706861e-06,
|
||
|
|
"loss": 0.2488,
|
||
|
|
"num_input_tokens_seen": 72576,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19230769230769232,
|
||
|
|
"grad_norm": 14.720597267150879,
|
||
|
|
"learning_rate": 1.912681912681913e-06,
|
||
|
|
"loss": 0.1863,
|
||
|
|
"num_input_tokens_seen": 74624,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19750519750519752,
|
||
|
|
"grad_norm": 25.971519470214844,
|
||
|
|
"learning_rate": 1.964656964656965e-06,
|
||
|
|
"loss": 0.1639,
|
||
|
|
"num_input_tokens_seen": 76608,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20270270270270271,
|
||
|
|
"grad_norm": 135.57969665527344,
|
||
|
|
"learning_rate": 2.016632016632017e-06,
|
||
|
|
"loss": 0.3799,
|
||
|
|
"num_input_tokens_seen": 78720,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2079002079002079,
|
||
|
|
"grad_norm": 85.43441009521484,
|
||
|
|
"learning_rate": 2.0686070686070687e-06,
|
||
|
|
"loss": 0.455,
|
||
|
|
"num_input_tokens_seen": 81152,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2130977130977131,
|
||
|
|
"grad_norm": 21.497081756591797,
|
||
|
|
"learning_rate": 2.120582120582121e-06,
|
||
|
|
"loss": 0.2303,
|
||
|
|
"num_input_tokens_seen": 83200,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2182952182952183,
|
||
|
|
"grad_norm": 43.83966064453125,
|
||
|
|
"learning_rate": 2.172557172557173e-06,
|
||
|
|
"loss": 0.2985,
|
||
|
|
"num_input_tokens_seen": 85184,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2234927234927235,
|
||
|
|
"grad_norm": 21.950963973999023,
|
||
|
|
"learning_rate": 2.2245322245322247e-06,
|
||
|
|
"loss": 0.1462,
|
||
|
|
"num_input_tokens_seen": 87232,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2286902286902287,
|
||
|
|
"grad_norm": 16.27555274963379,
|
||
|
|
"learning_rate": 2.276507276507277e-06,
|
||
|
|
"loss": 0.2323,
|
||
|
|
"num_input_tokens_seen": 89152,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2338877338877339,
|
||
|
|
"grad_norm": 20.64867401123047,
|
||
|
|
"learning_rate": 2.3284823284823286e-06,
|
||
|
|
"loss": 0.3453,
|
||
|
|
"num_input_tokens_seen": 91328,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2390852390852391,
|
||
|
|
"grad_norm": 17.483060836791992,
|
||
|
|
"learning_rate": 2.3804573804573807e-06,
|
||
|
|
"loss": 0.1965,
|
||
|
|
"num_input_tokens_seen": 93312,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2442827442827443,
|
||
|
|
"grad_norm": 18.55425262451172,
|
||
|
|
"learning_rate": 2.432432432432433e-06,
|
||
|
|
"loss": 0.186,
|
||
|
|
"num_input_tokens_seen": 95296,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2494802494802495,
|
||
|
|
"grad_norm": 59.2343635559082,
|
||
|
|
"learning_rate": 2.4844074844074846e-06,
|
||
|
|
"loss": 0.2021,
|
||
|
|
"num_input_tokens_seen": 97216,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2505197505197505,
|
||
|
|
"eval_loss": 0.2780425250530243,
|
||
|
|
"eval_runtime": 1.0326,
|
||
|
|
"eval_samples_per_second": 829.001,
|
||
|
|
"eval_steps_per_second": 103.625,
|
||
|
|
"num_input_tokens_seen": 97664,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25467775467775466,
|
||
|
|
"grad_norm": 32.85165023803711,
|
||
|
|
"learning_rate": 2.5363825363825367e-06,
|
||
|
|
"loss": 0.2316,
|
||
|
|
"num_input_tokens_seen": 99264,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2598752598752599,
|
||
|
|
"grad_norm": 52.35283660888672,
|
||
|
|
"learning_rate": 2.5883575883575885e-06,
|
||
|
|
"loss": 0.2509,
|
||
|
|
"num_input_tokens_seen": 101184,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26507276507276506,
|
||
|
|
"grad_norm": 18.137977600097656,
|
||
|
|
"learning_rate": 2.6403326403326406e-06,
|
||
|
|
"loss": 0.1696,
|
||
|
|
"num_input_tokens_seen": 103296,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2702702702702703,
|
||
|
|
"grad_norm": 35.994441986083984,
|
||
|
|
"learning_rate": 2.6923076923076923e-06,
|
||
|
|
"loss": 0.3451,
|
||
|
|
"num_input_tokens_seen": 105344,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27546777546777546,
|
||
|
|
"grad_norm": 18.656810760498047,
|
||
|
|
"learning_rate": 2.7442827442827445e-06,
|
||
|
|
"loss": 0.1851,
|
||
|
|
"num_input_tokens_seen": 107392,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2806652806652807,
|
||
|
|
"grad_norm": 21.467487335205078,
|
||
|
|
"learning_rate": 2.796257796257796e-06,
|
||
|
|
"loss": 0.2177,
|
||
|
|
"num_input_tokens_seen": 109440,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28586278586278585,
|
||
|
|
"grad_norm": 61.47985076904297,
|
||
|
|
"learning_rate": 2.8482328482328488e-06,
|
||
|
|
"loss": 0.2767,
|
||
|
|
"num_input_tokens_seen": 111424,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2910602910602911,
|
||
|
|
"grad_norm": 38.212928771972656,
|
||
|
|
"learning_rate": 2.9002079002079005e-06,
|
||
|
|
"loss": 0.3802,
|
||
|
|
"num_input_tokens_seen": 113408,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29625779625779625,
|
||
|
|
"grad_norm": 29.204410552978516,
|
||
|
|
"learning_rate": 2.9521829521829526e-06,
|
||
|
|
"loss": 0.2173,
|
||
|
|
"num_input_tokens_seen": 115392,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30145530145530147,
|
||
|
|
"grad_norm": 38.67171859741211,
|
||
|
|
"learning_rate": 3.0041580041580043e-06,
|
||
|
|
"loss": 0.2155,
|
||
|
|
"num_input_tokens_seen": 117440,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30665280665280664,
|
||
|
|
"grad_norm": 17.80721664428711,
|
||
|
|
"learning_rate": 3.0561330561330565e-06,
|
||
|
|
"loss": 0.2116,
|
||
|
|
"num_input_tokens_seen": 119424,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31185031185031187,
|
||
|
|
"grad_norm": 30.9665470123291,
|
||
|
|
"learning_rate": 3.1081081081081082e-06,
|
||
|
|
"loss": 0.2158,
|
||
|
|
"num_input_tokens_seen": 121344,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31704781704781704,
|
||
|
|
"grad_norm": 24.32500648498535,
|
||
|
|
"learning_rate": 3.1600831600831604e-06,
|
||
|
|
"loss": 0.2048,
|
||
|
|
"num_input_tokens_seen": 123264,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32224532224532226,
|
||
|
|
"grad_norm": 24.350160598754883,
|
||
|
|
"learning_rate": 3.212058212058212e-06,
|
||
|
|
"loss": 0.2194,
|
||
|
|
"num_input_tokens_seen": 125184,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32744282744282743,
|
||
|
|
"grad_norm": 17.5281982421875,
|
||
|
|
"learning_rate": 3.2640332640332646e-06,
|
||
|
|
"loss": 0.1868,
|
||
|
|
"num_input_tokens_seen": 127296,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33264033264033266,
|
||
|
|
"grad_norm": 30.43378257751465,
|
||
|
|
"learning_rate": 3.3160083160083164e-06,
|
||
|
|
"loss": 0.217,
|
||
|
|
"num_input_tokens_seen": 129408,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33783783783783783,
|
||
|
|
"grad_norm": 7.501856327056885,
|
||
|
|
"learning_rate": 3.3679833679833685e-06,
|
||
|
|
"loss": 0.179,
|
||
|
|
"num_input_tokens_seen": 131520,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34303534303534305,
|
||
|
|
"grad_norm": 37.26128005981445,
|
||
|
|
"learning_rate": 3.4199584199584202e-06,
|
||
|
|
"loss": 0.2437,
|
||
|
|
"num_input_tokens_seen": 133568,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3482328482328482,
|
||
|
|
"grad_norm": 50.59195327758789,
|
||
|
|
"learning_rate": 3.4719334719334724e-06,
|
||
|
|
"loss": 0.1981,
|
||
|
|
"num_input_tokens_seen": 135616,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35343035343035345,
|
||
|
|
"grad_norm": 27.73749542236328,
|
||
|
|
"learning_rate": 3.523908523908524e-06,
|
||
|
|
"loss": 0.3892,
|
||
|
|
"num_input_tokens_seen": 137664,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3586278586278586,
|
||
|
|
"grad_norm": 25.262042999267578,
|
||
|
|
"learning_rate": 3.5758835758835762e-06,
|
||
|
|
"loss": 0.1327,
|
||
|
|
"num_input_tokens_seen": 139584,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36382536382536385,
|
||
|
|
"grad_norm": 33.850399017333984,
|
||
|
|
"learning_rate": 3.627858627858628e-06,
|
||
|
|
"loss": 0.2165,
|
||
|
|
"num_input_tokens_seen": 141504,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.369022869022869,
|
||
|
|
"grad_norm": 40.24353790283203,
|
||
|
|
"learning_rate": 3.6798336798336805e-06,
|
||
|
|
"loss": 0.2907,
|
||
|
|
"num_input_tokens_seen": 143552,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37422037422037424,
|
||
|
|
"grad_norm": 26.471458435058594,
|
||
|
|
"learning_rate": 3.7318087318087322e-06,
|
||
|
|
"loss": 0.3012,
|
||
|
|
"num_input_tokens_seen": 145536,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3794178794178794,
|
||
|
|
"grad_norm": 24.082931518554688,
|
||
|
|
"learning_rate": 3.7837837837837844e-06,
|
||
|
|
"loss": 0.2293,
|
||
|
|
"num_input_tokens_seen": 147456,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38461538461538464,
|
||
|
|
"grad_norm": 17.22445297241211,
|
||
|
|
"learning_rate": 3.835758835758836e-06,
|
||
|
|
"loss": 0.1782,
|
||
|
|
"num_input_tokens_seen": 149440,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3898128898128898,
|
||
|
|
"grad_norm": 28.956668853759766,
|
||
|
|
"learning_rate": 3.887733887733889e-06,
|
||
|
|
"loss": 0.3696,
|
||
|
|
"num_input_tokens_seen": 151360,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39501039501039503,
|
||
|
|
"grad_norm": 33.45877456665039,
|
||
|
|
"learning_rate": 3.9397089397089396e-06,
|
||
|
|
"loss": 0.309,
|
||
|
|
"num_input_tokens_seen": 153344,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4002079002079002,
|
||
|
|
"grad_norm": 5.932255744934082,
|
||
|
|
"learning_rate": 3.991683991683992e-06,
|
||
|
|
"loss": 0.2413,
|
||
|
|
"num_input_tokens_seen": 155264,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40540540540540543,
|
||
|
|
"grad_norm": 21.102771759033203,
|
||
|
|
"learning_rate": 4.043659043659044e-06,
|
||
|
|
"loss": 0.3064,
|
||
|
|
"num_input_tokens_seen": 157248,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4106029106029106,
|
||
|
|
"grad_norm": 4.685667991638184,
|
||
|
|
"learning_rate": 4.095634095634096e-06,
|
||
|
|
"loss": 0.2798,
|
||
|
|
"num_input_tokens_seen": 159296,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4158004158004158,
|
||
|
|
"grad_norm": 17.0113525390625,
|
||
|
|
"learning_rate": 4.147609147609148e-06,
|
||
|
|
"loss": 0.3488,
|
||
|
|
"num_input_tokens_seen": 161344,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.420997920997921,
|
||
|
|
"grad_norm": 6.919645309448242,
|
||
|
|
"learning_rate": 4.1995841995842e-06,
|
||
|
|
"loss": 0.2072,
|
||
|
|
"num_input_tokens_seen": 163328,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4261954261954262,
|
||
|
|
"grad_norm": 69.12760925292969,
|
||
|
|
"learning_rate": 4.2515592515592516e-06,
|
||
|
|
"loss": 0.1704,
|
||
|
|
"num_input_tokens_seen": 165312,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4313929313929314,
|
||
|
|
"grad_norm": 5.609197616577148,
|
||
|
|
"learning_rate": 4.303534303534304e-06,
|
||
|
|
"loss": 0.0573,
|
||
|
|
"num_input_tokens_seen": 167360,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4365904365904366,
|
||
|
|
"grad_norm": 99.32202911376953,
|
||
|
|
"learning_rate": 4.355509355509356e-06,
|
||
|
|
"loss": 0.9576,
|
||
|
|
"num_input_tokens_seen": 169344,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4417879417879418,
|
||
|
|
"grad_norm": 10.58239459991455,
|
||
|
|
"learning_rate": 4.4074844074844084e-06,
|
||
|
|
"loss": 0.3222,
|
||
|
|
"num_input_tokens_seen": 171456,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.446985446985447,
|
||
|
|
"grad_norm": 20.188488006591797,
|
||
|
|
"learning_rate": 4.45945945945946e-06,
|
||
|
|
"loss": 0.3442,
|
||
|
|
"num_input_tokens_seen": 173568,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4521829521829522,
|
||
|
|
"grad_norm": 8.674958229064941,
|
||
|
|
"learning_rate": 4.511434511434512e-06,
|
||
|
|
"loss": 0.1851,
|
||
|
|
"num_input_tokens_seen": 175552,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4573804573804574,
|
||
|
|
"grad_norm": 22.08028793334961,
|
||
|
|
"learning_rate": 4.563409563409564e-06,
|
||
|
|
"loss": 0.2573,
|
||
|
|
"num_input_tokens_seen": 177536,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4625779625779626,
|
||
|
|
"grad_norm": 11.568997383117676,
|
||
|
|
"learning_rate": 4.615384615384616e-06,
|
||
|
|
"loss": 0.2972,
|
||
|
|
"num_input_tokens_seen": 179584,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4677754677754678,
|
||
|
|
"grad_norm": 6.849438190460205,
|
||
|
|
"learning_rate": 4.667359667359668e-06,
|
||
|
|
"loss": 0.2247,
|
||
|
|
"num_input_tokens_seen": 181568,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47297297297297297,
|
||
|
|
"grad_norm": 3.9055252075195312,
|
||
|
|
"learning_rate": 4.71933471933472e-06,
|
||
|
|
"loss": 0.2355,
|
||
|
|
"num_input_tokens_seen": 183552,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4781704781704782,
|
||
|
|
"grad_norm": 20.351293563842773,
|
||
|
|
"learning_rate": 4.771309771309771e-06,
|
||
|
|
"loss": 0.1821,
|
||
|
|
"num_input_tokens_seen": 185600,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48336798336798337,
|
||
|
|
"grad_norm": 21.34255599975586,
|
||
|
|
"learning_rate": 4.823284823284824e-06,
|
||
|
|
"loss": 0.1938,
|
||
|
|
"num_input_tokens_seen": 187584,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4885654885654886,
|
||
|
|
"grad_norm": 28.844085693359375,
|
||
|
|
"learning_rate": 4.875259875259876e-06,
|
||
|
|
"loss": 0.2747,
|
||
|
|
"num_input_tokens_seen": 189568,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49376299376299376,
|
||
|
|
"grad_norm": 14.666620254516602,
|
||
|
|
"learning_rate": 4.927234927234928e-06,
|
||
|
|
"loss": 0.2394,
|
||
|
|
"num_input_tokens_seen": 191680,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.498960498960499,
|
||
|
|
"grad_norm": 23.078649520874023,
|
||
|
|
"learning_rate": 4.97920997920998e-06,
|
||
|
|
"loss": 0.2402,
|
||
|
|
"num_input_tokens_seen": 193728,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.501039501039501,
|
||
|
|
"eval_loss": 0.20022711157798767,
|
||
|
|
"eval_runtime": 1.0474,
|
||
|
|
"eval_samples_per_second": 817.231,
|
||
|
|
"eval_steps_per_second": 102.154,
|
||
|
|
"num_input_tokens_seen": 194560,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5041580041580042,
|
||
|
|
"grad_norm": 49.961700439453125,
|
||
|
|
"learning_rate": 4.999994075155936e-06,
|
||
|
|
"loss": 0.1873,
|
||
|
|
"num_input_tokens_seen": 195776,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5093555093555093,
|
||
|
|
"grad_norm": 21.11049461364746,
|
||
|
|
"learning_rate": 4.999957867877242e-06,
|
||
|
|
"loss": 0.1905,
|
||
|
|
"num_input_tokens_seen": 197696,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5145530145530145,
|
||
|
|
"grad_norm": 40.248802185058594,
|
||
|
|
"learning_rate": 4.999888745376028e-06,
|
||
|
|
"loss": 0.1952,
|
||
|
|
"num_input_tokens_seen": 199680,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5197505197505198,
|
||
|
|
"grad_norm": 25.25174903869629,
|
||
|
|
"learning_rate": 4.999786708562382e-06,
|
||
|
|
"loss": 0.2149,
|
||
|
|
"num_input_tokens_seen": 201792,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.524948024948025,
|
||
|
|
"grad_norm": 30.329490661621094,
|
||
|
|
"learning_rate": 4.999651758779753e-06,
|
||
|
|
"loss": 0.2066,
|
||
|
|
"num_input_tokens_seen": 203840,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5301455301455301,
|
||
|
|
"grad_norm": 23.636180877685547,
|
||
|
|
"learning_rate": 4.999483897804933e-06,
|
||
|
|
"loss": 0.2161,
|
||
|
|
"num_input_tokens_seen": 205824,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5353430353430353,
|
||
|
|
"grad_norm": 29.035306930541992,
|
||
|
|
"learning_rate": 4.999283127848029e-06,
|
||
|
|
"loss": 0.1777,
|
||
|
|
"num_input_tokens_seen": 207936,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5405405405405406,
|
||
|
|
"grad_norm": 21.316884994506836,
|
||
|
|
"learning_rate": 4.999049451552443e-06,
|
||
|
|
"loss": 0.1931,
|
||
|
|
"num_input_tokens_seen": 209984,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5457380457380457,
|
||
|
|
"grad_norm": 39.675086975097656,
|
||
|
|
"learning_rate": 4.998782871994828e-06,
|
||
|
|
"loss": 0.3235,
|
||
|
|
"num_input_tokens_seen": 212096,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5509355509355509,
|
||
|
|
"grad_norm": 20.291854858398438,
|
||
|
|
"learning_rate": 4.998483392685055e-06,
|
||
|
|
"loss": 0.2083,
|
||
|
|
"num_input_tokens_seen": 214080,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5561330561330561,
|
||
|
|
"grad_norm": 11.547039985656738,
|
||
|
|
"learning_rate": 4.9981510175661606e-06,
|
||
|
|
"loss": 0.2592,
|
||
|
|
"num_input_tokens_seen": 216128,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5613305613305614,
|
||
|
|
"grad_norm": 14.435676574707031,
|
||
|
|
"learning_rate": 4.9977857510143e-06,
|
||
|
|
"loss": 0.2199,
|
||
|
|
"num_input_tokens_seen": 218176,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5665280665280665,
|
||
|
|
"grad_norm": 11.747395515441895,
|
||
|
|
"learning_rate": 4.997387597838684e-06,
|
||
|
|
"loss": 0.1414,
|
||
|
|
"num_input_tokens_seen": 220096,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5717255717255717,
|
||
|
|
"grad_norm": 39.84230422973633,
|
||
|
|
"learning_rate": 4.996956563281524e-06,
|
||
|
|
"loss": 0.1874,
|
||
|
|
"num_input_tokens_seen": 222080,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5769230769230769,
|
||
|
|
"grad_norm": 41.40126419067383,
|
||
|
|
"learning_rate": 4.996492653017953e-06,
|
||
|
|
"loss": 0.2643,
|
||
|
|
"num_input_tokens_seen": 224000,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5821205821205822,
|
||
|
|
"grad_norm": 26.15458106994629,
|
||
|
|
"learning_rate": 4.995995873155958e-06,
|
||
|
|
"loss": 0.2975,
|
||
|
|
"num_input_tokens_seen": 225984,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5873180873180873,
|
||
|
|
"grad_norm": 17.151378631591797,
|
||
|
|
"learning_rate": 4.995466230236298e-06,
|
||
|
|
"loss": 0.1955,
|
||
|
|
"num_input_tokens_seen": 227840,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5925155925155925,
|
||
|
|
"grad_norm": 15.602777481079102,
|
||
|
|
"learning_rate": 4.994903731232415e-06,
|
||
|
|
"loss": 0.2476,
|
||
|
|
"num_input_tokens_seen": 229824,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5977130977130977,
|
||
|
|
"grad_norm": 6.400282859802246,
|
||
|
|
"learning_rate": 4.994308383550347e-06,
|
||
|
|
"loss": 0.213,
|
||
|
|
"num_input_tokens_seen": 231872,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6029106029106029,
|
||
|
|
"grad_norm": 21.05705451965332,
|
||
|
|
"learning_rate": 4.993680195028626e-06,
|
||
|
|
"loss": 0.2039,
|
||
|
|
"num_input_tokens_seen": 233920,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6081081081081081,
|
||
|
|
"grad_norm": 20.211143493652344,
|
||
|
|
"learning_rate": 4.993019173938178e-06,
|
||
|
|
"loss": 0.2036,
|
||
|
|
"num_input_tokens_seen": 235840,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6133056133056133,
|
||
|
|
"grad_norm": 7.714716911315918,
|
||
|
|
"learning_rate": 4.992325328982212e-06,
|
||
|
|
"loss": 0.2111,
|
||
|
|
"num_input_tokens_seen": 238016,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6185031185031185,
|
||
|
|
"grad_norm": 11.061738967895508,
|
||
|
|
"learning_rate": 4.991598669296105e-06,
|
||
|
|
"loss": 0.1706,
|
||
|
|
"num_input_tokens_seen": 240064,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6237006237006237,
|
||
|
|
"grad_norm": 37.05807113647461,
|
||
|
|
"learning_rate": 4.990839204447287e-06,
|
||
|
|
"loss": 0.2236,
|
||
|
|
"num_input_tokens_seen": 242048,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6288981288981289,
|
||
|
|
"grad_norm": 23.512836456298828,
|
||
|
|
"learning_rate": 4.990046944435105e-06,
|
||
|
|
"loss": 0.1908,
|
||
|
|
"num_input_tokens_seen": 243968,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6340956340956341,
|
||
|
|
"grad_norm": 26.345306396484375,
|
||
|
|
"learning_rate": 4.989221899690704e-06,
|
||
|
|
"loss": 0.2409,
|
||
|
|
"num_input_tokens_seen": 246016,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6392931392931392,
|
||
|
|
"grad_norm": 8.184889793395996,
|
||
|
|
"learning_rate": 4.988364081076877e-06,
|
||
|
|
"loss": 0.2135,
|
||
|
|
"num_input_tokens_seen": 248000,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6444906444906445,
|
||
|
|
"grad_norm": 8.331661224365234,
|
||
|
|
"learning_rate": 4.987473499887932e-06,
|
||
|
|
"loss": 0.203,
|
||
|
|
"num_input_tokens_seen": 250048,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6496881496881497,
|
||
|
|
"grad_norm": 19.97974967956543,
|
||
|
|
"learning_rate": 4.986550167849538e-06,
|
||
|
|
"loss": 0.1867,
|
||
|
|
"num_input_tokens_seen": 252096,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6548856548856549,
|
||
|
|
"grad_norm": 15.157999992370605,
|
||
|
|
"learning_rate": 4.9855940971185705e-06,
|
||
|
|
"loss": 0.1162,
|
||
|
|
"num_input_tokens_seen": 254144,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.66008316008316,
|
||
|
|
"grad_norm": 9.33337116241455,
|
||
|
|
"learning_rate": 4.984605300282955e-06,
|
||
|
|
"loss": 0.2562,
|
||
|
|
"num_input_tokens_seen": 256128,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6652806652806653,
|
||
|
|
"grad_norm": 28.826885223388672,
|
||
|
|
"learning_rate": 4.983583790361497e-06,
|
||
|
|
"loss": 0.1389,
|
||
|
|
"num_input_tokens_seen": 258048,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6704781704781705,
|
||
|
|
"grad_norm": 48.085391998291016,
|
||
|
|
"learning_rate": 4.982529580803714e-06,
|
||
|
|
"loss": 0.3054,
|
||
|
|
"num_input_tokens_seen": 260352,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6756756756756757,
|
||
|
|
"grad_norm": 24.728063583374023,
|
||
|
|
"learning_rate": 4.981442685489659e-06,
|
||
|
|
"loss": 0.2884,
|
||
|
|
"num_input_tokens_seen": 262272,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6808731808731808,
|
||
|
|
"grad_norm": 24.10839080810547,
|
||
|
|
"learning_rate": 4.9803231187297305e-06,
|
||
|
|
"loss": 0.1599,
|
||
|
|
"num_input_tokens_seen": 264320,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6860706860706861,
|
||
|
|
"grad_norm": 10.08352279663086,
|
||
|
|
"learning_rate": 4.979170895264494e-06,
|
||
|
|
"loss": 0.1946,
|
||
|
|
"num_input_tokens_seen": 266240,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6912681912681913,
|
||
|
|
"grad_norm": 17.471120834350586,
|
||
|
|
"learning_rate": 4.977986030264483e-06,
|
||
|
|
"loss": 0.2128,
|
||
|
|
"num_input_tokens_seen": 268224,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6964656964656964,
|
||
|
|
"grad_norm": 19.734243392944336,
|
||
|
|
"learning_rate": 4.9767685393299946e-06,
|
||
|
|
"loss": 0.2326,
|
||
|
|
"num_input_tokens_seen": 270272,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7016632016632016,
|
||
|
|
"grad_norm": 8.745848655700684,
|
||
|
|
"learning_rate": 4.975518438490897e-06,
|
||
|
|
"loss": 0.2276,
|
||
|
|
"num_input_tokens_seen": 272256,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7068607068607069,
|
||
|
|
"grad_norm": 24.683629989624023,
|
||
|
|
"learning_rate": 4.974235744206405e-06,
|
||
|
|
"loss": 0.1786,
|
||
|
|
"num_input_tokens_seen": 274240,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7120582120582121,
|
||
|
|
"grad_norm": 32.86091232299805,
|
||
|
|
"learning_rate": 4.972920473364869e-06,
|
||
|
|
"loss": 0.1923,
|
||
|
|
"num_input_tokens_seen": 276288,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7172557172557172,
|
||
|
|
"grad_norm": 13.548423767089844,
|
||
|
|
"learning_rate": 4.971572643283557e-06,
|
||
|
|
"loss": 0.1661,
|
||
|
|
"num_input_tokens_seen": 278272,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7224532224532224,
|
||
|
|
"grad_norm": 31.974199295043945,
|
||
|
|
"learning_rate": 4.970192271708416e-06,
|
||
|
|
"loss": 0.1867,
|
||
|
|
"num_input_tokens_seen": 280384,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7276507276507277,
|
||
|
|
"grad_norm": 16.395275115966797,
|
||
|
|
"learning_rate": 4.968779376813849e-06,
|
||
|
|
"loss": 0.3333,
|
||
|
|
"num_input_tokens_seen": 282368,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7328482328482329,
|
||
|
|
"grad_norm": 12.498151779174805,
|
||
|
|
"learning_rate": 4.967333977202469e-06,
|
||
|
|
"loss": 0.1327,
|
||
|
|
"num_input_tokens_seen": 284416,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.738045738045738,
|
||
|
|
"grad_norm": 70.73739624023438,
|
||
|
|
"learning_rate": 4.965856091904855e-06,
|
||
|
|
"loss": 0.2235,
|
||
|
|
"num_input_tokens_seen": 286464,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7432432432432432,
|
||
|
|
"grad_norm": 11.769681930541992,
|
||
|
|
"learning_rate": 4.964345740379307e-06,
|
||
|
|
"loss": 0.3413,
|
||
|
|
"num_input_tokens_seen": 288448,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7484407484407485,
|
||
|
|
"grad_norm": 9.002899169921875,
|
||
|
|
"learning_rate": 4.962802942511582e-06,
|
||
|
|
"loss": 0.1906,
|
||
|
|
"num_input_tokens_seen": 290496,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7515592515592515,
|
||
|
|
"eval_loss": 0.20943090319633484,
|
||
|
|
"eval_runtime": 1.0284,
|
||
|
|
"eval_samples_per_second": 832.355,
|
||
|
|
"eval_steps_per_second": 104.044,
|
||
|
|
"num_input_tokens_seen": 291712,
|
||
|
|
"step": 723
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7536382536382537,
|
||
|
|
"grad_norm": 30.15701675415039,
|
||
|
|
"learning_rate": 4.961227718614634e-06,
|
||
|
|
"loss": 0.2576,
|
||
|
|
"num_input_tokens_seen": 292480,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7588357588357588,
|
||
|
|
"grad_norm": 16.669214248657227,
|
||
|
|
"learning_rate": 4.959620089428354e-06,
|
||
|
|
"loss": 0.2352,
|
||
|
|
"num_input_tokens_seen": 294464,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.764033264033264,
|
||
|
|
"grad_norm": 24.175790786743164,
|
||
|
|
"learning_rate": 4.957980076119285e-06,
|
||
|
|
"loss": 0.2617,
|
||
|
|
"num_input_tokens_seen": 296448,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7692307692307693,
|
||
|
|
"grad_norm": 14.268982887268066,
|
||
|
|
"learning_rate": 4.956307700280354e-06,
|
||
|
|
"loss": 0.2079,
|
||
|
|
"num_input_tokens_seen": 298432,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7744282744282744,
|
||
|
|
"grad_norm": 6.003777980804443,
|
||
|
|
"learning_rate": 4.954602983930581e-06,
|
||
|
|
"loss": 0.2712,
|
||
|
|
"num_input_tokens_seen": 300480,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7796257796257796,
|
||
|
|
"grad_norm": 9.822731018066406,
|
||
|
|
"learning_rate": 4.95286594951479e-06,
|
||
|
|
"loss": 0.2211,
|
||
|
|
"num_input_tokens_seen": 302400,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7848232848232848,
|
||
|
|
"grad_norm": 13.10158920288086,
|
||
|
|
"learning_rate": 4.951096619903317e-06,
|
||
|
|
"loss": 0.2161,
|
||
|
|
"num_input_tokens_seen": 304320,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7900207900207901,
|
||
|
|
"grad_norm": 6.7825775146484375,
|
||
|
|
"learning_rate": 4.949295018391707e-06,
|
||
|
|
"loss": 0.1828,
|
||
|
|
"num_input_tokens_seen": 306240,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7952182952182952,
|
||
|
|
"grad_norm": 16.962614059448242,
|
||
|
|
"learning_rate": 4.9474611687004025e-06,
|
||
|
|
"loss": 0.2155,
|
||
|
|
"num_input_tokens_seen": 308032,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8004158004158004,
|
||
|
|
"grad_norm": 11.7578125,
|
||
|
|
"learning_rate": 4.945595094974442e-06,
|
||
|
|
"loss": 0.2009,
|
||
|
|
"num_input_tokens_seen": 309952,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8056133056133056,
|
||
|
|
"grad_norm": 11.759556770324707,
|
||
|
|
"learning_rate": 4.94369682178313e-06,
|
||
|
|
"loss": 0.1813,
|
||
|
|
"num_input_tokens_seen": 311936,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8108108108108109,
|
||
|
|
"grad_norm": 10.483379364013672,
|
||
|
|
"learning_rate": 4.941766374119724e-06,
|
||
|
|
"loss": 0.1603,
|
||
|
|
"num_input_tokens_seen": 313920,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.816008316008316,
|
||
|
|
"grad_norm": 33.7660026550293,
|
||
|
|
"learning_rate": 4.939803777401096e-06,
|
||
|
|
"loss": 0.2613,
|
||
|
|
"num_input_tokens_seen": 315968,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8212058212058212,
|
||
|
|
"grad_norm": 12.025551795959473,
|
||
|
|
"learning_rate": 4.937809057467404e-06,
|
||
|
|
"loss": 0.2641,
|
||
|
|
"num_input_tokens_seen": 317952,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8264033264033264,
|
||
|
|
"grad_norm": 14.894133567810059,
|
||
|
|
"learning_rate": 4.935782240581753e-06,
|
||
|
|
"loss": 0.1934,
|
||
|
|
"num_input_tokens_seen": 319872,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8316008316008316,
|
||
|
|
"grad_norm": 7.2731733322143555,
|
||
|
|
"learning_rate": 4.933723353429842e-06,
|
||
|
|
"loss": 0.2498,
|
||
|
|
"num_input_tokens_seen": 321856,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8367983367983368,
|
||
|
|
"grad_norm": 7.900448799133301,
|
||
|
|
"learning_rate": 4.931632423119621e-06,
|
||
|
|
"loss": 0.1671,
|
||
|
|
"num_input_tokens_seen": 323968,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.841995841995842,
|
||
|
|
"grad_norm": 13.05286693572998,
|
||
|
|
"learning_rate": 4.929509477180929e-06,
|
||
|
|
"loss": 0.2092,
|
||
|
|
"num_input_tokens_seen": 325952,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8471933471933472,
|
||
|
|
"grad_norm": 0.7964070439338684,
|
||
|
|
"learning_rate": 4.927354543565131e-06,
|
||
|
|
"loss": 0.0581,
|
||
|
|
"num_input_tokens_seen": 328000,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8523908523908524,
|
||
|
|
"grad_norm": 68.79032135009766,
|
||
|
|
"learning_rate": 4.925167650644752e-06,
|
||
|
|
"loss": 0.1592,
|
||
|
|
"num_input_tokens_seen": 329984,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8575883575883576,
|
||
|
|
"grad_norm": 15.014650344848633,
|
||
|
|
"learning_rate": 4.922948827213107e-06,
|
||
|
|
"loss": 0.4462,
|
||
|
|
"num_input_tokens_seen": 331904,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8627858627858628,
|
||
|
|
"grad_norm": 11.443034172058105,
|
||
|
|
"learning_rate": 4.920698102483913e-06,
|
||
|
|
"loss": 0.4518,
|
||
|
|
"num_input_tokens_seen": 333888,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.867983367983368,
|
||
|
|
"grad_norm": 61.09624481201172,
|
||
|
|
"learning_rate": 4.9184155060909115e-06,
|
||
|
|
"loss": 0.2671,
|
||
|
|
"num_input_tokens_seen": 335872,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8731808731808732,
|
||
|
|
"grad_norm": 69.81952667236328,
|
||
|
|
"learning_rate": 4.916101068087477e-06,
|
||
|
|
"loss": 0.3681,
|
||
|
|
"num_input_tokens_seen": 337856,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8783783783783784,
|
||
|
|
"grad_norm": 26.779844284057617,
|
||
|
|
"learning_rate": 4.9137548189462185e-06,
|
||
|
|
"loss": 0.2011,
|
||
|
|
"num_input_tokens_seen": 339776,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8835758835758836,
|
||
|
|
"grad_norm": 14.02595043182373,
|
||
|
|
"learning_rate": 4.911376789558584e-06,
|
||
|
|
"loss": 0.1852,
|
||
|
|
"num_input_tokens_seen": 341760,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8887733887733887,
|
||
|
|
"grad_norm": 13.688316345214844,
|
||
|
|
"learning_rate": 4.908967011234446e-06,
|
||
|
|
"loss": 0.3553,
|
||
|
|
"num_input_tokens_seen": 343680,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.893970893970894,
|
||
|
|
"grad_norm": 12.0100679397583,
|
||
|
|
"learning_rate": 4.9065255157016955e-06,
|
||
|
|
"loss": 0.2092,
|
||
|
|
"num_input_tokens_seen": 345600,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8991683991683992,
|
||
|
|
"grad_norm": 13.758508682250977,
|
||
|
|
"learning_rate": 4.904052335105822e-06,
|
||
|
|
"loss": 0.2165,
|
||
|
|
"num_input_tokens_seen": 347520,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9043659043659044,
|
||
|
|
"grad_norm": 21.069822311401367,
|
||
|
|
"learning_rate": 4.90154750200949e-06,
|
||
|
|
"loss": 0.1773,
|
||
|
|
"num_input_tokens_seen": 349568,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9095634095634095,
|
||
|
|
"grad_norm": 12.611119270324707,
|
||
|
|
"learning_rate": 4.899011049392111e-06,
|
||
|
|
"loss": 0.1146,
|
||
|
|
"num_input_tokens_seen": 351552,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9147609147609148,
|
||
|
|
"grad_norm": 10.34527587890625,
|
||
|
|
"learning_rate": 4.896443010649408e-06,
|
||
|
|
"loss": 0.1213,
|
||
|
|
"num_input_tokens_seen": 353472,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.91995841995842,
|
||
|
|
"grad_norm": 7.383549690246582,
|
||
|
|
"learning_rate": 4.893843419592977e-06,
|
||
|
|
"loss": 0.123,
|
||
|
|
"num_input_tokens_seen": 355392,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9251559251559252,
|
||
|
|
"grad_norm": 36.267250061035156,
|
||
|
|
"learning_rate": 4.891212310449845e-06,
|
||
|
|
"loss": 0.1794,
|
||
|
|
"num_input_tokens_seen": 357440,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9303534303534303,
|
||
|
|
"grad_norm": 21.83590316772461,
|
||
|
|
"learning_rate": 4.88854971786201e-06,
|
||
|
|
"loss": 0.1822,
|
||
|
|
"num_input_tokens_seen": 359488,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9355509355509356,
|
||
|
|
"grad_norm": 74.82781982421875,
|
||
|
|
"learning_rate": 4.885855676885995e-06,
|
||
|
|
"loss": 0.282,
|
||
|
|
"num_input_tokens_seen": 361408,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9407484407484408,
|
||
|
|
"grad_norm": 27.140975952148438,
|
||
|
|
"learning_rate": 4.88313022299238e-06,
|
||
|
|
"loss": 0.1931,
|
||
|
|
"num_input_tokens_seen": 363392,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9459459459459459,
|
||
|
|
"grad_norm": 45.93625259399414,
|
||
|
|
"learning_rate": 4.880373392065339e-06,
|
||
|
|
"loss": 0.318,
|
||
|
|
"num_input_tokens_seen": 365440,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9511434511434511,
|
||
|
|
"grad_norm": 22.00739860534668,
|
||
|
|
"learning_rate": 4.877585220402167e-06,
|
||
|
|
"loss": 0.1793,
|
||
|
|
"num_input_tokens_seen": 367616,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9563409563409564,
|
||
|
|
"grad_norm": 20.41562843322754,
|
||
|
|
"learning_rate": 4.874765744712796e-06,
|
||
|
|
"loss": 0.1164,
|
||
|
|
"num_input_tokens_seen": 369600,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9615384615384616,
|
||
|
|
"grad_norm": 30.4830379486084,
|
||
|
|
"learning_rate": 4.8719150021193206e-06,
|
||
|
|
"loss": 0.2515,
|
||
|
|
"num_input_tokens_seen": 371520,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9667359667359667,
|
||
|
|
"grad_norm": 19.38152503967285,
|
||
|
|
"learning_rate": 4.869033030155504e-06,
|
||
|
|
"loss": 0.3492,
|
||
|
|
"num_input_tokens_seen": 373568,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9719334719334719,
|
||
|
|
"grad_norm": 14.191635131835938,
|
||
|
|
"learning_rate": 4.866119866766286e-06,
|
||
|
|
"loss": 0.1902,
|
||
|
|
"num_input_tokens_seen": 375488,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9771309771309772,
|
||
|
|
"grad_norm": 24.351335525512695,
|
||
|
|
"learning_rate": 4.86317555030728e-06,
|
||
|
|
"loss": 0.2238,
|
||
|
|
"num_input_tokens_seen": 377728,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9823284823284824,
|
||
|
|
"grad_norm": 7.04970121383667,
|
||
|
|
"learning_rate": 4.860200119544273e-06,
|
||
|
|
"loss": 0.11,
|
||
|
|
"num_input_tokens_seen": 379840,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9875259875259875,
|
||
|
|
"grad_norm": 40.61119079589844,
|
||
|
|
"learning_rate": 4.857193613652711e-06,
|
||
|
|
"loss": 0.2154,
|
||
|
|
"num_input_tokens_seen": 381760,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9927234927234927,
|
||
|
|
"grad_norm": 18.396310806274414,
|
||
|
|
"learning_rate": 4.854156072217185e-06,
|
||
|
|
"loss": 0.1666,
|
||
|
|
"num_input_tokens_seen": 383808,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.997920997920998,
|
||
|
|
"grad_norm": 8.14262866973877,
|
||
|
|
"learning_rate": 4.851087535230911e-06,
|
||
|
|
"loss": 0.2397,
|
||
|
|
"num_input_tokens_seen": 385856,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.002079002079002,
|
||
|
|
"eval_loss": 0.17627178132534027,
|
||
|
|
"eval_runtime": 1.0408,
|
||
|
|
"eval_samples_per_second": 822.411,
|
||
|
|
"eval_steps_per_second": 102.801,
|
||
|
|
"num_input_tokens_seen": 387464,
|
||
|
|
"step": 964
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.003118503118503,
|
||
|
|
"grad_norm": 17.4815731048584,
|
||
|
|
"learning_rate": 4.8479880430952e-06,
|
||
|
|
"loss": 0.176,
|
||
|
|
"num_input_tokens_seen": 387848,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0083160083160083,
|
||
|
|
"grad_norm": 3.335278272628784,
|
||
|
|
"learning_rate": 4.844857636618928e-06,
|
||
|
|
"loss": 0.0833,
|
||
|
|
"num_input_tokens_seen": 389640,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0135135135135136,
|
||
|
|
"grad_norm": 37.46596908569336,
|
||
|
|
"learning_rate": 4.841696357018003e-06,
|
||
|
|
"loss": 0.1134,
|
||
|
|
"num_input_tokens_seen": 391624,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0187110187110187,
|
||
|
|
"grad_norm": 12.097825050354004,
|
||
|
|
"learning_rate": 4.838504245914812e-06,
|
||
|
|
"loss": 0.0776,
|
||
|
|
"num_input_tokens_seen": 393672,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.023908523908524,
|
||
|
|
"grad_norm": 0.35555675625801086,
|
||
|
|
"learning_rate": 4.835281345337684e-06,
|
||
|
|
"loss": 0.0266,
|
||
|
|
"num_input_tokens_seen": 395784,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0291060291060292,
|
||
|
|
"grad_norm": 99.47309112548828,
|
||
|
|
"learning_rate": 4.832027697720329e-06,
|
||
|
|
"loss": 0.2075,
|
||
|
|
"num_input_tokens_seen": 397768,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0343035343035343,
|
||
|
|
"grad_norm": 65.3523178100586,
|
||
|
|
"learning_rate": 4.828743345901285e-06,
|
||
|
|
"loss": 0.4063,
|
||
|
|
"num_input_tokens_seen": 399816,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0395010395010396,
|
||
|
|
"grad_norm": 1.6000525951385498,
|
||
|
|
"learning_rate": 4.825428333123346e-06,
|
||
|
|
"loss": 0.1017,
|
||
|
|
"num_input_tokens_seen": 401928,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0446985446985446,
|
||
|
|
"grad_norm": 5.073741436004639,
|
||
|
|
"learning_rate": 4.822082703033003e-06,
|
||
|
|
"loss": 0.0338,
|
||
|
|
"num_input_tokens_seen": 403912,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.04989604989605,
|
||
|
|
"grad_norm": 31.460180282592773,
|
||
|
|
"learning_rate": 4.818706499679862e-06,
|
||
|
|
"loss": 0.1392,
|
||
|
|
"num_input_tokens_seen": 405832,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0550935550935552,
|
||
|
|
"grad_norm": 13.538461685180664,
|
||
|
|
"learning_rate": 4.815299767516065e-06,
|
||
|
|
"loss": 0.1168,
|
||
|
|
"num_input_tokens_seen": 407880,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0602910602910602,
|
||
|
|
"grad_norm": 48.505496978759766,
|
||
|
|
"learning_rate": 4.811862551395707e-06,
|
||
|
|
"loss": 0.1001,
|
||
|
|
"num_input_tokens_seen": 410120,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0654885654885655,
|
||
|
|
"grad_norm": 25.246902465820312,
|
||
|
|
"learning_rate": 4.808394896574246e-06,
|
||
|
|
"loss": 0.0944,
|
||
|
|
"num_input_tokens_seen": 412168,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0706860706860706,
|
||
|
|
"grad_norm": 40.07487487792969,
|
||
|
|
"learning_rate": 4.8048968487079e-06,
|
||
|
|
"loss": 0.1433,
|
||
|
|
"num_input_tokens_seen": 414472,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0758835758835759,
|
||
|
|
"grad_norm": 77.1120834350586,
|
||
|
|
"learning_rate": 4.801368453853057e-06,
|
||
|
|
"loss": 0.3131,
|
||
|
|
"num_input_tokens_seen": 416520,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0810810810810811,
|
||
|
|
"grad_norm": 22.368253707885742,
|
||
|
|
"learning_rate": 4.79780975846566e-06,
|
||
|
|
"loss": 0.171,
|
||
|
|
"num_input_tokens_seen": 418568,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0862785862785862,
|
||
|
|
"grad_norm": 13.560877799987793,
|
||
|
|
"learning_rate": 4.7942208094006e-06,
|
||
|
|
"loss": 0.1287,
|
||
|
|
"num_input_tokens_seen": 420488,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0914760914760915,
|
||
|
|
"grad_norm": 31.480270385742188,
|
||
|
|
"learning_rate": 4.790601653911094e-06,
|
||
|
|
"loss": 0.1098,
|
||
|
|
"num_input_tokens_seen": 422472,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0966735966735968,
|
||
|
|
"grad_norm": 34.814151763916016,
|
||
|
|
"learning_rate": 4.786952339648071e-06,
|
||
|
|
"loss": 0.297,
|
||
|
|
"num_input_tokens_seen": 424456,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1018711018711018,
|
||
|
|
"grad_norm": 38.07038116455078,
|
||
|
|
"learning_rate": 4.783272914659535e-06,
|
||
|
|
"loss": 0.3308,
|
||
|
|
"num_input_tokens_seen": 426568,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.107068607068607,
|
||
|
|
"grad_norm": 114.09796905517578,
|
||
|
|
"learning_rate": 4.77956342738994e-06,
|
||
|
|
"loss": 0.1061,
|
||
|
|
"num_input_tokens_seen": 428552,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1122661122661124,
|
||
|
|
"grad_norm": 26.920055389404297,
|
||
|
|
"learning_rate": 4.775823926679549e-06,
|
||
|
|
"loss": 0.0996,
|
||
|
|
"num_input_tokens_seen": 430472,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1174636174636174,
|
||
|
|
"grad_norm": 20.18915557861328,
|
||
|
|
"learning_rate": 4.77205446176379e-06,
|
||
|
|
"loss": 0.1315,
|
||
|
|
"num_input_tokens_seen": 432328,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1226611226611227,
|
||
|
|
"grad_norm": 146.2836151123047,
|
||
|
|
"learning_rate": 4.768255082272612e-06,
|
||
|
|
"loss": 0.2841,
|
||
|
|
"num_input_tokens_seen": 434440,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1278586278586278,
|
||
|
|
"grad_norm": 140.84530639648438,
|
||
|
|
"learning_rate": 4.764425838229823e-06,
|
||
|
|
"loss": 0.0783,
|
||
|
|
"num_input_tokens_seen": 436488,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.133056133056133,
|
||
|
|
"grad_norm": 28.586977005004883,
|
||
|
|
"learning_rate": 4.760566780052445e-06,
|
||
|
|
"loss": 0.346,
|
||
|
|
"num_input_tokens_seen": 438472,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1382536382536383,
|
||
|
|
"grad_norm": 42.30632400512695,
|
||
|
|
"learning_rate": 4.756677958550035e-06,
|
||
|
|
"loss": 0.4155,
|
||
|
|
"num_input_tokens_seen": 440456,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1434511434511434,
|
||
|
|
"grad_norm": 41.02433395385742,
|
||
|
|
"learning_rate": 4.752759424924026e-06,
|
||
|
|
"loss": 0.1236,
|
||
|
|
"num_input_tokens_seen": 442440,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1486486486486487,
|
||
|
|
"grad_norm": 22.878646850585938,
|
||
|
|
"learning_rate": 4.7488112307670515e-06,
|
||
|
|
"loss": 0.099,
|
||
|
|
"num_input_tokens_seen": 444424,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1538461538461537,
|
||
|
|
"grad_norm": 18.822031021118164,
|
||
|
|
"learning_rate": 4.7448334280622624e-06,
|
||
|
|
"loss": 0.1891,
|
||
|
|
"num_input_tokens_seen": 446280,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.159043659043659,
|
||
|
|
"grad_norm": 26.573184967041016,
|
||
|
|
"learning_rate": 4.740826069182645e-06,
|
||
|
|
"loss": 0.1802,
|
||
|
|
"num_input_tokens_seen": 448264,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1642411642411643,
|
||
|
|
"grad_norm": 14.406500816345215,
|
||
|
|
"learning_rate": 4.736789206890332e-06,
|
||
|
|
"loss": 0.2325,
|
||
|
|
"num_input_tokens_seen": 450376,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1694386694386694,
|
||
|
|
"grad_norm": 2.926020860671997,
|
||
|
|
"learning_rate": 4.732722894335909e-06,
|
||
|
|
"loss": 0.1142,
|
||
|
|
"num_input_tokens_seen": 452552,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1746361746361746,
|
||
|
|
"grad_norm": 13.685401916503906,
|
||
|
|
"learning_rate": 4.728627185057711e-06,
|
||
|
|
"loss": 0.1432,
|
||
|
|
"num_input_tokens_seen": 454600,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.17983367983368,
|
||
|
|
"grad_norm": 37.38111877441406,
|
||
|
|
"learning_rate": 4.724502132981119e-06,
|
||
|
|
"loss": 0.1061,
|
||
|
|
"num_input_tokens_seen": 456648,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.185031185031185,
|
||
|
|
"grad_norm": 28.67649269104004,
|
||
|
|
"learning_rate": 4.720347792417851e-06,
|
||
|
|
"loss": 0.078,
|
||
|
|
"num_input_tokens_seen": 458632,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1902286902286903,
|
||
|
|
"grad_norm": 83.61138153076172,
|
||
|
|
"learning_rate": 4.716164218065246e-06,
|
||
|
|
"loss": 0.1068,
|
||
|
|
"num_input_tokens_seen": 460680,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1954261954261955,
|
||
|
|
"grad_norm": 14.42905330657959,
|
||
|
|
"learning_rate": 4.711951465005548e-06,
|
||
|
|
"loss": 0.2177,
|
||
|
|
"num_input_tokens_seen": 462728,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2006237006237006,
|
||
|
|
"grad_norm": 9.12094783782959,
|
||
|
|
"learning_rate": 4.707709588705169e-06,
|
||
|
|
"loss": 0.058,
|
||
|
|
"num_input_tokens_seen": 464776,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2058212058212059,
|
||
|
|
"grad_norm": 33.16276550292969,
|
||
|
|
"learning_rate": 4.7034386450139735e-06,
|
||
|
|
"loss": 0.3544,
|
||
|
|
"num_input_tokens_seen": 466696,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.211018711018711,
|
||
|
|
"grad_norm": 29.38105010986328,
|
||
|
|
"learning_rate": 4.699138690164533e-06,
|
||
|
|
"loss": 0.1744,
|
||
|
|
"num_input_tokens_seen": 468616,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2162162162162162,
|
||
|
|
"grad_norm": 41.46559143066406,
|
||
|
|
"learning_rate": 4.694809780771391e-06,
|
||
|
|
"loss": 0.1842,
|
||
|
|
"num_input_tokens_seen": 470728,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2214137214137215,
|
||
|
|
"grad_norm": 15.253453254699707,
|
||
|
|
"learning_rate": 4.690451973830314e-06,
|
||
|
|
"loss": 0.1067,
|
||
|
|
"num_input_tokens_seen": 472776,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2266112266112266,
|
||
|
|
"grad_norm": 32.71086120605469,
|
||
|
|
"learning_rate": 4.6860653267175425e-06,
|
||
|
|
"loss": 0.177,
|
||
|
|
"num_input_tokens_seen": 474824,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2318087318087318,
|
||
|
|
"grad_norm": 14.900812149047852,
|
||
|
|
"learning_rate": 4.681649897189036e-06,
|
||
|
|
"loss": 0.2562,
|
||
|
|
"num_input_tokens_seen": 476744,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.237006237006237,
|
||
|
|
"grad_norm": 11.960613250732422,
|
||
|
|
"learning_rate": 4.677205743379714e-06,
|
||
|
|
"loss": 0.053,
|
||
|
|
"num_input_tokens_seen": 478856,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2422037422037422,
|
||
|
|
"grad_norm": 33.26080322265625,
|
||
|
|
"learning_rate": 4.672732923802685e-06,
|
||
|
|
"loss": 0.1686,
|
||
|
|
"num_input_tokens_seen": 480776,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2474012474012475,
|
||
|
|
"grad_norm": 13.924117088317871,
|
||
|
|
"learning_rate": 4.6682314973484844e-06,
|
||
|
|
"loss": 0.0292,
|
||
|
|
"num_input_tokens_seen": 482952,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2525987525987525,
|
||
|
|
"grad_norm": 18.63437271118164,
|
||
|
|
"learning_rate": 4.663701523284291e-06,
|
||
|
|
"loss": 0.0622,
|
||
|
|
"num_input_tokens_seen": 485192,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2525987525987525,
|
||
|
|
"eval_loss": 0.26757940649986267,
|
||
|
|
"eval_runtime": 1.0561,
|
||
|
|
"eval_samples_per_second": 810.538,
|
||
|
|
"eval_steps_per_second": 101.317,
|
||
|
|
"num_input_tokens_seen": 485192,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2577962577962578,
|
||
|
|
"grad_norm": 0.11255653202533722,
|
||
|
|
"learning_rate": 4.659143061253152e-06,
|
||
|
|
"loss": 0.1299,
|
||
|
|
"num_input_tokens_seen": 487112,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.262993762993763,
|
||
|
|
"grad_norm": 38.90354537963867,
|
||
|
|
"learning_rate": 4.654556171273196e-06,
|
||
|
|
"loss": 0.2685,
|
||
|
|
"num_input_tokens_seen": 489160,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2681912681912682,
|
||
|
|
"grad_norm": 0.6540949940681458,
|
||
|
|
"learning_rate": 4.649940913736841e-06,
|
||
|
|
"loss": 0.2017,
|
||
|
|
"num_input_tokens_seen": 491080,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2733887733887734,
|
||
|
|
"grad_norm": 0.8767725825309753,
|
||
|
|
"learning_rate": 4.645297349410005e-06,
|
||
|
|
"loss": 0.0607,
|
||
|
|
"num_input_tokens_seen": 493064,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2785862785862787,
|
||
|
|
"grad_norm": 1.3416281938552856,
|
||
|
|
"learning_rate": 4.640625539431298e-06,
|
||
|
|
"loss": 0.1537,
|
||
|
|
"num_input_tokens_seen": 494984,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2837837837837838,
|
||
|
|
"grad_norm": 11.072347640991211,
|
||
|
|
"learning_rate": 4.635925545311224e-06,
|
||
|
|
"loss": 0.2946,
|
||
|
|
"num_input_tokens_seen": 496968,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.288981288981289,
|
||
|
|
"grad_norm": 5.309337615966797,
|
||
|
|
"learning_rate": 4.631197428931365e-06,
|
||
|
|
"loss": 0.0799,
|
||
|
|
"num_input_tokens_seen": 498824,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2941787941787941,
|
||
|
|
"grad_norm": 42.28898239135742,
|
||
|
|
"learning_rate": 4.626441252543572e-06,
|
||
|
|
"loss": 0.0804,
|
||
|
|
"num_input_tokens_seen": 500808,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2993762993762994,
|
||
|
|
"grad_norm": 99.69770812988281,
|
||
|
|
"learning_rate": 4.621657078769143e-06,
|
||
|
|
"loss": 0.251,
|
||
|
|
"num_input_tokens_seen": 502856,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3045738045738045,
|
||
|
|
"grad_norm": 48.22378921508789,
|
||
|
|
"learning_rate": 4.616844970597996e-06,
|
||
|
|
"loss": 0.0856,
|
||
|
|
"num_input_tokens_seen": 504712,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3097713097713097,
|
||
|
|
"grad_norm": 22.802143096923828,
|
||
|
|
"learning_rate": 4.612004991387843e-06,
|
||
|
|
"loss": 0.3719,
|
||
|
|
"num_input_tokens_seen": 506696,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.314968814968815,
|
||
|
|
"grad_norm": 20.26570701599121,
|
||
|
|
"learning_rate": 4.607137204863356e-06,
|
||
|
|
"loss": 0.0936,
|
||
|
|
"num_input_tokens_seen": 508680,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.32016632016632,
|
||
|
|
"grad_norm": 4.629741191864014,
|
||
|
|
"learning_rate": 4.602241675115326e-06,
|
||
|
|
"loss": 0.1072,
|
||
|
|
"num_input_tokens_seen": 510728,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3253638253638254,
|
||
|
|
"grad_norm": 2.296597957611084,
|
||
|
|
"learning_rate": 4.597318466599819e-06,
|
||
|
|
"loss": 0.0841,
|
||
|
|
"num_input_tokens_seen": 512712,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3305613305613306,
|
||
|
|
"grad_norm": 0.3281061351299286,
|
||
|
|
"learning_rate": 4.592367644137329e-06,
|
||
|
|
"loss": 0.1067,
|
||
|
|
"num_input_tokens_seen": 514696,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3357588357588357,
|
||
|
|
"grad_norm": 28.257986068725586,
|
||
|
|
"learning_rate": 4.587389272911923e-06,
|
||
|
|
"loss": 0.1895,
|
||
|
|
"num_input_tokens_seen": 516808,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.340956340956341,
|
||
|
|
"grad_norm": 49.04169464111328,
|
||
|
|
"learning_rate": 4.582383418470386e-06,
|
||
|
|
"loss": 0.2118,
|
||
|
|
"num_input_tokens_seen": 518792,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3461538461538463,
|
||
|
|
"grad_norm": 75.7163314819336,
|
||
|
|
"learning_rate": 4.5773501467213525e-06,
|
||
|
|
"loss": 0.1325,
|
||
|
|
"num_input_tokens_seen": 520840,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3513513513513513,
|
||
|
|
"grad_norm": 0.3487839698791504,
|
||
|
|
"learning_rate": 4.572289523934444e-06,
|
||
|
|
"loss": 0.0526,
|
||
|
|
"num_input_tokens_seen": 522760,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3565488565488566,
|
||
|
|
"grad_norm": 21.736961364746094,
|
||
|
|
"learning_rate": 4.567201616739393e-06,
|
||
|
|
"loss": 0.2152,
|
||
|
|
"num_input_tokens_seen": 524872,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3617463617463619,
|
||
|
|
"grad_norm": 66.38067626953125,
|
||
|
|
"learning_rate": 4.562086492125167e-06,
|
||
|
|
"loss": 0.1978,
|
||
|
|
"num_input_tokens_seen": 526920,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.366943866943867,
|
||
|
|
"grad_norm": 11.493204116821289,
|
||
|
|
"learning_rate": 4.5569442174390885e-06,
|
||
|
|
"loss": 0.1374,
|
||
|
|
"num_input_tokens_seen": 528968,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3721413721413722,
|
||
|
|
"grad_norm": 0.7820735573768616,
|
||
|
|
"learning_rate": 4.551774860385944e-06,
|
||
|
|
"loss": 0.0818,
|
||
|
|
"num_input_tokens_seen": 530888,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3773388773388773,
|
||
|
|
"grad_norm": 0.8318536281585693,
|
||
|
|
"learning_rate": 4.546578489027095e-06,
|
||
|
|
"loss": 0.1644,
|
||
|
|
"num_input_tokens_seen": 532872,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3825363825363826,
|
||
|
|
"grad_norm": 10.547067642211914,
|
||
|
|
"learning_rate": 4.541355171779582e-06,
|
||
|
|
"loss": 0.118,
|
||
|
|
"num_input_tokens_seen": 534920,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3877338877338876,
|
||
|
|
"grad_norm": 1.9730658531188965,
|
||
|
|
"learning_rate": 4.536104977415225e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"num_input_tokens_seen": 536840,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.392931392931393,
|
||
|
|
"grad_norm": 21.502456665039062,
|
||
|
|
"learning_rate": 4.530827975059715e-06,
|
||
|
|
"loss": 0.3705,
|
||
|
|
"num_input_tokens_seen": 538760,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3981288981288982,
|
||
|
|
"grad_norm": 0.3489514887332916,
|
||
|
|
"learning_rate": 4.525524234191705e-06,
|
||
|
|
"loss": 0.2364,
|
||
|
|
"num_input_tokens_seen": 540680,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4033264033264032,
|
||
|
|
"grad_norm": 44.12948989868164,
|
||
|
|
"learning_rate": 4.520193824641898e-06,
|
||
|
|
"loss": 0.1405,
|
||
|
|
"num_input_tokens_seen": 542664,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4085239085239085,
|
||
|
|
"grad_norm": 91.0547103881836,
|
||
|
|
"learning_rate": 4.51483681659212e-06,
|
||
|
|
"loss": 0.1596,
|
||
|
|
"num_input_tokens_seen": 544712,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4137214137214138,
|
||
|
|
"grad_norm": 16.955787658691406,
|
||
|
|
"learning_rate": 4.5094532805744075e-06,
|
||
|
|
"loss": 0.2662,
|
||
|
|
"num_input_tokens_seen": 546824,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4189189189189189,
|
||
|
|
"grad_norm": 24.00728416442871,
|
||
|
|
"learning_rate": 4.504043287470068e-06,
|
||
|
|
"loss": 0.0791,
|
||
|
|
"num_input_tokens_seen": 548936,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4241164241164241,
|
||
|
|
"grad_norm": 1.3119056224822998,
|
||
|
|
"learning_rate": 4.498606908508754e-06,
|
||
|
|
"loss": 0.1218,
|
||
|
|
"num_input_tokens_seen": 550920,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4293139293139294,
|
||
|
|
"grad_norm": 0.5111730694770813,
|
||
|
|
"learning_rate": 4.493144215267519e-06,
|
||
|
|
"loss": 0.0307,
|
||
|
|
"num_input_tokens_seen": 552904,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4345114345114345,
|
||
|
|
"grad_norm": 5.568216800689697,
|
||
|
|
"learning_rate": 4.4876552796698814e-06,
|
||
|
|
"loss": 0.1616,
|
||
|
|
"num_input_tokens_seen": 554824,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4397089397089398,
|
||
|
|
"grad_norm": 21.439517974853516,
|
||
|
|
"learning_rate": 4.482140173984875e-06,
|
||
|
|
"loss": 0.214,
|
||
|
|
"num_input_tokens_seen": 556872,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.444906444906445,
|
||
|
|
"grad_norm": 4.337334156036377,
|
||
|
|
"learning_rate": 4.476598970826093e-06,
|
||
|
|
"loss": 0.1453,
|
||
|
|
"num_input_tokens_seen": 558984,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.45010395010395,
|
||
|
|
"grad_norm": 15.975934982299805,
|
||
|
|
"learning_rate": 4.471031743150744e-06,
|
||
|
|
"loss": 0.2061,
|
||
|
|
"num_input_tokens_seen": 560968,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4553014553014554,
|
||
|
|
"grad_norm": 65.18135070800781,
|
||
|
|
"learning_rate": 4.465438564258673e-06,
|
||
|
|
"loss": 0.2358,
|
||
|
|
"num_input_tokens_seen": 562952,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4604989604989604,
|
||
|
|
"grad_norm": 0.4641796052455902,
|
||
|
|
"learning_rate": 4.459819507791415e-06,
|
||
|
|
"loss": 0.0357,
|
||
|
|
"num_input_tokens_seen": 565064,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4656964656964657,
|
||
|
|
"grad_norm": 0.4620436728000641,
|
||
|
|
"learning_rate": 4.454174647731213e-06,
|
||
|
|
"loss": 0.1194,
|
||
|
|
"num_input_tokens_seen": 567112,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4708939708939708,
|
||
|
|
"grad_norm": 0.40077999234199524,
|
||
|
|
"learning_rate": 4.448504058400052e-06,
|
||
|
|
"loss": 0.2261,
|
||
|
|
"num_input_tokens_seen": 569160,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.476091476091476,
|
||
|
|
"grad_norm": 31.147504806518555,
|
||
|
|
"learning_rate": 4.4428078144586715e-06,
|
||
|
|
"loss": 0.1794,
|
||
|
|
"num_input_tokens_seen": 571336,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4812889812889813,
|
||
|
|
"grad_norm": 60.33259582519531,
|
||
|
|
"learning_rate": 4.437085990905591e-06,
|
||
|
|
"loss": 0.2622,
|
||
|
|
"num_input_tokens_seen": 573384,
|
||
|
|
"step": 1425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4864864864864864,
|
||
|
|
"grad_norm": 42.84424591064453,
|
||
|
|
"learning_rate": 4.431338663076119e-06,
|
||
|
|
"loss": 0.1625,
|
||
|
|
"num_input_tokens_seen": 575304,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4916839916839917,
|
||
|
|
"grad_norm": 1.8637181520462036,
|
||
|
|
"learning_rate": 4.42556590664136e-06,
|
||
|
|
"loss": 0.0647,
|
||
|
|
"num_input_tokens_seen": 577160,
|
||
|
|
"step": 1435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.496881496881497,
|
||
|
|
"grad_norm": 34.14229965209961,
|
||
|
|
"learning_rate": 4.41976779760722e-06,
|
||
|
|
"loss": 0.11,
|
||
|
|
"num_input_tokens_seen": 579208,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.502079002079002,
|
||
|
|
"grad_norm": 48.55718994140625,
|
||
|
|
"learning_rate": 4.413944412313405e-06,
|
||
|
|
"loss": 0.0911,
|
||
|
|
"num_input_tokens_seen": 581256,
|
||
|
|
"step": 1445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.503118503118503,
|
||
|
|
"eval_loss": 0.3145564794540405,
|
||
|
|
"eval_runtime": 1.1073,
|
||
|
|
"eval_samples_per_second": 773.075,
|
||
|
|
"eval_steps_per_second": 96.634,
|
||
|
|
"num_input_tokens_seen": 581704,
|
||
|
|
"step": 1446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5072765072765073,
|
||
|
|
"grad_norm": 0.2164192944765091,
|
||
|
|
"learning_rate": 4.408095827432416e-06,
|
||
|
|
"loss": 0.1191,
|
||
|
|
"num_input_tokens_seen": 583304,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5124740124740126,
|
||
|
|
"grad_norm": 50.20052719116211,
|
||
|
|
"learning_rate": 4.40222211996854e-06,
|
||
|
|
"loss": 0.3479,
|
||
|
|
"num_input_tokens_seen": 585224,
|
||
|
|
"step": 1455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5176715176715176,
|
||
|
|
"grad_norm": 31.401309967041016,
|
||
|
|
"learning_rate": 4.396323367256836e-06,
|
||
|
|
"loss": 0.2617,
|
||
|
|
"num_input_tokens_seen": 587272,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5228690228690227,
|
||
|
|
"grad_norm": 32.85145568847656,
|
||
|
|
"learning_rate": 4.390399646962117e-06,
|
||
|
|
"loss": 0.1985,
|
||
|
|
"num_input_tokens_seen": 589320,
|
||
|
|
"step": 1465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5280665280665282,
|
||
|
|
"grad_norm": 13.456771850585938,
|
||
|
|
"learning_rate": 4.384451037077924e-06,
|
||
|
|
"loss": 0.1369,
|
||
|
|
"num_input_tokens_seen": 591304,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5332640332640333,
|
||
|
|
"grad_norm": 1.2188056707382202,
|
||
|
|
"learning_rate": 4.378477615925506e-06,
|
||
|
|
"loss": 0.1433,
|
||
|
|
"num_input_tokens_seen": 593224,
|
||
|
|
"step": 1475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5384615384615383,
|
||
|
|
"grad_norm": 28.065401077270508,
|
||
|
|
"learning_rate": 4.372479462152781e-06,
|
||
|
|
"loss": 0.1273,
|
||
|
|
"num_input_tokens_seen": 595336,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5436590436590436,
|
||
|
|
"grad_norm": 21.94362449645996,
|
||
|
|
"learning_rate": 4.366456654733308e-06,
|
||
|
|
"loss": 0.2715,
|
||
|
|
"num_input_tokens_seen": 597256,
|
||
|
|
"step": 1485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5488565488565489,
|
||
|
|
"grad_norm": 11.613262176513672,
|
||
|
|
"learning_rate": 4.360409272965242e-06,
|
||
|
|
"loss": 0.1859,
|
||
|
|
"num_input_tokens_seen": 599304,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.554054054054054,
|
||
|
|
"grad_norm": 3.856565237045288,
|
||
|
|
"learning_rate": 4.354337396470291e-06,
|
||
|
|
"loss": 0.0745,
|
||
|
|
"num_input_tokens_seen": 601288,
|
||
|
|
"step": 1495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5592515592515592,
|
||
|
|
"grad_norm": 52.13639831542969,
|
||
|
|
"learning_rate": 4.348241105192668e-06,
|
||
|
|
"loss": 0.1641,
|
||
|
|
"num_input_tokens_seen": 603272,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5644490644490645,
|
||
|
|
"grad_norm": 4.8234357833862305,
|
||
|
|
"learning_rate": 4.34212047939804e-06,
|
||
|
|
"loss": 0.1365,
|
||
|
|
"num_input_tokens_seen": 605256,
|
||
|
|
"step": 1505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5696465696465696,
|
||
|
|
"grad_norm": 17.75581169128418,
|
||
|
|
"learning_rate": 4.335975599672469e-06,
|
||
|
|
"loss": 0.0868,
|
||
|
|
"num_input_tokens_seen": 607304,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5748440748440748,
|
||
|
|
"grad_norm": 6.615152835845947,
|
||
|
|
"learning_rate": 4.329806546921354e-06,
|
||
|
|
"loss": 0.1281,
|
||
|
|
"num_input_tokens_seen": 609224,
|
||
|
|
"step": 1515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5800415800415801,
|
||
|
|
"grad_norm": 29.3903865814209,
|
||
|
|
"learning_rate": 4.3236134023683565e-06,
|
||
|
|
"loss": 0.0465,
|
||
|
|
"num_input_tokens_seen": 611336,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5852390852390852,
|
||
|
|
"grad_norm": 66.3274154663086,
|
||
|
|
"learning_rate": 4.3173962475543475e-06,
|
||
|
|
"loss": 0.1156,
|
||
|
|
"num_input_tokens_seen": 613320,
|
||
|
|
"step": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5904365904365905,
|
||
|
|
"grad_norm": 45.9361686706543,
|
||
|
|
"learning_rate": 4.311155164336318e-06,
|
||
|
|
"loss": 0.2405,
|
||
|
|
"num_input_tokens_seen": 615176,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5956340956340958,
|
||
|
|
"grad_norm": 15.004530906677246,
|
||
|
|
"learning_rate": 4.3048902348863116e-06,
|
||
|
|
"loss": 0.1673,
|
||
|
|
"num_input_tokens_seen": 617224,
|
||
|
|
"step": 1535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6008316008316008,
|
||
|
|
"grad_norm": 46.72109603881836,
|
||
|
|
"learning_rate": 4.298601541690336e-06,
|
||
|
|
"loss": 0.1683,
|
||
|
|
"num_input_tokens_seen": 619208,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6060291060291059,
|
||
|
|
"grad_norm": 25.378122329711914,
|
||
|
|
"learning_rate": 4.292289167547281e-06,
|
||
|
|
"loss": 0.221,
|
||
|
|
"num_input_tokens_seen": 621192,
|
||
|
|
"step": 1545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6112266112266114,
|
||
|
|
"grad_norm": 15.957310676574707,
|
||
|
|
"learning_rate": 4.285953195567827e-06,
|
||
|
|
"loss": 0.1458,
|
||
|
|
"num_input_tokens_seen": 623176,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6164241164241164,
|
||
|
|
"grad_norm": 20.942054748535156,
|
||
|
|
"learning_rate": 4.279593709173352e-06,
|
||
|
|
"loss": 0.246,
|
||
|
|
"num_input_tokens_seen": 625160,
|
||
|
|
"step": 1555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6216216216216215,
|
||
|
|
"grad_norm": 0.6662601232528687,
|
||
|
|
"learning_rate": 4.27321079209483e-06,
|
||
|
|
"loss": 0.1381,
|
||
|
|
"num_input_tokens_seen": 627144,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6268191268191268,
|
||
|
|
"grad_norm": 24.978178024291992,
|
||
|
|
"learning_rate": 4.266804528371732e-06,
|
||
|
|
"loss": 0.1634,
|
||
|
|
"num_input_tokens_seen": 629192,
|
||
|
|
"step": 1565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.632016632016632,
|
||
|
|
"grad_norm": 7.083024024963379,
|
||
|
|
"learning_rate": 4.260375002350917e-06,
|
||
|
|
"loss": 0.1174,
|
||
|
|
"num_input_tokens_seen": 631240,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.637214137214137,
|
||
|
|
"grad_norm": 20.857555389404297,
|
||
|
|
"learning_rate": 4.253922298685525e-06,
|
||
|
|
"loss": 0.2274,
|
||
|
|
"num_input_tokens_seen": 633224,
|
||
|
|
"step": 1575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6424116424116424,
|
||
|
|
"grad_norm": 46.30590057373047,
|
||
|
|
"learning_rate": 4.2474465023338586e-06,
|
||
|
|
"loss": 0.1367,
|
||
|
|
"num_input_tokens_seen": 635208,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6476091476091477,
|
||
|
|
"grad_norm": 0.8250938057899475,
|
||
|
|
"learning_rate": 4.2409476985582645e-06,
|
||
|
|
"loss": 0.1048,
|
||
|
|
"num_input_tokens_seen": 637256,
|
||
|
|
"step": 1585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6528066528066527,
|
||
|
|
"grad_norm": 2.617978572845459,
|
||
|
|
"learning_rate": 4.234425972924014e-06,
|
||
|
|
"loss": 0.0156,
|
||
|
|
"num_input_tokens_seen": 639176,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.658004158004158,
|
||
|
|
"grad_norm": 0.5611851811408997,
|
||
|
|
"learning_rate": 4.227881411298175e-06,
|
||
|
|
"loss": 0.1551,
|
||
|
|
"num_input_tokens_seen": 641224,
|
||
|
|
"step": 1595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6632016632016633,
|
||
|
|
"grad_norm": 0.4693892002105713,
|
||
|
|
"learning_rate": 4.221314099848481e-06,
|
||
|
|
"loss": 0.1125,
|
||
|
|
"num_input_tokens_seen": 643144,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6683991683991684,
|
||
|
|
"grad_norm": 38.350101470947266,
|
||
|
|
"learning_rate": 4.214724125042195e-06,
|
||
|
|
"loss": 0.1457,
|
||
|
|
"num_input_tokens_seen": 644936,
|
||
|
|
"step": 1605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6735966735966736,
|
||
|
|
"grad_norm": 36.42043685913086,
|
||
|
|
"learning_rate": 4.208111573644975e-06,
|
||
|
|
"loss": 0.1623,
|
||
|
|
"num_input_tokens_seen": 646984,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.678794178794179,
|
||
|
|
"grad_norm": 0.15917447209358215,
|
||
|
|
"learning_rate": 4.2014765327197285e-06,
|
||
|
|
"loss": 0.2052,
|
||
|
|
"num_input_tokens_seen": 649032,
|
||
|
|
"step": 1615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.683991683991684,
|
||
|
|
"grad_norm": 25.833524703979492,
|
||
|
|
"learning_rate": 4.194819089625466e-06,
|
||
|
|
"loss": 0.2047,
|
||
|
|
"num_input_tokens_seen": 651080,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.689189189189189,
|
||
|
|
"grad_norm": 13.379853248596191,
|
||
|
|
"learning_rate": 4.188139332016154e-06,
|
||
|
|
"loss": 0.2123,
|
||
|
|
"num_input_tokens_seen": 653000,
|
||
|
|
"step": 1625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6943866943866945,
|
||
|
|
"grad_norm": 10.135590553283691,
|
||
|
|
"learning_rate": 4.181437347839559e-06,
|
||
|
|
"loss": 0.2089,
|
||
|
|
"num_input_tokens_seen": 654920,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6995841995841996,
|
||
|
|
"grad_norm": 3.057936191558838,
|
||
|
|
"learning_rate": 4.174713225336087e-06,
|
||
|
|
"loss": 0.1685,
|
||
|
|
"num_input_tokens_seen": 656904,
|
||
|
|
"step": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7047817047817047,
|
||
|
|
"grad_norm": 29.033493041992188,
|
||
|
|
"learning_rate": 4.167967053037625e-06,
|
||
|
|
"loss": 0.105,
|
||
|
|
"num_input_tokens_seen": 658952,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.70997920997921,
|
||
|
|
"grad_norm": 0.9139642715454102,
|
||
|
|
"learning_rate": 4.161198919766375e-06,
|
||
|
|
"loss": 0.0899,
|
||
|
|
"num_input_tokens_seen": 660872,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7151767151767152,
|
||
|
|
"grad_norm": 37.07249069213867,
|
||
|
|
"learning_rate": 4.154408914633685e-06,
|
||
|
|
"loss": 0.2054,
|
||
|
|
"num_input_tokens_seen": 662856,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7203742203742203,
|
||
|
|
"grad_norm": 6.607808589935303,
|
||
|
|
"learning_rate": 4.147597127038873e-06,
|
||
|
|
"loss": 0.2025,
|
||
|
|
"num_input_tokens_seen": 664904,
|
||
|
|
"step": 1655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7255717255717256,
|
||
|
|
"grad_norm": 20.834936141967773,
|
||
|
|
"learning_rate": 4.140763646668051e-06,
|
||
|
|
"loss": 0.141,
|
||
|
|
"num_input_tokens_seen": 666888,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7307692307692308,
|
||
|
|
"grad_norm": 8.577420234680176,
|
||
|
|
"learning_rate": 4.133908563492949e-06,
|
||
|
|
"loss": 0.0252,
|
||
|
|
"num_input_tokens_seen": 668936,
|
||
|
|
"step": 1665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.735966735966736,
|
||
|
|
"grad_norm": 86.40946960449219,
|
||
|
|
"learning_rate": 4.12703196776972e-06,
|
||
|
|
"loss": 0.2066,
|
||
|
|
"num_input_tokens_seen": 670856,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7411642411642412,
|
||
|
|
"grad_norm": 30.79783058166504,
|
||
|
|
"learning_rate": 4.120133950037763e-06,
|
||
|
|
"loss": 0.3627,
|
||
|
|
"num_input_tokens_seen": 672840,
|
||
|
|
"step": 1675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7463617463617465,
|
||
|
|
"grad_norm": 40.20522689819336,
|
||
|
|
"learning_rate": 4.113214601118523e-06,
|
||
|
|
"loss": 0.2218,
|
||
|
|
"num_input_tokens_seen": 674824,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7515592515592515,
|
||
|
|
"grad_norm": 26.033588409423828,
|
||
|
|
"learning_rate": 4.106274012114302e-06,
|
||
|
|
"loss": 0.1042,
|
||
|
|
"num_input_tokens_seen": 676808,
|
||
|
|
"step": 1685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7536382536382535,
|
||
|
|
"eval_loss": 0.2114141583442688,
|
||
|
|
"eval_runtime": 1.0685,
|
||
|
|
"eval_samples_per_second": 801.127,
|
||
|
|
"eval_steps_per_second": 100.141,
|
||
|
|
"num_input_tokens_seen": 677576,
|
||
|
|
"step": 1687
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7567567567567568,
|
||
|
|
"grad_norm": 20.88821792602539,
|
||
|
|
"learning_rate": 4.099312274407049e-06,
|
||
|
|
"loss": 0.1712,
|
||
|
|
"num_input_tokens_seen": 678728,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.761954261954262,
|
||
|
|
"grad_norm": 36.20766067504883,
|
||
|
|
"learning_rate": 4.092329479657168e-06,
|
||
|
|
"loss": 0.1031,
|
||
|
|
"num_input_tokens_seen": 680776,
|
||
|
|
"step": 1695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7671517671517671,
|
||
|
|
"grad_norm": 1.624861478805542,
|
||
|
|
"learning_rate": 4.085325719802307e-06,
|
||
|
|
"loss": 0.1288,
|
||
|
|
"num_input_tokens_seen": 683016,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7723492723492722,
|
||
|
|
"grad_norm": 5.351211071014404,
|
||
|
|
"learning_rate": 4.0783010870561445e-06,
|
||
|
|
"loss": 0.0556,
|
||
|
|
"num_input_tokens_seen": 685256,
|
||
|
|
"step": 1705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7775467775467777,
|
||
|
|
"grad_norm": 45.75218963623047,
|
||
|
|
"learning_rate": 4.07125567390718e-06,
|
||
|
|
"loss": 0.3125,
|
||
|
|
"num_input_tokens_seen": 687304,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7827442827442828,
|
||
|
|
"grad_norm": 26.084054946899414,
|
||
|
|
"learning_rate": 4.064189573117512e-06,
|
||
|
|
"loss": 0.2158,
|
||
|
|
"num_input_tokens_seen": 689224,
|
||
|
|
"step": 1715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7879417879417878,
|
||
|
|
"grad_norm": 1.3422380685806274,
|
||
|
|
"learning_rate": 4.057102877721621e-06,
|
||
|
|
"loss": 0.1701,
|
||
|
|
"num_input_tokens_seen": 691400,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.793139293139293,
|
||
|
|
"grad_norm": 14.056601524353027,
|
||
|
|
"learning_rate": 4.049995681025143e-06,
|
||
|
|
"loss": 0.1154,
|
||
|
|
"num_input_tokens_seen": 693320,
|
||
|
|
"step": 1725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7983367983367984,
|
||
|
|
"grad_norm": 18.775554656982422,
|
||
|
|
"learning_rate": 4.0428680766036386e-06,
|
||
|
|
"loss": 0.1654,
|
||
|
|
"num_input_tokens_seen": 695432,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8035343035343034,
|
||
|
|
"grad_norm": 18.788192749023438,
|
||
|
|
"learning_rate": 4.035720158301363e-06,
|
||
|
|
"loss": 0.2169,
|
||
|
|
"num_input_tokens_seen": 697544,
|
||
|
|
"step": 1735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8087318087318087,
|
||
|
|
"grad_norm": 15.670763969421387,
|
||
|
|
"learning_rate": 4.028552020230031e-06,
|
||
|
|
"loss": 0.1438,
|
||
|
|
"num_input_tokens_seen": 699592,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.813929313929314,
|
||
|
|
"grad_norm": 27.88116455078125,
|
||
|
|
"learning_rate": 4.021363756767577e-06,
|
||
|
|
"loss": 0.2247,
|
||
|
|
"num_input_tokens_seen": 701576,
|
||
|
|
"step": 1745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.819126819126819,
|
||
|
|
"grad_norm": 17.78391456604004,
|
||
|
|
"learning_rate": 4.014155462556913e-06,
|
||
|
|
"loss": 0.2586,
|
||
|
|
"num_input_tokens_seen": 703688,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8243243243243243,
|
||
|
|
"grad_norm": 16.56380844116211,
|
||
|
|
"learning_rate": 4.006927232504682e-06,
|
||
|
|
"loss": 0.2187,
|
||
|
|
"num_input_tokens_seen": 705736,
|
||
|
|
"step": 1755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8295218295218296,
|
||
|
|
"grad_norm": 6.663086414337158,
|
||
|
|
"learning_rate": 3.999679161780006e-06,
|
||
|
|
"loss": 0.043,
|
||
|
|
"num_input_tokens_seen": 707720,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8347193347193347,
|
||
|
|
"grad_norm": 0.7404634952545166,
|
||
|
|
"learning_rate": 3.99241134581324e-06,
|
||
|
|
"loss": 0.08,
|
||
|
|
"num_input_tokens_seen": 709896,
|
||
|
|
"step": 1765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.83991683991684,
|
||
|
|
"grad_norm": 39.46183395385742,
|
||
|
|
"learning_rate": 3.985123880294708e-06,
|
||
|
|
"loss": 0.1669,
|
||
|
|
"num_input_tokens_seen": 711944,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8451143451143452,
|
||
|
|
"grad_norm": 7.113590717315674,
|
||
|
|
"learning_rate": 3.977816861173446e-06,
|
||
|
|
"loss": 0.1912,
|
||
|
|
"num_input_tokens_seen": 713992,
|
||
|
|
"step": 1775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8503118503118503,
|
||
|
|
"grad_norm": 48.9863395690918,
|
||
|
|
"learning_rate": 3.970490384655939e-06,
|
||
|
|
"loss": 0.1846,
|
||
|
|
"num_input_tokens_seen": 715976,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8555093555093554,
|
||
|
|
"grad_norm": 31.94883918762207,
|
||
|
|
"learning_rate": 3.963144547204856e-06,
|
||
|
|
"loss": 0.105,
|
||
|
|
"num_input_tokens_seen": 718024,
|
||
|
|
"step": 1785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8607068607068609,
|
||
|
|
"grad_norm": 11.180656433105469,
|
||
|
|
"learning_rate": 3.955779445537776e-06,
|
||
|
|
"loss": 0.2342,
|
||
|
|
"num_input_tokens_seen": 720072,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.865904365904366,
|
||
|
|
"grad_norm": 5.974298000335693,
|
||
|
|
"learning_rate": 3.948395176625918e-06,
|
||
|
|
"loss": 0.2314,
|
||
|
|
"num_input_tokens_seen": 722120,
|
||
|
|
"step": 1795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.871101871101871,
|
||
|
|
"grad_norm": 3.235103130340576,
|
||
|
|
"learning_rate": 3.940991837692861e-06,
|
||
|
|
"loss": 0.1187,
|
||
|
|
"num_input_tokens_seen": 724168,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8762993762993763,
|
||
|
|
"grad_norm": 25.299224853515625,
|
||
|
|
"learning_rate": 3.933569526213268e-06,
|
||
|
|
"loss": 0.1292,
|
||
|
|
"num_input_tokens_seen": 726280,
|
||
|
|
"step": 1805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8814968814968815,
|
||
|
|
"grad_norm": 2.7856338024139404,
|
||
|
|
"learning_rate": 3.926128339911599e-06,
|
||
|
|
"loss": 0.0843,
|
||
|
|
"num_input_tokens_seen": 728264,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8866943866943866,
|
||
|
|
"grad_norm": 0.9576424360275269,
|
||
|
|
"learning_rate": 3.918668376760827e-06,
|
||
|
|
"loss": 0.1791,
|
||
|
|
"num_input_tokens_seen": 730312,
|
||
|
|
"step": 1815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8918918918918919,
|
||
|
|
"grad_norm": 55.718650817871094,
|
||
|
|
"learning_rate": 3.9111897349811455e-06,
|
||
|
|
"loss": 0.1365,
|
||
|
|
"num_input_tokens_seen": 732296,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8970893970893972,
|
||
|
|
"grad_norm": 20.79973793029785,
|
||
|
|
"learning_rate": 3.903692513038677e-06,
|
||
|
|
"loss": 0.1369,
|
||
|
|
"num_input_tokens_seen": 734088,
|
||
|
|
"step": 1825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9022869022869022,
|
||
|
|
"grad_norm": 51.692867279052734,
|
||
|
|
"learning_rate": 3.896176809644178e-06,
|
||
|
|
"loss": 0.2305,
|
||
|
|
"num_input_tokens_seen": 736072,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9074844074844075,
|
||
|
|
"grad_norm": 3.8833014965057373,
|
||
|
|
"learning_rate": 3.8886427237517345e-06,
|
||
|
|
"loss": 0.2062,
|
||
|
|
"num_input_tokens_seen": 738120,
|
||
|
|
"step": 1835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9126819126819128,
|
||
|
|
"grad_norm": 26.46794891357422,
|
||
|
|
"learning_rate": 3.881090354557463e-06,
|
||
|
|
"loss": 0.2077,
|
||
|
|
"num_input_tokens_seen": 740168,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9178794178794178,
|
||
|
|
"grad_norm": 11.211897850036621,
|
||
|
|
"learning_rate": 3.8735198014982066e-06,
|
||
|
|
"loss": 0.1425,
|
||
|
|
"num_input_tokens_seen": 742280,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9230769230769231,
|
||
|
|
"grad_norm": 26.17072296142578,
|
||
|
|
"learning_rate": 3.865931164250219e-06,
|
||
|
|
"loss": 0.0702,
|
||
|
|
"num_input_tokens_seen": 744328,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9282744282744284,
|
||
|
|
"grad_norm": 13.180768966674805,
|
||
|
|
"learning_rate": 3.858324542727859e-06,
|
||
|
|
"loss": 0.1732,
|
||
|
|
"num_input_tokens_seen": 746440,
|
||
|
|
"step": 1855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9334719334719335,
|
||
|
|
"grad_norm": 41.96379470825195,
|
||
|
|
"learning_rate": 3.8507000370822675e-06,
|
||
|
|
"loss": 0.1543,
|
||
|
|
"num_input_tokens_seen": 748488,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9386694386694385,
|
||
|
|
"grad_norm": 14.179240226745605,
|
||
|
|
"learning_rate": 3.84305774770006e-06,
|
||
|
|
"loss": 0.1298,
|
||
|
|
"num_input_tokens_seen": 750344,
|
||
|
|
"step": 1865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.943866943866944,
|
||
|
|
"grad_norm": 10.15986442565918,
|
||
|
|
"learning_rate": 3.835397775201991e-06,
|
||
|
|
"loss": 0.0507,
|
||
|
|
"num_input_tokens_seen": 752328,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.949064449064449,
|
||
|
|
"grad_norm": 14.734403610229492,
|
||
|
|
"learning_rate": 3.827720220441642e-06,
|
||
|
|
"loss": 0.2625,
|
||
|
|
"num_input_tokens_seen": 754312,
|
||
|
|
"step": 1875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9542619542619541,
|
||
|
|
"grad_norm": 38.59032440185547,
|
||
|
|
"learning_rate": 3.820025184504085e-06,
|
||
|
|
"loss": 0.4145,
|
||
|
|
"num_input_tokens_seen": 756232,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9594594594594594,
|
||
|
|
"grad_norm": 22.36400604248047,
|
||
|
|
"learning_rate": 3.812312768704557e-06,
|
||
|
|
"loss": 0.2626,
|
||
|
|
"num_input_tokens_seen": 758280,
|
||
|
|
"step": 1885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9646569646569647,
|
||
|
|
"grad_norm": 5.247732162475586,
|
||
|
|
"learning_rate": 3.80458307458712e-06,
|
||
|
|
"loss": 0.1128,
|
||
|
|
"num_input_tokens_seen": 760328,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9698544698544698,
|
||
|
|
"grad_norm": 14.368209838867188,
|
||
|
|
"learning_rate": 3.7968362039233315e-06,
|
||
|
|
"loss": 0.1213,
|
||
|
|
"num_input_tokens_seen": 762248,
|
||
|
|
"step": 1895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.975051975051975,
|
||
|
|
"grad_norm": 22.700754165649414,
|
||
|
|
"learning_rate": 3.7890722587108985e-06,
|
||
|
|
"loss": 0.077,
|
||
|
|
"num_input_tokens_seen": 764168,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9802494802494803,
|
||
|
|
"grad_norm": 1.1314526796340942,
|
||
|
|
"learning_rate": 3.7812913411723377e-06,
|
||
|
|
"loss": 0.0655,
|
||
|
|
"num_input_tokens_seen": 766216,
|
||
|
|
"step": 1905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9854469854469854,
|
||
|
|
"grad_norm": 22.53055763244629,
|
||
|
|
"learning_rate": 3.773493553753628e-06,
|
||
|
|
"loss": 0.0962,
|
||
|
|
"num_input_tokens_seen": 768264,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9906444906444907,
|
||
|
|
"grad_norm": 19.964323043823242,
|
||
|
|
"learning_rate": 3.7656789991228638e-06,
|
||
|
|
"loss": 0.0219,
|
||
|
|
"num_input_tokens_seen": 770184,
|
||
|
|
"step": 1915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.995841995841996,
|
||
|
|
"grad_norm": 3.517256259918213,
|
||
|
|
"learning_rate": 3.7578477801689e-06,
|
||
|
|
"loss": 0.1279,
|
||
|
|
"num_input_tokens_seen": 772168,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.001039501039501,
|
||
|
|
"grad_norm": 0.14674918353557587,
|
||
|
|
"learning_rate": 3.7500000000000005e-06,
|
||
|
|
"loss": 0.096,
|
||
|
|
"num_input_tokens_seen": 774160,
|
||
|
|
"step": 1925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.004158004158004,
|
||
|
|
"eval_loss": 0.3561875522136688,
|
||
|
|
"eval_runtime": 1.0449,
|
||
|
|
"eval_samples_per_second": 819.204,
|
||
|
|
"eval_steps_per_second": 102.401,
|
||
|
|
"num_input_tokens_seen": 775312,
|
||
|
|
"step": 1928
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.006237006237006,
|
||
|
|
"grad_norm": 1.6722761392593384,
|
||
|
|
"learning_rate": 3.7421357619424793e-06,
|
||
|
|
"loss": 0.0698,
|
||
|
|
"num_input_tokens_seen": 776144,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0114345114345116,
|
||
|
|
"grad_norm": 0.06727920472621918,
|
||
|
|
"learning_rate": 3.7342551695393375e-06,
|
||
|
|
"loss": 0.0941,
|
||
|
|
"num_input_tokens_seen": 778128,
|
||
|
|
"step": 1935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0166320166320166,
|
||
|
|
"grad_norm": 0.06038059666752815,
|
||
|
|
"learning_rate": 3.7263583265489077e-06,
|
||
|
|
"loss": 0.0863,
|
||
|
|
"num_input_tokens_seen": 780176,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0218295218295217,
|
||
|
|
"grad_norm": 52.84983444213867,
|
||
|
|
"learning_rate": 3.718445336943478e-06,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"num_input_tokens_seen": 782160,
|
||
|
|
"step": 1945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.027027027027027,
|
||
|
|
"grad_norm": 35.81016540527344,
|
||
|
|
"learning_rate": 3.7105163049079305e-06,
|
||
|
|
"loss": 0.0675,
|
||
|
|
"num_input_tokens_seen": 784208,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0322245322245323,
|
||
|
|
"grad_norm": 0.14884886145591736,
|
||
|
|
"learning_rate": 3.702571334838365e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 786256,
|
||
|
|
"step": 1955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0374220374220373,
|
||
|
|
"grad_norm": 0.03602315112948418,
|
||
|
|
"learning_rate": 3.6946105313407287e-06,
|
||
|
|
"loss": 0.1288,
|
||
|
|
"num_input_tokens_seen": 788240,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.042619542619543,
|
||
|
|
"grad_norm": 152.0932159423828,
|
||
|
|
"learning_rate": 3.6866339992294347e-06,
|
||
|
|
"loss": 0.1179,
|
||
|
|
"num_input_tokens_seen": 790288,
|
||
|
|
"step": 1965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.047817047817048,
|
||
|
|
"grad_norm": 0.03992204740643501,
|
||
|
|
"learning_rate": 3.678641843525986e-06,
|
||
|
|
"loss": 0.0768,
|
||
|
|
"num_input_tokens_seen": 792272,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.053014553014553,
|
||
|
|
"grad_norm": 0.021054543554782867,
|
||
|
|
"learning_rate": 3.670634169457587e-06,
|
||
|
|
"loss": 0.0297,
|
||
|
|
"num_input_tokens_seen": 794384,
|
||
|
|
"step": 1975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0582120582120584,
|
||
|
|
"grad_norm": 0.09833300858736038,
|
||
|
|
"learning_rate": 3.662611082455766e-06,
|
||
|
|
"loss": 0.1305,
|
||
|
|
"num_input_tokens_seen": 796368,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0634095634095635,
|
||
|
|
"grad_norm": 1.8203060626983643,
|
||
|
|
"learning_rate": 3.6545726881549792e-06,
|
||
|
|
"loss": 0.0029,
|
||
|
|
"num_input_tokens_seen": 798480,
|
||
|
|
"step": 1985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0686070686070686,
|
||
|
|
"grad_norm": 13.759748458862305,
|
||
|
|
"learning_rate": 3.6465190923912275e-06,
|
||
|
|
"loss": 0.0937,
|
||
|
|
"num_input_tokens_seen": 800528,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0738045738045736,
|
||
|
|
"grad_norm": 0.23287345468997955,
|
||
|
|
"learning_rate": 3.6384504012006544e-06,
|
||
|
|
"loss": 0.1904,
|
||
|
|
"num_input_tokens_seen": 802768,
|
||
|
|
"step": 1995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.079002079002079,
|
||
|
|
"grad_norm": 15.67501163482666,
|
||
|
|
"learning_rate": 3.6303667208181576e-06,
|
||
|
|
"loss": 0.1647,
|
||
|
|
"num_input_tokens_seen": 804752,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.084199584199584,
|
||
|
|
"grad_norm": 0.5168598294258118,
|
||
|
|
"learning_rate": 3.622268157675986e-06,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"num_input_tokens_seen": 806672,
|
||
|
|
"step": 2005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0893970893970892,
|
||
|
|
"grad_norm": 0.3060367703437805,
|
||
|
|
"learning_rate": 3.614154818402339e-06,
|
||
|
|
"loss": 0.0186,
|
||
|
|
"num_input_tokens_seen": 808656,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0945945945945947,
|
||
|
|
"grad_norm": 0.8559133410453796,
|
||
|
|
"learning_rate": 3.6060268098199656e-06,
|
||
|
|
"loss": 0.0494,
|
||
|
|
"num_input_tokens_seen": 810640,
|
||
|
|
"step": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0997920997921,
|
||
|
|
"grad_norm": 0.40390461683273315,
|
||
|
|
"learning_rate": 3.5978842389447523e-06,
|
||
|
|
"loss": 0.0657,
|
||
|
|
"num_input_tokens_seen": 812688,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.104989604989605,
|
||
|
|
"grad_norm": 0.16403798758983612,
|
||
|
|
"learning_rate": 3.5897272129843198e-06,
|
||
|
|
"loss": 0.0206,
|
||
|
|
"num_input_tokens_seen": 814800,
|
||
|
|
"step": 2025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1101871101871104,
|
||
|
|
"grad_norm": 0.8833001255989075,
|
||
|
|
"learning_rate": 3.5815558393366064e-06,
|
||
|
|
"loss": 0.0252,
|
||
|
|
"num_input_tokens_seen": 816912,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1153846153846154,
|
||
|
|
"grad_norm": 0.044099193066358566,
|
||
|
|
"learning_rate": 3.57337022558846e-06,
|
||
|
|
"loss": 0.1156,
|
||
|
|
"num_input_tokens_seen": 818896,
|
||
|
|
"step": 2035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1205821205821205,
|
||
|
|
"grad_norm": 20.8973445892334,
|
||
|
|
"learning_rate": 3.5651704795142137e-06,
|
||
|
|
"loss": 0.0855,
|
||
|
|
"num_input_tokens_seen": 820880,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.125779625779626,
|
||
|
|
"grad_norm": 21.765148162841797,
|
||
|
|
"learning_rate": 3.5569567090742763e-06,
|
||
|
|
"loss": 0.1594,
|
||
|
|
"num_input_tokens_seen": 822864,
|
||
|
|
"step": 2045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.130977130977131,
|
||
|
|
"grad_norm": 2.817866325378418,
|
||
|
|
"learning_rate": 3.548729022413701e-06,
|
||
|
|
"loss": 0.0265,
|
||
|
|
"num_input_tokens_seen": 825040,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.136174636174636,
|
||
|
|
"grad_norm": 0.0856303721666336,
|
||
|
|
"learning_rate": 3.5404875278607693e-06,
|
||
|
|
"loss": 0.0995,
|
||
|
|
"num_input_tokens_seen": 827024,
|
||
|
|
"step": 2055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.141372141372141,
|
||
|
|
"grad_norm": 0.09817512333393097,
|
||
|
|
"learning_rate": 3.5322323339255602e-06,
|
||
|
|
"loss": 0.072,
|
||
|
|
"num_input_tokens_seen": 829136,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1465696465696467,
|
||
|
|
"grad_norm": 4.946967601776123,
|
||
|
|
"learning_rate": 3.5239635492985248e-06,
|
||
|
|
"loss": 0.0483,
|
||
|
|
"num_input_tokens_seen": 831184,
|
||
|
|
"step": 2065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1517671517671517,
|
||
|
|
"grad_norm": 0.04570393264293671,
|
||
|
|
"learning_rate": 3.5156812828490507e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 833168,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.156964656964657,
|
||
|
|
"grad_norm": 0.031534019857645035,
|
||
|
|
"learning_rate": 3.5073856436240335e-06,
|
||
|
|
"loss": 0.0685,
|
||
|
|
"num_input_tokens_seen": 835216,
|
||
|
|
"step": 2075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1621621621621623,
|
||
|
|
"grad_norm": 0.12049651145935059,
|
||
|
|
"learning_rate": 3.4990767408464383e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 837136,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1673596673596673,
|
||
|
|
"grad_norm": 0.0549314022064209,
|
||
|
|
"learning_rate": 3.4907546839138627e-06,
|
||
|
|
"loss": 0.1832,
|
||
|
|
"num_input_tokens_seen": 839120,
|
||
|
|
"step": 2085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1725571725571724,
|
||
|
|
"grad_norm": 0.11109847575426102,
|
||
|
|
"learning_rate": 3.4824195823970954e-06,
|
||
|
|
"loss": 0.0608,
|
||
|
|
"num_input_tokens_seen": 841104,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.177754677754678,
|
||
|
|
"grad_norm": 9.288878440856934,
|
||
|
|
"learning_rate": 3.4740715460386732e-06,
|
||
|
|
"loss": 0.0894,
|
||
|
|
"num_input_tokens_seen": 843152,
|
||
|
|
"step": 2095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.182952182952183,
|
||
|
|
"grad_norm": 0.1733572781085968,
|
||
|
|
"learning_rate": 3.46571068475144e-06,
|
||
|
|
"loss": 0.0972,
|
||
|
|
"num_input_tokens_seen": 845136,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.188149688149688,
|
||
|
|
"grad_norm": 0.4034070670604706,
|
||
|
|
"learning_rate": 3.457337108617094e-06,
|
||
|
|
"loss": 0.1887,
|
||
|
|
"num_input_tokens_seen": 847120,
|
||
|
|
"step": 2105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1933471933471935,
|
||
|
|
"grad_norm": 26.97930908203125,
|
||
|
|
"learning_rate": 3.4489509278847415e-06,
|
||
|
|
"loss": 0.2052,
|
||
|
|
"num_input_tokens_seen": 849168,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1985446985446986,
|
||
|
|
"grad_norm": 74.03186798095703,
|
||
|
|
"learning_rate": 3.440552252969446e-06,
|
||
|
|
"loss": 0.0731,
|
||
|
|
"num_input_tokens_seen": 851152,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2037422037422036,
|
||
|
|
"grad_norm": 3.1255908012390137,
|
||
|
|
"learning_rate": 3.432141194450772e-06,
|
||
|
|
"loss": 0.0078,
|
||
|
|
"num_input_tokens_seen": 853008,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.208939708939709,
|
||
|
|
"grad_norm": 0.36267173290252686,
|
||
|
|
"learning_rate": 3.4237178630713312e-06,
|
||
|
|
"loss": 0.0651,
|
||
|
|
"num_input_tokens_seen": 855120,
|
||
|
|
"step": 2125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.214137214137214,
|
||
|
|
"grad_norm": 0.01146312803030014,
|
||
|
|
"learning_rate": 3.4152823697353237e-06,
|
||
|
|
"loss": 0.1599,
|
||
|
|
"num_input_tokens_seen": 857232,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2193347193347193,
|
||
|
|
"grad_norm": 28.126773834228516,
|
||
|
|
"learning_rate": 3.4068348255070764e-06,
|
||
|
|
"loss": 0.057,
|
||
|
|
"num_input_tokens_seen": 859344,
|
||
|
|
"step": 2135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2245322245322248,
|
||
|
|
"grad_norm": 15.560206413269043,
|
||
|
|
"learning_rate": 3.3983753416095844e-06,
|
||
|
|
"loss": 0.0868,
|
||
|
|
"num_input_tokens_seen": 861328,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.22972972972973,
|
||
|
|
"grad_norm": 13.887256622314453,
|
||
|
|
"learning_rate": 3.3899040294230413e-06,
|
||
|
|
"loss": 0.2098,
|
||
|
|
"num_input_tokens_seen": 863376,
|
||
|
|
"step": 2145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.234927234927235,
|
||
|
|
"grad_norm": 0.028023365885019302,
|
||
|
|
"learning_rate": 3.381421000483378e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"num_input_tokens_seen": 865424,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.24012474012474,
|
||
|
|
"grad_norm": 32.4400520324707,
|
||
|
|
"learning_rate": 3.37292636648079e-06,
|
||
|
|
"loss": 0.0981,
|
||
|
|
"num_input_tokens_seen": 867472,
|
||
|
|
"step": 2155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2453222453222454,
|
||
|
|
"grad_norm": 72.8097915649414,
|
||
|
|
"learning_rate": 3.3644202392582703e-06,
|
||
|
|
"loss": 0.1542,
|
||
|
|
"num_input_tokens_seen": 869584,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2505197505197505,
|
||
|
|
"grad_norm": 1.8866627216339111,
|
||
|
|
"learning_rate": 3.3559027308101344e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"num_input_tokens_seen": 871568,
|
||
|
|
"step": 2165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2546777546777546,
|
||
|
|
"eval_loss": 0.30345332622528076,
|
||
|
|
"eval_runtime": 1.0686,
|
||
|
|
"eval_samples_per_second": 801.043,
|
||
|
|
"eval_steps_per_second": 100.13,
|
||
|
|
"num_input_tokens_seen": 873104,
|
||
|
|
"step": 2169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2557172557172556,
|
||
|
|
"grad_norm": 12.88599967956543,
|
||
|
|
"learning_rate": 3.3473739532805464e-06,
|
||
|
|
"loss": 0.0945,
|
||
|
|
"num_input_tokens_seen": 873488,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.260914760914761,
|
||
|
|
"grad_norm": 2.817438840866089,
|
||
|
|
"learning_rate": 3.3388340189620427e-06,
|
||
|
|
"loss": 0.1038,
|
||
|
|
"num_input_tokens_seen": 875472,
|
||
|
|
"step": 2175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.266112266112266,
|
||
|
|
"grad_norm": 0.07076973468065262,
|
||
|
|
"learning_rate": 3.3302830402940534e-06,
|
||
|
|
"loss": 0.0275,
|
||
|
|
"num_input_tokens_seen": 877392,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.271309771309771,
|
||
|
|
"grad_norm": 0.515289306640625,
|
||
|
|
"learning_rate": 3.3217211298614225e-06,
|
||
|
|
"loss": 0.1037,
|
||
|
|
"num_input_tokens_seen": 879504,
|
||
|
|
"step": 2185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2765072765072767,
|
||
|
|
"grad_norm": 4.857708930969238,
|
||
|
|
"learning_rate": 3.313148400392925e-06,
|
||
|
|
"loss": 0.0551,
|
||
|
|
"num_input_tokens_seen": 881360,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2817047817047817,
|
||
|
|
"grad_norm": 0.10649969428777695,
|
||
|
|
"learning_rate": 3.3045649647597814e-06,
|
||
|
|
"loss": 0.094,
|
||
|
|
"num_input_tokens_seen": 883280,
|
||
|
|
"step": 2195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.286902286902287,
|
||
|
|
"grad_norm": 0.1783861368894577,
|
||
|
|
"learning_rate": 3.2959709359741743e-06,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"num_input_tokens_seen": 885328,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2920997920997923,
|
||
|
|
"grad_norm": 23.481298446655273,
|
||
|
|
"learning_rate": 3.2873664271877588e-06,
|
||
|
|
"loss": 0.0732,
|
||
|
|
"num_input_tokens_seen": 887312,
|
||
|
|
"step": 2205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2972972972972974,
|
||
|
|
"grad_norm": 54.74378967285156,
|
||
|
|
"learning_rate": 3.2787515516901717e-06,
|
||
|
|
"loss": 0.0574,
|
||
|
|
"num_input_tokens_seen": 889296,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3024948024948024,
|
||
|
|
"grad_norm": 0.0618121400475502,
|
||
|
|
"learning_rate": 3.2701264229075443e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 891408,
|
||
|
|
"step": 2215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3076923076923075,
|
||
|
|
"grad_norm": 0.05290354788303375,
|
||
|
|
"learning_rate": 3.261491154401001e-06,
|
||
|
|
"loss": 0.001,
|
||
|
|
"num_input_tokens_seen": 893392,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.312889812889813,
|
||
|
|
"grad_norm": 21.230501174926758,
|
||
|
|
"learning_rate": 3.2528458598651735e-06,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"num_input_tokens_seen": 895440,
|
||
|
|
"step": 2225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.318087318087318,
|
||
|
|
"grad_norm": 2.141026258468628,
|
||
|
|
"learning_rate": 3.2441906531266963e-06,
|
||
|
|
"loss": 0.1493,
|
||
|
|
"num_input_tokens_seen": 897616,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.323284823284823,
|
||
|
|
"grad_norm": 35.35129165649414,
|
||
|
|
"learning_rate": 3.2355256481427145e-06,
|
||
|
|
"loss": 0.0359,
|
||
|
|
"num_input_tokens_seen": 899536,
|
||
|
|
"step": 2235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3284823284823286,
|
||
|
|
"grad_norm": 0.017625411972403526,
|
||
|
|
"learning_rate": 3.2268509589993745e-06,
|
||
|
|
"loss": 0.0408,
|
||
|
|
"num_input_tokens_seen": 901648,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3336798336798337,
|
||
|
|
"grad_norm": 3.886178731918335,
|
||
|
|
"learning_rate": 3.218166699910332e-06,
|
||
|
|
"loss": 0.1054,
|
||
|
|
"num_input_tokens_seen": 903696,
|
||
|
|
"step": 2245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3388773388773387,
|
||
|
|
"grad_norm": 27.935588836669922,
|
||
|
|
"learning_rate": 3.209472985215243e-06,
|
||
|
|
"loss": 0.1455,
|
||
|
|
"num_input_tokens_seen": 905552,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3440748440748442,
|
||
|
|
"grad_norm": 35.09407043457031,
|
||
|
|
"learning_rate": 3.2007699293782557e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"num_input_tokens_seen": 907472,
|
||
|
|
"step": 2255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3492723492723493,
|
||
|
|
"grad_norm": 58.58681869506836,
|
||
|
|
"learning_rate": 3.1920576469865115e-06,
|
||
|
|
"loss": 0.1043,
|
||
|
|
"num_input_tokens_seen": 909584,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3544698544698544,
|
||
|
|
"grad_norm": 45.3444938659668,
|
||
|
|
"learning_rate": 3.183336252748627e-06,
|
||
|
|
"loss": 0.0544,
|
||
|
|
"num_input_tokens_seen": 911632,
|
||
|
|
"step": 2265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.35966735966736,
|
||
|
|
"grad_norm": 0.2568526566028595,
|
||
|
|
"learning_rate": 3.1746058614931918e-06,
|
||
|
|
"loss": 0.0396,
|
||
|
|
"num_input_tokens_seen": 913616,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.364864864864865,
|
||
|
|
"grad_norm": 55.77798080444336,
|
||
|
|
"learning_rate": 3.16586658816725e-06,
|
||
|
|
"loss": 0.0559,
|
||
|
|
"num_input_tokens_seen": 915728,
|
||
|
|
"step": 2275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.37006237006237,
|
||
|
|
"grad_norm": 0.06812699884176254,
|
||
|
|
"learning_rate": 3.157118547834793e-06,
|
||
|
|
"loss": 0.1154,
|
||
|
|
"num_input_tokens_seen": 917776,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.375259875259875,
|
||
|
|
"grad_norm": 0.2111903578042984,
|
||
|
|
"learning_rate": 3.1483618556752373e-06,
|
||
|
|
"loss": 0.1803,
|
||
|
|
"num_input_tokens_seen": 919952,
|
||
|
|
"step": 2285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3804573804573805,
|
||
|
|
"grad_norm": 0.02562599442899227,
|
||
|
|
"learning_rate": 3.139596626981916e-06,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"num_input_tokens_seen": 921872,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3856548856548856,
|
||
|
|
"grad_norm": 0.2788306176662445,
|
||
|
|
"learning_rate": 3.1308229771605546e-06,
|
||
|
|
"loss": 0.1079,
|
||
|
|
"num_input_tokens_seen": 923856,
|
||
|
|
"step": 2295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.390852390852391,
|
||
|
|
"grad_norm": 10.78072738647461,
|
||
|
|
"learning_rate": 3.1220410217277546e-06,
|
||
|
|
"loss": 0.1516,
|
||
|
|
"num_input_tokens_seen": 925968,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.396049896049896,
|
||
|
|
"grad_norm": 3.1442511081695557,
|
||
|
|
"learning_rate": 3.1132508763094715e-06,
|
||
|
|
"loss": 0.0496,
|
||
|
|
"num_input_tokens_seen": 927888,
|
||
|
|
"step": 2305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.401247401247401,
|
||
|
|
"grad_norm": 0.10060002654790878,
|
||
|
|
"learning_rate": 3.1044526566394924e-06,
|
||
|
|
"loss": 0.0691,
|
||
|
|
"num_input_tokens_seen": 929808,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4064449064449063,
|
||
|
|
"grad_norm": 0.07174642384052277,
|
||
|
|
"learning_rate": 3.0956464785579125e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_input_tokens_seen": 931728,
|
||
|
|
"step": 2315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4116424116424118,
|
||
|
|
"grad_norm": 0.3574046790599823,
|
||
|
|
"learning_rate": 3.0868324580096113e-06,
|
||
|
|
"loss": 0.0309,
|
||
|
|
"num_input_tokens_seen": 933840,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.416839916839917,
|
||
|
|
"grad_norm": 0.8769842982292175,
|
||
|
|
"learning_rate": 3.078010711042723e-06,
|
||
|
|
"loss": 0.1115,
|
||
|
|
"num_input_tokens_seen": 935824,
|
||
|
|
"step": 2325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.422037422037422,
|
||
|
|
"grad_norm": 0.044372253119945526,
|
||
|
|
"learning_rate": 3.069181353807111e-06,
|
||
|
|
"loss": 0.043,
|
||
|
|
"num_input_tokens_seen": 937872,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4272349272349274,
|
||
|
|
"grad_norm": 0.3668450713157654,
|
||
|
|
"learning_rate": 3.0603445025528377e-06,
|
||
|
|
"loss": 0.098,
|
||
|
|
"num_input_tokens_seen": 939984,
|
||
|
|
"step": 2335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4324324324324325,
|
||
|
|
"grad_norm": 0.3943478465080261,
|
||
|
|
"learning_rate": 3.051500273628633e-06,
|
||
|
|
"loss": 0.0482,
|
||
|
|
"num_input_tokens_seen": 941968,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4376299376299375,
|
||
|
|
"grad_norm": 40.15293502807617,
|
||
|
|
"learning_rate": 3.042648783480366e-06,
|
||
|
|
"loss": 0.0265,
|
||
|
|
"num_input_tokens_seen": 943952,
|
||
|
|
"step": 2345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.442827442827443,
|
||
|
|
"grad_norm": 0.6149891018867493,
|
||
|
|
"learning_rate": 3.0337901486495073e-06,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"num_input_tokens_seen": 945872,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.448024948024948,
|
||
|
|
"grad_norm": 0.031426846981048584,
|
||
|
|
"learning_rate": 3.0249244857715977e-06,
|
||
|
|
"loss": 0.1045,
|
||
|
|
"num_input_tokens_seen": 947856,
|
||
|
|
"step": 2355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.453222453222453,
|
||
|
|
"grad_norm": 0.029239047318696976,
|
||
|
|
"learning_rate": 3.01605191157471e-06,
|
||
|
|
"loss": 0.0835,
|
||
|
|
"num_input_tokens_seen": 949840,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4584199584199586,
|
||
|
|
"grad_norm": 43.117408752441406,
|
||
|
|
"learning_rate": 3.0071725428779152e-06,
|
||
|
|
"loss": 0.0307,
|
||
|
|
"num_input_tokens_seen": 951760,
|
||
|
|
"step": 2365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4636174636174637,
|
||
|
|
"grad_norm": 5.0166168212890625,
|
||
|
|
"learning_rate": 2.9982864965897423e-06,
|
||
|
|
"loss": 0.0294,
|
||
|
|
"num_input_tokens_seen": 953680,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4688149688149688,
|
||
|
|
"grad_norm": 0.023427043110132217,
|
||
|
|
"learning_rate": 2.9893938897066392e-06,
|
||
|
|
"loss": 0.0349,
|
||
|
|
"num_input_tokens_seen": 955600,
|
||
|
|
"step": 2375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.474012474012474,
|
||
|
|
"grad_norm": 0.23864233493804932,
|
||
|
|
"learning_rate": 2.9804948393114325e-06,
|
||
|
|
"loss": 0.2071,
|
||
|
|
"num_input_tokens_seen": 957456,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4792099792099793,
|
||
|
|
"grad_norm": 0.4926133453845978,
|
||
|
|
"learning_rate": 2.9715894625717868e-06,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"num_input_tokens_seen": 959504,
|
||
|
|
"step": 2385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4844074844074844,
|
||
|
|
"grad_norm": 0.32122310996055603,
|
||
|
|
"learning_rate": 2.9626778767386604e-06,
|
||
|
|
"loss": 0.0277,
|
||
|
|
"num_input_tokens_seen": 961488,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4896049896049894,
|
||
|
|
"grad_norm": 0.13178545236587524,
|
||
|
|
"learning_rate": 2.953760199144764e-06,
|
||
|
|
"loss": 0.1288,
|
||
|
|
"num_input_tokens_seen": 963408,
|
||
|
|
"step": 2395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.494802494802495,
|
||
|
|
"grad_norm": 0.05135256052017212,
|
||
|
|
"learning_rate": 2.9448365472030116e-06,
|
||
|
|
"loss": 0.0595,
|
||
|
|
"num_input_tokens_seen": 965392,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5,
|
||
|
|
"grad_norm": 0.09723832458257675,
|
||
|
|
"learning_rate": 2.935907038404981e-06,
|
||
|
|
"loss": 0.0664,
|
||
|
|
"num_input_tokens_seen": 967440,
|
||
|
|
"step": 2405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.505197505197505,
|
||
|
|
"grad_norm": 90.10528564453125,
|
||
|
|
"learning_rate": 2.9269717903193603e-06,
|
||
|
|
"loss": 0.0894,
|
||
|
|
"num_input_tokens_seen": 969360,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.505197505197505,
|
||
|
|
"eval_loss": 0.3648892641067505,
|
||
|
|
"eval_runtime": 1.0725,
|
||
|
|
"eval_samples_per_second": 798.114,
|
||
|
|
"eval_steps_per_second": 99.764,
|
||
|
|
"num_input_tokens_seen": 969360,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.51039501039501,
|
||
|
|
"grad_norm": 116.97930908203125,
|
||
|
|
"learning_rate": 2.918030920590403e-06,
|
||
|
|
"loss": 0.0082,
|
||
|
|
"num_input_tokens_seen": 971472,
|
||
|
|
"step": 2415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5155925155925156,
|
||
|
|
"grad_norm": 0.011047163046896458,
|
||
|
|
"learning_rate": 2.9090845469363804e-06,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_input_tokens_seen": 973456,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5207900207900207,
|
||
|
|
"grad_norm": 0.1614125818014145,
|
||
|
|
"learning_rate": 2.9001327871480296e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 975504,
|
||
|
|
"step": 2425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.525987525987526,
|
||
|
|
"grad_norm": 0.01074185874313116,
|
||
|
|
"learning_rate": 2.8911757590870028e-06,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"num_input_tokens_seen": 977552,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5311850311850312,
|
||
|
|
"grad_norm": 173.61000061035156,
|
||
|
|
"learning_rate": 2.8822135806843156e-06,
|
||
|
|
"loss": 0.1355,
|
||
|
|
"num_input_tokens_seen": 979536,
|
||
|
|
"step": 2435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5363825363825363,
|
||
|
|
"grad_norm": 0.009233055636286736,
|
||
|
|
"learning_rate": 2.873246369938797e-06,
|
||
|
|
"loss": 0.084,
|
||
|
|
"num_input_tokens_seen": 981584,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5415800415800414,
|
||
|
|
"grad_norm": 3.7363264560699463,
|
||
|
|
"learning_rate": 2.8642742449155287e-06,
|
||
|
|
"loss": 0.0365,
|
||
|
|
"num_input_tokens_seen": 983632,
|
||
|
|
"step": 2445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.546777546777547,
|
||
|
|
"grad_norm": 13.669214248657227,
|
||
|
|
"learning_rate": 2.855297323744301e-06,
|
||
|
|
"loss": 0.1776,
|
||
|
|
"num_input_tokens_seen": 985680,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.551975051975052,
|
||
|
|
"grad_norm": 17.678695678710938,
|
||
|
|
"learning_rate": 2.8463157246180465e-06,
|
||
|
|
"loss": 0.0731,
|
||
|
|
"num_input_tokens_seen": 987664,
|
||
|
|
"step": 2455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5571725571725574,
|
||
|
|
"grad_norm": 0.007595015689730644,
|
||
|
|
"learning_rate": 2.8373295657912947e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 989648,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5623700623700625,
|
||
|
|
"grad_norm": 0.08272235840559006,
|
||
|
|
"learning_rate": 2.828338965578603e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_input_tokens_seen": 991696,
|
||
|
|
"step": 2465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5675675675675675,
|
||
|
|
"grad_norm": 14.857572555541992,
|
||
|
|
"learning_rate": 2.8193440423530117e-06,
|
||
|
|
"loss": 0.2142,
|
||
|
|
"num_input_tokens_seen": 993616,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5727650727650726,
|
||
|
|
"grad_norm": 155.0998077392578,
|
||
|
|
"learning_rate": 2.810344914544475e-06,
|
||
|
|
"loss": 0.0503,
|
||
|
|
"num_input_tokens_seen": 995664,
|
||
|
|
"step": 2475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.577962577962578,
|
||
|
|
"grad_norm": 0.12371411919593811,
|
||
|
|
"learning_rate": 2.8013417006383078e-06,
|
||
|
|
"loss": 0.1017,
|
||
|
|
"num_input_tokens_seen": 997648,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.583160083160083,
|
||
|
|
"grad_norm": 0.05877931788563728,
|
||
|
|
"learning_rate": 2.792334519173624e-06,
|
||
|
|
"loss": 0.0796,
|
||
|
|
"num_input_tokens_seen": 999696,
|
||
|
|
"step": 2485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5883575883575882,
|
||
|
|
"grad_norm": 0.09183234721422195,
|
||
|
|
"learning_rate": 2.7833234887417745e-06,
|
||
|
|
"loss": 0.1002,
|
||
|
|
"num_input_tokens_seen": 1001680,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5935550935550937,
|
||
|
|
"grad_norm": 0.1903308629989624,
|
||
|
|
"learning_rate": 2.774308727984787e-06,
|
||
|
|
"loss": 0.0836,
|
||
|
|
"num_input_tokens_seen": 1003728,
|
||
|
|
"step": 2495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.598752598752599,
|
||
|
|
"grad_norm": 0.0603751540184021,
|
||
|
|
"learning_rate": 2.7652903555938047e-06,
|
||
|
|
"loss": 0.0495,
|
||
|
|
"num_input_tokens_seen": 1005584,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.603950103950104,
|
||
|
|
"grad_norm": 0.11998436599969864,
|
||
|
|
"learning_rate": 2.756268490307524e-06,
|
||
|
|
"loss": 0.061,
|
||
|
|
"num_input_tokens_seen": 1007696,
|
||
|
|
"step": 2505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.609147609147609,
|
||
|
|
"grad_norm": 15.387810707092285,
|
||
|
|
"learning_rate": 2.747243250910625e-06,
|
||
|
|
"loss": 0.2945,
|
||
|
|
"num_input_tokens_seen": 1009680,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6143451143451144,
|
||
|
|
"grad_norm": 0.8963765501976013,
|
||
|
|
"learning_rate": 2.7382147562322175e-06,
|
||
|
|
"loss": 0.0414,
|
||
|
|
"num_input_tokens_seen": 1011728,
|
||
|
|
"step": 2515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6195426195426195,
|
||
|
|
"grad_norm": 0.18006020784378052,
|
||
|
|
"learning_rate": 2.729183125144269e-06,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"num_input_tokens_seen": 1013840,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.624740124740125,
|
||
|
|
"grad_norm": 33.4202880859375,
|
||
|
|
"learning_rate": 2.7201484765600426e-06,
|
||
|
|
"loss": 0.1403,
|
||
|
|
"num_input_tokens_seen": 1015824,
|
||
|
|
"step": 2525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.62993762993763,
|
||
|
|
"grad_norm": 0.056018222123384476,
|
||
|
|
"learning_rate": 2.71111092943253e-06,
|
||
|
|
"loss": 0.1792,
|
||
|
|
"num_input_tokens_seen": 1017744,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.635135135135135,
|
||
|
|
"grad_norm": 0.27936848998069763,
|
||
|
|
"learning_rate": 2.702070602752887e-06,
|
||
|
|
"loss": 0.0616,
|
||
|
|
"num_input_tokens_seen": 1019728,
|
||
|
|
"step": 2535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.64033264033264,
|
||
|
|
"grad_norm": 0.3187178075313568,
|
||
|
|
"learning_rate": 2.693027615548864e-06,
|
||
|
|
"loss": 0.0836,
|
||
|
|
"num_input_tokens_seen": 1021840,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6455301455301456,
|
||
|
|
"grad_norm": 18.821897506713867,
|
||
|
|
"learning_rate": 2.6839820868832433e-06,
|
||
|
|
"loss": 0.0909,
|
||
|
|
"num_input_tokens_seen": 1023824,
|
||
|
|
"step": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6507276507276507,
|
||
|
|
"grad_norm": 0.6045968532562256,
|
||
|
|
"learning_rate": 2.6749341358522675e-06,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"num_input_tokens_seen": 1025616,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6559251559251558,
|
||
|
|
"grad_norm": 0.18768300116062164,
|
||
|
|
"learning_rate": 2.665883881584072e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"num_input_tokens_seen": 1027664,
|
||
|
|
"step": 2555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6611226611226613,
|
||
|
|
"grad_norm": 0.04695185646414757,
|
||
|
|
"learning_rate": 2.6568314432371183e-06,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"num_input_tokens_seen": 1029648,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6663201663201663,
|
||
|
|
"grad_norm": 0.04115242138504982,
|
||
|
|
"learning_rate": 2.647776939998625e-06,
|
||
|
|
"loss": 0.0354,
|
||
|
|
"num_input_tokens_seen": 1031632,
|
||
|
|
"step": 2565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6715176715176714,
|
||
|
|
"grad_norm": 0.029054885730147362,
|
||
|
|
"learning_rate": 2.6387204910829954e-06,
|
||
|
|
"loss": 0.0416,
|
||
|
|
"num_input_tokens_seen": 1033488,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6767151767151764,
|
||
|
|
"grad_norm": 12.681103706359863,
|
||
|
|
"learning_rate": 2.629662215730253e-06,
|
||
|
|
"loss": 0.0011,
|
||
|
|
"num_input_tokens_seen": 1035536,
|
||
|
|
"step": 2575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.681912681912682,
|
||
|
|
"grad_norm": 0.039936427026987076,
|
||
|
|
"learning_rate": 2.620602233204467e-06,
|
||
|
|
"loss": 0.0636,
|
||
|
|
"num_input_tokens_seen": 1037584,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.687110187110187,
|
||
|
|
"grad_norm": 17.539627075195312,
|
||
|
|
"learning_rate": 2.6115406627921823e-06,
|
||
|
|
"loss": 0.1506,
|
||
|
|
"num_input_tokens_seen": 1039568,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6923076923076925,
|
||
|
|
"grad_norm": 0.6394942998886108,
|
||
|
|
"learning_rate": 2.6024776238008543e-06,
|
||
|
|
"loss": 0.0269,
|
||
|
|
"num_input_tokens_seen": 1041616,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6975051975051976,
|
||
|
|
"grad_norm": 0.01985405571758747,
|
||
|
|
"learning_rate": 2.5934132355572713e-06,
|
||
|
|
"loss": 0.1038,
|
||
|
|
"num_input_tokens_seen": 1043664,
|
||
|
|
"step": 2595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7027027027027026,
|
||
|
|
"grad_norm": 12.785253524780273,
|
||
|
|
"learning_rate": 2.5843476174059874e-06,
|
||
|
|
"loss": 0.159,
|
||
|
|
"num_input_tokens_seen": 1045520,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7079002079002077,
|
||
|
|
"grad_norm": 0.04270019382238388,
|
||
|
|
"learning_rate": 2.575280888707748e-06,
|
||
|
|
"loss": 0.1412,
|
||
|
|
"num_input_tokens_seen": 1047376,
|
||
|
|
"step": 2605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.713097713097713,
|
||
|
|
"grad_norm": 7.434192657470703,
|
||
|
|
"learning_rate": 2.5662131688379244e-06,
|
||
|
|
"loss": 0.0029,
|
||
|
|
"num_input_tokens_seen": 1049360,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7182952182952183,
|
||
|
|
"grad_norm": 0.07113603502511978,
|
||
|
|
"learning_rate": 2.557144577184933e-06,
|
||
|
|
"loss": 0.054,
|
||
|
|
"num_input_tokens_seen": 1051344,
|
||
|
|
"step": 2615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7234927234927238,
|
||
|
|
"grad_norm": 0.07835633307695389,
|
||
|
|
"learning_rate": 2.5480752331486742e-06,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"num_input_tokens_seen": 1053264,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.728690228690229,
|
||
|
|
"grad_norm": 0.1012062355875969,
|
||
|
|
"learning_rate": 2.539005256138948e-06,
|
||
|
|
"loss": 0.0494,
|
||
|
|
"num_input_tokens_seen": 1055248,
|
||
|
|
"step": 2625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.733887733887734,
|
||
|
|
"grad_norm": 87.71424865722656,
|
||
|
|
"learning_rate": 2.529934765573893e-06,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"num_input_tokens_seen": 1057104,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.739085239085239,
|
||
|
|
"grad_norm": 0.09936363995075226,
|
||
|
|
"learning_rate": 2.520863880878408e-06,
|
||
|
|
"loss": 0.0379,
|
||
|
|
"num_input_tokens_seen": 1059024,
|
||
|
|
"step": 2635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7442827442827444,
|
||
|
|
"grad_norm": 1.4544926881790161,
|
||
|
|
"learning_rate": 2.511792721482581e-06,
|
||
|
|
"loss": 0.2379,
|
||
|
|
"num_input_tokens_seen": 1060944,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7494802494802495,
|
||
|
|
"grad_norm": 1.8390294313430786,
|
||
|
|
"learning_rate": 2.502721406820116e-06,
|
||
|
|
"loss": 0.038,
|
||
|
|
"num_input_tokens_seen": 1062992,
|
||
|
|
"step": 2645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7546777546777546,
|
||
|
|
"grad_norm": 0.23238161206245422,
|
||
|
|
"learning_rate": 2.493650056326763e-06,
|
||
|
|
"loss": 0.0705,
|
||
|
|
"num_input_tokens_seen": 1064848,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7557172557172556,
|
||
|
|
"eval_loss": 0.306118369102478,
|
||
|
|
"eval_runtime": 1.0992,
|
||
|
|
"eval_samples_per_second": 778.717,
|
||
|
|
"eval_steps_per_second": 97.34,
|
||
|
|
"num_input_tokens_seen": 1065232,
|
||
|
|
"step": 2651
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.75987525987526,
|
||
|
|
"grad_norm": 1.4501862525939941,
|
||
|
|
"learning_rate": 2.4845787894387427e-06,
|
||
|
|
"loss": 0.2106,
|
||
|
|
"num_input_tokens_seen": 1066832,
|
||
|
|
"step": 2655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.765072765072765,
|
||
|
|
"grad_norm": 0.20231160521507263,
|
||
|
|
"learning_rate": 2.4755077255911746e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"num_input_tokens_seen": 1068880,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.77027027027027,
|
||
|
|
"grad_norm": 12.596285820007324,
|
||
|
|
"learning_rate": 2.466436984216507e-06,
|
||
|
|
"loss": 0.151,
|
||
|
|
"num_input_tokens_seen": 1070864,
|
||
|
|
"step": 2665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7754677754677752,
|
||
|
|
"grad_norm": 2.4909775257110596,
|
||
|
|
"learning_rate": 2.4573666847429383e-06,
|
||
|
|
"loss": 0.1102,
|
||
|
|
"num_input_tokens_seen": 1072848,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7806652806652807,
|
||
|
|
"grad_norm": 0.3123326301574707,
|
||
|
|
"learning_rate": 2.4482969465928545e-06,
|
||
|
|
"loss": 0.0628,
|
||
|
|
"num_input_tokens_seen": 1074832,
|
||
|
|
"step": 2675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.785862785862786,
|
||
|
|
"grad_norm": 0.03955717012286186,
|
||
|
|
"learning_rate": 2.4392278891812457e-06,
|
||
|
|
"loss": 0.002,
|
||
|
|
"num_input_tokens_seen": 1076944,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7910602910602913,
|
||
|
|
"grad_norm": 1.0874260663986206,
|
||
|
|
"learning_rate": 2.430159631914141e-06,
|
||
|
|
"loss": 0.0233,
|
||
|
|
"num_input_tokens_seen": 1078800,
|
||
|
|
"step": 2685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7962577962577964,
|
||
|
|
"grad_norm": 0.6165662407875061,
|
||
|
|
"learning_rate": 2.421092294187037e-06,
|
||
|
|
"loss": 0.1463,
|
||
|
|
"num_input_tokens_seen": 1080912,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8014553014553014,
|
||
|
|
"grad_norm": 0.12875588238239288,
|
||
|
|
"learning_rate": 2.41202599538332e-06,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"num_input_tokens_seen": 1082960,
|
||
|
|
"step": 2695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8066528066528065,
|
||
|
|
"grad_norm": 0.024786395952105522,
|
||
|
|
"learning_rate": 2.402960854872697e-06,
|
||
|
|
"loss": 0.0591,
|
||
|
|
"num_input_tokens_seen": 1085008,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.811850311850312,
|
||
|
|
"grad_norm": 0.05379832535982132,
|
||
|
|
"learning_rate": 2.39389699200963e-06,
|
||
|
|
"loss": 0.0729,
|
||
|
|
"num_input_tokens_seen": 1087184,
|
||
|
|
"step": 2705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.817047817047817,
|
||
|
|
"grad_norm": 0.04001461714506149,
|
||
|
|
"learning_rate": 2.3848345261317523e-06,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"num_input_tokens_seen": 1089104,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.822245322245322,
|
||
|
|
"grad_norm": 0.09780512005090714,
|
||
|
|
"learning_rate": 2.3757735765583083e-06,
|
||
|
|
"loss": 0.1587,
|
||
|
|
"num_input_tokens_seen": 1091024,
|
||
|
|
"step": 2715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8274428274428276,
|
||
|
|
"grad_norm": 0.06699176877737045,
|
||
|
|
"learning_rate": 2.3667142625885774e-06,
|
||
|
|
"loss": 0.0685,
|
||
|
|
"num_input_tokens_seen": 1093008,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8326403326403327,
|
||
|
|
"grad_norm": 0.03752860054373741,
|
||
|
|
"learning_rate": 2.357656703500303e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_input_tokens_seen": 1094992,
|
||
|
|
"step": 2725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8378378378378377,
|
||
|
|
"grad_norm": 0.09200336039066315,
|
||
|
|
"learning_rate": 2.3486010185481247e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1097040,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8430353430353428,
|
||
|
|
"grad_norm": 0.3943188786506653,
|
||
|
|
"learning_rate": 2.3395473269620055e-06,
|
||
|
|
"loss": 0.1532,
|
||
|
|
"num_input_tokens_seen": 1098960,
|
||
|
|
"step": 2735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8482328482328483,
|
||
|
|
"grad_norm": 0.03474006429314613,
|
||
|
|
"learning_rate": 2.330495747945665e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_input_tokens_seen": 1101200,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8534303534303533,
|
||
|
|
"grad_norm": 0.25783851742744446,
|
||
|
|
"learning_rate": 2.321446400675005e-06,
|
||
|
|
"loss": 0.1635,
|
||
|
|
"num_input_tokens_seen": 1103120,
|
||
|
|
"step": 2745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.858627858627859,
|
||
|
|
"grad_norm": 27.41282081604004,
|
||
|
|
"learning_rate": 2.3123994042965454e-06,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"num_input_tokens_seen": 1105168,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.863825363825364,
|
||
|
|
"grad_norm": 0.015091204084455967,
|
||
|
|
"learning_rate": 2.3033548779258535e-06,
|
||
|
|
"loss": 0.0463,
|
||
|
|
"num_input_tokens_seen": 1107152,
|
||
|
|
"step": 2755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.869022869022869,
|
||
|
|
"grad_norm": 48.32891845703125,
|
||
|
|
"learning_rate": 2.2943129406459754e-06,
|
||
|
|
"loss": 0.2765,
|
||
|
|
"num_input_tokens_seen": 1109200,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.874220374220374,
|
||
|
|
"grad_norm": 0.029947001487016678,
|
||
|
|
"learning_rate": 2.2852737115058684e-06,
|
||
|
|
"loss": 0.2216,
|
||
|
|
"num_input_tokens_seen": 1111248,
|
||
|
|
"step": 2765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8794178794178795,
|
||
|
|
"grad_norm": 54.1898307800293,
|
||
|
|
"learning_rate": 2.2762373095188344e-06,
|
||
|
|
"loss": 0.1188,
|
||
|
|
"num_input_tokens_seen": 1113232,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8846153846153846,
|
||
|
|
"grad_norm": 10.942605972290039,
|
||
|
|
"learning_rate": 2.2672038536609487e-06,
|
||
|
|
"loss": 0.0557,
|
||
|
|
"num_input_tokens_seen": 1115216,
|
||
|
|
"step": 2775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.88981288981289,
|
||
|
|
"grad_norm": 0.1424887627363205,
|
||
|
|
"learning_rate": 2.2581734628695034e-06,
|
||
|
|
"loss": 0.0011,
|
||
|
|
"num_input_tokens_seen": 1117264,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.895010395010395,
|
||
|
|
"grad_norm": 16.02339744567871,
|
||
|
|
"learning_rate": 2.2491462560414287e-06,
|
||
|
|
"loss": 0.1068,
|
||
|
|
"num_input_tokens_seen": 1119376,
|
||
|
|
"step": 2785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9002079002079,
|
||
|
|
"grad_norm": 1.1276624202728271,
|
||
|
|
"learning_rate": 2.2401223520317363e-06,
|
||
|
|
"loss": 0.1178,
|
||
|
|
"num_input_tokens_seen": 1121424,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9054054054054053,
|
||
|
|
"grad_norm": 13.925151824951172,
|
||
|
|
"learning_rate": 2.2311018696519532e-06,
|
||
|
|
"loss": 0.0582,
|
||
|
|
"num_input_tokens_seen": 1123472,
|
||
|
|
"step": 2795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9106029106029108,
|
||
|
|
"grad_norm": 0.04868851974606514,
|
||
|
|
"learning_rate": 2.2220849276685533e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1125584,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.915800415800416,
|
||
|
|
"grad_norm": 0.11241783946752548,
|
||
|
|
"learning_rate": 2.2130716448014e-06,
|
||
|
|
"loss": 0.0783,
|
||
|
|
"num_input_tokens_seen": 1127568,
|
||
|
|
"step": 2805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.920997920997921,
|
||
|
|
"grad_norm": 0.37742850184440613,
|
||
|
|
"learning_rate": 2.2040621397221762e-06,
|
||
|
|
"loss": 0.0946,
|
||
|
|
"num_input_tokens_seen": 1129552,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9261954261954264,
|
||
|
|
"grad_norm": 0.31173890829086304,
|
||
|
|
"learning_rate": 2.1950565310528264e-06,
|
||
|
|
"loss": 0.0011,
|
||
|
|
"num_input_tokens_seen": 1131472,
|
||
|
|
"step": 2815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9313929313929314,
|
||
|
|
"grad_norm": 0.06812157481908798,
|
||
|
|
"learning_rate": 2.186054937363996e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_input_tokens_seen": 1133392,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9365904365904365,
|
||
|
|
"grad_norm": 0.7941020727157593,
|
||
|
|
"learning_rate": 2.1770574771734644e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1135440,
|
||
|
|
"step": 2825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9417879417879416,
|
||
|
|
"grad_norm": 23.382444381713867,
|
||
|
|
"learning_rate": 2.168064268944591e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"num_input_tokens_seen": 1137424,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.946985446985447,
|
||
|
|
"grad_norm": 0.1391032636165619,
|
||
|
|
"learning_rate": 2.1590754310847513e-06,
|
||
|
|
"loss": 0.018,
|
||
|
|
"num_input_tokens_seen": 1139408,
|
||
|
|
"step": 2835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.952182952182952,
|
||
|
|
"grad_norm": 0.01357136107981205,
|
||
|
|
"learning_rate": 2.150091081943777e-06,
|
||
|
|
"loss": 0.1722,
|
||
|
|
"num_input_tokens_seen": 1141456,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9573804573804576,
|
||
|
|
"grad_norm": 0.020888514816761017,
|
||
|
|
"learning_rate": 2.141111339812405e-06,
|
||
|
|
"loss": 0.1002,
|
||
|
|
"num_input_tokens_seen": 1143440,
|
||
|
|
"step": 2845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9625779625779627,
|
||
|
|
"grad_norm": 0.11883700639009476,
|
||
|
|
"learning_rate": 2.1321363229207097e-06,
|
||
|
|
"loss": 0.0783,
|
||
|
|
"num_input_tokens_seen": 1145360,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9677754677754677,
|
||
|
|
"grad_norm": 0.4169588088989258,
|
||
|
|
"learning_rate": 2.123166149436556e-06,
|
||
|
|
"loss": 0.1061,
|
||
|
|
"num_input_tokens_seen": 1147280,
|
||
|
|
"step": 2855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.972972972972973,
|
||
|
|
"grad_norm": 0.14435216784477234,
|
||
|
|
"learning_rate": 2.114200937464035e-06,
|
||
|
|
"loss": 0.1705,
|
||
|
|
"num_input_tokens_seen": 1149200,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9781704781704783,
|
||
|
|
"grad_norm": 0.27636605501174927,
|
||
|
|
"learning_rate": 2.1052408050419153e-06,
|
||
|
|
"loss": 0.003,
|
||
|
|
"num_input_tokens_seen": 1151184,
|
||
|
|
"step": 2865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9833679833679834,
|
||
|
|
"grad_norm": 0.32042670249938965,
|
||
|
|
"learning_rate": 2.0962858701420867e-06,
|
||
|
|
"loss": 0.0952,
|
||
|
|
"num_input_tokens_seen": 1153232,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9885654885654884,
|
||
|
|
"grad_norm": 12.104057312011719,
|
||
|
|
"learning_rate": 2.087336250668006e-06,
|
||
|
|
"loss": 0.1992,
|
||
|
|
"num_input_tokens_seen": 1155216,
|
||
|
|
"step": 2875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.993762993762994,
|
||
|
|
"grad_norm": 0.12431977689266205,
|
||
|
|
"learning_rate": 2.0783920644531443e-06,
|
||
|
|
"loss": 0.1408,
|
||
|
|
"num_input_tokens_seen": 1157264,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.998960498960499,
|
||
|
|
"grad_norm": 0.302804559469223,
|
||
|
|
"learning_rate": 2.069453429259439e-06,
|
||
|
|
"loss": 0.2101,
|
||
|
|
"num_input_tokens_seen": 1159312,
|
||
|
|
"step": 2885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.004158004158004,
|
||
|
|
"grad_norm": 0.18154755234718323,
|
||
|
|
"learning_rate": 2.06052046277574e-06,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"num_input_tokens_seen": 1161248,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.006237006237006,
|
||
|
|
"eval_loss": 0.2698093056678772,
|
||
|
|
"eval_runtime": 1.0525,
|
||
|
|
"eval_samples_per_second": 813.316,
|
||
|
|
"eval_steps_per_second": 101.664,
|
||
|
|
"num_input_tokens_seen": 1162016,
|
||
|
|
"step": 2892
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0093555093555096,
|
||
|
|
"grad_norm": 0.132065549492836,
|
||
|
|
"learning_rate": 2.051593282616262e-06,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"num_input_tokens_seen": 1163168,
|
||
|
|
"step": 2895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0145530145530146,
|
||
|
|
"grad_norm": 0.12736886739730835,
|
||
|
|
"learning_rate": 2.0426720063190335e-06,
|
||
|
|
"loss": 0.0559,
|
||
|
|
"num_input_tokens_seen": 1165088,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0197505197505197,
|
||
|
|
"grad_norm": 0.15903866291046143,
|
||
|
|
"learning_rate": 2.0337567513443518e-06,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"num_input_tokens_seen": 1167136,
|
||
|
|
"step": 2905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.024948024948025,
|
||
|
|
"grad_norm": 0.06871409714221954,
|
||
|
|
"learning_rate": 2.0248476350732368e-06,
|
||
|
|
"loss": 0.046,
|
||
|
|
"num_input_tokens_seen": 1169120,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0301455301455302,
|
||
|
|
"grad_norm": 6.423719882965088,
|
||
|
|
"learning_rate": 2.0159447748058803e-06,
|
||
|
|
"loss": 0.0235,
|
||
|
|
"num_input_tokens_seen": 1171040,
|
||
|
|
"step": 2915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0353430353430353,
|
||
|
|
"grad_norm": 0.08531547337770462,
|
||
|
|
"learning_rate": 2.007048287760113e-06,
|
||
|
|
"loss": 0.1135,
|
||
|
|
"num_input_tokens_seen": 1173024,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0405405405405403,
|
||
|
|
"grad_norm": 0.07908215373754501,
|
||
|
|
"learning_rate": 1.998158291069845e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1174944,
|
||
|
|
"step": 2925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.045738045738046,
|
||
|
|
"grad_norm": 15.975863456726074,
|
||
|
|
"learning_rate": 1.989274901783538e-06,
|
||
|
|
"loss": 0.009,
|
||
|
|
"num_input_tokens_seen": 1177056,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.050935550935551,
|
||
|
|
"grad_norm": 0.03592640534043312,
|
||
|
|
"learning_rate": 1.9803982368626582e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1178976,
|
||
|
|
"step": 2935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.056133056133056,
|
||
|
|
"grad_norm": 0.09219586849212646,
|
||
|
|
"learning_rate": 1.9715284131801353e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1181024,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0613305613305615,
|
||
|
|
"grad_norm": 0.022503485903143883,
|
||
|
|
"learning_rate": 1.9626655475188237e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1183008,
|
||
|
|
"step": 2945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0665280665280665,
|
||
|
|
"grad_norm": 0.01857338473200798,
|
||
|
|
"learning_rate": 1.953809756569971e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1185056,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0717255717255716,
|
||
|
|
"grad_norm": 0.03196537122130394,
|
||
|
|
"learning_rate": 1.9449611569316716e-06,
|
||
|
|
"loss": 0.0623,
|
||
|
|
"num_input_tokens_seen": 1186976,
|
||
|
|
"step": 2955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.076923076923077,
|
||
|
|
"grad_norm": 0.016185106709599495,
|
||
|
|
"learning_rate": 1.936119865107341e-06,
|
||
|
|
"loss": 0.1065,
|
||
|
|
"num_input_tokens_seen": 1188960,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.082120582120582,
|
||
|
|
"grad_norm": 0.03204691782593727,
|
||
|
|
"learning_rate": 1.9272859975041757e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1190944,
|
||
|
|
"step": 2965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.087318087318087,
|
||
|
|
"grad_norm": 0.03665002062916756,
|
||
|
|
"learning_rate": 1.918459670431622e-06,
|
||
|
|
"loss": 0.0381,
|
||
|
|
"num_input_tokens_seen": 1192928,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0925155925155927,
|
||
|
|
"grad_norm": 0.01817925274372101,
|
||
|
|
"learning_rate": 1.9096410000998478e-06,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"num_input_tokens_seen": 1194848,
|
||
|
|
"step": 2975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0977130977130978,
|
||
|
|
"grad_norm": 0.28329595923423767,
|
||
|
|
"learning_rate": 1.9008301026182064e-06,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"num_input_tokens_seen": 1196768,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.102910602910603,
|
||
|
|
"grad_norm": 0.016043463721871376,
|
||
|
|
"learning_rate": 1.892027093993716e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1198688,
|
||
|
|
"step": 2985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.108108108108108,
|
||
|
|
"grad_norm": 0.028100663796067238,
|
||
|
|
"learning_rate": 1.883232090129523e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1200672,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1133056133056134,
|
||
|
|
"grad_norm": 0.06433451920747757,
|
||
|
|
"learning_rate": 1.8744452068233826e-06,
|
||
|
|
"loss": 0.0713,
|
||
|
|
"num_input_tokens_seen": 1202720,
|
||
|
|
"step": 2995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1185031185031185,
|
||
|
|
"grad_norm": 0.017661597579717636,
|
||
|
|
"learning_rate": 1.8656665597661334e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1204768,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1237006237006235,
|
||
|
|
"grad_norm": 0.02003006637096405,
|
||
|
|
"learning_rate": 1.8568962645401702e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1206944,
|
||
|
|
"step": 3005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.128898128898129,
|
||
|
|
"grad_norm": 0.017040062695741653,
|
||
|
|
"learning_rate": 1.8481344366179284e-06,
|
||
|
|
"loss": 0.095,
|
||
|
|
"num_input_tokens_seen": 1209056,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.134095634095634,
|
||
|
|
"grad_norm": 0.030846811830997467,
|
||
|
|
"learning_rate": 1.8393811913603583e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1210976,
|
||
|
|
"step": 3015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.139293139293139,
|
||
|
|
"grad_norm": 0.029971648007631302,
|
||
|
|
"learning_rate": 1.8306366440154067e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1213024,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1444906444906446,
|
||
|
|
"grad_norm": 0.033417243510484695,
|
||
|
|
"learning_rate": 1.8219009097165042e-06,
|
||
|
|
"loss": 0.0302,
|
||
|
|
"num_input_tokens_seen": 1215136,
|
||
|
|
"step": 3025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1496881496881497,
|
||
|
|
"grad_norm": 0.3327726423740387,
|
||
|
|
"learning_rate": 1.8131741034810436e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1217056,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1548856548856548,
|
||
|
|
"grad_norm": 0.030325112864375114,
|
||
|
|
"learning_rate": 1.8044563402088686e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1219168,
|
||
|
|
"step": 3035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1600831600831603,
|
||
|
|
"grad_norm": 0.0346427820622921,
|
||
|
|
"learning_rate": 1.7957477346807622e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1221088,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1652806652806653,
|
||
|
|
"grad_norm": 0.013777323067188263,
|
||
|
|
"learning_rate": 1.7870484015569306e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1223264,
|
||
|
|
"step": 3045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1704781704781704,
|
||
|
|
"grad_norm": 0.006741571240127087,
|
||
|
|
"learning_rate": 1.7783584553755007e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1225440,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.175675675675676,
|
||
|
|
"grad_norm": 0.1481812596321106,
|
||
|
|
"learning_rate": 1.769678010551003e-06,
|
||
|
|
"loss": 0.0864,
|
||
|
|
"num_input_tokens_seen": 1227424,
|
||
|
|
"step": 3055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.180873180873181,
|
||
|
|
"grad_norm": 0.15245255827903748,
|
||
|
|
"learning_rate": 1.7610071813728741e-06,
|
||
|
|
"loss": 0.0793,
|
||
|
|
"num_input_tokens_seen": 1229344,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.186070686070686,
|
||
|
|
"grad_norm": 6.337021827697754,
|
||
|
|
"learning_rate": 1.7523460820039466e-06,
|
||
|
|
"loss": 0.0974,
|
||
|
|
"num_input_tokens_seen": 1231456,
|
||
|
|
"step": 3065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1912681912681915,
|
||
|
|
"grad_norm": 0.27218034863471985,
|
||
|
|
"learning_rate": 1.7436948264789465e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1233440,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1964656964656966,
|
||
|
|
"grad_norm": 0.02221021242439747,
|
||
|
|
"learning_rate": 1.7350535287029957e-06,
|
||
|
|
"loss": 0.0779,
|
||
|
|
"num_input_tokens_seen": 1235552,
|
||
|
|
"step": 3075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2016632016632016,
|
||
|
|
"grad_norm": 0.04011327400803566,
|
||
|
|
"learning_rate": 1.7264223024501064e-06,
|
||
|
|
"loss": 0.152,
|
||
|
|
"num_input_tokens_seen": 1237536,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2068607068607067,
|
||
|
|
"grad_norm": 0.02025497704744339,
|
||
|
|
"learning_rate": 1.717801261361685e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1239584,
|
||
|
|
"step": 3085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.212058212058212,
|
||
|
|
"grad_norm": 1.1895182132720947,
|
||
|
|
"learning_rate": 1.7091905189450425e-06,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"num_input_tokens_seen": 1241504,
|
||
|
|
"step": 3090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2172557172557172,
|
||
|
|
"grad_norm": 251.4841766357422,
|
||
|
|
"learning_rate": 1.700590188571887e-06,
|
||
|
|
"loss": 0.0375,
|
||
|
|
"num_input_tokens_seen": 1243552,
|
||
|
|
"step": 3095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2224532224532223,
|
||
|
|
"grad_norm": 0.022575953975319862,
|
||
|
|
"learning_rate": 1.6920003834768438e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1245600,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.227650727650728,
|
||
|
|
"grad_norm": 0.019538020715117455,
|
||
|
|
"learning_rate": 1.6834212167559578e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1247712,
|
||
|
|
"step": 3105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.232848232848233,
|
||
|
|
"grad_norm": 102.0543212890625,
|
||
|
|
"learning_rate": 1.6748528013652032e-06,
|
||
|
|
"loss": 0.031,
|
||
|
|
"num_input_tokens_seen": 1249696,
|
||
|
|
"step": 3110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.238045738045738,
|
||
|
|
"grad_norm": 0.1734342724084854,
|
||
|
|
"learning_rate": 1.6662952501190032e-06,
|
||
|
|
"loss": 0.0647,
|
||
|
|
"num_input_tokens_seen": 1251808,
|
||
|
|
"step": 3115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2432432432432434,
|
||
|
|
"grad_norm": 106.23870086669922,
|
||
|
|
"learning_rate": 1.6577486756887376e-06,
|
||
|
|
"loss": 0.0462,
|
||
|
|
"num_input_tokens_seen": 1253728,
|
||
|
|
"step": 3120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2484407484407485,
|
||
|
|
"grad_norm": 0.021452955901622772,
|
||
|
|
"learning_rate": 1.6492131906012608e-06,
|
||
|
|
"loss": 0.0289,
|
||
|
|
"num_input_tokens_seen": 1255840,
|
||
|
|
"step": 3125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2536382536382535,
|
||
|
|
"grad_norm": 0.07759949564933777,
|
||
|
|
"learning_rate": 1.640688907237425e-06,
|
||
|
|
"loss": 0.0469,
|
||
|
|
"num_input_tokens_seen": 1257888,
|
||
|
|
"step": 3130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2567567567567566,
|
||
|
|
"eval_loss": 0.36025160551071167,
|
||
|
|
"eval_runtime": 1.086,
|
||
|
|
"eval_samples_per_second": 788.217,
|
||
|
|
"eval_steps_per_second": 98.527,
|
||
|
|
"num_input_tokens_seen": 1259168,
|
||
|
|
"step": 3133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.258835758835759,
|
||
|
|
"grad_norm": 17.87605857849121,
|
||
|
|
"learning_rate": 1.632175937830594e-06,
|
||
|
|
"loss": 0.0767,
|
||
|
|
"num_input_tokens_seen": 1259936,
|
||
|
|
"step": 3135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.264033264033264,
|
||
|
|
"grad_norm": 0.020596951246261597,
|
||
|
|
"learning_rate": 1.6236743944651703e-06,
|
||
|
|
"loss": 0.0504,
|
||
|
|
"num_input_tokens_seen": 1262112,
|
||
|
|
"step": 3140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.269230769230769,
|
||
|
|
"grad_norm": 20.961088180541992,
|
||
|
|
"learning_rate": 1.6151843890751172e-06,
|
||
|
|
"loss": 0.0185,
|
||
|
|
"num_input_tokens_seen": 1263904,
|
||
|
|
"step": 3145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.274428274428274,
|
||
|
|
"grad_norm": 0.048750557005405426,
|
||
|
|
"learning_rate": 1.6067060334424836e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"num_input_tokens_seen": 1265952,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2796257796257797,
|
||
|
|
"grad_norm": 0.010466455481946468,
|
||
|
|
"learning_rate": 1.5982394391959382e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1267872,
|
||
|
|
"step": 3155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.284823284823285,
|
||
|
|
"grad_norm": 70.06320190429688,
|
||
|
|
"learning_rate": 1.5897847178092902e-06,
|
||
|
|
"loss": 0.0937,
|
||
|
|
"num_input_tokens_seen": 1269792,
|
||
|
|
"step": 3160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.29002079002079,
|
||
|
|
"grad_norm": 1.3579819202423096,
|
||
|
|
"learning_rate": 1.5813419806000329e-06,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"num_input_tokens_seen": 1271776,
|
||
|
|
"step": 3165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2952182952182953,
|
||
|
|
"grad_norm": 0.01165761612355709,
|
||
|
|
"learning_rate": 1.5729113387278675e-06,
|
||
|
|
"loss": 0.0785,
|
||
|
|
"num_input_tokens_seen": 1273760,
|
||
|
|
"step": 3170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3004158004158004,
|
||
|
|
"grad_norm": 0.04458131268620491,
|
||
|
|
"learning_rate": 1.5644929031932455e-06,
|
||
|
|
"loss": 0.1213,
|
||
|
|
"num_input_tokens_seen": 1275808,
|
||
|
|
"step": 3175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3056133056133055,
|
||
|
|
"grad_norm": 0.052004504948854446,
|
||
|
|
"learning_rate": 1.556086784835908e-06,
|
||
|
|
"loss": 0.0576,
|
||
|
|
"num_input_tokens_seen": 1277792,
|
||
|
|
"step": 3180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.310810810810811,
|
||
|
|
"grad_norm": 0.08415602892637253,
|
||
|
|
"learning_rate": 1.547693094333421e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1279776,
|
||
|
|
"step": 3185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.316008316008316,
|
||
|
|
"grad_norm": 38.53419494628906,
|
||
|
|
"learning_rate": 1.5393119421997252e-06,
|
||
|
|
"loss": 0.1482,
|
||
|
|
"num_input_tokens_seen": 1281760,
|
||
|
|
"step": 3190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.321205821205821,
|
||
|
|
"grad_norm": 0.08246491849422455,
|
||
|
|
"learning_rate": 1.5309434387836737e-06,
|
||
|
|
"loss": 0.0042,
|
||
|
|
"num_input_tokens_seen": 1283744,
|
||
|
|
"step": 3195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3264033264033266,
|
||
|
|
"grad_norm": 0.12744662165641785,
|
||
|
|
"learning_rate": 1.5225876942675844e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_input_tokens_seen": 1285792,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3316008316008316,
|
||
|
|
"grad_norm": 19.751220703125,
|
||
|
|
"learning_rate": 1.514244818665788e-06,
|
||
|
|
"loss": 0.0525,
|
||
|
|
"num_input_tokens_seen": 1287776,
|
||
|
|
"step": 3205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3367983367983367,
|
||
|
|
"grad_norm": 0.04199739545583725,
|
||
|
|
"learning_rate": 1.505914921823178e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1289696,
|
||
|
|
"step": 3210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3419958419958418,
|
||
|
|
"grad_norm": 0.19491150975227356,
|
||
|
|
"learning_rate": 1.497598113413766e-06,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_input_tokens_seen": 1291680,
|
||
|
|
"step": 3215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3471933471933473,
|
||
|
|
"grad_norm": 0.019337935373187065,
|
||
|
|
"learning_rate": 1.489294502939238e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1293536,
|
||
|
|
"step": 3220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3523908523908523,
|
||
|
|
"grad_norm": 0.01680462807416916,
|
||
|
|
"learning_rate": 1.4810041997275094e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1295712,
|
||
|
|
"step": 3225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.357588357588358,
|
||
|
|
"grad_norm": 0.025287525728344917,
|
||
|
|
"learning_rate": 1.4727273129312918e-06,
|
||
|
|
"loss": 0.0008,
|
||
|
|
"num_input_tokens_seen": 1297760,
|
||
|
|
"step": 3230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.362785862785863,
|
||
|
|
"grad_norm": 0.015904569998383522,
|
||
|
|
"learning_rate": 1.4644639515266484e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1299808,
|
||
|
|
"step": 3235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.367983367983368,
|
||
|
|
"grad_norm": 0.02150922454893589,
|
||
|
|
"learning_rate": 1.4562142243115646e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1301920,
|
||
|
|
"step": 3240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.373180873180873,
|
||
|
|
"grad_norm": 0.014152280054986477,
|
||
|
|
"learning_rate": 1.4479782399045152e-06,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"num_input_tokens_seen": 1303904,
|
||
|
|
"step": 3245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3783783783783785,
|
||
|
|
"grad_norm": 0.002619291888549924,
|
||
|
|
"learning_rate": 1.43975610674303e-06,
|
||
|
|
"loss": 0.0424,
|
||
|
|
"num_input_tokens_seen": 1305888,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3835758835758836,
|
||
|
|
"grad_norm": 16.205442428588867,
|
||
|
|
"learning_rate": 1.4315479330822711e-06,
|
||
|
|
"loss": 0.1061,
|
||
|
|
"num_input_tokens_seen": 1308064,
|
||
|
|
"step": 3255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3887733887733886,
|
||
|
|
"grad_norm": 0.017430748790502548,
|
||
|
|
"learning_rate": 1.4233538269936042e-06,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"num_input_tokens_seen": 1310048,
|
||
|
|
"step": 3260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.393970893970894,
|
||
|
|
"grad_norm": 0.022695308551192284,
|
||
|
|
"learning_rate": 1.415173896363178e-06,
|
||
|
|
"loss": 0.1162,
|
||
|
|
"num_input_tokens_seen": 1311968,
|
||
|
|
"step": 3265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.399168399168399,
|
||
|
|
"grad_norm": 0.009034757502377033,
|
||
|
|
"learning_rate": 1.4070082488905034e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1313888,
|
||
|
|
"step": 3270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4043659043659042,
|
||
|
|
"grad_norm": 0.014441024512052536,
|
||
|
|
"learning_rate": 1.3988569920870315e-06,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"num_input_tokens_seen": 1316064,
|
||
|
|
"step": 3275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4095634095634098,
|
||
|
|
"grad_norm": 0.019770730286836624,
|
||
|
|
"learning_rate": 1.3907202332747454e-06,
|
||
|
|
"loss": 0.0011,
|
||
|
|
"num_input_tokens_seen": 1318112,
|
||
|
|
"step": 3280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.414760914760915,
|
||
|
|
"grad_norm": 0.18654315173625946,
|
||
|
|
"learning_rate": 1.3825980795847401e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1319968,
|
||
|
|
"step": 3285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.41995841995842,
|
||
|
|
"grad_norm": 0.016684433445334435,
|
||
|
|
"learning_rate": 1.3744906379558165e-06,
|
||
|
|
"loss": 0.038,
|
||
|
|
"num_input_tokens_seen": 1322016,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4251559251559254,
|
||
|
|
"grad_norm": 0.012927900068461895,
|
||
|
|
"learning_rate": 1.3663980151330734e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_input_tokens_seen": 1323936,
|
||
|
|
"step": 3295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4303534303534304,
|
||
|
|
"grad_norm": 0.034205105155706406,
|
||
|
|
"learning_rate": 1.358320317666496e-06,
|
||
|
|
"loss": 0.0241,
|
||
|
|
"num_input_tokens_seen": 1325920,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4355509355509355,
|
||
|
|
"grad_norm": 0.016045430675148964,
|
||
|
|
"learning_rate": 1.350257651909562e-06,
|
||
|
|
"loss": 0.0668,
|
||
|
|
"num_input_tokens_seen": 1327840,
|
||
|
|
"step": 3305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4407484407484406,
|
||
|
|
"grad_norm": 0.01693640649318695,
|
||
|
|
"learning_rate": 1.3422101240178365e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1329760,
|
||
|
|
"step": 3310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.445945945945946,
|
||
|
|
"grad_norm": 0.12216309458017349,
|
||
|
|
"learning_rate": 1.3341778399475714e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1331744,
|
||
|
|
"step": 3315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.451143451143451,
|
||
|
|
"grad_norm": 72.96051025390625,
|
||
|
|
"learning_rate": 1.3261609054543178e-06,
|
||
|
|
"loss": 0.0278,
|
||
|
|
"num_input_tokens_seen": 1333792,
|
||
|
|
"step": 3320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.456340956340956,
|
||
|
|
"grad_norm": 0.03497939929366112,
|
||
|
|
"learning_rate": 1.3181594260915263e-06,
|
||
|
|
"loss": 0.0412,
|
||
|
|
"num_input_tokens_seen": 1335776,
|
||
|
|
"step": 3325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4615384615384617,
|
||
|
|
"grad_norm": 0.28867605328559875,
|
||
|
|
"learning_rate": 1.3101735072091624e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1337824,
|
||
|
|
"step": 3330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4667359667359667,
|
||
|
|
"grad_norm": 0.010723100043833256,
|
||
|
|
"learning_rate": 1.3022032539523177e-06,
|
||
|
|
"loss": 0.0311,
|
||
|
|
"num_input_tokens_seen": 1339872,
|
||
|
|
"step": 3335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.471933471933472,
|
||
|
|
"grad_norm": 0.06669893115758896,
|
||
|
|
"learning_rate": 1.2942487712598234e-06,
|
||
|
|
"loss": 0.0937,
|
||
|
|
"num_input_tokens_seen": 1341920,
|
||
|
|
"step": 3340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4771309771309773,
|
||
|
|
"grad_norm": 41.53193664550781,
|
||
|
|
"learning_rate": 1.2863101638628716e-06,
|
||
|
|
"loss": 0.0176,
|
||
|
|
"num_input_tokens_seen": 1343904,
|
||
|
|
"step": 3345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4823284823284824,
|
||
|
|
"grad_norm": 0.002587628783658147,
|
||
|
|
"learning_rate": 1.2783875362836373e-06,
|
||
|
|
"loss": 0.0738,
|
||
|
|
"num_input_tokens_seen": 1345952,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4875259875259874,
|
||
|
|
"grad_norm": 0.01352632511407137,
|
||
|
|
"learning_rate": 1.2704809928338957e-06,
|
||
|
|
"loss": 0.0394,
|
||
|
|
"num_input_tokens_seen": 1348128,
|
||
|
|
"step": 3355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.492723492723493,
|
||
|
|
"grad_norm": 0.01383188832551241,
|
||
|
|
"learning_rate": 1.2625906376136582e-06,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"num_input_tokens_seen": 1350048,
|
||
|
|
"step": 3360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.497920997920998,
|
||
|
|
"grad_norm": 0.03481636196374893,
|
||
|
|
"learning_rate": 1.2547165745097927e-06,
|
||
|
|
"loss": 0.1121,
|
||
|
|
"num_input_tokens_seen": 1351968,
|
||
|
|
"step": 3365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.503118503118503,
|
||
|
|
"grad_norm": 0.007815233431756496,
|
||
|
|
"learning_rate": 1.2468589071946632e-06,
|
||
|
|
"loss": 0.0682,
|
||
|
|
"num_input_tokens_seen": 1353952,
|
||
|
|
"step": 3370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.507276507276507,
|
||
|
|
"eval_loss": 0.4127735495567322,
|
||
|
|
"eval_runtime": 1.0486,
|
||
|
|
"eval_samples_per_second": 816.351,
|
||
|
|
"eval_steps_per_second": 102.044,
|
||
|
|
"num_input_tokens_seen": 1355552,
|
||
|
|
"step": 3374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.508316008316008,
|
||
|
|
"grad_norm": 0.027879195287823677,
|
||
|
|
"learning_rate": 1.2390177391247616e-06,
|
||
|
|
"loss": 0.0726,
|
||
|
|
"num_input_tokens_seen": 1356000,
|
||
|
|
"step": 3375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5135135135135136,
|
||
|
|
"grad_norm": 26.8095760345459,
|
||
|
|
"learning_rate": 1.2311931735393417e-06,
|
||
|
|
"loss": 0.1161,
|
||
|
|
"num_input_tokens_seen": 1357984,
|
||
|
|
"step": 3380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5187110187110187,
|
||
|
|
"grad_norm": 0.03682544827461243,
|
||
|
|
"learning_rate": 1.2233853134590698e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1359904,
|
||
|
|
"step": 3385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.523908523908524,
|
||
|
|
"grad_norm": 0.015444116666913033,
|
||
|
|
"learning_rate": 1.2155942616846562e-06,
|
||
|
|
"loss": 0.0385,
|
||
|
|
"num_input_tokens_seen": 1361952,
|
||
|
|
"step": 3390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.529106029106029,
|
||
|
|
"grad_norm": 0.0405961312353611,
|
||
|
|
"learning_rate": 1.2078201207955122e-06,
|
||
|
|
"loss": 0.1318,
|
||
|
|
"num_input_tokens_seen": 1364000,
|
||
|
|
"step": 3395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5343035343035343,
|
||
|
|
"grad_norm": 0.05229797586798668,
|
||
|
|
"learning_rate": 1.2000629931483947e-06,
|
||
|
|
"loss": 0.0008,
|
||
|
|
"num_input_tokens_seen": 1366112,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5395010395010393,
|
||
|
|
"grad_norm": 0.09235959500074387,
|
||
|
|
"learning_rate": 1.1923229808760565e-06,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"num_input_tokens_seen": 1368096,
|
||
|
|
"step": 3405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.544698544698545,
|
||
|
|
"grad_norm": 0.044506847858428955,
|
||
|
|
"learning_rate": 1.1846001858859054e-06,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"num_input_tokens_seen": 1370208,
|
||
|
|
"step": 3410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.54989604989605,
|
||
|
|
"grad_norm": 0.27768710255622864,
|
||
|
|
"learning_rate": 1.1768947098586628e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1372192,
|
||
|
|
"step": 3415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.555093555093555,
|
||
|
|
"grad_norm": 0.05268951505422592,
|
||
|
|
"learning_rate": 1.1692066542470202e-06,
|
||
|
|
"loss": 0.0171,
|
||
|
|
"num_input_tokens_seen": 1374240,
|
||
|
|
"step": 3420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5602910602910605,
|
||
|
|
"grad_norm": 0.04955555871129036,
|
||
|
|
"learning_rate": 1.1615361202743088e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1376160,
|
||
|
|
"step": 3425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5654885654885655,
|
||
|
|
"grad_norm": 0.020622428506612778,
|
||
|
|
"learning_rate": 1.1538832089331628e-06,
|
||
|
|
"loss": 0.0008,
|
||
|
|
"num_input_tokens_seen": 1378208,
|
||
|
|
"step": 3430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5706860706860706,
|
||
|
|
"grad_norm": 0.04335801303386688,
|
||
|
|
"learning_rate": 1.1462480209841928e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1380192,
|
||
|
|
"step": 3435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5758835758835756,
|
||
|
|
"grad_norm": 0.053323231637477875,
|
||
|
|
"learning_rate": 1.1386306569546578e-06,
|
||
|
|
"loss": 0.0491,
|
||
|
|
"num_input_tokens_seen": 1382368,
|
||
|
|
"step": 3440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.581081081081081,
|
||
|
|
"grad_norm": 0.09630803763866425,
|
||
|
|
"learning_rate": 1.1310312171371394e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1384608,
|
||
|
|
"step": 3445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.586278586278586,
|
||
|
|
"grad_norm": 0.24890074133872986,
|
||
|
|
"learning_rate": 1.123449801588226e-06,
|
||
|
|
"loss": 0.1426,
|
||
|
|
"num_input_tokens_seen": 1386592,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5914760914760917,
|
||
|
|
"grad_norm": 51.86346435546875,
|
||
|
|
"learning_rate": 1.1158865101271906e-06,
|
||
|
|
"loss": 0.098,
|
||
|
|
"num_input_tokens_seen": 1388448,
|
||
|
|
"step": 3455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5966735966735968,
|
||
|
|
"grad_norm": 0.017590023577213287,
|
||
|
|
"learning_rate": 1.1083414423346807e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1390560,
|
||
|
|
"step": 3460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.601871101871102,
|
||
|
|
"grad_norm": 0.01884053274989128,
|
||
|
|
"learning_rate": 1.100814697551406e-06,
|
||
|
|
"loss": 0.0977,
|
||
|
|
"num_input_tokens_seen": 1392736,
|
||
|
|
"step": 3465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.607068607068607,
|
||
|
|
"grad_norm": 173.05203247070312,
|
||
|
|
"learning_rate": 1.0933063748768254e-06,
|
||
|
|
"loss": 0.1036,
|
||
|
|
"num_input_tokens_seen": 1394720,
|
||
|
|
"step": 3470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6122661122661124,
|
||
|
|
"grad_norm": 0.04371850937604904,
|
||
|
|
"learning_rate": 1.0858165731678514e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1396640,
|
||
|
|
"step": 3475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6174636174636174,
|
||
|
|
"grad_norm": 78.75630187988281,
|
||
|
|
"learning_rate": 1.0783453910375423e-06,
|
||
|
|
"loss": 0.0528,
|
||
|
|
"num_input_tokens_seen": 1398752,
|
||
|
|
"step": 3480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6226611226611225,
|
||
|
|
"grad_norm": 17.215036392211914,
|
||
|
|
"learning_rate": 1.0708929268538034e-06,
|
||
|
|
"loss": 0.0787,
|
||
|
|
"num_input_tokens_seen": 1400800,
|
||
|
|
"step": 3485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.627858627858628,
|
||
|
|
"grad_norm": 0.05456389859318733,
|
||
|
|
"learning_rate": 1.0634592787380964e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1402720,
|
||
|
|
"step": 3490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.633056133056133,
|
||
|
|
"grad_norm": 0.06369329243898392,
|
||
|
|
"learning_rate": 1.0560445445641423e-06,
|
||
|
|
"loss": 0.0827,
|
||
|
|
"num_input_tokens_seen": 1404704,
|
||
|
|
"step": 3495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.638253638253638,
|
||
|
|
"grad_norm": 0.02703475020825863,
|
||
|
|
"learning_rate": 1.048648821956637e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1406560,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.643451143451143,
|
||
|
|
"grad_norm": 0.0234812144190073,
|
||
|
|
"learning_rate": 1.0412722082899647e-06,
|
||
|
|
"loss": 0.0586,
|
||
|
|
"num_input_tokens_seen": 1408544,
|
||
|
|
"step": 3505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6486486486486487,
|
||
|
|
"grad_norm": 0.03321904316544533,
|
||
|
|
"learning_rate": 1.033914800686912e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1410464,
|
||
|
|
"step": 3510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6538461538461537,
|
||
|
|
"grad_norm": 0.021713286638259888,
|
||
|
|
"learning_rate": 1.0265766960173964e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1412448,
|
||
|
|
"step": 3515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6590436590436592,
|
||
|
|
"grad_norm": 19.148372650146484,
|
||
|
|
"learning_rate": 1.019257990897185e-06,
|
||
|
|
"loss": 0.042,
|
||
|
|
"num_input_tokens_seen": 1414688,
|
||
|
|
"step": 3520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6642411642411643,
|
||
|
|
"grad_norm": 13.719977378845215,
|
||
|
|
"learning_rate": 1.0119587816866258e-06,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"num_input_tokens_seen": 1416672,
|
||
|
|
"step": 3525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6694386694386694,
|
||
|
|
"grad_norm": 0.012155055068433285,
|
||
|
|
"learning_rate": 1.0046791644893757e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1418592,
|
||
|
|
"step": 3530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6746361746361744,
|
||
|
|
"grad_norm": 0.015267685987055302,
|
||
|
|
"learning_rate": 9.97419235151137e-07,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1420576,
|
||
|
|
"step": 3535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.67983367983368,
|
||
|
|
"grad_norm": 0.4185558259487152,
|
||
|
|
"learning_rate": 9.901790892583973e-07,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_input_tokens_seen": 1422560,
|
||
|
|
"step": 3540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.685031185031185,
|
||
|
|
"grad_norm": 0.01660173013806343,
|
||
|
|
"learning_rate": 9.829588221371694e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1424608,
|
||
|
|
"step": 3545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6902286902286905,
|
||
|
|
"grad_norm": 0.06823495030403137,
|
||
|
|
"learning_rate": 9.757585288517329e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1426784,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6954261954261955,
|
||
|
|
"grad_norm": 0.010435913689434528,
|
||
|
|
"learning_rate": 9.6857830420339e-07,
|
||
|
|
"loss": 0.0507,
|
||
|
|
"num_input_tokens_seen": 1428896,
|
||
|
|
"step": 3555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7006237006237006,
|
||
|
|
"grad_norm": 0.03763195872306824,
|
||
|
|
"learning_rate": 9.614182427292076e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1430880,
|
||
|
|
"step": 3560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7058212058212057,
|
||
|
|
"grad_norm": 0.07442791014909744,
|
||
|
|
"learning_rate": 9.54278438700785e-07,
|
||
|
|
"loss": 0.0706,
|
||
|
|
"num_input_tokens_seen": 1432864,
|
||
|
|
"step": 3565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.711018711018711,
|
||
|
|
"grad_norm": 13.558998107910156,
|
||
|
|
"learning_rate": 9.471589861229999e-07,
|
||
|
|
"loss": 0.0558,
|
||
|
|
"num_input_tokens_seen": 1434912,
|
||
|
|
"step": 3570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7162162162162162,
|
||
|
|
"grad_norm": 0.03634670376777649,
|
||
|
|
"learning_rate": 9.400599787327774e-07,
|
||
|
|
"loss": 0.0451,
|
||
|
|
"num_input_tokens_seen": 1436832,
|
||
|
|
"step": 3575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7214137214137213,
|
||
|
|
"grad_norm": 0.015272362157702446,
|
||
|
|
"learning_rate": 9.329815099978567e-07,
|
||
|
|
"loss": 0.0456,
|
||
|
|
"num_input_tokens_seen": 1438752,
|
||
|
|
"step": 3580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.726611226611227,
|
||
|
|
"grad_norm": 0.06222844123840332,
|
||
|
|
"learning_rate": 9.259236731155583e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1440672,
|
||
|
|
"step": 3585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.731808731808732,
|
||
|
|
"grad_norm": 0.31334197521209717,
|
||
|
|
"learning_rate": 9.188865610115572e-07,
|
||
|
|
"loss": 0.0311,
|
||
|
|
"num_input_tokens_seen": 1442784,
|
||
|
|
"step": 3590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.737006237006237,
|
||
|
|
"grad_norm": 51.054107666015625,
|
||
|
|
"learning_rate": 9.118702663386583e-07,
|
||
|
|
"loss": 0.0596,
|
||
|
|
"num_input_tokens_seen": 1444960,
|
||
|
|
"step": 3595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.742203742203742,
|
||
|
|
"grad_norm": 33.01020431518555,
|
||
|
|
"learning_rate": 9.048748814755783e-07,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"num_input_tokens_seen": 1446880,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7474012474012475,
|
||
|
|
"grad_norm": 0.032987091690301895,
|
||
|
|
"learning_rate": 8.979004985257294e-07,
|
||
|
|
"loss": 0.0394,
|
||
|
|
"num_input_tokens_seen": 1448992,
|
||
|
|
"step": 3605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7525987525987525,
|
||
|
|
"grad_norm": 0.09595970064401627,
|
||
|
|
"learning_rate": 8.909472093160066e-07,
|
||
|
|
"loss": 0.0295,
|
||
|
|
"num_input_tokens_seen": 1450976,
|
||
|
|
"step": 3610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.757796257796258,
|
||
|
|
"grad_norm": 1.557525396347046,
|
||
|
|
"learning_rate": 8.840151053955773e-07,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"num_input_tokens_seen": 1453088,
|
||
|
|
"step": 3615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.757796257796258,
|
||
|
|
"eval_loss": 0.36968719959259033,
|
||
|
|
"eval_runtime": 1.2334,
|
||
|
|
"eval_samples_per_second": 694.025,
|
||
|
|
"eval_steps_per_second": 86.753,
|
||
|
|
"num_input_tokens_seen": 1453088,
|
||
|
|
"step": 3615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.762993762993763,
|
||
|
|
"grad_norm": 0.020010627806186676,
|
||
|
|
"learning_rate": 8.771042780346767e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"num_input_tokens_seen": 1455136,
|
||
|
|
"step": 3620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.768191268191268,
|
||
|
|
"grad_norm": 12.859967231750488,
|
||
|
|
"learning_rate": 8.702148182234043e-07,
|
||
|
|
"loss": 0.1087,
|
||
|
|
"num_input_tokens_seen": 1457120,
|
||
|
|
"step": 3625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.773388773388773,
|
||
|
|
"grad_norm": 0.03449089452624321,
|
||
|
|
"learning_rate": 8.633468166705336e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1459168,
|
||
|
|
"step": 3630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7785862785862787,
|
||
|
|
"grad_norm": 0.013074683956801891,
|
||
|
|
"learning_rate": 8.565003638023065e-07,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"num_input_tokens_seen": 1461152,
|
||
|
|
"step": 3635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7837837837837838,
|
||
|
|
"grad_norm": 0.00507075572386384,
|
||
|
|
"learning_rate": 8.496755497612491e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1463136,
|
||
|
|
"step": 3640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.788981288981289,
|
||
|
|
"grad_norm": 0.010262695141136646,
|
||
|
|
"learning_rate": 8.42872464404986e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1465120,
|
||
|
|
"step": 3645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7941787941787943,
|
||
|
|
"grad_norm": 4.041860103607178,
|
||
|
|
"learning_rate": 8.360911973050537e-07,
|
||
|
|
"loss": 0.0322,
|
||
|
|
"num_input_tokens_seen": 1467104,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7993762993762994,
|
||
|
|
"grad_norm": 0.005001334939152002,
|
||
|
|
"learning_rate": 8.29331837745724e-07,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1469152,
|
||
|
|
"step": 3655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8045738045738045,
|
||
|
|
"grad_norm": 17.126569747924805,
|
||
|
|
"learning_rate": 8.225944747228257e-07,
|
||
|
|
"loss": 0.1215,
|
||
|
|
"num_input_tokens_seen": 1471264,
|
||
|
|
"step": 3660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8097713097713095,
|
||
|
|
"grad_norm": 0.0037782315630465746,
|
||
|
|
"learning_rate": 8.158791969425739e-07,
|
||
|
|
"loss": 0.0868,
|
||
|
|
"num_input_tokens_seen": 1473248,
|
||
|
|
"step": 3665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.814968814968815,
|
||
|
|
"grad_norm": 0.027992993593215942,
|
||
|
|
"learning_rate": 8.091860928204048e-07,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_input_tokens_seen": 1475360,
|
||
|
|
"step": 3670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.82016632016632,
|
||
|
|
"grad_norm": 0.006942141801118851,
|
||
|
|
"learning_rate": 8.025152504798078e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1477472,
|
||
|
|
"step": 3675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8253638253638256,
|
||
|
|
"grad_norm": 19.416587829589844,
|
||
|
|
"learning_rate": 7.958667577511684e-07,
|
||
|
|
"loss": 0.0912,
|
||
|
|
"num_input_tokens_seen": 1479328,
|
||
|
|
"step": 3680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8305613305613306,
|
||
|
|
"grad_norm": 0.010084366425871849,
|
||
|
|
"learning_rate": 7.892407021706064e-07,
|
||
|
|
"loss": 0.0447,
|
||
|
|
"num_input_tokens_seen": 1481248,
|
||
|
|
"step": 3685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8357588357588357,
|
||
|
|
"grad_norm": 0.02589116431772709,
|
||
|
|
"learning_rate": 7.826371709788314e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1483168,
|
||
|
|
"step": 3690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8409563409563408,
|
||
|
|
"grad_norm": 0.12098560482263565,
|
||
|
|
"learning_rate": 7.760562511199881e-07,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1485152,
|
||
|
|
"step": 3695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8461538461538463,
|
||
|
|
"grad_norm": 0.0336734913289547,
|
||
|
|
"learning_rate": 7.694980292405122e-07,
|
||
|
|
"loss": 0.0407,
|
||
|
|
"num_input_tokens_seen": 1487200,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8513513513513513,
|
||
|
|
"grad_norm": 0.08973251283168793,
|
||
|
|
"learning_rate": 7.629625916879932e-07,
|
||
|
|
"loss": 0.0294,
|
||
|
|
"num_input_tokens_seen": 1489184,
|
||
|
|
"step": 3705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.856548856548857,
|
||
|
|
"grad_norm": 17.128236770629883,
|
||
|
|
"learning_rate": 7.564500245100326e-07,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"num_input_tokens_seen": 1491168,
|
||
|
|
"step": 3710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.861746361746362,
|
||
|
|
"grad_norm": 0.03917059302330017,
|
||
|
|
"learning_rate": 7.49960413453115e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1493216,
|
||
|
|
"step": 3715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.866943866943867,
|
||
|
|
"grad_norm": 0.022577917203307152,
|
||
|
|
"learning_rate": 7.434938439614781e-07,
|
||
|
|
"loss": 0.0738,
|
||
|
|
"num_input_tokens_seen": 1495200,
|
||
|
|
"step": 3720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.872141372141372,
|
||
|
|
"grad_norm": 0.04259275645017624,
|
||
|
|
"learning_rate": 7.370504011759855e-07,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"num_input_tokens_seen": 1497184,
|
||
|
|
"step": 3725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8773388773388775,
|
||
|
|
"grad_norm": 39.698997497558594,
|
||
|
|
"learning_rate": 7.306301699330065e-07,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"num_input_tokens_seen": 1499040,
|
||
|
|
"step": 3730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8825363825363826,
|
||
|
|
"grad_norm": 21.861370086669922,
|
||
|
|
"learning_rate": 7.242332347633052e-07,
|
||
|
|
"loss": 0.0354,
|
||
|
|
"num_input_tokens_seen": 1501024,
|
||
|
|
"step": 3735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8877338877338876,
|
||
|
|
"grad_norm": 0.0236463975161314,
|
||
|
|
"learning_rate": 7.17859679890916e-07,
|
||
|
|
"loss": 0.042,
|
||
|
|
"num_input_tokens_seen": 1503072,
|
||
|
|
"step": 3740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.892931392931393,
|
||
|
|
"grad_norm": 0.09350544959306717,
|
||
|
|
"learning_rate": 7.115095892320456e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1505248,
|
||
|
|
"step": 3745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.898128898128898,
|
||
|
|
"grad_norm": 0.004034217447042465,
|
||
|
|
"learning_rate": 7.051830463939605e-07,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"num_input_tokens_seen": 1507296,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9033264033264032,
|
||
|
|
"grad_norm": 0.026631083339452744,
|
||
|
|
"learning_rate": 6.988801346738911e-07,
|
||
|
|
"loss": 0.0226,
|
||
|
|
"num_input_tokens_seen": 1509344,
|
||
|
|
"step": 3755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9085239085239083,
|
||
|
|
"grad_norm": 0.008157435804605484,
|
||
|
|
"learning_rate": 6.926009370579334e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1511456,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.913721413721414,
|
||
|
|
"grad_norm": 72.86700439453125,
|
||
|
|
"learning_rate": 6.863455362199542e-07,
|
||
|
|
"loss": 0.0235,
|
||
|
|
"num_input_tokens_seen": 1513440,
|
||
|
|
"step": 3765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.918918918918919,
|
||
|
|
"grad_norm": 0.05969979614019394,
|
||
|
|
"learning_rate": 6.801140145205071e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1515488,
|
||
|
|
"step": 3770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9241164241164244,
|
||
|
|
"grad_norm": 4.924336910247803,
|
||
|
|
"learning_rate": 6.739064540057425e-07,
|
||
|
|
"loss": 0.0065,
|
||
|
|
"num_input_tokens_seen": 1517408,
|
||
|
|
"step": 3775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9293139293139294,
|
||
|
|
"grad_norm": 0.07060942053794861,
|
||
|
|
"learning_rate": 6.677229364063329e-07,
|
||
|
|
"loss": 0.0335,
|
||
|
|
"num_input_tokens_seen": 1519392,
|
||
|
|
"step": 3780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9345114345114345,
|
||
|
|
"grad_norm": 0.025277776643633842,
|
||
|
|
"learning_rate": 6.615635431363943e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1521440,
|
||
|
|
"step": 3785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9397089397089395,
|
||
|
|
"grad_norm": 22.37493896484375,
|
||
|
|
"learning_rate": 6.554283552924118e-07,
|
||
|
|
"loss": 0.0844,
|
||
|
|
"num_input_tokens_seen": 1523488,
|
||
|
|
"step": 3790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.944906444906445,
|
||
|
|
"grad_norm": 0.008414591662585735,
|
||
|
|
"learning_rate": 6.493174536521768e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1525600,
|
||
|
|
"step": 3795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.95010395010395,
|
||
|
|
"grad_norm": 4.057095527648926,
|
||
|
|
"learning_rate": 6.43230918673721e-07,
|
||
|
|
"loss": 0.0715,
|
||
|
|
"num_input_tokens_seen": 1527584,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.955301455301455,
|
||
|
|
"grad_norm": 0.2397640198469162,
|
||
|
|
"learning_rate": 6.371688304942544e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1529504,
|
||
|
|
"step": 3805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9604989604989607,
|
||
|
|
"grad_norm": 0.024253297597169876,
|
||
|
|
"learning_rate": 6.311312689291166e-07,
|
||
|
|
"loss": 0.0805,
|
||
|
|
"num_input_tokens_seen": 1531424,
|
||
|
|
"step": 3810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9656964656964657,
|
||
|
|
"grad_norm": 0.006427168846130371,
|
||
|
|
"learning_rate": 6.251183134707183e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1533408,
|
||
|
|
"step": 3815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.970893970893971,
|
||
|
|
"grad_norm": 22.389490127563477,
|
||
|
|
"learning_rate": 6.191300432875017e-07,
|
||
|
|
"loss": 0.1432,
|
||
|
|
"num_input_tokens_seen": 1535392,
|
||
|
|
"step": 3820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.976091476091476,
|
||
|
|
"grad_norm": 42.83168029785156,
|
||
|
|
"learning_rate": 6.13166537222894e-07,
|
||
|
|
"loss": 0.0178,
|
||
|
|
"num_input_tokens_seen": 1537312,
|
||
|
|
"step": 3825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9812889812889813,
|
||
|
|
"grad_norm": 34.80426788330078,
|
||
|
|
"learning_rate": 6.072278737942691e-07,
|
||
|
|
"loss": 0.0611,
|
||
|
|
"num_input_tokens_seen": 1539360,
|
||
|
|
"step": 3830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9864864864864864,
|
||
|
|
"grad_norm": 0.005531808827072382,
|
||
|
|
"learning_rate": 6.013141311919168e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"num_input_tokens_seen": 1541280,
|
||
|
|
"step": 3835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.991683991683992,
|
||
|
|
"grad_norm": 0.09399595111608505,
|
||
|
|
"learning_rate": 5.954253872780102e-07,
|
||
|
|
"loss": 0.0644,
|
||
|
|
"num_input_tokens_seen": 1543136,
|
||
|
|
"step": 3840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.996881496881497,
|
||
|
|
"grad_norm": 0.004355916753411293,
|
||
|
|
"learning_rate": 5.895617195855827e-07,
|
||
|
|
"loss": 0.1091,
|
||
|
|
"num_input_tokens_seen": 1545120,
|
||
|
|
"step": 3845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.002079002079002,
|
||
|
|
"grad_norm": 0.013024209067225456,
|
||
|
|
"learning_rate": 5.837232053175065e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1547056,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.007276507276507,
|
||
|
|
"grad_norm": 0.05919545143842697,
|
||
|
|
"learning_rate": 5.77909921345475e-07,
|
||
|
|
"loss": 0.0238,
|
||
|
|
"num_input_tokens_seen": 1548976,
|
||
|
|
"step": 3855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.008316008316008,
|
||
|
|
"eval_loss": 0.3716074526309967,
|
||
|
|
"eval_runtime": 1.0785,
|
||
|
|
"eval_samples_per_second": 793.715,
|
||
|
|
"eval_steps_per_second": 99.214,
|
||
|
|
"num_input_tokens_seen": 1549360,
|
||
|
|
"step": 3856
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.012474012474012,
|
||
|
|
"grad_norm": 0.22275064885616302,
|
||
|
|
"learning_rate": 5.721219442089925e-07,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"num_input_tokens_seen": 1550960,
|
||
|
|
"step": 3860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.017671517671518,
|
||
|
|
"grad_norm": 11.842212677001953,
|
||
|
|
"learning_rate": 5.663593501143663e-07,
|
||
|
|
"loss": 0.011,
|
||
|
|
"num_input_tokens_seen": 1552944,
|
||
|
|
"step": 3865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.022869022869023,
|
||
|
|
"grad_norm": 0.035551466047763824,
|
||
|
|
"learning_rate": 5.606222149337004e-07,
|
||
|
|
"loss": 0.0378,
|
||
|
|
"num_input_tokens_seen": 1554992,
|
||
|
|
"step": 3870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.028066528066528,
|
||
|
|
"grad_norm": 0.21466241776943207,
|
||
|
|
"learning_rate": 5.549106142039018e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1557104,
|
||
|
|
"step": 3875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.033264033264033,
|
||
|
|
"grad_norm": 0.010968453250825405,
|
||
|
|
"learning_rate": 5.492246231256798e-07,
|
||
|
|
"loss": 0.0008,
|
||
|
|
"num_input_tokens_seen": 1559088,
|
||
|
|
"step": 3880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.038461538461538,
|
||
|
|
"grad_norm": 0.0740390494465828,
|
||
|
|
"learning_rate": 5.435643165625615e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1561008,
|
||
|
|
"step": 3885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.043659043659043,
|
||
|
|
"grad_norm": 0.03413901478052139,
|
||
|
|
"learning_rate": 5.379297690399035e-07,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1563056,
|
||
|
|
"step": 3890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.048856548856548,
|
||
|
|
"grad_norm": 0.023828689008951187,
|
||
|
|
"learning_rate": 5.323210547439089e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1565040,
|
||
|
|
"step": 3895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.054054054054054,
|
||
|
|
"grad_norm": 0.02368989959359169,
|
||
|
|
"learning_rate": 5.267382475206548e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1567024,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0592515592515594,
|
||
|
|
"grad_norm": 0.1620592474937439,
|
||
|
|
"learning_rate": 5.21181420875117e-07,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1569136,
|
||
|
|
"step": 3905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0644490644490645,
|
||
|
|
"grad_norm": 0.013055311515927315,
|
||
|
|
"learning_rate": 5.15650647970202e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1571120,
|
||
|
|
"step": 3910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.06964656964657,
|
||
|
|
"grad_norm": 0.005612197332084179,
|
||
|
|
"learning_rate": 5.101460016257858e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1573040,
|
||
|
|
"step": 3915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.074844074844075,
|
||
|
|
"grad_norm": 0.016595976427197456,
|
||
|
|
"learning_rate": 5.046675543177531e-07,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_input_tokens_seen": 1574896,
|
||
|
|
"step": 3920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.08004158004158,
|
||
|
|
"grad_norm": 0.05645221471786499,
|
||
|
|
"learning_rate": 4.992153781770448e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1576880,
|
||
|
|
"step": 3925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.085239085239086,
|
||
|
|
"grad_norm": 0.02893124334514141,
|
||
|
|
"learning_rate": 4.937895449887076e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1578864,
|
||
|
|
"step": 3930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.090436590436591,
|
||
|
|
"grad_norm": 0.010248606093227863,
|
||
|
|
"learning_rate": 4.883901261909466e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1580848,
|
||
|
|
"step": 3935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.095634095634096,
|
||
|
|
"grad_norm": 0.019447464495897293,
|
||
|
|
"learning_rate": 4.830171928741901e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1582704,
|
||
|
|
"step": 3940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.100831600831601,
|
||
|
|
"grad_norm": 0.15405897796154022,
|
||
|
|
"learning_rate": 4.776708157801463e-07,
|
||
|
|
"loss": 0.0008,
|
||
|
|
"num_input_tokens_seen": 1584816,
|
||
|
|
"step": 3945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.106029106029106,
|
||
|
|
"grad_norm": 8.753682136535645,
|
||
|
|
"learning_rate": 4.723510653008809e-07,
|
||
|
|
"loss": 0.0387,
|
||
|
|
"num_input_tokens_seen": 1586800,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.111226611226611,
|
||
|
|
"grad_norm": 0.06123171001672745,
|
||
|
|
"learning_rate": 4.6705801147788136e-07,
|
||
|
|
"loss": 0.081,
|
||
|
|
"num_input_tokens_seen": 1588720,
|
||
|
|
"step": 3955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.116424116424117,
|
||
|
|
"grad_norm": 0.004952425602823496,
|
||
|
|
"learning_rate": 4.617917240011394e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1590576,
|
||
|
|
"step": 3960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.121621621621622,
|
||
|
|
"grad_norm": 0.00792229175567627,
|
||
|
|
"learning_rate": 4.5655227220823355e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1592496,
|
||
|
|
"step": 3965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.126819126819127,
|
||
|
|
"grad_norm": 0.013923810794949532,
|
||
|
|
"learning_rate": 4.513397250834159e-07,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"num_input_tokens_seen": 1594544,
|
||
|
|
"step": 3970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.132016632016632,
|
||
|
|
"grad_norm": 0.029175899922847748,
|
||
|
|
"learning_rate": 4.461541512567011e-07,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1596400,
|
||
|
|
"step": 3975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.137214137214137,
|
||
|
|
"grad_norm": 0.04299869015812874,
|
||
|
|
"learning_rate": 4.409956190029674e-07,
|
||
|
|
"loss": 0.0585,
|
||
|
|
"num_input_tokens_seen": 1598320,
|
||
|
|
"step": 3980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.142411642411642,
|
||
|
|
"grad_norm": 36.72762680053711,
|
||
|
|
"learning_rate": 4.358641962410537e-07,
|
||
|
|
"loss": 0.0202,
|
||
|
|
"num_input_tokens_seen": 1600368,
|
||
|
|
"step": 3985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.147609147609147,
|
||
|
|
"grad_norm": 0.005658295005559921,
|
||
|
|
"learning_rate": 4.3075995053286716e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1602352,
|
||
|
|
"step": 3990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.152806652806653,
|
||
|
|
"grad_norm": 0.00978625938296318,
|
||
|
|
"learning_rate": 4.2568294908249486e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1604336,
|
||
|
|
"step": 3995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.158004158004158,
|
||
|
|
"grad_norm": 0.005897314287722111,
|
||
|
|
"learning_rate": 4.2063325873531485e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1606256,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.163201663201663,
|
||
|
|
"grad_norm": 0.059251993894577026,
|
||
|
|
"learning_rate": 4.156109459771215e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1608304,
|
||
|
|
"step": 4005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.168399168399168,
|
||
|
|
"grad_norm": 0.004152240231633186,
|
||
|
|
"learning_rate": 4.106160769332443e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1610480,
|
||
|
|
"step": 4010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.173596673596673,
|
||
|
|
"grad_norm": 0.047246526926755905,
|
||
|
|
"learning_rate": 4.056487173676843e-07,
|
||
|
|
"loss": 0.0382,
|
||
|
|
"num_input_tokens_seen": 1612528,
|
||
|
|
"step": 4015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1787941787941785,
|
||
|
|
"grad_norm": 0.026120582595467567,
|
||
|
|
"learning_rate": 4.0070893268224055e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1614576,
|
||
|
|
"step": 4020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.183991683991684,
|
||
|
|
"grad_norm": 0.012839434668421745,
|
||
|
|
"learning_rate": 3.9579678791565323e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1616624,
|
||
|
|
"step": 4025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1891891891891895,
|
||
|
|
"grad_norm": 16.232559204101562,
|
||
|
|
"learning_rate": 3.9091234774274873e-07,
|
||
|
|
"loss": 0.0378,
|
||
|
|
"num_input_tokens_seen": 1618672,
|
||
|
|
"step": 4030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1943866943866945,
|
||
|
|
"grad_norm": 0.0076831188052892685,
|
||
|
|
"learning_rate": 3.8605567647358426e-07,
|
||
|
|
"loss": 0.0029,
|
||
|
|
"num_input_tokens_seen": 1620784,
|
||
|
|
"step": 4035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1995841995842,
|
||
|
|
"grad_norm": 0.009812161326408386,
|
||
|
|
"learning_rate": 3.812268380526046e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1622768,
|
||
|
|
"step": 4040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.204781704781705,
|
||
|
|
"grad_norm": 0.12099117040634155,
|
||
|
|
"learning_rate": 3.764258960577971e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1624688,
|
||
|
|
"step": 4045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.20997920997921,
|
||
|
|
"grad_norm": 0.005353657063096762,
|
||
|
|
"learning_rate": 3.7165291369985616e-07,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1626672,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.215176715176715,
|
||
|
|
"grad_norm": 0.001504407380707562,
|
||
|
|
"learning_rate": 3.6690795382135184e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1628848,
|
||
|
|
"step": 4055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.220374220374221,
|
||
|
|
"grad_norm": 0.009774814359843731,
|
||
|
|
"learning_rate": 3.6219107889590154e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1630832,
|
||
|
|
"step": 4060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.225571725571726,
|
||
|
|
"grad_norm": 0.00985631812363863,
|
||
|
|
"learning_rate": 3.575023510273462e-07,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1632880,
|
||
|
|
"step": 4065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.230769230769231,
|
||
|
|
"grad_norm": 0.01718440279364586,
|
||
|
|
"learning_rate": 3.528418319489349e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1634992,
|
||
|
|
"step": 4070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.235966735966736,
|
||
|
|
"grad_norm": 0.021337008103728294,
|
||
|
|
"learning_rate": 3.48209583022511e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1636912,
|
||
|
|
"step": 4075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.241164241164241,
|
||
|
|
"grad_norm": 0.03264433145523071,
|
||
|
|
"learning_rate": 3.436056652377043e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1638832,
|
||
|
|
"step": 4080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.246361746361746,
|
||
|
|
"grad_norm": 0.028791099786758423,
|
||
|
|
"learning_rate": 3.3903013921112753e-07,
|
||
|
|
"loss": 0.056,
|
||
|
|
"num_input_tokens_seen": 1641072,
|
||
|
|
"step": 4085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.251559251559252,
|
||
|
|
"grad_norm": 0.00902112852782011,
|
||
|
|
"learning_rate": 3.3448306518557795e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1642992,
|
||
|
|
"step": 4090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.256756756756757,
|
||
|
|
"grad_norm": 0.0031842426396906376,
|
||
|
|
"learning_rate": 3.299645030292467e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1645040,
|
||
|
|
"step": 4095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.258835758835759,
|
||
|
|
"eval_loss": 0.4492134153842926,
|
||
|
|
"eval_runtime": 1.0401,
|
||
|
|
"eval_samples_per_second": 823.003,
|
||
|
|
"eval_steps_per_second": 102.875,
|
||
|
|
"num_input_tokens_seen": 1645808,
|
||
|
|
"step": 4097
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.261954261954262,
|
||
|
|
"grad_norm": 0.008271156810224056,
|
||
|
|
"learning_rate": 3.254745122349279e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1647024,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.267151767151767,
|
||
|
|
"grad_norm": 0.009126213379204273,
|
||
|
|
"learning_rate": 3.2101315191923667e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1649008,
|
||
|
|
"step": 4105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.272349272349272,
|
||
|
|
"grad_norm": 0.008243863470852375,
|
||
|
|
"learning_rate": 3.1658048082182926e-07,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1651056,
|
||
|
|
"step": 4110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.277546777546777,
|
||
|
|
"grad_norm": 0.016346026211977005,
|
||
|
|
"learning_rate": 3.1217655730463094e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1653104,
|
||
|
|
"step": 4115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.282744282744282,
|
||
|
|
"grad_norm": 0.014476928859949112,
|
||
|
|
"learning_rate": 3.078014393510695e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1655344,
|
||
|
|
"step": 4120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.287941787941788,
|
||
|
|
"grad_norm": 0.00862564891576767,
|
||
|
|
"learning_rate": 3.0345518456530666e-07,
|
||
|
|
"loss": 0.042,
|
||
|
|
"num_input_tokens_seen": 1657392,
|
||
|
|
"step": 4125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.293139293139293,
|
||
|
|
"grad_norm": 0.011305141262710094,
|
||
|
|
"learning_rate": 2.9913785017148563e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1659312,
|
||
|
|
"step": 4130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.298336798336798,
|
||
|
|
"grad_norm": 17.00044822692871,
|
||
|
|
"learning_rate": 2.9484949301297166e-07,
|
||
|
|
"loss": 0.0557,
|
||
|
|
"num_input_tokens_seen": 1661424,
|
||
|
|
"step": 4135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.303534303534303,
|
||
|
|
"grad_norm": 0.0021855896338820457,
|
||
|
|
"learning_rate": 2.905901695516092e-07,
|
||
|
|
"loss": 0.0239,
|
||
|
|
"num_input_tokens_seen": 1663408,
|
||
|
|
"step": 4140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3087318087318085,
|
||
|
|
"grad_norm": 0.005250105168670416,
|
||
|
|
"learning_rate": 2.8635993586697555e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1665328,
|
||
|
|
"step": 4145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.313929313929314,
|
||
|
|
"grad_norm": 0.02172735519707203,
|
||
|
|
"learning_rate": 2.8215884765564197e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1667312,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3191268191268195,
|
||
|
|
"grad_norm": 0.3306088447570801,
|
||
|
|
"learning_rate": 2.779869602304416e-07,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1669296,
|
||
|
|
"step": 4155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.324324324324325,
|
||
|
|
"grad_norm": 0.0034492157865315676,
|
||
|
|
"learning_rate": 2.73844328519742e-07,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1671280,
|
||
|
|
"step": 4160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.32952182952183,
|
||
|
|
"grad_norm": 0.3147349953651428,
|
||
|
|
"learning_rate": 2.6973100706672e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1673456,
|
||
|
|
"step": 4165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.334719334719335,
|
||
|
|
"grad_norm": 0.0011071843327954412,
|
||
|
|
"learning_rate": 2.656470500286451e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1675504,
|
||
|
|
"step": 4170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.33991683991684,
|
||
|
|
"grad_norm": 0.00639357278123498,
|
||
|
|
"learning_rate": 2.615925111761647e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1677488,
|
||
|
|
"step": 4175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.345114345114345,
|
||
|
|
"grad_norm": 0.00608447939157486,
|
||
|
|
"learning_rate": 2.575674438925974e-07,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"num_input_tokens_seen": 1679536,
|
||
|
|
"step": 4180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.350311850311851,
|
||
|
|
"grad_norm": 84.28992462158203,
|
||
|
|
"learning_rate": 2.535719011732321e-07,
|
||
|
|
"loss": 0.0875,
|
||
|
|
"num_input_tokens_seen": 1681520,
|
||
|
|
"step": 4185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.355509355509356,
|
||
|
|
"grad_norm": 13.161194801330566,
|
||
|
|
"learning_rate": 2.4960593562462496e-07,
|
||
|
|
"loss": 0.0372,
|
||
|
|
"num_input_tokens_seen": 1683568,
|
||
|
|
"step": 4190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.360706860706861,
|
||
|
|
"grad_norm": 0.006262065842747688,
|
||
|
|
"learning_rate": 2.4566959946391246e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1685488,
|
||
|
|
"step": 4195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.365904365904366,
|
||
|
|
"grad_norm": 0.010419082827866077,
|
||
|
|
"learning_rate": 2.4176294451811936e-07,
|
||
|
|
"loss": 0.0341,
|
||
|
|
"num_input_tokens_seen": 1687408,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.371101871101871,
|
||
|
|
"grad_norm": 0.0028410113882273436,
|
||
|
|
"learning_rate": 2.378860222234794e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1689520,
|
||
|
|
"step": 4205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.376299376299376,
|
||
|
|
"grad_norm": 0.00360031402669847,
|
||
|
|
"learning_rate": 2.3403888362475784e-07,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1691568,
|
||
|
|
"step": 4210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.381496881496881,
|
||
|
|
"grad_norm": 0.006181271746754646,
|
||
|
|
"learning_rate": 2.3022157937457628e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1693616,
|
||
|
|
"step": 4215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.386694386694387,
|
||
|
|
"grad_norm": 0.014252823777496815,
|
||
|
|
"learning_rate": 2.2643415973275017e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1695600,
|
||
|
|
"step": 4220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.391891891891892,
|
||
|
|
"grad_norm": 0.008665296249091625,
|
||
|
|
"learning_rate": 2.226766745656231e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1697584,
|
||
|
|
"step": 4225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.397089397089397,
|
||
|
|
"grad_norm": 0.004776356276124716,
|
||
|
|
"learning_rate": 2.1894917334541355e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1699568,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.402286902286902,
|
||
|
|
"grad_norm": 0.01509240921586752,
|
||
|
|
"learning_rate": 2.15251705149562e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"num_input_tokens_seen": 1701744,
|
||
|
|
"step": 4235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.407484407484407,
|
||
|
|
"grad_norm": 0.002179246162995696,
|
||
|
|
"learning_rate": 2.11584318660083e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1703600,
|
||
|
|
"step": 4240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.412681912681912,
|
||
|
|
"grad_norm": 0.01842692494392395,
|
||
|
|
"learning_rate": 2.0794706216292815e-07,
|
||
|
|
"loss": 0.0613,
|
||
|
|
"num_input_tokens_seen": 1705712,
|
||
|
|
"step": 4245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.417879417879418,
|
||
|
|
"grad_norm": 0.007841149345040321,
|
||
|
|
"learning_rate": 2.043399835473475e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1707696,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.423076923076923,
|
||
|
|
"grad_norm": 0.006627705413848162,
|
||
|
|
"learning_rate": 2.0076313030525845e-07,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"num_input_tokens_seen": 1709744,
|
||
|
|
"step": 4255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.428274428274428,
|
||
|
|
"grad_norm": 0.0027992126997560263,
|
||
|
|
"learning_rate": 1.9721654953062412e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1711792,
|
||
|
|
"step": 4260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4334719334719335,
|
||
|
|
"grad_norm": 0.02479691430926323,
|
||
|
|
"learning_rate": 1.937002879188285e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1713904,
|
||
|
|
"step": 4265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4386694386694385,
|
||
|
|
"grad_norm": 0.011448011733591557,
|
||
|
|
"learning_rate": 1.9021439176606565e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1715824,
|
||
|
|
"step": 4270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.443866943866944,
|
||
|
|
"grad_norm": 0.01309084240347147,
|
||
|
|
"learning_rate": 1.8675890696872838e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1717808,
|
||
|
|
"step": 4275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4490644490644495,
|
||
|
|
"grad_norm": 16.534465789794922,
|
||
|
|
"learning_rate": 1.8333387902280314e-07,
|
||
|
|
"loss": 0.0326,
|
||
|
|
"num_input_tokens_seen": 1719856,
|
||
|
|
"step": 4280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.454261954261955,
|
||
|
|
"grad_norm": 0.006772617343813181,
|
||
|
|
"learning_rate": 1.799393530232729e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1721776,
|
||
|
|
"step": 4285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.45945945945946,
|
||
|
|
"grad_norm": 0.04006092995405197,
|
||
|
|
"learning_rate": 1.765753736635234e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1723632,
|
||
|
|
"step": 4290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.464656964656965,
|
||
|
|
"grad_norm": 0.004494254942983389,
|
||
|
|
"learning_rate": 1.7324198523475111e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1725488,
|
||
|
|
"step": 4295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.46985446985447,
|
||
|
|
"grad_norm": 0.005892970599234104,
|
||
|
|
"learning_rate": 1.6993923162538562e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1727600,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.475051975051975,
|
||
|
|
"grad_norm": 0.016387267038226128,
|
||
|
|
"learning_rate": 1.666671563205069e-07,
|
||
|
|
"loss": 0.0462,
|
||
|
|
"num_input_tokens_seen": 1729712,
|
||
|
|
"step": 4305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.48024948024948,
|
||
|
|
"grad_norm": 0.003626425750553608,
|
||
|
|
"learning_rate": 1.6342580240127582e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1731696,
|
||
|
|
"step": 4310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.485446985446986,
|
||
|
|
"grad_norm": 0.050125960260629654,
|
||
|
|
"learning_rate": 1.6021521254436678e-07,
|
||
|
|
"loss": 0.0169,
|
||
|
|
"num_input_tokens_seen": 1733744,
|
||
|
|
"step": 4315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.490644490644491,
|
||
|
|
"grad_norm": 0.010477319359779358,
|
||
|
|
"learning_rate": 1.5703542902140296e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1735728,
|
||
|
|
"step": 4320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.495841995841996,
|
||
|
|
"grad_norm": 0.18304939568042755,
|
||
|
|
"learning_rate": 1.538864936984036e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1737776,
|
||
|
|
"step": 4325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.501039501039501,
|
||
|
|
"grad_norm": 0.0033877205569297075,
|
||
|
|
"learning_rate": 1.507684480352292e-07,
|
||
|
|
"loss": 0.0313,
|
||
|
|
"num_input_tokens_seen": 1739824,
|
||
|
|
"step": 4330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.506237006237006,
|
||
|
|
"grad_norm": 0.005890438798815012,
|
||
|
|
"learning_rate": 1.476813330850388e-07,
|
||
|
|
"loss": 0.0202,
|
||
|
|
"num_input_tokens_seen": 1741744,
|
||
|
|
"step": 4335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.509355509355509,
|
||
|
|
"eval_loss": 0.43684616684913635,
|
||
|
|
"eval_runtime": 1.0364,
|
||
|
|
"eval_samples_per_second": 825.951,
|
||
|
|
"eval_steps_per_second": 103.244,
|
||
|
|
"num_input_tokens_seen": 1742960,
|
||
|
|
"step": 4338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.511434511434511,
|
||
|
|
"grad_norm": 0.004618450067937374,
|
||
|
|
"learning_rate": 1.4462518949374838e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1743728,
|
||
|
|
"step": 4340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.516632016632016,
|
||
|
|
"grad_norm": 15.636531829833984,
|
||
|
|
"learning_rate": 1.4160005749949328e-07,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"num_input_tokens_seen": 1745904,
|
||
|
|
"step": 4345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.521829521829522,
|
||
|
|
"grad_norm": 0.009265006519854069,
|
||
|
|
"learning_rate": 1.386059769321027e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1747824,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.527027027027027,
|
||
|
|
"grad_norm": 0.14768032729625702,
|
||
|
|
"learning_rate": 1.3564298721257223e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1749872,
|
||
|
|
"step": 4355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.532224532224532,
|
||
|
|
"grad_norm": 0.004393266513943672,
|
||
|
|
"learning_rate": 1.32711127352545e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1751792,
|
||
|
|
"step": 4360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.537422037422037,
|
||
|
|
"grad_norm": 0.0018293843604624271,
|
||
|
|
"learning_rate": 1.2981043595380048e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1753776,
|
||
|
|
"step": 4365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.542619542619542,
|
||
|
|
"grad_norm": 0.010307732038199902,
|
||
|
|
"learning_rate": 1.269409512077427e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1755824,
|
||
|
|
"step": 4370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.547817047817047,
|
||
|
|
"grad_norm": 0.02882198989391327,
|
||
|
|
"learning_rate": 1.241027108949e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1758000,
|
||
|
|
"step": 4375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.553014553014553,
|
||
|
|
"grad_norm": 0.02132793888449669,
|
||
|
|
"learning_rate": 1.2129575238442715e-07,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_input_tokens_seen": 1759984,
|
||
|
|
"step": 4380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.558212058212058,
|
||
|
|
"grad_norm": 0.014733387157320976,
|
||
|
|
"learning_rate": 1.1852011263361218e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1761968,
|
||
|
|
"step": 4385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5634095634095635,
|
||
|
|
"grad_norm": 0.009524204768240452,
|
||
|
|
"learning_rate": 1.1577582818739136e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1764016,
|
||
|
|
"step": 4390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5686070686070686,
|
||
|
|
"grad_norm": 17.770906448364258,
|
||
|
|
"learning_rate": 1.1306293517786615e-07,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"num_input_tokens_seen": 1765936,
|
||
|
|
"step": 4395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.573804573804574,
|
||
|
|
"grad_norm": 0.0056666964665055275,
|
||
|
|
"learning_rate": 1.1038146932383003e-07,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1767984,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.579002079002079,
|
||
|
|
"grad_norm": 0.2879636287689209,
|
||
|
|
"learning_rate": 1.0773146593029637e-07,
|
||
|
|
"loss": 0.0266,
|
||
|
|
"num_input_tokens_seen": 1769904,
|
||
|
|
"step": 4405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.584199584199585,
|
||
|
|
"grad_norm": 0.01593073643743992,
|
||
|
|
"learning_rate": 1.0511295988803293e-07,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1771888,
|
||
|
|
"step": 4410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.58939708939709,
|
||
|
|
"grad_norm": 0.005392360966652632,
|
||
|
|
"learning_rate": 1.0252598567310451e-07,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"num_input_tokens_seen": 1773936,
|
||
|
|
"step": 4415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.594594594594595,
|
||
|
|
"grad_norm": 0.00791078433394432,
|
||
|
|
"learning_rate": 9.997057734641852e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1775984,
|
||
|
|
"step": 4420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5997920997921,
|
||
|
|
"grad_norm": 0.006835015490651131,
|
||
|
|
"learning_rate": 9.744676855327484e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1777840,
|
||
|
|
"step": 4425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.604989604989605,
|
||
|
|
"grad_norm": 11.176219940185547,
|
||
|
|
"learning_rate": 9.495459252292505e-08,
|
||
|
|
"loss": 0.0267,
|
||
|
|
"num_input_tokens_seen": 1779824,
|
||
|
|
"step": 4430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.61018711018711,
|
||
|
|
"grad_norm": 0.013272907584905624,
|
||
|
|
"learning_rate": 9.249408206813332e-08,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"num_input_tokens_seen": 1781872,
|
||
|
|
"step": 4435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.615384615384615,
|
||
|
|
"grad_norm": 0.012476145289838314,
|
||
|
|
"learning_rate": 9.00652695847451e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1783984,
|
||
|
|
"step": 4440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.620582120582121,
|
||
|
|
"grad_norm": 0.02734716795384884,
|
||
|
|
"learning_rate": 8.766818705126134e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1786032,
|
||
|
|
"step": 4445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.625779625779626,
|
||
|
|
"grad_norm": 0.0005791023722849786,
|
||
|
|
"learning_rate": 8.530286602841525e-08,
|
||
|
|
"loss": 0.0058,
|
||
|
|
"num_input_tokens_seen": 1788016,
|
||
|
|
"step": 4450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.630977130977131,
|
||
|
|
"grad_norm": 0.0094565125182271,
|
||
|
|
"learning_rate": 8.296933765875898e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1790064,
|
||
|
|
"step": 4455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.636174636174636,
|
||
|
|
"grad_norm": 0.0023999966215342283,
|
||
|
|
"learning_rate": 8.066763266625283e-08,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1791984,
|
||
|
|
"step": 4460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.641372141372141,
|
||
|
|
"grad_norm": 0.004338675644248724,
|
||
|
|
"learning_rate": 7.839778135586007e-08,
|
||
|
|
"loss": 0.0321,
|
||
|
|
"num_input_tokens_seen": 1793904,
|
||
|
|
"step": 4465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.646569646569646,
|
||
|
|
"grad_norm": 0.7023778557777405,
|
||
|
|
"learning_rate": 7.61598136131489e-08,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1795888,
|
||
|
|
"step": 4470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.651767151767151,
|
||
|
|
"grad_norm": 0.019446399062871933,
|
||
|
|
"learning_rate": 7.3953758903898e-08,
|
||
|
|
"loss": 0.028,
|
||
|
|
"num_input_tokens_seen": 1797872,
|
||
|
|
"step": 4475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.656964656964657,
|
||
|
|
"grad_norm": 0.06941288709640503,
|
||
|
|
"learning_rate": 7.177964627370999e-08,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_input_tokens_seen": 1799920,
|
||
|
|
"step": 4480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.662162162162162,
|
||
|
|
"grad_norm": 0.009321698918938637,
|
||
|
|
"learning_rate": 6.963750434762745e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1801776,
|
||
|
|
"step": 4485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.667359667359667,
|
||
|
|
"grad_norm": 61.291290283203125,
|
||
|
|
"learning_rate": 6.752736132975696e-08,
|
||
|
|
"loss": 0.0157,
|
||
|
|
"num_input_tokens_seen": 1803824,
|
||
|
|
"step": 4490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.672557172557172,
|
||
|
|
"grad_norm": 0.003786651650443673,
|
||
|
|
"learning_rate": 6.544924500289789e-08,
|
||
|
|
"loss": 0.0562,
|
||
|
|
"num_input_tokens_seen": 1805744,
|
||
|
|
"step": 4495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6777546777546775,
|
||
|
|
"grad_norm": 0.01198617834597826,
|
||
|
|
"learning_rate": 6.340318272817476e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1807728,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.682952182952183,
|
||
|
|
"grad_norm": 0.024431385099887848,
|
||
|
|
"learning_rate": 6.138920144468124e-08,
|
||
|
|
"loss": 0.0329,
|
||
|
|
"num_input_tokens_seen": 1809712,
|
||
|
|
"step": 4505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6881496881496885,
|
||
|
|
"grad_norm": 0.011199146509170532,
|
||
|
|
"learning_rate": 5.940732766912011e-08,
|
||
|
|
"loss": 0.1284,
|
||
|
|
"num_input_tokens_seen": 1811632,
|
||
|
|
"step": 4510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6933471933471935,
|
||
|
|
"grad_norm": 0.015243390575051308,
|
||
|
|
"learning_rate": 5.745758749545749e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1813552,
|
||
|
|
"step": 4515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.698544698544699,
|
||
|
|
"grad_norm": 0.02215772680938244,
|
||
|
|
"learning_rate": 5.554000659457881e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1815664,
|
||
|
|
"step": 4520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.703742203742204,
|
||
|
|
"grad_norm": 0.07075236737728119,
|
||
|
|
"learning_rate": 5.365461021395096e-08,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"num_input_tokens_seen": 1817648,
|
||
|
|
"step": 4525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.708939708939709,
|
||
|
|
"grad_norm": 0.6212006211280823,
|
||
|
|
"learning_rate": 5.1801423177288146e-08,
|
||
|
|
"loss": 0.0226,
|
||
|
|
"num_input_tokens_seen": 1819696,
|
||
|
|
"step": 4530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.714137214137214,
|
||
|
|
"grad_norm": 0.021660171449184418,
|
||
|
|
"learning_rate": 4.998046988422767e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1821680,
|
||
|
|
"step": 4535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.71933471933472,
|
||
|
|
"grad_norm": 0.003982728812843561,
|
||
|
|
"learning_rate": 4.8191774310006045e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1823728,
|
||
|
|
"step": 4540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.724532224532225,
|
||
|
|
"grad_norm": 0.005566044710576534,
|
||
|
|
"learning_rate": 4.6435360005145647e-08,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_input_tokens_seen": 1825712,
|
||
|
|
"step": 4545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.72972972972973,
|
||
|
|
"grad_norm": 0.006296331528574228,
|
||
|
|
"learning_rate": 4.471125009514326e-08,
|
||
|
|
"loss": 0.0258,
|
||
|
|
"num_input_tokens_seen": 1827760,
|
||
|
|
"step": 4550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.734927234927235,
|
||
|
|
"grad_norm": 0.007365924771875143,
|
||
|
|
"learning_rate": 4.30194672801662e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1829680,
|
||
|
|
"step": 4555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.74012474012474,
|
||
|
|
"grad_norm": 0.032850153744220734,
|
||
|
|
"learning_rate": 4.136003383475251e-08,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1831728,
|
||
|
|
"step": 4560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.745322245322245,
|
||
|
|
"grad_norm": 0.048938535153865814,
|
||
|
|
"learning_rate": 3.9732971607519264e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1833648,
|
||
|
|
"step": 4565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.75051975051975,
|
||
|
|
"grad_norm": 0.010438877157866955,
|
||
|
|
"learning_rate": 3.813830202087338e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1835696,
|
||
|
|
"step": 4570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.755717255717256,
|
||
|
|
"grad_norm": 0.24429504573345184,
|
||
|
|
"learning_rate": 3.6576046070730676e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1837808,
|
||
|
|
"step": 4575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.75987525987526,
|
||
|
|
"eval_loss": 0.4380520284175873,
|
||
|
|
"eval_runtime": 1.0491,
|
||
|
|
"eval_samples_per_second": 815.908,
|
||
|
|
"eval_steps_per_second": 101.988,
|
||
|
|
"num_input_tokens_seen": 1839344,
|
||
|
|
"step": 4579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.760914760914761,
|
||
|
|
"grad_norm": 0.021253783255815506,
|
||
|
|
"learning_rate": 3.504622432623811e-08,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_input_tokens_seen": 1839728,
|
||
|
|
"step": 4580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.766112266112266,
|
||
|
|
"grad_norm": 0.007059005554765463,
|
||
|
|
"learning_rate": 3.354885692950505e-08,
|
||
|
|
"loss": 0.002,
|
||
|
|
"num_input_tokens_seen": 1841776,
|
||
|
|
"step": 4585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.771309771309771,
|
||
|
|
"grad_norm": 0.0066725509241223335,
|
||
|
|
"learning_rate": 3.208396359533572e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1843696,
|
||
|
|
"step": 4590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.776507276507276,
|
||
|
|
"grad_norm": 0.006126644089818001,
|
||
|
|
"learning_rate": 3.065156361097138e-08,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1845744,
|
||
|
|
"step": 4595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.781704781704782,
|
||
|
|
"grad_norm": 5.130758285522461,
|
||
|
|
"learning_rate": 2.925167583583577e-08,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_input_tokens_seen": 1847792,
|
||
|
|
"step": 4600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.786902286902287,
|
||
|
|
"grad_norm": 0.009116302244365215,
|
||
|
|
"learning_rate": 2.7884318701285883e-08,
|
||
|
|
"loss": 0.0712,
|
||
|
|
"num_input_tokens_seen": 1849776,
|
||
|
|
"step": 4605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.792099792099792,
|
||
|
|
"grad_norm": 0.005589164327830076,
|
||
|
|
"learning_rate": 2.654951021037161e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1852016,
|
||
|
|
"step": 4610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.797297297297297,
|
||
|
|
"grad_norm": 0.0037636614870280027,
|
||
|
|
"learning_rate": 2.524726793759591e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1854064,
|
||
|
|
"step": 4615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.802494802494802,
|
||
|
|
"grad_norm": 0.012889928184449673,
|
||
|
|
"learning_rate": 2.3977609028686123e-08,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1856112,
|
||
|
|
"step": 4620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8076923076923075,
|
||
|
|
"grad_norm": 0.0022313897497951984,
|
||
|
|
"learning_rate": 2.2740550200365528e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1858096,
|
||
|
|
"step": 4625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8128898128898125,
|
||
|
|
"grad_norm": 0.004886255133897066,
|
||
|
|
"learning_rate": 2.153610774013548e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1860272,
|
||
|
|
"step": 4630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8180873180873185,
|
||
|
|
"grad_norm": 0.004527249839156866,
|
||
|
|
"learning_rate": 2.0364297506060005e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1862256,
|
||
|
|
"step": 4635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8232848232848236,
|
||
|
|
"grad_norm": 0.005999819375574589,
|
||
|
|
"learning_rate": 1.922513492655653e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1864304,
|
||
|
|
"step": 4640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.828482328482329,
|
||
|
|
"grad_norm": 0.003096930915489793,
|
||
|
|
"learning_rate": 1.8118635000194395e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1866224,
|
||
|
|
"step": 4645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.833679833679834,
|
||
|
|
"grad_norm": 0.011734005995094776,
|
||
|
|
"learning_rate": 1.704481229549526e-08,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1868336,
|
||
|
|
"step": 4650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.838877338877339,
|
||
|
|
"grad_norm": 0.005439637694507837,
|
||
|
|
"learning_rate": 1.6003680950742728e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1870448,
|
||
|
|
"step": 4655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.844074844074844,
|
||
|
|
"grad_norm": 21.41458511352539,
|
||
|
|
"learning_rate": 1.499525467379581e-08,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"num_input_tokens_seen": 1872368,
|
||
|
|
"step": 4660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.849272349272349,
|
||
|
|
"grad_norm": 0.007195206359028816,
|
||
|
|
"learning_rate": 1.4019546741908252e-08,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1874480,
|
||
|
|
"step": 4665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.854469854469855,
|
||
|
|
"grad_norm": 0.019606366753578186,
|
||
|
|
"learning_rate": 1.3076570001553934e-08,
|
||
|
|
"loss": 0.0214,
|
||
|
|
"num_input_tokens_seen": 1876464,
|
||
|
|
"step": 4670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.85966735966736,
|
||
|
|
"grad_norm": 0.021399203687906265,
|
||
|
|
"learning_rate": 1.216633686825841e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1878448,
|
||
|
|
"step": 4675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.864864864864865,
|
||
|
|
"grad_norm": 0.008349803276360035,
|
||
|
|
"learning_rate": 1.1288859326433477e-08,
|
||
|
|
"loss": 0.0426,
|
||
|
|
"num_input_tokens_seen": 1880432,
|
||
|
|
"step": 4680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.87006237006237,
|
||
|
|
"grad_norm": 0.00672512361779809,
|
||
|
|
"learning_rate": 1.0444148929221466e-08,
|
||
|
|
"loss": 0.0598,
|
||
|
|
"num_input_tokens_seen": 1882544,
|
||
|
|
"step": 4685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.875259875259875,
|
||
|
|
"grad_norm": 0.009454301558434963,
|
||
|
|
"learning_rate": 9.632216798342032e-09,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1884528,
|
||
|
|
"step": 4690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.88045738045738,
|
||
|
|
"grad_norm": 0.001981085864827037,
|
||
|
|
"learning_rate": 8.853073623946163e-09,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1886640,
|
||
|
|
"step": 4695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.885654885654886,
|
||
|
|
"grad_norm": 13.079166412353516,
|
||
|
|
"learning_rate": 8.106729664475178e-09,
|
||
|
|
"loss": 0.0369,
|
||
|
|
"num_input_tokens_seen": 1888688,
|
||
|
|
"step": 4700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.890852390852391,
|
||
|
|
"grad_norm": 0.020751064643263817,
|
||
|
|
"learning_rate": 7.3931947465252786e-09,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1890736,
|
||
|
|
"step": 4705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.896049896049896,
|
||
|
|
"grad_norm": 0.0023189696948975325,
|
||
|
|
"learning_rate": 6.7124782647196015e-09,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1892720,
|
||
|
|
"step": 4710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.901247401247401,
|
||
|
|
"grad_norm": 0.010008195415139198,
|
||
|
|
"learning_rate": 6.064589181582481e-09,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1894704,
|
||
|
|
"step": 4715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.906444906444906,
|
||
|
|
"grad_norm": 0.011259862221777439,
|
||
|
|
"learning_rate": 5.4495360274231526e-09,
|
||
|
|
"loss": 0.0287,
|
||
|
|
"num_input_tokens_seen": 1896624,
|
||
|
|
"step": 4720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.911642411642411,
|
||
|
|
"grad_norm": 0.006877740379422903,
|
||
|
|
"learning_rate": 4.867326900223068e-09,
|
||
|
|
"loss": 0.0307,
|
||
|
|
"num_input_tokens_seen": 1898544,
|
||
|
|
"step": 4725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.916839916839917,
|
||
|
|
"grad_norm": 0.0025101625360548496,
|
||
|
|
"learning_rate": 4.317969465527927e-09,
|
||
|
|
"loss": 0.0353,
|
||
|
|
"num_input_tokens_seen": 1900592,
|
||
|
|
"step": 4730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.922037422037422,
|
||
|
|
"grad_norm": 0.022444335743784904,
|
||
|
|
"learning_rate": 3.801470956348863e-09,
|
||
|
|
"loss": 0.0287,
|
||
|
|
"num_input_tokens_seen": 1902576,
|
||
|
|
"step": 4735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.927234927234927,
|
||
|
|
"grad_norm": 0.0029787139501422644,
|
||
|
|
"learning_rate": 3.3178381730661345e-09,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1904624,
|
||
|
|
"step": 4740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9324324324324325,
|
||
|
|
"grad_norm": 0.0010304702445864677,
|
||
|
|
"learning_rate": 2.8670774833386427e-09,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1906736,
|
||
|
|
"step": 4745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9376299376299375,
|
||
|
|
"grad_norm": 0.0025558616034686565,
|
||
|
|
"learning_rate": 2.449194822022327e-09,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1908592,
|
||
|
|
"step": 4750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.942827442827443,
|
||
|
|
"grad_norm": 0.019764816388487816,
|
||
|
|
"learning_rate": 2.064195691089954e-09,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_input_tokens_seen": 1910576,
|
||
|
|
"step": 4755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.948024948024948,
|
||
|
|
"grad_norm": 0.004834398627281189,
|
||
|
|
"learning_rate": 1.7120851595597842e-09,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1912624,
|
||
|
|
"step": 4760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.953222453222454,
|
||
|
|
"grad_norm": 0.03484058007597923,
|
||
|
|
"learning_rate": 1.3928678634289595e-09,
|
||
|
|
"loss": 0.0283,
|
||
|
|
"num_input_tokens_seen": 1914608,
|
||
|
|
"step": 4765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.958419958419959,
|
||
|
|
"grad_norm": 0.008615722879767418,
|
||
|
|
"learning_rate": 1.1065480056110521e-09,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_input_tokens_seen": 1916592,
|
||
|
|
"step": 4770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.963617463617464,
|
||
|
|
"grad_norm": 0.16332073509693146,
|
||
|
|
"learning_rate": 8.531293558824983e-10,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1918704,
|
||
|
|
"step": 4775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.968814968814969,
|
||
|
|
"grad_norm": 0.0005217403522692621,
|
||
|
|
"learning_rate": 6.326152508320804e-10,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1920624,
|
||
|
|
"step": 4780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.974012474012474,
|
||
|
|
"grad_norm": 1.4390920400619507,
|
||
|
|
"learning_rate": 4.450085938170756e-10,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"num_input_tokens_seen": 1922480,
|
||
|
|
"step": 4785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.979209979209979,
|
||
|
|
"grad_norm": 0.018787242472171783,
|
||
|
|
"learning_rate": 2.903118549252293e-10,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_input_tokens_seen": 1924464,
|
||
|
|
"step": 4790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.984407484407484,
|
||
|
|
"grad_norm": 0.14635036885738373,
|
||
|
|
"learning_rate": 1.6852707094172637e-10,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1926448,
|
||
|
|
"step": 4795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.98960498960499,
|
||
|
|
"grad_norm": 0.04664904624223709,
|
||
|
|
"learning_rate": 7.965584532282356e-11,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_input_tokens_seen": 1928560,
|
||
|
|
"step": 4800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.994802494802495,
|
||
|
|
"grad_norm": 0.024172263219952583,
|
||
|
|
"learning_rate": 2.3699348174754943e-11,
|
||
|
|
"loss": 0.0177,
|
||
|
|
"num_input_tokens_seen": 1930544,
|
||
|
|
"step": 4805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 0.0018674664897844195,
|
||
|
|
"learning_rate": 6.583162381890162e-13,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_input_tokens_seen": 1932608,
|
||
|
|
"step": 4810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"num_input_tokens_seen": 1932608,
|
||
|
|
"step": 4810,
|
||
|
|
"total_flos": 1.1284259767320576e+16,
|
||
|
|
"train_loss": 0.10950150515592155,
|
||
|
|
"train_runtime": 1431.7139,
|
||
|
|
"train_samples_per_second": 26.873,
|
||
|
|
"train_steps_per_second": 3.36
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 4810,
|
||
|
|
"num_input_tokens_seen": 1932608,
|
||
|
|
"num_train_epochs": 5,
|
||
|
|
"save_steps": 241,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 1.1284259767320576e+16,
|
||
|
|
"train_batch_size": 8,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|