17738 lines
463 KiB
JSON
17738 lines
463 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 1966,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.000508646998982706,
|
|
"grad_norm": 32.11302947998047,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.2961,
|
|
"mean_token_accuracy": 0.7449109554290771,
|
|
"num_tokens": 156200.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.001017293997965412,
|
|
"grad_norm": 31.278301239013672,
|
|
"learning_rate": 5.076142131979696e-08,
|
|
"loss": 1.2669,
|
|
"mean_token_accuracy": 0.7504401206970215,
|
|
"num_tokens": 314183.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.001525940996948118,
|
|
"grad_norm": 30.861223220825195,
|
|
"learning_rate": 1.0152284263959391e-07,
|
|
"loss": 1.3067,
|
|
"mean_token_accuracy": 0.7393251657485962,
|
|
"num_tokens": 470040.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.002034587995930824,
|
|
"grad_norm": 31.689613342285156,
|
|
"learning_rate": 1.5228426395939088e-07,
|
|
"loss": 1.3077,
|
|
"mean_token_accuracy": 0.7398750185966492,
|
|
"num_tokens": 627227.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.00254323499491353,
|
|
"grad_norm": 31.292264938354492,
|
|
"learning_rate": 2.0304568527918783e-07,
|
|
"loss": 1.3225,
|
|
"mean_token_accuracy": 0.7381956577301025,
|
|
"num_tokens": 785176.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.003051881993896236,
|
|
"grad_norm": 32.40222930908203,
|
|
"learning_rate": 2.538071065989848e-07,
|
|
"loss": 1.3197,
|
|
"mean_token_accuracy": 0.7405046224594116,
|
|
"num_tokens": 926658.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.003560528992878942,
|
|
"grad_norm": 31.02337074279785,
|
|
"learning_rate": 3.0456852791878176e-07,
|
|
"loss": 1.3316,
|
|
"mean_token_accuracy": 0.7354226112365723,
|
|
"num_tokens": 1089803.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.004069175991861648,
|
|
"grad_norm": 30.440677642822266,
|
|
"learning_rate": 3.553299492385787e-07,
|
|
"loss": 1.268,
|
|
"mean_token_accuracy": 0.7473543882369995,
|
|
"num_tokens": 1253116.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.004577822990844354,
|
|
"grad_norm": 29.12032699584961,
|
|
"learning_rate": 4.0609137055837566e-07,
|
|
"loss": 1.2602,
|
|
"mean_token_accuracy": 0.7476384043693542,
|
|
"num_tokens": 1419240.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.00508646998982706,
|
|
"grad_norm": 29.42680549621582,
|
|
"learning_rate": 4.568527918781726e-07,
|
|
"loss": 1.2725,
|
|
"mean_token_accuracy": 0.7454349994659424,
|
|
"num_tokens": 1582136.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.005595116988809766,
|
|
"grad_norm": 28.261323928833008,
|
|
"learning_rate": 5.076142131979696e-07,
|
|
"loss": 1.2612,
|
|
"mean_token_accuracy": 0.744758129119873,
|
|
"num_tokens": 1738808.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.006103763987792472,
|
|
"grad_norm": 27.25571632385254,
|
|
"learning_rate": 5.583756345177665e-07,
|
|
"loss": 1.2428,
|
|
"mean_token_accuracy": 0.7447201609611511,
|
|
"num_tokens": 1899325.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.006612410986775178,
|
|
"grad_norm": 26.683231353759766,
|
|
"learning_rate": 6.091370558375635e-07,
|
|
"loss": 1.1995,
|
|
"mean_token_accuracy": 0.7522713541984558,
|
|
"num_tokens": 2067149.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.007121057985757884,
|
|
"grad_norm": 19.833831787109375,
|
|
"learning_rate": 6.598984771573605e-07,
|
|
"loss": 1.2018,
|
|
"mean_token_accuracy": 0.7373824715614319,
|
|
"num_tokens": 2225992.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.0076297049847405905,
|
|
"grad_norm": 20.34471893310547,
|
|
"learning_rate": 7.106598984771574e-07,
|
|
"loss": 1.1661,
|
|
"mean_token_accuracy": 0.7451382875442505,
|
|
"num_tokens": 2391475.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.008138351983723296,
|
|
"grad_norm": 17.260560989379883,
|
|
"learning_rate": 7.614213197969544e-07,
|
|
"loss": 1.133,
|
|
"mean_token_accuracy": 0.747637152671814,
|
|
"num_tokens": 2564876.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.008646998982706003,
|
|
"grad_norm": 16.89604377746582,
|
|
"learning_rate": 8.121827411167513e-07,
|
|
"loss": 1.167,
|
|
"mean_token_accuracy": 0.7356365919113159,
|
|
"num_tokens": 2728912.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.009155645981688708,
|
|
"grad_norm": 17.789722442626953,
|
|
"learning_rate": 8.629441624365482e-07,
|
|
"loss": 1.1113,
|
|
"mean_token_accuracy": 0.752740204334259,
|
|
"num_tokens": 2885094.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.009664292980671414,
|
|
"grad_norm": 9.220502853393555,
|
|
"learning_rate": 9.137055837563452e-07,
|
|
"loss": 1.0571,
|
|
"mean_token_accuracy": 0.7442377805709839,
|
|
"num_tokens": 3043505.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.01017293997965412,
|
|
"grad_norm": 6.030149459838867,
|
|
"learning_rate": 9.644670050761422e-07,
|
|
"loss": 0.9415,
|
|
"mean_token_accuracy": 0.7668419480323792,
|
|
"num_tokens": 3200547.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.010681586978636826,
|
|
"grad_norm": 5.80075216293335,
|
|
"learning_rate": 1.0152284263959392e-06,
|
|
"loss": 1.027,
|
|
"mean_token_accuracy": 0.746906578540802,
|
|
"num_tokens": 3352567.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.011190233977619531,
|
|
"grad_norm": 5.6724772453308105,
|
|
"learning_rate": 1.0659898477157362e-06,
|
|
"loss": 1.0436,
|
|
"mean_token_accuracy": 0.7409395575523376,
|
|
"num_tokens": 3522299.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.011698880976602238,
|
|
"grad_norm": 5.231161594390869,
|
|
"learning_rate": 1.116751269035533e-06,
|
|
"loss": 1.0285,
|
|
"mean_token_accuracy": 0.7423572540283203,
|
|
"num_tokens": 3678029.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.012207527975584944,
|
|
"grad_norm": 4.966853141784668,
|
|
"learning_rate": 1.16751269035533e-06,
|
|
"loss": 0.9824,
|
|
"mean_token_accuracy": 0.7529760599136353,
|
|
"num_tokens": 3843052.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.01271617497456765,
|
|
"grad_norm": 4.760066509246826,
|
|
"learning_rate": 1.218274111675127e-06,
|
|
"loss": 0.979,
|
|
"mean_token_accuracy": 0.7546529769897461,
|
|
"num_tokens": 4003590.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.013224821973550356,
|
|
"grad_norm": 5.28239631652832,
|
|
"learning_rate": 1.2690355329949238e-06,
|
|
"loss": 0.9364,
|
|
"mean_token_accuracy": 0.7599929571151733,
|
|
"num_tokens": 4163923.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.013733468972533061,
|
|
"grad_norm": 6.529881477355957,
|
|
"learning_rate": 1.319796954314721e-06,
|
|
"loss": 0.9794,
|
|
"mean_token_accuracy": 0.7488726377487183,
|
|
"num_tokens": 4329915.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.014242115971515769,
|
|
"grad_norm": 6.9578962326049805,
|
|
"learning_rate": 1.3705583756345178e-06,
|
|
"loss": 0.9386,
|
|
"mean_token_accuracy": 0.7588946223258972,
|
|
"num_tokens": 4501785.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.014750762970498474,
|
|
"grad_norm": 6.696822643280029,
|
|
"learning_rate": 1.4213197969543148e-06,
|
|
"loss": 0.9373,
|
|
"mean_token_accuracy": 0.7576085329055786,
|
|
"num_tokens": 4665203.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.015259409969481181,
|
|
"grad_norm": 6.5830302238464355,
|
|
"learning_rate": 1.4720812182741118e-06,
|
|
"loss": 0.9173,
|
|
"mean_token_accuracy": 0.7623026371002197,
|
|
"num_tokens": 4826248.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.015768056968463885,
|
|
"grad_norm": 5.765131950378418,
|
|
"learning_rate": 1.5228426395939088e-06,
|
|
"loss": 0.9028,
|
|
"mean_token_accuracy": 0.7635811567306519,
|
|
"num_tokens": 5002184.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.01627670396744659,
|
|
"grad_norm": 5.106930255889893,
|
|
"learning_rate": 1.5736040609137056e-06,
|
|
"loss": 0.919,
|
|
"mean_token_accuracy": 0.7574912905693054,
|
|
"num_tokens": 5159429.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.0167853509664293,
|
|
"grad_norm": 4.388010025024414,
|
|
"learning_rate": 1.6243654822335026e-06,
|
|
"loss": 0.9034,
|
|
"mean_token_accuracy": 0.7649917602539062,
|
|
"num_tokens": 5319981.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.017293997965412006,
|
|
"grad_norm": 3.827232837677002,
|
|
"learning_rate": 1.6751269035532996e-06,
|
|
"loss": 0.9191,
|
|
"mean_token_accuracy": 0.7602884769439697,
|
|
"num_tokens": 5470920.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.01780264496439471,
|
|
"grad_norm": 3.347505807876587,
|
|
"learning_rate": 1.7258883248730964e-06,
|
|
"loss": 0.9235,
|
|
"mean_token_accuracy": 0.7586832642555237,
|
|
"num_tokens": 5621279.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.018311291963377416,
|
|
"grad_norm": 2.964426279067993,
|
|
"learning_rate": 1.7766497461928936e-06,
|
|
"loss": 0.868,
|
|
"mean_token_accuracy": 0.7696095705032349,
|
|
"num_tokens": 5783722.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.018819938962360123,
|
|
"grad_norm": 2.932925224304199,
|
|
"learning_rate": 1.8274111675126904e-06,
|
|
"loss": 0.8643,
|
|
"mean_token_accuracy": 0.7698394060134888,
|
|
"num_tokens": 5946247.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.019328585961342827,
|
|
"grad_norm": 2.958730459213257,
|
|
"learning_rate": 1.8781725888324874e-06,
|
|
"loss": 0.8728,
|
|
"mean_token_accuracy": 0.7667010426521301,
|
|
"num_tokens": 6105483.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.019837232960325534,
|
|
"grad_norm": 2.769481658935547,
|
|
"learning_rate": 1.9289340101522844e-06,
|
|
"loss": 0.8852,
|
|
"mean_token_accuracy": 0.7639551758766174,
|
|
"num_tokens": 6267257.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.02034587995930824,
|
|
"grad_norm": 2.108130693435669,
|
|
"learning_rate": 1.9796954314720814e-06,
|
|
"loss": 0.8248,
|
|
"mean_token_accuracy": 0.7764819860458374,
|
|
"num_tokens": 6423575.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.020854526958290945,
|
|
"grad_norm": 1.7889447212219238,
|
|
"learning_rate": 2.0304568527918785e-06,
|
|
"loss": 0.8387,
|
|
"mean_token_accuracy": 0.774937093257904,
|
|
"num_tokens": 6580647.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.021363173957273652,
|
|
"grad_norm": 1.8017579317092896,
|
|
"learning_rate": 2.0812182741116755e-06,
|
|
"loss": 0.8172,
|
|
"mean_token_accuracy": 0.7769420146942139,
|
|
"num_tokens": 6740933.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.02187182095625636,
|
|
"grad_norm": 2.1292924880981445,
|
|
"learning_rate": 2.1319796954314725e-06,
|
|
"loss": 0.8129,
|
|
"mean_token_accuracy": 0.7777750492095947,
|
|
"num_tokens": 6888534.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.022380467955239063,
|
|
"grad_norm": 2.17268967628479,
|
|
"learning_rate": 2.182741116751269e-06,
|
|
"loss": 0.8078,
|
|
"mean_token_accuracy": 0.7786726951599121,
|
|
"num_tokens": 7053795.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.02288911495422177,
|
|
"grad_norm": 1.9331721067428589,
|
|
"learning_rate": 2.233502538071066e-06,
|
|
"loss": 0.8014,
|
|
"mean_token_accuracy": 0.7789409160614014,
|
|
"num_tokens": 7206610.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.023397761953204477,
|
|
"grad_norm": 1.6656444072723389,
|
|
"learning_rate": 2.284263959390863e-06,
|
|
"loss": 0.8457,
|
|
"mean_token_accuracy": 0.7701550722122192,
|
|
"num_tokens": 7375236.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.023906408952187184,
|
|
"grad_norm": 1.4546806812286377,
|
|
"learning_rate": 2.33502538071066e-06,
|
|
"loss": 0.7827,
|
|
"mean_token_accuracy": 0.7828081846237183,
|
|
"num_tokens": 7533110.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.024415055951169887,
|
|
"grad_norm": 1.4266563653945923,
|
|
"learning_rate": 2.385786802030457e-06,
|
|
"loss": 0.8063,
|
|
"mean_token_accuracy": 0.777489185333252,
|
|
"num_tokens": 7693804.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.024923702950152594,
|
|
"grad_norm": 1.5602530241012573,
|
|
"learning_rate": 2.436548223350254e-06,
|
|
"loss": 0.7513,
|
|
"mean_token_accuracy": 0.793053388595581,
|
|
"num_tokens": 7873397.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.0254323499491353,
|
|
"grad_norm": 1.5479062795639038,
|
|
"learning_rate": 2.487309644670051e-06,
|
|
"loss": 0.788,
|
|
"mean_token_accuracy": 0.7837173938751221,
|
|
"num_tokens": 8024175.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.025940996948118005,
|
|
"grad_norm": 1.4507423639297485,
|
|
"learning_rate": 2.5380710659898476e-06,
|
|
"loss": 0.7655,
|
|
"mean_token_accuracy": 0.7892324328422546,
|
|
"num_tokens": 8177438.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.026449643947100712,
|
|
"grad_norm": 1.288428544998169,
|
|
"learning_rate": 2.588832487309645e-06,
|
|
"loss": 0.7669,
|
|
"mean_token_accuracy": 0.7886277437210083,
|
|
"num_tokens": 8351946.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.02695829094608342,
|
|
"grad_norm": 1.4515126943588257,
|
|
"learning_rate": 2.639593908629442e-06,
|
|
"loss": 0.7712,
|
|
"mean_token_accuracy": 0.7832987308502197,
|
|
"num_tokens": 8504280.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.027466937945066123,
|
|
"grad_norm": 1.3418675661087036,
|
|
"learning_rate": 2.6903553299492387e-06,
|
|
"loss": 0.7384,
|
|
"mean_token_accuracy": 0.7952812910079956,
|
|
"num_tokens": 8661017.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.02797558494404883,
|
|
"grad_norm": 1.3066939115524292,
|
|
"learning_rate": 2.7411167512690357e-06,
|
|
"loss": 0.769,
|
|
"mean_token_accuracy": 0.786482036113739,
|
|
"num_tokens": 8819796.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.028484231943031537,
|
|
"grad_norm": 1.7970499992370605,
|
|
"learning_rate": 2.7918781725888327e-06,
|
|
"loss": 0.7347,
|
|
"mean_token_accuracy": 0.7946780323982239,
|
|
"num_tokens": 8975390.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.02899287894201424,
|
|
"grad_norm": 1.2918215990066528,
|
|
"learning_rate": 2.8426395939086297e-06,
|
|
"loss": 0.7557,
|
|
"mean_token_accuracy": 0.7911810278892517,
|
|
"num_tokens": 9128994.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.029501525940996948,
|
|
"grad_norm": 1.3276824951171875,
|
|
"learning_rate": 2.8934010152284262e-06,
|
|
"loss": 0.7342,
|
|
"mean_token_accuracy": 0.7935407161712646,
|
|
"num_tokens": 9280736.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.030010172939979655,
|
|
"grad_norm": 1.2609550952911377,
|
|
"learning_rate": 2.9441624365482237e-06,
|
|
"loss": 0.7755,
|
|
"mean_token_accuracy": 0.7831066846847534,
|
|
"num_tokens": 9431826.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.030518819938962362,
|
|
"grad_norm": 1.2028979063034058,
|
|
"learning_rate": 2.9949238578680207e-06,
|
|
"loss": 0.717,
|
|
"mean_token_accuracy": 0.7988446950912476,
|
|
"num_tokens": 9587259.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.031027466937945065,
|
|
"grad_norm": 1.1432055234909058,
|
|
"learning_rate": 3.0456852791878177e-06,
|
|
"loss": 0.7201,
|
|
"mean_token_accuracy": 0.7961779236793518,
|
|
"num_tokens": 9735856.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.03153611393692777,
|
|
"grad_norm": 1.147587537765503,
|
|
"learning_rate": 3.0964467005076143e-06,
|
|
"loss": 0.7267,
|
|
"mean_token_accuracy": 0.7953379154205322,
|
|
"num_tokens": 9903156.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.03204476093591048,
|
|
"grad_norm": 1.1921346187591553,
|
|
"learning_rate": 3.1472081218274113e-06,
|
|
"loss": 0.7245,
|
|
"mean_token_accuracy": 0.7943313121795654,
|
|
"num_tokens": 10054899.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.03255340793489318,
|
|
"grad_norm": 1.1654026508331299,
|
|
"learning_rate": 3.1979695431472087e-06,
|
|
"loss": 0.6968,
|
|
"mean_token_accuracy": 0.803705096244812,
|
|
"num_tokens": 10214147.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.03306205493387589,
|
|
"grad_norm": 1.1071702241897583,
|
|
"learning_rate": 3.2487309644670053e-06,
|
|
"loss": 0.6762,
|
|
"mean_token_accuracy": 0.8057570457458496,
|
|
"num_tokens": 10376357.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.0335707019328586,
|
|
"grad_norm": 1.199992299079895,
|
|
"learning_rate": 3.2994923857868023e-06,
|
|
"loss": 0.7448,
|
|
"mean_token_accuracy": 0.7892211675643921,
|
|
"num_tokens": 10528001.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.0340793489318413,
|
|
"grad_norm": 1.173454761505127,
|
|
"learning_rate": 3.3502538071065993e-06,
|
|
"loss": 0.742,
|
|
"mean_token_accuracy": 0.7903908491134644,
|
|
"num_tokens": 10697458.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.03458799593082401,
|
|
"grad_norm": 1.2794227600097656,
|
|
"learning_rate": 3.4010152284263963e-06,
|
|
"loss": 0.7566,
|
|
"mean_token_accuracy": 0.7864009141921997,
|
|
"num_tokens": 10852213.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.035096642929806715,
|
|
"grad_norm": 1.1894865036010742,
|
|
"learning_rate": 3.451776649746193e-06,
|
|
"loss": 0.7158,
|
|
"mean_token_accuracy": 0.7968059778213501,
|
|
"num_tokens": 11018742.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.03560528992878942,
|
|
"grad_norm": 1.1944557428359985,
|
|
"learning_rate": 3.5025380710659903e-06,
|
|
"loss": 0.7453,
|
|
"mean_token_accuracy": 0.7894875407218933,
|
|
"num_tokens": 11178241.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.03611393692777213,
|
|
"grad_norm": 1.1321941614151,
|
|
"learning_rate": 3.5532994923857873e-06,
|
|
"loss": 0.7,
|
|
"mean_token_accuracy": 0.8021224737167358,
|
|
"num_tokens": 11339213.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.03662258392675483,
|
|
"grad_norm": 1.185842514038086,
|
|
"learning_rate": 3.6040609137055843e-06,
|
|
"loss": 0.6916,
|
|
"mean_token_accuracy": 0.801837146282196,
|
|
"num_tokens": 11489548.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.037131230925737536,
|
|
"grad_norm": 1.1836117506027222,
|
|
"learning_rate": 3.654822335025381e-06,
|
|
"loss": 0.6982,
|
|
"mean_token_accuracy": 0.8014888763427734,
|
|
"num_tokens": 11656772.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.03763987792472025,
|
|
"grad_norm": 1.1263853311538696,
|
|
"learning_rate": 3.705583756345178e-06,
|
|
"loss": 0.6811,
|
|
"mean_token_accuracy": 0.8046553730964661,
|
|
"num_tokens": 11818390.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.03814852492370295,
|
|
"grad_norm": 1.2295253276824951,
|
|
"learning_rate": 3.756345177664975e-06,
|
|
"loss": 0.7333,
|
|
"mean_token_accuracy": 0.791732907295227,
|
|
"num_tokens": 11965717.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.038657171922685654,
|
|
"grad_norm": 1.1593804359436035,
|
|
"learning_rate": 3.8071065989847715e-06,
|
|
"loss": 0.7127,
|
|
"mean_token_accuracy": 0.7967889904975891,
|
|
"num_tokens": 12131732.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.039165818921668365,
|
|
"grad_norm": 1.1421594619750977,
|
|
"learning_rate": 3.857868020304569e-06,
|
|
"loss": 0.7309,
|
|
"mean_token_accuracy": 0.7926912307739258,
|
|
"num_tokens": 12283927.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.03967446592065107,
|
|
"grad_norm": 1.0763299465179443,
|
|
"learning_rate": 3.9086294416243655e-06,
|
|
"loss": 0.6936,
|
|
"mean_token_accuracy": 0.8024134635925293,
|
|
"num_tokens": 12436906.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.04018311291963377,
|
|
"grad_norm": 1.1294554471969604,
|
|
"learning_rate": 3.959390862944163e-06,
|
|
"loss": 0.7055,
|
|
"mean_token_accuracy": 0.7977326512336731,
|
|
"num_tokens": 12600258.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.04069175991861648,
|
|
"grad_norm": 1.1827266216278076,
|
|
"learning_rate": 4.0101522842639595e-06,
|
|
"loss": 0.7044,
|
|
"mean_token_accuracy": 0.7989583015441895,
|
|
"num_tokens": 12747689.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.041200406917599186,
|
|
"grad_norm": 1.1358517408370972,
|
|
"learning_rate": 4.060913705583757e-06,
|
|
"loss": 0.6928,
|
|
"mean_token_accuracy": 0.802930474281311,
|
|
"num_tokens": 12911145.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.04170905391658189,
|
|
"grad_norm": 1.1222280263900757,
|
|
"learning_rate": 4.1116751269035535e-06,
|
|
"loss": 0.6935,
|
|
"mean_token_accuracy": 0.8010543584823608,
|
|
"num_tokens": 13081387.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.0422177009155646,
|
|
"grad_norm": 1.1250072717666626,
|
|
"learning_rate": 4.162436548223351e-06,
|
|
"loss": 0.7126,
|
|
"mean_token_accuracy": 0.7962907552719116,
|
|
"num_tokens": 13243319.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.042726347914547304,
|
|
"grad_norm": 1.1086766719818115,
|
|
"learning_rate": 4.2131979695431475e-06,
|
|
"loss": 0.694,
|
|
"mean_token_accuracy": 0.8029916882514954,
|
|
"num_tokens": 13408622.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.04323499491353001,
|
|
"grad_norm": 1.2479902505874634,
|
|
"learning_rate": 4.263959390862945e-06,
|
|
"loss": 0.7087,
|
|
"mean_token_accuracy": 0.7968636155128479,
|
|
"num_tokens": 13562201.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.04374364191251272,
|
|
"grad_norm": 1.1710768938064575,
|
|
"learning_rate": 4.3147208121827415e-06,
|
|
"loss": 0.6947,
|
|
"mean_token_accuracy": 0.801063597202301,
|
|
"num_tokens": 13706016.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.04425228891149542,
|
|
"grad_norm": 1.109525203704834,
|
|
"learning_rate": 4.365482233502538e-06,
|
|
"loss": 0.6921,
|
|
"mean_token_accuracy": 0.8021470308303833,
|
|
"num_tokens": 13872736.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.044760935910478125,
|
|
"grad_norm": 1.1604565382003784,
|
|
"learning_rate": 4.4162436548223355e-06,
|
|
"loss": 0.6485,
|
|
"mean_token_accuracy": 0.8127233386039734,
|
|
"num_tokens": 14029427.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.045269582909460836,
|
|
"grad_norm": 1.1403427124023438,
|
|
"learning_rate": 4.467005076142132e-06,
|
|
"loss": 0.6809,
|
|
"mean_token_accuracy": 0.802753746509552,
|
|
"num_tokens": 14192123.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.04577822990844354,
|
|
"grad_norm": 1.080385684967041,
|
|
"learning_rate": 4.5177664974619295e-06,
|
|
"loss": 0.6581,
|
|
"mean_token_accuracy": 0.8092286586761475,
|
|
"num_tokens": 14356828.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.04628687690742624,
|
|
"grad_norm": 1.1096259355545044,
|
|
"learning_rate": 4.568527918781726e-06,
|
|
"loss": 0.6234,
|
|
"mean_token_accuracy": 0.8174171447753906,
|
|
"num_tokens": 14522417.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.04679552390640895,
|
|
"grad_norm": 1.1488056182861328,
|
|
"learning_rate": 4.6192893401015235e-06,
|
|
"loss": 0.7016,
|
|
"mean_token_accuracy": 0.7984861135482788,
|
|
"num_tokens": 14675485.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.04730417090539166,
|
|
"grad_norm": 1.1531343460083008,
|
|
"learning_rate": 4.67005076142132e-06,
|
|
"loss": 0.6636,
|
|
"mean_token_accuracy": 0.8089845180511475,
|
|
"num_tokens": 14831686.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.04781281790437437,
|
|
"grad_norm": 1.1957042217254639,
|
|
"learning_rate": 4.7208121827411175e-06,
|
|
"loss": 0.6937,
|
|
"mean_token_accuracy": 0.8009766936302185,
|
|
"num_tokens": 14983974.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.04832146490335707,
|
|
"grad_norm": 1.1750566959381104,
|
|
"learning_rate": 4.771573604060914e-06,
|
|
"loss": 0.7169,
|
|
"mean_token_accuracy": 0.7926790118217468,
|
|
"num_tokens": 15135323.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.048830111902339775,
|
|
"grad_norm": 1.2351971864700317,
|
|
"learning_rate": 4.822335025380711e-06,
|
|
"loss": 0.6883,
|
|
"mean_token_accuracy": 0.8023048639297485,
|
|
"num_tokens": 15289033.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.049338758901322485,
|
|
"grad_norm": 1.113542079925537,
|
|
"learning_rate": 4.873096446700508e-06,
|
|
"loss": 0.6929,
|
|
"mean_token_accuracy": 0.7991148829460144,
|
|
"num_tokens": 15439451.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.04984740590030519,
|
|
"grad_norm": 1.1323645114898682,
|
|
"learning_rate": 4.923857868020305e-06,
|
|
"loss": 0.6741,
|
|
"mean_token_accuracy": 0.8045241832733154,
|
|
"num_tokens": 15594811.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.05035605289928789,
|
|
"grad_norm": 1.2108473777770996,
|
|
"learning_rate": 4.974619289340102e-06,
|
|
"loss": 0.6838,
|
|
"mean_token_accuracy": 0.8022953271865845,
|
|
"num_tokens": 15758944.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.0508646998982706,
|
|
"grad_norm": 1.1598291397094727,
|
|
"learning_rate": 5.025380710659899e-06,
|
|
"loss": 0.6273,
|
|
"mean_token_accuracy": 0.8167519569396973,
|
|
"num_tokens": 15922340.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.05137334689725331,
|
|
"grad_norm": 1.1451945304870605,
|
|
"learning_rate": 5.076142131979695e-06,
|
|
"loss": 0.6472,
|
|
"mean_token_accuracy": 0.8123450875282288,
|
|
"num_tokens": 16074162.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.05188199389623601,
|
|
"grad_norm": 1.183913230895996,
|
|
"learning_rate": 5.126903553299493e-06,
|
|
"loss": 0.6701,
|
|
"mean_token_accuracy": 0.8053520321846008,
|
|
"num_tokens": 16238658.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.05239064089521872,
|
|
"grad_norm": 1.09932279586792,
|
|
"learning_rate": 5.17766497461929e-06,
|
|
"loss": 0.6449,
|
|
"mean_token_accuracy": 0.8124725818634033,
|
|
"num_tokens": 16396393.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.052899287894201424,
|
|
"grad_norm": 1.1981287002563477,
|
|
"learning_rate": 5.228426395939087e-06,
|
|
"loss": 0.6698,
|
|
"mean_token_accuracy": 0.8068869113922119,
|
|
"num_tokens": 16546360.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.05340793489318413,
|
|
"grad_norm": 1.1747201681137085,
|
|
"learning_rate": 5.279187817258884e-06,
|
|
"loss": 0.65,
|
|
"mean_token_accuracy": 0.8108594417572021,
|
|
"num_tokens": 16704979.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.05391658189216684,
|
|
"grad_norm": 1.0800034999847412,
|
|
"learning_rate": 5.329949238578681e-06,
|
|
"loss": 0.6449,
|
|
"mean_token_accuracy": 0.8102988004684448,
|
|
"num_tokens": 16860358.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.05442522889114954,
|
|
"grad_norm": 1.1816664934158325,
|
|
"learning_rate": 5.380710659898477e-06,
|
|
"loss": 0.6662,
|
|
"mean_token_accuracy": 0.8073922991752625,
|
|
"num_tokens": 17034067.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.054933875890132246,
|
|
"grad_norm": 1.2378530502319336,
|
|
"learning_rate": 5.431472081218274e-06,
|
|
"loss": 0.696,
|
|
"mean_token_accuracy": 0.7979105710983276,
|
|
"num_tokens": 17181452.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.055442522889114956,
|
|
"grad_norm": 1.102335810661316,
|
|
"learning_rate": 5.482233502538071e-06,
|
|
"loss": 0.6308,
|
|
"mean_token_accuracy": 0.8160994052886963,
|
|
"num_tokens": 17338963.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.05595116988809766,
|
|
"grad_norm": 1.1627287864685059,
|
|
"learning_rate": 5.532994923857869e-06,
|
|
"loss": 0.6639,
|
|
"mean_token_accuracy": 0.8066244125366211,
|
|
"num_tokens": 17493379.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.056459816887080364,
|
|
"grad_norm": 1.1666368246078491,
|
|
"learning_rate": 5.583756345177665e-06,
|
|
"loss": 0.6299,
|
|
"mean_token_accuracy": 0.8159629106521606,
|
|
"num_tokens": 17651741.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.056968463886063074,
|
|
"grad_norm": 1.1785426139831543,
|
|
"learning_rate": 5.634517766497463e-06,
|
|
"loss": 0.6499,
|
|
"mean_token_accuracy": 0.8092576861381531,
|
|
"num_tokens": 17810924.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.05747711088504578,
|
|
"grad_norm": 1.1432498693466187,
|
|
"learning_rate": 5.685279187817259e-06,
|
|
"loss": 0.6509,
|
|
"mean_token_accuracy": 0.8116598129272461,
|
|
"num_tokens": 17976758.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.05798575788402848,
|
|
"grad_norm": 1.162224292755127,
|
|
"learning_rate": 5.736040609137057e-06,
|
|
"loss": 0.6921,
|
|
"mean_token_accuracy": 0.8001761436462402,
|
|
"num_tokens": 18134320.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.05849440488301119,
|
|
"grad_norm": 1.2580808401107788,
|
|
"learning_rate": 5.7868020304568525e-06,
|
|
"loss": 0.682,
|
|
"mean_token_accuracy": 0.8021042346954346,
|
|
"num_tokens": 18276678.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.059003051881993895,
|
|
"grad_norm": 1.1780056953430176,
|
|
"learning_rate": 5.83756345177665e-06,
|
|
"loss": 0.6398,
|
|
"mean_token_accuracy": 0.8138629198074341,
|
|
"num_tokens": 18426725.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.0595116988809766,
|
|
"grad_norm": 1.1504524946212769,
|
|
"learning_rate": 5.888324873096447e-06,
|
|
"loss": 0.6748,
|
|
"mean_token_accuracy": 0.8074554204940796,
|
|
"num_tokens": 18587522.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.06002034587995931,
|
|
"grad_norm": 1.1362825632095337,
|
|
"learning_rate": 5.939086294416244e-06,
|
|
"loss": 0.6635,
|
|
"mean_token_accuracy": 0.8064254522323608,
|
|
"num_tokens": 18749574.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.06052899287894201,
|
|
"grad_norm": 1.161048412322998,
|
|
"learning_rate": 5.989847715736041e-06,
|
|
"loss": 0.6445,
|
|
"mean_token_accuracy": 0.8129458427429199,
|
|
"num_tokens": 18906800.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.061037639877924724,
|
|
"grad_norm": 1.2150564193725586,
|
|
"learning_rate": 6.040609137055839e-06,
|
|
"loss": 0.6979,
|
|
"mean_token_accuracy": 0.7975142002105713,
|
|
"num_tokens": 19068637.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.06154628687690743,
|
|
"grad_norm": 1.2829769849777222,
|
|
"learning_rate": 6.091370558375635e-06,
|
|
"loss": 0.657,
|
|
"mean_token_accuracy": 0.8076584339141846,
|
|
"num_tokens": 19219209.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.06205493387589013,
|
|
"grad_norm": 1.1912235021591187,
|
|
"learning_rate": 6.142131979695432e-06,
|
|
"loss": 0.6365,
|
|
"mean_token_accuracy": 0.8118331432342529,
|
|
"num_tokens": 19367232.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.06256358087487283,
|
|
"grad_norm": 1.1799649000167847,
|
|
"learning_rate": 6.1928934010152285e-06,
|
|
"loss": 0.6319,
|
|
"mean_token_accuracy": 0.8160076141357422,
|
|
"num_tokens": 19525992.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.06307222787385554,
|
|
"grad_norm": 1.133954405784607,
|
|
"learning_rate": 6.243654822335026e-06,
|
|
"loss": 0.6529,
|
|
"mean_token_accuracy": 0.8099106550216675,
|
|
"num_tokens": 19693698.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.06358087487283826,
|
|
"grad_norm": 1.2654860019683838,
|
|
"learning_rate": 6.2944162436548225e-06,
|
|
"loss": 0.6374,
|
|
"mean_token_accuracy": 0.8136156797409058,
|
|
"num_tokens": 19841203.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.06408952187182096,
|
|
"grad_norm": 1.2145419120788574,
|
|
"learning_rate": 6.34517766497462e-06,
|
|
"loss": 0.6777,
|
|
"mean_token_accuracy": 0.8022431135177612,
|
|
"num_tokens": 20001680.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.06459816887080366,
|
|
"grad_norm": 1.3376954793930054,
|
|
"learning_rate": 6.395939086294417e-06,
|
|
"loss": 0.6395,
|
|
"mean_token_accuracy": 0.8126106262207031,
|
|
"num_tokens": 20150787.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.06510681586978637,
|
|
"grad_norm": 1.1641050577163696,
|
|
"learning_rate": 6.446700507614214e-06,
|
|
"loss": 0.6497,
|
|
"mean_token_accuracy": 0.8102061152458191,
|
|
"num_tokens": 20326096.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.06561546286876907,
|
|
"grad_norm": 1.2625352144241333,
|
|
"learning_rate": 6.4974619289340105e-06,
|
|
"loss": 0.6885,
|
|
"mean_token_accuracy": 0.8006555438041687,
|
|
"num_tokens": 20490433.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.06612410986775177,
|
|
"grad_norm": 1.2983770370483398,
|
|
"learning_rate": 6.548223350253807e-06,
|
|
"loss": 0.6793,
|
|
"mean_token_accuracy": 0.8020418882369995,
|
|
"num_tokens": 20657596.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.06663275686673449,
|
|
"grad_norm": 1.1498662233352661,
|
|
"learning_rate": 6.5989847715736045e-06,
|
|
"loss": 0.6307,
|
|
"mean_token_accuracy": 0.8145186305046082,
|
|
"num_tokens": 20827354.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.0671414038657172,
|
|
"grad_norm": 1.192496657371521,
|
|
"learning_rate": 6.649746192893401e-06,
|
|
"loss": 0.6602,
|
|
"mean_token_accuracy": 0.8084685802459717,
|
|
"num_tokens": 20978319.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.0676500508646999,
|
|
"grad_norm": 1.2856016159057617,
|
|
"learning_rate": 6.7005076142131985e-06,
|
|
"loss": 0.637,
|
|
"mean_token_accuracy": 0.814816951751709,
|
|
"num_tokens": 21124807.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.0681586978636826,
|
|
"grad_norm": 1.1739333868026733,
|
|
"learning_rate": 6.751269035532996e-06,
|
|
"loss": 0.6241,
|
|
"mean_token_accuracy": 0.8179885149002075,
|
|
"num_tokens": 21283777.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.0686673448626653,
|
|
"grad_norm": 1.2382988929748535,
|
|
"learning_rate": 6.8020304568527926e-06,
|
|
"loss": 0.6391,
|
|
"mean_token_accuracy": 0.8126323223114014,
|
|
"num_tokens": 21436297.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.06917599186164802,
|
|
"grad_norm": 1.2716588973999023,
|
|
"learning_rate": 6.852791878172589e-06,
|
|
"loss": 0.6571,
|
|
"mean_token_accuracy": 0.807772159576416,
|
|
"num_tokens": 21590885.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.06968463886063073,
|
|
"grad_norm": 1.269378423690796,
|
|
"learning_rate": 6.903553299492386e-06,
|
|
"loss": 0.6855,
|
|
"mean_token_accuracy": 0.7997962236404419,
|
|
"num_tokens": 21746247.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.07019328585961343,
|
|
"grad_norm": 1.189234733581543,
|
|
"learning_rate": 6.954314720812183e-06,
|
|
"loss": 0.6343,
|
|
"mean_token_accuracy": 0.814178466796875,
|
|
"num_tokens": 21898426.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.07070193285859613,
|
|
"grad_norm": 1.3098485469818115,
|
|
"learning_rate": 7.0050761421319806e-06,
|
|
"loss": 0.7037,
|
|
"mean_token_accuracy": 0.7971333265304565,
|
|
"num_tokens": 22054155.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.07121057985757884,
|
|
"grad_norm": 1.3493456840515137,
|
|
"learning_rate": 7.055837563451777e-06,
|
|
"loss": 0.6692,
|
|
"mean_token_accuracy": 0.8043407201766968,
|
|
"num_tokens": 22215818.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.07171922685656154,
|
|
"grad_norm": 1.2597171068191528,
|
|
"learning_rate": 7.106598984771575e-06,
|
|
"loss": 0.6511,
|
|
"mean_token_accuracy": 0.8078402280807495,
|
|
"num_tokens": 22367981.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.07222787385554426,
|
|
"grad_norm": 1.209790587425232,
|
|
"learning_rate": 7.157360406091371e-06,
|
|
"loss": 0.6494,
|
|
"mean_token_accuracy": 0.8108629584312439,
|
|
"num_tokens": 22537203.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.07273652085452696,
|
|
"grad_norm": 1.2577440738677979,
|
|
"learning_rate": 7.208121827411169e-06,
|
|
"loss": 0.6483,
|
|
"mean_token_accuracy": 0.8100079298019409,
|
|
"num_tokens": 22696776.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.07324516785350967,
|
|
"grad_norm": 1.2360994815826416,
|
|
"learning_rate": 7.258883248730964e-06,
|
|
"loss": 0.6236,
|
|
"mean_token_accuracy": 0.8167132139205933,
|
|
"num_tokens": 22850630.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.07375381485249237,
|
|
"grad_norm": 1.2556370496749878,
|
|
"learning_rate": 7.309644670050762e-06,
|
|
"loss": 0.6782,
|
|
"mean_token_accuracy": 0.8049169778823853,
|
|
"num_tokens": 23002043.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.07426246185147507,
|
|
"grad_norm": 1.280638337135315,
|
|
"learning_rate": 7.360406091370559e-06,
|
|
"loss": 0.6085,
|
|
"mean_token_accuracy": 0.8195856809616089,
|
|
"num_tokens": 23159234.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.07477110885045778,
|
|
"grad_norm": 1.2262206077575684,
|
|
"learning_rate": 7.411167512690356e-06,
|
|
"loss": 0.6348,
|
|
"mean_token_accuracy": 0.8120899200439453,
|
|
"num_tokens": 23310355.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.0752797558494405,
|
|
"grad_norm": 1.2795320749282837,
|
|
"learning_rate": 7.461928934010153e-06,
|
|
"loss": 0.6253,
|
|
"mean_token_accuracy": 0.8177874684333801,
|
|
"num_tokens": 23473092.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.0757884028484232,
|
|
"grad_norm": 1.2047311067581177,
|
|
"learning_rate": 7.51269035532995e-06,
|
|
"loss": 0.6419,
|
|
"mean_token_accuracy": 0.8110628724098206,
|
|
"num_tokens": 23627766.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.0762970498474059,
|
|
"grad_norm": 1.2633532285690308,
|
|
"learning_rate": 7.563451776649747e-06,
|
|
"loss": 0.617,
|
|
"mean_token_accuracy": 0.8181522488594055,
|
|
"num_tokens": 23777307.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.0768056968463886,
|
|
"grad_norm": 1.2570198774337769,
|
|
"learning_rate": 7.614213197969543e-06,
|
|
"loss": 0.6534,
|
|
"mean_token_accuracy": 0.8082944750785828,
|
|
"num_tokens": 23923715.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.07731434384537131,
|
|
"grad_norm": 1.190126895904541,
|
|
"learning_rate": 7.664974619289341e-06,
|
|
"loss": 0.6394,
|
|
"mean_token_accuracy": 0.8122843503952026,
|
|
"num_tokens": 24087522.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.07782299084435401,
|
|
"grad_norm": 1.2950968742370605,
|
|
"learning_rate": 7.715736040609138e-06,
|
|
"loss": 0.6162,
|
|
"mean_token_accuracy": 0.8173761367797852,
|
|
"num_tokens": 24231067.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.07833163784333673,
|
|
"grad_norm": 1.2913380861282349,
|
|
"learning_rate": 7.766497461928934e-06,
|
|
"loss": 0.6601,
|
|
"mean_token_accuracy": 0.8078927993774414,
|
|
"num_tokens": 24394212.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.07884028484231943,
|
|
"grad_norm": 1.3249062299728394,
|
|
"learning_rate": 7.817258883248731e-06,
|
|
"loss": 0.628,
|
|
"mean_token_accuracy": 0.8148497343063354,
|
|
"num_tokens": 24539665.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.07934893184130214,
|
|
"grad_norm": 1.1700199842453003,
|
|
"learning_rate": 7.86802030456853e-06,
|
|
"loss": 0.6624,
|
|
"mean_token_accuracy": 0.8045659065246582,
|
|
"num_tokens": 24703099.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.07985757884028484,
|
|
"grad_norm": 1.2378876209259033,
|
|
"learning_rate": 7.918781725888326e-06,
|
|
"loss": 0.6401,
|
|
"mean_token_accuracy": 0.8125580549240112,
|
|
"num_tokens": 24851707.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.08036622583926754,
|
|
"grad_norm": 1.1588832139968872,
|
|
"learning_rate": 7.969543147208122e-06,
|
|
"loss": 0.6225,
|
|
"mean_token_accuracy": 0.8164503574371338,
|
|
"num_tokens": 25018302.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.08087487283825025,
|
|
"grad_norm": 1.1309963464736938,
|
|
"learning_rate": 8.020304568527919e-06,
|
|
"loss": 0.6277,
|
|
"mean_token_accuracy": 0.8146978616714478,
|
|
"num_tokens": 25177030.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.08138351983723296,
|
|
"grad_norm": 1.3874728679656982,
|
|
"learning_rate": 8.071065989847716e-06,
|
|
"loss": 0.6509,
|
|
"mean_token_accuracy": 0.8102186918258667,
|
|
"num_tokens": 25345695.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.08189216683621567,
|
|
"grad_norm": 1.1526621580123901,
|
|
"learning_rate": 8.121827411167514e-06,
|
|
"loss": 0.6026,
|
|
"mean_token_accuracy": 0.8226553201675415,
|
|
"num_tokens": 25513624.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.08240081383519837,
|
|
"grad_norm": 1.2950594425201416,
|
|
"learning_rate": 8.17258883248731e-06,
|
|
"loss": 0.644,
|
|
"mean_token_accuracy": 0.8108351230621338,
|
|
"num_tokens": 25674463.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.08290946083418108,
|
|
"grad_norm": 1.2725694179534912,
|
|
"learning_rate": 8.223350253807107e-06,
|
|
"loss": 0.6204,
|
|
"mean_token_accuracy": 0.8160519003868103,
|
|
"num_tokens": 25834424.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.08341810783316378,
|
|
"grad_norm": 1.3356080055236816,
|
|
"learning_rate": 8.274111675126905e-06,
|
|
"loss": 0.6618,
|
|
"mean_token_accuracy": 0.8052741289138794,
|
|
"num_tokens": 25991547.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.0839267548321465,
|
|
"grad_norm": 1.3054618835449219,
|
|
"learning_rate": 8.324873096446702e-06,
|
|
"loss": 0.6412,
|
|
"mean_token_accuracy": 0.8113834857940674,
|
|
"num_tokens": 26151034.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.0844354018311292,
|
|
"grad_norm": 1.2440849542617798,
|
|
"learning_rate": 8.375634517766498e-06,
|
|
"loss": 0.6125,
|
|
"mean_token_accuracy": 0.8189442753791809,
|
|
"num_tokens": 26310637.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.0849440488301119,
|
|
"grad_norm": 1.4464467763900757,
|
|
"learning_rate": 8.426395939086295e-06,
|
|
"loss": 0.6112,
|
|
"mean_token_accuracy": 0.8193594813346863,
|
|
"num_tokens": 26465678.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.08545269582909461,
|
|
"grad_norm": 1.2905434370040894,
|
|
"learning_rate": 8.477157360406092e-06,
|
|
"loss": 0.6422,
|
|
"mean_token_accuracy": 0.8109447360038757,
|
|
"num_tokens": 26632464.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.08596134282807731,
|
|
"grad_norm": 1.292102575302124,
|
|
"learning_rate": 8.52791878172589e-06,
|
|
"loss": 0.5979,
|
|
"mean_token_accuracy": 0.8221216201782227,
|
|
"num_tokens": 26795346.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.08646998982706001,
|
|
"grad_norm": 1.452039361000061,
|
|
"learning_rate": 8.578680203045686e-06,
|
|
"loss": 0.6576,
|
|
"mean_token_accuracy": 0.8046762943267822,
|
|
"num_tokens": 26951965.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.08697863682604273,
|
|
"grad_norm": 1.2484265565872192,
|
|
"learning_rate": 8.629441624365483e-06,
|
|
"loss": 0.6032,
|
|
"mean_token_accuracy": 0.8209149837493896,
|
|
"num_tokens": 27113922.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.08748728382502544,
|
|
"grad_norm": 1.2269099950790405,
|
|
"learning_rate": 8.68020304568528e-06,
|
|
"loss": 0.6135,
|
|
"mean_token_accuracy": 0.8181469440460205,
|
|
"num_tokens": 27279172.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.08799593082400814,
|
|
"grad_norm": 1.2918200492858887,
|
|
"learning_rate": 8.730964467005076e-06,
|
|
"loss": 0.6379,
|
|
"mean_token_accuracy": 0.8122804164886475,
|
|
"num_tokens": 27449975.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.08850457782299084,
|
|
"grad_norm": 1.2979905605316162,
|
|
"learning_rate": 8.781725888324873e-06,
|
|
"loss": 0.647,
|
|
"mean_token_accuracy": 0.8093007802963257,
|
|
"num_tokens": 27595872.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.08901322482197355,
|
|
"grad_norm": 1.252805471420288,
|
|
"learning_rate": 8.832487309644671e-06,
|
|
"loss": 0.6201,
|
|
"mean_token_accuracy": 0.8170379400253296,
|
|
"num_tokens": 27760835.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.08952187182095625,
|
|
"grad_norm": 1.3609118461608887,
|
|
"learning_rate": 8.883248730964468e-06,
|
|
"loss": 0.6172,
|
|
"mean_token_accuracy": 0.817752480506897,
|
|
"num_tokens": 27923943.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.09003051881993897,
|
|
"grad_norm": 1.2665863037109375,
|
|
"learning_rate": 8.934010152284264e-06,
|
|
"loss": 0.6405,
|
|
"mean_token_accuracy": 0.8115326166152954,
|
|
"num_tokens": 28085891.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.09053916581892167,
|
|
"grad_norm": 1.2021713256835938,
|
|
"learning_rate": 8.984771573604062e-06,
|
|
"loss": 0.5881,
|
|
"mean_token_accuracy": 0.8245677351951599,
|
|
"num_tokens": 28243518.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.09104781281790437,
|
|
"grad_norm": 1.40549898147583,
|
|
"learning_rate": 9.035532994923859e-06,
|
|
"loss": 0.616,
|
|
"mean_token_accuracy": 0.8165683150291443,
|
|
"num_tokens": 28405483.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.09155645981688708,
|
|
"grad_norm": 1.2603745460510254,
|
|
"learning_rate": 9.086294416243656e-06,
|
|
"loss": 0.6431,
|
|
"mean_token_accuracy": 0.8083624839782715,
|
|
"num_tokens": 28558441.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.09206510681586978,
|
|
"grad_norm": 1.2929681539535522,
|
|
"learning_rate": 9.137055837563452e-06,
|
|
"loss": 0.5976,
|
|
"mean_token_accuracy": 0.8222934007644653,
|
|
"num_tokens": 28707646.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.09257375381485249,
|
|
"grad_norm": 1.1667423248291016,
|
|
"learning_rate": 9.187817258883249e-06,
|
|
"loss": 0.6265,
|
|
"mean_token_accuracy": 0.8158867359161377,
|
|
"num_tokens": 28877502.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.0930824008138352,
|
|
"grad_norm": 1.3487673997879028,
|
|
"learning_rate": 9.238578680203047e-06,
|
|
"loss": 0.6694,
|
|
"mean_token_accuracy": 0.8048607110977173,
|
|
"num_tokens": 29040815.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.0935910478128179,
|
|
"grad_norm": 1.283803105354309,
|
|
"learning_rate": 9.289340101522844e-06,
|
|
"loss": 0.6047,
|
|
"mean_token_accuracy": 0.8185036778450012,
|
|
"num_tokens": 29189342.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.09409969481180061,
|
|
"grad_norm": 1.2010540962219238,
|
|
"learning_rate": 9.34010152284264e-06,
|
|
"loss": 0.6276,
|
|
"mean_token_accuracy": 0.8162474632263184,
|
|
"num_tokens": 29361787.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.09460834181078331,
|
|
"grad_norm": 1.2891279458999634,
|
|
"learning_rate": 9.390862944162438e-06,
|
|
"loss": 0.552,
|
|
"mean_token_accuracy": 0.834886908531189,
|
|
"num_tokens": 29523929.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.09511698880976602,
|
|
"grad_norm": 1.2837107181549072,
|
|
"learning_rate": 9.441624365482235e-06,
|
|
"loss": 0.6484,
|
|
"mean_token_accuracy": 0.810014009475708,
|
|
"num_tokens": 29682593.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.09562563580874874,
|
|
"grad_norm": 1.3777011632919312,
|
|
"learning_rate": 9.492385786802032e-06,
|
|
"loss": 0.5963,
|
|
"mean_token_accuracy": 0.8240934014320374,
|
|
"num_tokens": 29839653.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.09613428280773144,
|
|
"grad_norm": 1.269422173500061,
|
|
"learning_rate": 9.543147208121828e-06,
|
|
"loss": 0.6367,
|
|
"mean_token_accuracy": 0.8108811974525452,
|
|
"num_tokens": 30001594.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.09664292980671414,
|
|
"grad_norm": 1.2095832824707031,
|
|
"learning_rate": 9.593908629441625e-06,
|
|
"loss": 0.625,
|
|
"mean_token_accuracy": 0.815091073513031,
|
|
"num_tokens": 30163501.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.09715157680569685,
|
|
"grad_norm": 1.3392397165298462,
|
|
"learning_rate": 9.644670050761421e-06,
|
|
"loss": 0.609,
|
|
"mean_token_accuracy": 0.8181994557380676,
|
|
"num_tokens": 30324797.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.09766022380467955,
|
|
"grad_norm": 1.3514515161514282,
|
|
"learning_rate": 9.69543147208122e-06,
|
|
"loss": 0.6237,
|
|
"mean_token_accuracy": 0.8160215020179749,
|
|
"num_tokens": 30492907.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.09816887080366225,
|
|
"grad_norm": 1.2122633457183838,
|
|
"learning_rate": 9.746192893401016e-06,
|
|
"loss": 0.5802,
|
|
"mean_token_accuracy": 0.8255295753479004,
|
|
"num_tokens": 30667482.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.09867751780264497,
|
|
"grad_norm": 1.3341679573059082,
|
|
"learning_rate": 9.796954314720813e-06,
|
|
"loss": 0.6094,
|
|
"mean_token_accuracy": 0.8188539743423462,
|
|
"num_tokens": 30830951.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.09918616480162767,
|
|
"grad_norm": 1.4670215845108032,
|
|
"learning_rate": 9.84771573604061e-06,
|
|
"loss": 0.6097,
|
|
"mean_token_accuracy": 0.8192304968833923,
|
|
"num_tokens": 30992078.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.09969481180061038,
|
|
"grad_norm": 1.3874598741531372,
|
|
"learning_rate": 9.898477157360406e-06,
|
|
"loss": 0.6024,
|
|
"mean_token_accuracy": 0.8210733532905579,
|
|
"num_tokens": 31153089.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.10020345879959308,
|
|
"grad_norm": 1.337073564529419,
|
|
"learning_rate": 9.949238578680204e-06,
|
|
"loss": 0.606,
|
|
"mean_token_accuracy": 0.8213313817977905,
|
|
"num_tokens": 31297334.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.10071210579857579,
|
|
"grad_norm": 1.4408767223358154,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6499,
|
|
"mean_token_accuracy": 0.809918224811554,
|
|
"num_tokens": 31462790.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.10122075279755849,
|
|
"grad_norm": 1.4449899196624756,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6345,
|
|
"mean_token_accuracy": 0.8106706142425537,
|
|
"num_tokens": 31614333.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.1017293997965412,
|
|
"grad_norm": 1.3223861455917358,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6201,
|
|
"mean_token_accuracy": 0.817018985748291,
|
|
"num_tokens": 31757983.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.10223804679552391,
|
|
"grad_norm": 1.4164437055587769,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6305,
|
|
"mean_token_accuracy": 0.8113193511962891,
|
|
"num_tokens": 31902198.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.10274669379450661,
|
|
"grad_norm": 1.3873207569122314,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6092,
|
|
"mean_token_accuracy": 0.8206071257591248,
|
|
"num_tokens": 32072015.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.10325534079348932,
|
|
"grad_norm": 1.3321030139923096,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6263,
|
|
"mean_token_accuracy": 0.8133052587509155,
|
|
"num_tokens": 32231536.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.10376398779247202,
|
|
"grad_norm": 1.3445665836334229,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5779,
|
|
"mean_token_accuracy": 0.8258579969406128,
|
|
"num_tokens": 32401421.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.10427263479145472,
|
|
"grad_norm": 7.624189853668213,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6111,
|
|
"mean_token_accuracy": 0.8195592164993286,
|
|
"num_tokens": 32559615.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.10478128179043744,
|
|
"grad_norm": 1.406477689743042,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6131,
|
|
"mean_token_accuracy": 0.8170050382614136,
|
|
"num_tokens": 32710194.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.10528992878942015,
|
|
"grad_norm": 1.3041821718215942,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6172,
|
|
"mean_token_accuracy": 0.8164232969284058,
|
|
"num_tokens": 32878068.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.10579857578840285,
|
|
"grad_norm": 1.2968215942382812,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6454,
|
|
"mean_token_accuracy": 0.8079843521118164,
|
|
"num_tokens": 33048273.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.10630722278738555,
|
|
"grad_norm": 1.3858803510665894,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6165,
|
|
"mean_token_accuracy": 0.8175449967384338,
|
|
"num_tokens": 33209096.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.10681586978636826,
|
|
"grad_norm": 1.4570480585098267,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5919,
|
|
"mean_token_accuracy": 0.8247489929199219,
|
|
"num_tokens": 33375803.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.10732451678535096,
|
|
"grad_norm": 1.386375904083252,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6323,
|
|
"mean_token_accuracy": 0.8123111128807068,
|
|
"num_tokens": 33519566.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.10783316378433368,
|
|
"grad_norm": 1.2808319330215454,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6419,
|
|
"mean_token_accuracy": 0.809684157371521,
|
|
"num_tokens": 33675958.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.10834181078331638,
|
|
"grad_norm": 1.3665415048599243,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6165,
|
|
"mean_token_accuracy": 0.8161715865135193,
|
|
"num_tokens": 33848664.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.10885045778229908,
|
|
"grad_norm": 1.195825219154358,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6179,
|
|
"mean_token_accuracy": 0.8146218061447144,
|
|
"num_tokens": 34013911.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.10935910478128179,
|
|
"grad_norm": 1.2427469491958618,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5986,
|
|
"mean_token_accuracy": 0.8210475444793701,
|
|
"num_tokens": 34175183.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.10986775178026449,
|
|
"grad_norm": 1.2118704319000244,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5914,
|
|
"mean_token_accuracy": 0.8235177397727966,
|
|
"num_tokens": 34337717.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.11037639877924721,
|
|
"grad_norm": 1.332649827003479,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6262,
|
|
"mean_token_accuracy": 0.8143296241760254,
|
|
"num_tokens": 34491024.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.11088504577822991,
|
|
"grad_norm": 1.221186876296997,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5921,
|
|
"mean_token_accuracy": 0.8230754137039185,
|
|
"num_tokens": 34668424.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.11139369277721262,
|
|
"grad_norm": 1.2910295724868774,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6295,
|
|
"mean_token_accuracy": 0.8133854269981384,
|
|
"num_tokens": 34823916.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.11190233977619532,
|
|
"grad_norm": 1.4481086730957031,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6034,
|
|
"mean_token_accuracy": 0.8207244873046875,
|
|
"num_tokens": 34986133.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.11241098677517802,
|
|
"grad_norm": 1.1247764825820923,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.581,
|
|
"mean_token_accuracy": 0.8286846876144409,
|
|
"num_tokens": 35146182.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.11291963377416073,
|
|
"grad_norm": 1.25968337059021,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6255,
|
|
"mean_token_accuracy": 0.8136856555938721,
|
|
"num_tokens": 35301888.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.11342828077314344,
|
|
"grad_norm": 1.1560875177383423,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.8247735500335693,
|
|
"num_tokens": 35461669.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.11393692777212615,
|
|
"grad_norm": 1.282809853553772,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6039,
|
|
"mean_token_accuracy": 0.8183197975158691,
|
|
"num_tokens": 35621625.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.11444557477110885,
|
|
"grad_norm": 1.4031862020492554,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5908,
|
|
"mean_token_accuracy": 0.8242624998092651,
|
|
"num_tokens": 35773479.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.11495422177009156,
|
|
"grad_norm": 1.1410008668899536,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5865,
|
|
"mean_token_accuracy": 0.8251166939735413,
|
|
"num_tokens": 35936577.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.11546286876907426,
|
|
"grad_norm": 1.26992928981781,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6301,
|
|
"mean_token_accuracy": 0.8128260374069214,
|
|
"num_tokens": 36094734.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.11597151576805696,
|
|
"grad_norm": 1.1656041145324707,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5859,
|
|
"mean_token_accuracy": 0.8240753412246704,
|
|
"num_tokens": 36258492.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.11648016276703968,
|
|
"grad_norm": 1.1628189086914062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6029,
|
|
"mean_token_accuracy": 0.8214801549911499,
|
|
"num_tokens": 36423218.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.11698880976602238,
|
|
"grad_norm": 1.28502357006073,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5744,
|
|
"mean_token_accuracy": 0.8258035182952881,
|
|
"num_tokens": 36569958.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.11749745676500509,
|
|
"grad_norm": 1.224998116493225,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6284,
|
|
"mean_token_accuracy": 0.813615620136261,
|
|
"num_tokens": 36735839.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.11800610376398779,
|
|
"grad_norm": 1.228937029838562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6106,
|
|
"mean_token_accuracy": 0.8186049461364746,
|
|
"num_tokens": 36886881.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.1185147507629705,
|
|
"grad_norm": 1.2199290990829468,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.606,
|
|
"mean_token_accuracy": 0.8183395862579346,
|
|
"num_tokens": 37055602.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.1190233977619532,
|
|
"grad_norm": 1.2404224872589111,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5845,
|
|
"mean_token_accuracy": 0.8273381590843201,
|
|
"num_tokens": 37215889.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.11953204476093592,
|
|
"grad_norm": 1.2935690879821777,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5879,
|
|
"mean_token_accuracy": 0.8240891695022583,
|
|
"num_tokens": 37368358.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.12004069175991862,
|
|
"grad_norm": 1.1917532682418823,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.599,
|
|
"mean_token_accuracy": 0.8223183751106262,
|
|
"num_tokens": 37528382.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.12054933875890132,
|
|
"grad_norm": 1.3194689750671387,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6134,
|
|
"mean_token_accuracy": 0.8174057006835938,
|
|
"num_tokens": 37685103.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.12105798575788403,
|
|
"grad_norm": 1.0998708009719849,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6249,
|
|
"mean_token_accuracy": 0.8146561980247498,
|
|
"num_tokens": 37848901.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.12156663275686673,
|
|
"grad_norm": 1.173926591873169,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.62,
|
|
"mean_token_accuracy": 0.8145238161087036,
|
|
"num_tokens": 38014704.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.12207527975584945,
|
|
"grad_norm": 1.19385826587677,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6203,
|
|
"mean_token_accuracy": 0.8140416145324707,
|
|
"num_tokens": 38166048.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.12258392675483215,
|
|
"grad_norm": 1.2071212530136108,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5703,
|
|
"mean_token_accuracy": 0.829688310623169,
|
|
"num_tokens": 38326365.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.12309257375381485,
|
|
"grad_norm": 1.1595906019210815,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5918,
|
|
"mean_token_accuracy": 0.8218910694122314,
|
|
"num_tokens": 38487350.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.12360122075279756,
|
|
"grad_norm": 1.1498054265975952,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6357,
|
|
"mean_token_accuracy": 0.8128587007522583,
|
|
"num_tokens": 38653367.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.12410986775178026,
|
|
"grad_norm": 1.1582930088043213,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6156,
|
|
"mean_token_accuracy": 0.8154253959655762,
|
|
"num_tokens": 38803903.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.12461851475076297,
|
|
"grad_norm": 1.2007235288619995,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.821514904499054,
|
|
"num_tokens": 38956439.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.12512716174974567,
|
|
"grad_norm": 1.2579951286315918,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5807,
|
|
"mean_token_accuracy": 0.8249865174293518,
|
|
"num_tokens": 39109262.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.12563580874872837,
|
|
"grad_norm": 1.0463939905166626,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5836,
|
|
"mean_token_accuracy": 0.8253095149993896,
|
|
"num_tokens": 39271741.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.12614445574771108,
|
|
"grad_norm": 1.2439155578613281,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6119,
|
|
"mean_token_accuracy": 0.8164927363395691,
|
|
"num_tokens": 39438988.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.1266531027466938,
|
|
"grad_norm": 1.1544227600097656,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6216,
|
|
"mean_token_accuracy": 0.8164765238761902,
|
|
"num_tokens": 39606007.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.1271617497456765,
|
|
"grad_norm": 1.2033929824829102,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6558,
|
|
"mean_token_accuracy": 0.8053734302520752,
|
|
"num_tokens": 39780179.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.12767039674465921,
|
|
"grad_norm": 1.1998658180236816,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6138,
|
|
"mean_token_accuracy": 0.8166667819023132,
|
|
"num_tokens": 39939761.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.12817904374364192,
|
|
"grad_norm": 1.2700233459472656,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6356,
|
|
"mean_token_accuracy": 0.8122010231018066,
|
|
"num_tokens": 40103824.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.12868769074262462,
|
|
"grad_norm": 1.2417421340942383,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6141,
|
|
"mean_token_accuracy": 0.8166791796684265,
|
|
"num_tokens": 40275322.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.12919633774160733,
|
|
"grad_norm": 1.2107737064361572,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5892,
|
|
"mean_token_accuracy": 0.8222872018814087,
|
|
"num_tokens": 40433373.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.12970498474059003,
|
|
"grad_norm": 1.2591029405593872,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6189,
|
|
"mean_token_accuracy": 0.8156533241271973,
|
|
"num_tokens": 40580246.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.13021363173957273,
|
|
"grad_norm": 1.3103513717651367,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5903,
|
|
"mean_token_accuracy": 0.823656439781189,
|
|
"num_tokens": 40743349.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.13072227873855544,
|
|
"grad_norm": 1.1803346872329712,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5796,
|
|
"mean_token_accuracy": 0.8239772319793701,
|
|
"num_tokens": 40909931.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.13123092573753814,
|
|
"grad_norm": 1.326135516166687,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5905,
|
|
"mean_token_accuracy": 0.8228780031204224,
|
|
"num_tokens": 41064135.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.13173957273652084,
|
|
"grad_norm": 1.2487269639968872,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5809,
|
|
"mean_token_accuracy": 0.8256618976593018,
|
|
"num_tokens": 41215121.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.13224821973550355,
|
|
"grad_norm": 1.339449167251587,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5944,
|
|
"mean_token_accuracy": 0.8231635093688965,
|
|
"num_tokens": 41373300.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.13275686673448628,
|
|
"grad_norm": 1.3216067552566528,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5995,
|
|
"mean_token_accuracy": 0.820053219795227,
|
|
"num_tokens": 41538822.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.13326551373346898,
|
|
"grad_norm": 1.2868990898132324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.642,
|
|
"mean_token_accuracy": 0.8083620667457581,
|
|
"num_tokens": 41699217.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.13377416073245169,
|
|
"grad_norm": 1.265202522277832,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5991,
|
|
"mean_token_accuracy": 0.8189837336540222,
|
|
"num_tokens": 41870291.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.1342828077314344,
|
|
"grad_norm": 1.1498230695724487,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6326,
|
|
"mean_token_accuracy": 0.8124827146530151,
|
|
"num_tokens": 42029303.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.1347914547304171,
|
|
"grad_norm": 1.3529750108718872,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6208,
|
|
"mean_token_accuracy": 0.8141924738883972,
|
|
"num_tokens": 42197688.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.1353001017293998,
|
|
"grad_norm": 1.2339895963668823,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6022,
|
|
"mean_token_accuracy": 0.8191970586776733,
|
|
"num_tokens": 42349174.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.1358087487283825,
|
|
"grad_norm": 1.3051806688308716,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5799,
|
|
"mean_token_accuracy": 0.8249984979629517,
|
|
"num_tokens": 42506878.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.1363173957273652,
|
|
"grad_norm": 1.1635160446166992,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6192,
|
|
"mean_token_accuracy": 0.8148203492164612,
|
|
"num_tokens": 42676569.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.1368260427263479,
|
|
"grad_norm": 1.296911358833313,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6124,
|
|
"mean_token_accuracy": 0.8165633678436279,
|
|
"num_tokens": 42831149.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.1373346897253306,
|
|
"grad_norm": 1.1763876676559448,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6096,
|
|
"mean_token_accuracy": 0.8170506358146667,
|
|
"num_tokens": 42992073.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.13784333672431331,
|
|
"grad_norm": 1.1894811391830444,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6216,
|
|
"mean_token_accuracy": 0.8166886568069458,
|
|
"num_tokens": 43157608.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.13835198372329605,
|
|
"grad_norm": 1.3227925300598145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.577,
|
|
"mean_token_accuracy": 0.8258155584335327,
|
|
"num_tokens": 43319905.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.13886063072227875,
|
|
"grad_norm": 1.2134519815444946,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6191,
|
|
"mean_token_accuracy": 0.8159205317497253,
|
|
"num_tokens": 43483878.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.13936927772126145,
|
|
"grad_norm": 1.390149712562561,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5757,
|
|
"mean_token_accuracy": 0.826744556427002,
|
|
"num_tokens": 43642021.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.13987792472024416,
|
|
"grad_norm": 1.249283790588379,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5884,
|
|
"mean_token_accuracy": 0.8216973543167114,
|
|
"num_tokens": 43788563.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.14038657171922686,
|
|
"grad_norm": 1.194814920425415,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6187,
|
|
"mean_token_accuracy": 0.8158164024353027,
|
|
"num_tokens": 43948725.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.14089521871820956,
|
|
"grad_norm": 1.199250340461731,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8249163031578064,
|
|
"num_tokens": 44107222.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.14140386571719227,
|
|
"grad_norm": 1.2112387418746948,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6018,
|
|
"mean_token_accuracy": 0.8194043636322021,
|
|
"num_tokens": 44265793.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.14191251271617497,
|
|
"grad_norm": 1.2595670223236084,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6115,
|
|
"mean_token_accuracy": 0.8172046542167664,
|
|
"num_tokens": 44416663.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.14242115971515767,
|
|
"grad_norm": 1.3530234098434448,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6033,
|
|
"mean_token_accuracy": 0.8205140829086304,
|
|
"num_tokens": 44585677.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.14292980671414038,
|
|
"grad_norm": 1.3794883489608765,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6051,
|
|
"mean_token_accuracy": 0.8198160529136658,
|
|
"num_tokens": 44733031.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.14343845371312308,
|
|
"grad_norm": 1.2748477458953857,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6258,
|
|
"mean_token_accuracy": 0.8135629892349243,
|
|
"num_tokens": 44896422.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.14394710071210579,
|
|
"grad_norm": 1.3169347047805786,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5901,
|
|
"mean_token_accuracy": 0.8225511312484741,
|
|
"num_tokens": 45063428.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.14445574771108852,
|
|
"grad_norm": 1.1416714191436768,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5782,
|
|
"mean_token_accuracy": 0.8251551985740662,
|
|
"num_tokens": 45222963.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.14496439471007122,
|
|
"grad_norm": 1.3052245378494263,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5924,
|
|
"mean_token_accuracy": 0.822461724281311,
|
|
"num_tokens": 45369420.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.14547304170905392,
|
|
"grad_norm": 1.2727510929107666,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5644,
|
|
"mean_token_accuracy": 0.8323483467102051,
|
|
"num_tokens": 45525565.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.14598168870803663,
|
|
"grad_norm": 1.238297700881958,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6061,
|
|
"mean_token_accuracy": 0.8184632062911987,
|
|
"num_tokens": 45688704.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.14649033570701933,
|
|
"grad_norm": 1.3253711462020874,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6018,
|
|
"mean_token_accuracy": 0.8195759057998657,
|
|
"num_tokens": 45852191.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.14699898270600203,
|
|
"grad_norm": 1.255407452583313,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.617,
|
|
"mean_token_accuracy": 0.8195772171020508,
|
|
"num_tokens": 46009944.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.14750762970498474,
|
|
"grad_norm": 1.3443149328231812,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6452,
|
|
"mean_token_accuracy": 0.8103073835372925,
|
|
"num_tokens": 46163464.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.14801627670396744,
|
|
"grad_norm": 1.3617409467697144,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6009,
|
|
"mean_token_accuracy": 0.8228298425674438,
|
|
"num_tokens": 46315469.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.14852492370295015,
|
|
"grad_norm": 1.320160150527954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5897,
|
|
"mean_token_accuracy": 0.8236374855041504,
|
|
"num_tokens": 46475098.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.14903357070193285,
|
|
"grad_norm": 1.1730265617370605,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6014,
|
|
"mean_token_accuracy": 0.8199717998504639,
|
|
"num_tokens": 46645510.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.14954221770091555,
|
|
"grad_norm": 1.2772011756896973,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5856,
|
|
"mean_token_accuracy": 0.8237594366073608,
|
|
"num_tokens": 46799760.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.15005086469989828,
|
|
"grad_norm": 1.3035606145858765,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5901,
|
|
"mean_token_accuracy": 0.82267165184021,
|
|
"num_tokens": 46965987.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.150559511698881,
|
|
"grad_norm": 1.2326747179031372,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5695,
|
|
"mean_token_accuracy": 0.8280316591262817,
|
|
"num_tokens": 47128771.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.1510681586978637,
|
|
"grad_norm": 1.2811074256896973,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6225,
|
|
"mean_token_accuracy": 0.8160161972045898,
|
|
"num_tokens": 47288316.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.1515768056968464,
|
|
"grad_norm": 1.2438085079193115,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5871,
|
|
"mean_token_accuracy": 0.821655809879303,
|
|
"num_tokens": 47455167.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 0.1520854526958291,
|
|
"grad_norm": 1.30879807472229,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6081,
|
|
"mean_token_accuracy": 0.8191161155700684,
|
|
"num_tokens": 47600859.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 0.1525940996948118,
|
|
"grad_norm": 1.1298332214355469,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5841,
|
|
"mean_token_accuracy": 0.8238013386726379,
|
|
"num_tokens": 47768491.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.1531027466937945,
|
|
"grad_norm": 1.302790880203247,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5917,
|
|
"mean_token_accuracy": 0.8203186392784119,
|
|
"num_tokens": 47918622.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 0.1536113936927772,
|
|
"grad_norm": 1.094289779663086,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5417,
|
|
"mean_token_accuracy": 0.8354231715202332,
|
|
"num_tokens": 48076479.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 0.1541200406917599,
|
|
"grad_norm": 1.191530466079712,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5879,
|
|
"mean_token_accuracy": 0.8236433267593384,
|
|
"num_tokens": 48226249.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 0.15462868769074262,
|
|
"grad_norm": 1.3071236610412598,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5461,
|
|
"mean_token_accuracy": 0.8359087109565735,
|
|
"num_tokens": 48390048.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 0.15513733468972532,
|
|
"grad_norm": 1.0845615863800049,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5688,
|
|
"mean_token_accuracy": 0.8274515867233276,
|
|
"num_tokens": 48558970.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.15564598168870802,
|
|
"grad_norm": 1.2192639112472534,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.577,
|
|
"mean_token_accuracy": 0.8271118402481079,
|
|
"num_tokens": 48712282.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 0.15615462868769076,
|
|
"grad_norm": 1.2620494365692139,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5981,
|
|
"mean_token_accuracy": 0.8221869468688965,
|
|
"num_tokens": 48874280.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 0.15666327568667346,
|
|
"grad_norm": 1.1076061725616455,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6341,
|
|
"mean_token_accuracy": 0.8119561076164246,
|
|
"num_tokens": 49032570.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 0.15717192268565616,
|
|
"grad_norm": 1.1432855129241943,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.577,
|
|
"mean_token_accuracy": 0.826465904712677,
|
|
"num_tokens": 49206160.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 0.15768056968463887,
|
|
"grad_norm": 1.282848596572876,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6243,
|
|
"mean_token_accuracy": 0.812804102897644,
|
|
"num_tokens": 49357658.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.15818921668362157,
|
|
"grad_norm": 1.22379469871521,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6015,
|
|
"mean_token_accuracy": 0.819198727607727,
|
|
"num_tokens": 49513574.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 0.15869786368260427,
|
|
"grad_norm": 1.2921515703201294,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.831973671913147,
|
|
"num_tokens": 49678022.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 0.15920651068158698,
|
|
"grad_norm": 1.274613380432129,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6113,
|
|
"mean_token_accuracy": 0.8184370994567871,
|
|
"num_tokens": 49844519.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 0.15971515768056968,
|
|
"grad_norm": 1.2928526401519775,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5517,
|
|
"mean_token_accuracy": 0.8329215049743652,
|
|
"num_tokens": 50003658.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 0.16022380467955238,
|
|
"grad_norm": 1.3217474222183228,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6026,
|
|
"mean_token_accuracy": 0.8197334408760071,
|
|
"num_tokens": 50169430.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.1607324516785351,
|
|
"grad_norm": 1.3033621311187744,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5757,
|
|
"mean_token_accuracy": 0.82660973072052,
|
|
"num_tokens": 50337188.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 0.1612410986775178,
|
|
"grad_norm": 1.292182445526123,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6152,
|
|
"mean_token_accuracy": 0.8158968687057495,
|
|
"num_tokens": 50488304.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 0.1617497456765005,
|
|
"grad_norm": 1.2796605825424194,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6127,
|
|
"mean_token_accuracy": 0.8183920383453369,
|
|
"num_tokens": 50649690.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 0.16225839267548323,
|
|
"grad_norm": 1.2305800914764404,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5674,
|
|
"mean_token_accuracy": 0.827362060546875,
|
|
"num_tokens": 50809934.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 0.16276703967446593,
|
|
"grad_norm": 1.260392665863037,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6205,
|
|
"mean_token_accuracy": 0.8147035241127014,
|
|
"num_tokens": 50973651.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.16327568667344863,
|
|
"grad_norm": 1.2155548334121704,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6425,
|
|
"mean_token_accuracy": 0.8077457547187805,
|
|
"num_tokens": 51152576.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 0.16378433367243134,
|
|
"grad_norm": 1.1556423902511597,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.830562949180603,
|
|
"num_tokens": 51302446.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 0.16429298067141404,
|
|
"grad_norm": 1.4814339876174927,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5983,
|
|
"mean_token_accuracy": 0.8216321468353271,
|
|
"num_tokens": 51443754.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 0.16480162767039674,
|
|
"grad_norm": 1.1830353736877441,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5914,
|
|
"mean_token_accuracy": 0.823473334312439,
|
|
"num_tokens": 51613127.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 0.16531027466937945,
|
|
"grad_norm": 1.1541862487792969,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5801,
|
|
"mean_token_accuracy": 0.8245047926902771,
|
|
"num_tokens": 51778162.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.16581892166836215,
|
|
"grad_norm": 1.1448123455047607,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5774,
|
|
"mean_token_accuracy": 0.8273317813873291,
|
|
"num_tokens": 51935798.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 0.16632756866734486,
|
|
"grad_norm": 1.1235575675964355,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5998,
|
|
"mean_token_accuracy": 0.8195568323135376,
|
|
"num_tokens": 52089803.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 0.16683621566632756,
|
|
"grad_norm": 1.1180049180984497,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5831,
|
|
"mean_token_accuracy": 0.8214372396469116,
|
|
"num_tokens": 52241386.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 0.16734486266531026,
|
|
"grad_norm": 1.107740044593811,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6006,
|
|
"mean_token_accuracy": 0.8191947937011719,
|
|
"num_tokens": 52397569.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 0.167853509664293,
|
|
"grad_norm": 1.2240755558013916,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5945,
|
|
"mean_token_accuracy": 0.8211661577224731,
|
|
"num_tokens": 52550468.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.1683621566632757,
|
|
"grad_norm": 1.105646014213562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5789,
|
|
"mean_token_accuracy": 0.8263506293296814,
|
|
"num_tokens": 52705147.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 0.1688708036622584,
|
|
"grad_norm": 1.18425714969635,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6246,
|
|
"mean_token_accuracy": 0.8139541149139404,
|
|
"num_tokens": 52870603.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 0.1693794506612411,
|
|
"grad_norm": 1.1311122179031372,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6008,
|
|
"mean_token_accuracy": 0.8203775882720947,
|
|
"num_tokens": 53027492.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 0.1698880976602238,
|
|
"grad_norm": 1.1832103729248047,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6,
|
|
"mean_token_accuracy": 0.8204811811447144,
|
|
"num_tokens": 53186596.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 0.1703967446592065,
|
|
"grad_norm": 1.1455259323120117,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5843,
|
|
"mean_token_accuracy": 0.8235903978347778,
|
|
"num_tokens": 53341286.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.17090539165818922,
|
|
"grad_norm": 1.187774658203125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6061,
|
|
"mean_token_accuracy": 0.8166907429695129,
|
|
"num_tokens": 53511107.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 0.17141403865717192,
|
|
"grad_norm": 1.1563060283660889,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.8305507302284241,
|
|
"num_tokens": 53673861.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 0.17192268565615462,
|
|
"grad_norm": 1.2529067993164062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5939,
|
|
"mean_token_accuracy": 0.8233036994934082,
|
|
"num_tokens": 53829950.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 0.17243133265513733,
|
|
"grad_norm": 1.1666814088821411,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6117,
|
|
"mean_token_accuracy": 0.818543553352356,
|
|
"num_tokens": 53992561.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 0.17293997965412003,
|
|
"grad_norm": 1.267090082168579,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6046,
|
|
"mean_token_accuracy": 0.8193832635879517,
|
|
"num_tokens": 54156414.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.17344862665310273,
|
|
"grad_norm": 1.122776985168457,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5948,
|
|
"mean_token_accuracy": 0.8207167983055115,
|
|
"num_tokens": 54315479.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 0.17395727365208546,
|
|
"grad_norm": 1.184189796447754,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.589,
|
|
"mean_token_accuracy": 0.821286678314209,
|
|
"num_tokens": 54487854.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 0.17446592065106817,
|
|
"grad_norm": 1.2882232666015625,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.584,
|
|
"mean_token_accuracy": 0.8235088586807251,
|
|
"num_tokens": 54637940.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 0.17497456765005087,
|
|
"grad_norm": 1.1329272985458374,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5877,
|
|
"mean_token_accuracy": 0.8235629796981812,
|
|
"num_tokens": 54799217.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 0.17548321464903358,
|
|
"grad_norm": 1.1960710287094116,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5744,
|
|
"mean_token_accuracy": 0.8274551033973694,
|
|
"num_tokens": 54952384.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.17599186164801628,
|
|
"grad_norm": 1.1488741636276245,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5993,
|
|
"mean_token_accuracy": 0.8195698261260986,
|
|
"num_tokens": 55126417.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 0.17650050864699898,
|
|
"grad_norm": 1.2244809865951538,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6311,
|
|
"mean_token_accuracy": 0.8094319701194763,
|
|
"num_tokens": 55296549.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 0.1770091556459817,
|
|
"grad_norm": 1.1994503736495972,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5694,
|
|
"mean_token_accuracy": 0.8292071223258972,
|
|
"num_tokens": 55457557.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 0.1775178026449644,
|
|
"grad_norm": 1.4169642925262451,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5897,
|
|
"mean_token_accuracy": 0.821076512336731,
|
|
"num_tokens": 55615322.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 0.1780264496439471,
|
|
"grad_norm": 1.2098528146743774,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6046,
|
|
"mean_token_accuracy": 0.8202278017997742,
|
|
"num_tokens": 55780975.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.1785350966429298,
|
|
"grad_norm": 1.3667292594909668,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6017,
|
|
"mean_token_accuracy": 0.820119321346283,
|
|
"num_tokens": 55931561.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 0.1790437436419125,
|
|
"grad_norm": 1.2060900926589966,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6278,
|
|
"mean_token_accuracy": 0.8119494915008545,
|
|
"num_tokens": 56097949.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 0.17955239064089523,
|
|
"grad_norm": 1.2072443962097168,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6356,
|
|
"mean_token_accuracy": 0.809796929359436,
|
|
"num_tokens": 56256104.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 0.18006103763987794,
|
|
"grad_norm": 1.2915446758270264,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.617,
|
|
"mean_token_accuracy": 0.816338062286377,
|
|
"num_tokens": 56427795.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 0.18056968463886064,
|
|
"grad_norm": 1.329746961593628,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5885,
|
|
"mean_token_accuracy": 0.8227415084838867,
|
|
"num_tokens": 56580755.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.18107833163784334,
|
|
"grad_norm": 1.2750502824783325,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5959,
|
|
"mean_token_accuracy": 0.8213445544242859,
|
|
"num_tokens": 56725887.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 0.18158697863682605,
|
|
"grad_norm": 1.1861790418624878,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5852,
|
|
"mean_token_accuracy": 0.8219949007034302,
|
|
"num_tokens": 56893128.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 0.18209562563580875,
|
|
"grad_norm": 1.1133451461791992,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5786,
|
|
"mean_token_accuracy": 0.8256653547286987,
|
|
"num_tokens": 57066741.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 0.18260427263479145,
|
|
"grad_norm": 1.2725830078125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5759,
|
|
"mean_token_accuracy": 0.8266841173171997,
|
|
"num_tokens": 57215386.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 0.18311291963377416,
|
|
"grad_norm": 1.1565479040145874,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5981,
|
|
"mean_token_accuracy": 0.8205220699310303,
|
|
"num_tokens": 57382943.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.18362156663275686,
|
|
"grad_norm": 1.2219996452331543,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5773,
|
|
"mean_token_accuracy": 0.8252044320106506,
|
|
"num_tokens": 57533799.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 0.18413021363173956,
|
|
"grad_norm": 1.3008339405059814,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5909,
|
|
"mean_token_accuracy": 0.8215640783309937,
|
|
"num_tokens": 57697387.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 0.18463886063072227,
|
|
"grad_norm": 1.0892812013626099,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5967,
|
|
"mean_token_accuracy": 0.821318507194519,
|
|
"num_tokens": 57855776.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 0.18514750762970497,
|
|
"grad_norm": 1.2349947690963745,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5748,
|
|
"mean_token_accuracy": 0.8269065618515015,
|
|
"num_tokens": 58024333.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 0.1856561546286877,
|
|
"grad_norm": 1.2553539276123047,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5765,
|
|
"mean_token_accuracy": 0.8262948393821716,
|
|
"num_tokens": 58184182.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.1861648016276704,
|
|
"grad_norm": 1.0978028774261475,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5976,
|
|
"mean_token_accuracy": 0.8190889358520508,
|
|
"num_tokens": 58338761.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 0.1866734486266531,
|
|
"grad_norm": 1.4213918447494507,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5963,
|
|
"mean_token_accuracy": 0.8194625973701477,
|
|
"num_tokens": 58485678.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 0.1871820956256358,
|
|
"grad_norm": 1.233219861984253,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6218,
|
|
"mean_token_accuracy": 0.8158354759216309,
|
|
"num_tokens": 58643138.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 0.18769074262461852,
|
|
"grad_norm": 1.2403314113616943,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5756,
|
|
"mean_token_accuracy": 0.826107382774353,
|
|
"num_tokens": 58804789.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 0.18819938962360122,
|
|
"grad_norm": 1.2945451736450195,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5785,
|
|
"mean_token_accuracy": 0.8255239725112915,
|
|
"num_tokens": 58978624.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.18870803662258392,
|
|
"grad_norm": 1.0491999387741089,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5689,
|
|
"mean_token_accuracy": 0.8316059112548828,
|
|
"num_tokens": 59150156.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 0.18921668362156663,
|
|
"grad_norm": 1.2332353591918945,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5409,
|
|
"mean_token_accuracy": 0.8354936838150024,
|
|
"num_tokens": 59307831.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 0.18972533062054933,
|
|
"grad_norm": 1.3186850547790527,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6007,
|
|
"mean_token_accuracy": 0.8197685480117798,
|
|
"num_tokens": 59458464.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 0.19023397761953204,
|
|
"grad_norm": 1.1561511754989624,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.579,
|
|
"mean_token_accuracy": 0.8258455395698547,
|
|
"num_tokens": 59612951.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 0.19074262461851474,
|
|
"grad_norm": 1.2569273710250854,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5515,
|
|
"mean_token_accuracy": 0.8312716484069824,
|
|
"num_tokens": 59774372.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.19125127161749747,
|
|
"grad_norm": 1.122464656829834,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8312299251556396,
|
|
"num_tokens": 59940444.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 0.19175991861648017,
|
|
"grad_norm": 1.2286045551300049,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5973,
|
|
"mean_token_accuracy": 0.8190140724182129,
|
|
"num_tokens": 60097403.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 0.19226856561546288,
|
|
"grad_norm": 1.1971975564956665,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5743,
|
|
"mean_token_accuracy": 0.8260021805763245,
|
|
"num_tokens": 60270518.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 0.19277721261444558,
|
|
"grad_norm": 1.228528618812561,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.569,
|
|
"mean_token_accuracy": 0.8281633257865906,
|
|
"num_tokens": 60434705.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 0.19328585961342828,
|
|
"grad_norm": 1.1806086301803589,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6026,
|
|
"mean_token_accuracy": 0.818634569644928,
|
|
"num_tokens": 60587162.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.193794506612411,
|
|
"grad_norm": 1.1534594297409058,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5755,
|
|
"mean_token_accuracy": 0.8261923789978027,
|
|
"num_tokens": 60749142.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 0.1943031536113937,
|
|
"grad_norm": 1.1578582525253296,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5654,
|
|
"mean_token_accuracy": 0.8286859393119812,
|
|
"num_tokens": 60917294.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 0.1948118006103764,
|
|
"grad_norm": 1.1654207706451416,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5679,
|
|
"mean_token_accuracy": 0.8284820318222046,
|
|
"num_tokens": 61084543.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 0.1953204476093591,
|
|
"grad_norm": 1.355996012687683,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6042,
|
|
"mean_token_accuracy": 0.8220304846763611,
|
|
"num_tokens": 61248579.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 0.1958290946083418,
|
|
"grad_norm": 1.3036282062530518,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8220359086990356,
|
|
"num_tokens": 61403557.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.1963377416073245,
|
|
"grad_norm": 2.102263927459717,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5846,
|
|
"mean_token_accuracy": 0.8245697617530823,
|
|
"num_tokens": 61568050.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 0.1968463886063072,
|
|
"grad_norm": 1.3433443307876587,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6061,
|
|
"mean_token_accuracy": 0.8184012174606323,
|
|
"num_tokens": 61735941.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 0.19735503560528994,
|
|
"grad_norm": 1.1168278455734253,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5418,
|
|
"mean_token_accuracy": 0.8362807035446167,
|
|
"num_tokens": 61895500.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 0.19786368260427264,
|
|
"grad_norm": 1.1756104230880737,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5871,
|
|
"mean_token_accuracy": 0.823564887046814,
|
|
"num_tokens": 62062619.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 0.19837232960325535,
|
|
"grad_norm": 1.446220874786377,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5916,
|
|
"mean_token_accuracy": 0.8246034979820251,
|
|
"num_tokens": 62247115.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.19888097660223805,
|
|
"grad_norm": 1.2506608963012695,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5826,
|
|
"mean_token_accuracy": 0.8243875503540039,
|
|
"num_tokens": 62404051.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 0.19938962360122076,
|
|
"grad_norm": 1.0673178434371948,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5957,
|
|
"mean_token_accuracy": 0.8206828832626343,
|
|
"num_tokens": 62559441.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 0.19989827060020346,
|
|
"grad_norm": 1.2300033569335938,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6234,
|
|
"mean_token_accuracy": 0.8144210577011108,
|
|
"num_tokens": 62719791.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 0.20040691759918616,
|
|
"grad_norm": 1.2013486623764038,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5846,
|
|
"mean_token_accuracy": 0.8223665952682495,
|
|
"num_tokens": 62863338.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 0.20091556459816887,
|
|
"grad_norm": 1.1348063945770264,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5724,
|
|
"mean_token_accuracy": 0.8273745775222778,
|
|
"num_tokens": 63024566.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.20142421159715157,
|
|
"grad_norm": 1.0478370189666748,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.542,
|
|
"mean_token_accuracy": 0.835296094417572,
|
|
"num_tokens": 63189778.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 0.20193285859613427,
|
|
"grad_norm": 1.1494406461715698,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5979,
|
|
"mean_token_accuracy": 0.8189268112182617,
|
|
"num_tokens": 63347541.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 0.20244150559511698,
|
|
"grad_norm": 1.1450999975204468,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6077,
|
|
"mean_token_accuracy": 0.8168392181396484,
|
|
"num_tokens": 63513614.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 0.2029501525940997,
|
|
"grad_norm": 1.1753997802734375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5813,
|
|
"mean_token_accuracy": 0.8233904838562012,
|
|
"num_tokens": 63657699.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 0.2034587995930824,
|
|
"grad_norm": 1.0630018711090088,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5784,
|
|
"mean_token_accuracy": 0.8258477449417114,
|
|
"num_tokens": 63810862.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.20396744659206512,
|
|
"grad_norm": 1.2419931888580322,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6028,
|
|
"mean_token_accuracy": 0.8186101317405701,
|
|
"num_tokens": 63976337.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 0.20447609359104782,
|
|
"grad_norm": 1.1836490631103516,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5862,
|
|
"mean_token_accuracy": 0.8243967294692993,
|
|
"num_tokens": 64131809.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 0.20498474059003052,
|
|
"grad_norm": 1.1719073057174683,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6026,
|
|
"mean_token_accuracy": 0.8206160068511963,
|
|
"num_tokens": 64301531.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 0.20549338758901323,
|
|
"grad_norm": 1.1444793939590454,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5803,
|
|
"mean_token_accuracy": 0.8233711123466492,
|
|
"num_tokens": 64453734.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 0.20600203458799593,
|
|
"grad_norm": 1.15047025680542,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6118,
|
|
"mean_token_accuracy": 0.8195493817329407,
|
|
"num_tokens": 64614638.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.20651068158697863,
|
|
"grad_norm": 1.1457444429397583,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5568,
|
|
"mean_token_accuracy": 0.8303655385971069,
|
|
"num_tokens": 64783877.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 0.20701932858596134,
|
|
"grad_norm": 1.1359026432037354,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5916,
|
|
"mean_token_accuracy": 0.8228204250335693,
|
|
"num_tokens": 64946232.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 0.20752797558494404,
|
|
"grad_norm": 1.1045160293579102,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5741,
|
|
"mean_token_accuracy": 0.8265408277511597,
|
|
"num_tokens": 65105547.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 0.20803662258392674,
|
|
"grad_norm": 1.1697115898132324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5974,
|
|
"mean_token_accuracy": 0.8219442963600159,
|
|
"num_tokens": 65262260.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 0.20854526958290945,
|
|
"grad_norm": 1.172763466835022,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5835,
|
|
"mean_token_accuracy": 0.8254345655441284,
|
|
"num_tokens": 65425596.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.20905391658189218,
|
|
"grad_norm": 1.169799566268921,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5881,
|
|
"mean_token_accuracy": 0.8227865695953369,
|
|
"num_tokens": 65582379.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 0.20956256358087488,
|
|
"grad_norm": 1.13645601272583,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.606,
|
|
"mean_token_accuracy": 0.8167718052864075,
|
|
"num_tokens": 65753543.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 0.2100712105798576,
|
|
"grad_norm": 1.1254587173461914,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.567,
|
|
"mean_token_accuracy": 0.827894926071167,
|
|
"num_tokens": 65920741.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 0.2105798575788403,
|
|
"grad_norm": 1.0942331552505493,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.554,
|
|
"mean_token_accuracy": 0.8323182463645935,
|
|
"num_tokens": 66079261.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 0.211088504577823,
|
|
"grad_norm": 1.1396540403366089,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5848,
|
|
"mean_token_accuracy": 0.8227716684341431,
|
|
"num_tokens": 66249455.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.2115971515768057,
|
|
"grad_norm": 1.1977486610412598,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6108,
|
|
"mean_token_accuracy": 0.816601037979126,
|
|
"num_tokens": 66410765.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 0.2121057985757884,
|
|
"grad_norm": 1.100052833557129,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5754,
|
|
"mean_token_accuracy": 0.8266069889068604,
|
|
"num_tokens": 66564476.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 0.2126144455747711,
|
|
"grad_norm": 1.5787156820297241,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6063,
|
|
"mean_token_accuracy": 0.8198506832122803,
|
|
"num_tokens": 66737804.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 0.2131230925737538,
|
|
"grad_norm": 1.1844232082366943,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6005,
|
|
"mean_token_accuracy": 0.8195633292198181,
|
|
"num_tokens": 66895240.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 0.2136317395727365,
|
|
"grad_norm": 1.130846619606018,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.581,
|
|
"mean_token_accuracy": 0.8251656293869019,
|
|
"num_tokens": 67056288.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.21414038657171922,
|
|
"grad_norm": 1.147178053855896,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5915,
|
|
"mean_token_accuracy": 0.8218474984169006,
|
|
"num_tokens": 67206207.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 0.21464903357070192,
|
|
"grad_norm": 1.1447371244430542,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.551,
|
|
"mean_token_accuracy": 0.8317303657531738,
|
|
"num_tokens": 67369756.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 0.21515768056968465,
|
|
"grad_norm": 1.1209125518798828,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5624,
|
|
"mean_token_accuracy": 0.8276084661483765,
|
|
"num_tokens": 67527516.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 0.21566632756866735,
|
|
"grad_norm": 1.0791105031967163,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8268887996673584,
|
|
"num_tokens": 67688042.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 0.21617497456765006,
|
|
"grad_norm": 1.2122269868850708,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5764,
|
|
"mean_token_accuracy": 0.8255937099456787,
|
|
"num_tokens": 67854191.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.21668362156663276,
|
|
"grad_norm": 1.1728301048278809,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5644,
|
|
"mean_token_accuracy": 0.8275389671325684,
|
|
"num_tokens": 68008533.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 0.21719226856561547,
|
|
"grad_norm": 1.1808736324310303,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5863,
|
|
"mean_token_accuracy": 0.8220161199569702,
|
|
"num_tokens": 68162894.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 0.21770091556459817,
|
|
"grad_norm": 1.132716178894043,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5408,
|
|
"mean_token_accuracy": 0.8360246419906616,
|
|
"num_tokens": 68332401.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 0.21820956256358087,
|
|
"grad_norm": 1.187079906463623,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5538,
|
|
"mean_token_accuracy": 0.8317699432373047,
|
|
"num_tokens": 68489170.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 0.21871820956256358,
|
|
"grad_norm": 1.159859538078308,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5951,
|
|
"mean_token_accuracy": 0.8213475942611694,
|
|
"num_tokens": 68643444.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.21922685656154628,
|
|
"grad_norm": 1.2300002574920654,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5801,
|
|
"mean_token_accuracy": 0.82547527551651,
|
|
"num_tokens": 68812499.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 0.21973550356052898,
|
|
"grad_norm": 1.209106206893921,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5688,
|
|
"mean_token_accuracy": 0.8265661001205444,
|
|
"num_tokens": 68976455.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 0.2202441505595117,
|
|
"grad_norm": 1.1866743564605713,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6141,
|
|
"mean_token_accuracy": 0.8169054388999939,
|
|
"num_tokens": 69142925.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 0.22075279755849442,
|
|
"grad_norm": 1.195504069328308,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.603,
|
|
"mean_token_accuracy": 0.8177850246429443,
|
|
"num_tokens": 69307832.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 0.22126144455747712,
|
|
"grad_norm": 1.1744379997253418,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8307361602783203,
|
|
"num_tokens": 69466104.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.22177009155645983,
|
|
"grad_norm": 1.1425153017044067,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5404,
|
|
"mean_token_accuracy": 0.8364987373352051,
|
|
"num_tokens": 69619859.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 0.22227873855544253,
|
|
"grad_norm": 1.135064721107483,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5463,
|
|
"mean_token_accuracy": 0.8337409496307373,
|
|
"num_tokens": 69774687.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 0.22278738555442523,
|
|
"grad_norm": 1.2530827522277832,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6011,
|
|
"mean_token_accuracy": 0.8196946978569031,
|
|
"num_tokens": 69924843.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 0.22329603255340794,
|
|
"grad_norm": 1.1789072751998901,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.585,
|
|
"mean_token_accuracy": 0.8250002861022949,
|
|
"num_tokens": 70088441.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 0.22380467955239064,
|
|
"grad_norm": 1.1316715478897095,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5731,
|
|
"mean_token_accuracy": 0.8247166275978088,
|
|
"num_tokens": 70244433.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.22431332655137334,
|
|
"grad_norm": 1.2392256259918213,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5904,
|
|
"mean_token_accuracy": 0.8200168609619141,
|
|
"num_tokens": 70412934.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 0.22482197355035605,
|
|
"grad_norm": 1.2182955741882324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5784,
|
|
"mean_token_accuracy": 0.8252276182174683,
|
|
"num_tokens": 70568120.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 0.22533062054933875,
|
|
"grad_norm": 1.1843059062957764,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5939,
|
|
"mean_token_accuracy": 0.8191959261894226,
|
|
"num_tokens": 70725251.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 0.22583926754832145,
|
|
"grad_norm": 1.1350986957550049,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5896,
|
|
"mean_token_accuracy": 0.821168839931488,
|
|
"num_tokens": 70881374.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 0.22634791454730416,
|
|
"grad_norm": 1.2637605667114258,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5871,
|
|
"mean_token_accuracy": 0.8221958875656128,
|
|
"num_tokens": 71029051.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.2268565615462869,
|
|
"grad_norm": 1.313187837600708,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5732,
|
|
"mean_token_accuracy": 0.8275098204612732,
|
|
"num_tokens": 71192310.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 0.2273652085452696,
|
|
"grad_norm": 1.166354775428772,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6006,
|
|
"mean_token_accuracy": 0.8188575506210327,
|
|
"num_tokens": 71355710.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 0.2278738555442523,
|
|
"grad_norm": 1.203367829322815,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6033,
|
|
"mean_token_accuracy": 0.819640040397644,
|
|
"num_tokens": 71509580.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 0.228382502543235,
|
|
"grad_norm": 1.1382862329483032,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5532,
|
|
"mean_token_accuracy": 0.8330165147781372,
|
|
"num_tokens": 71666995.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 0.2288911495422177,
|
|
"grad_norm": 1.3082048892974854,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5732,
|
|
"mean_token_accuracy": 0.8270379304885864,
|
|
"num_tokens": 71825201.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.2293997965412004,
|
|
"grad_norm": 1.1791752576828003,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5586,
|
|
"mean_token_accuracy": 0.8284664154052734,
|
|
"num_tokens": 71984961.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 0.2299084435401831,
|
|
"grad_norm": 1.2788444757461548,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5792,
|
|
"mean_token_accuracy": 0.8253182172775269,
|
|
"num_tokens": 72158603.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 0.23041709053916581,
|
|
"grad_norm": 1.2335439920425415,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5577,
|
|
"mean_token_accuracy": 0.832517147064209,
|
|
"num_tokens": 72313855.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 0.23092573753814852,
|
|
"grad_norm": 1.103445053100586,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5746,
|
|
"mean_token_accuracy": 0.825374186038971,
|
|
"num_tokens": 72476307.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 0.23143438453713122,
|
|
"grad_norm": 1.1981607675552368,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5696,
|
|
"mean_token_accuracy": 0.8257123231887817,
|
|
"num_tokens": 72630768.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.23194303153611392,
|
|
"grad_norm": 1.118881106376648,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5706,
|
|
"mean_token_accuracy": 0.8260403275489807,
|
|
"num_tokens": 72805014.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 0.23245167853509666,
|
|
"grad_norm": 1.1847957372665405,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5345,
|
|
"mean_token_accuracy": 0.838158369064331,
|
|
"num_tokens": 72965078.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 0.23296032553407936,
|
|
"grad_norm": 1.1698648929595947,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5544,
|
|
"mean_token_accuracy": 0.8327252864837646,
|
|
"num_tokens": 73123811.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 0.23346897253306206,
|
|
"grad_norm": 1.185947060585022,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8305568695068359,
|
|
"num_tokens": 73286856.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 0.23397761953204477,
|
|
"grad_norm": 1.122018814086914,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8231082558631897,
|
|
"num_tokens": 73432474.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.23448626653102747,
|
|
"grad_norm": 1.1647952795028687,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5715,
|
|
"mean_token_accuracy": 0.8255927562713623,
|
|
"num_tokens": 73589719.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 0.23499491353001017,
|
|
"grad_norm": 1.113539695739746,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5718,
|
|
"mean_token_accuracy": 0.8253976106643677,
|
|
"num_tokens": 73759617.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 0.23550356052899288,
|
|
"grad_norm": 1.204064130783081,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5947,
|
|
"mean_token_accuracy": 0.8204372525215149,
|
|
"num_tokens": 73911743.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 0.23601220752797558,
|
|
"grad_norm": 1.2407692670822144,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5819,
|
|
"mean_token_accuracy": 0.8239152431488037,
|
|
"num_tokens": 74082923.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 0.23652085452695829,
|
|
"grad_norm": 1.24295175075531,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5675,
|
|
"mean_token_accuracy": 0.8276124000549316,
|
|
"num_tokens": 74243087.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.237029501525941,
|
|
"grad_norm": 1.0703984498977661,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.556,
|
|
"mean_token_accuracy": 0.829281747341156,
|
|
"num_tokens": 74410401.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 0.2375381485249237,
|
|
"grad_norm": 1.217100739479065,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.571,
|
|
"mean_token_accuracy": 0.8262215256690979,
|
|
"num_tokens": 74568603.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 0.2380467955239064,
|
|
"grad_norm": 1.1968027353286743,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8254478573799133,
|
|
"num_tokens": 74727709.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 0.23855544252288913,
|
|
"grad_norm": 1.1246533393859863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5604,
|
|
"mean_token_accuracy": 0.8291188478469849,
|
|
"num_tokens": 74890980.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 0.23906408952187183,
|
|
"grad_norm": 1.2297966480255127,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5292,
|
|
"mean_token_accuracy": 0.8388729691505432,
|
|
"num_tokens": 75056158.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.23957273652085453,
|
|
"grad_norm": 1.157701849937439,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.567,
|
|
"mean_token_accuracy": 0.8297462463378906,
|
|
"num_tokens": 75217454.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 0.24008138351983724,
|
|
"grad_norm": 1.1809066534042358,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5655,
|
|
"mean_token_accuracy": 0.827551007270813,
|
|
"num_tokens": 75380129.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 0.24059003051881994,
|
|
"grad_norm": 1.1710585355758667,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5434,
|
|
"mean_token_accuracy": 0.8329430818557739,
|
|
"num_tokens": 75538207.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 0.24109867751780265,
|
|
"grad_norm": 1.2090671062469482,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5412,
|
|
"mean_token_accuracy": 0.8352799415588379,
|
|
"num_tokens": 75692158.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 0.24160732451678535,
|
|
"grad_norm": 1.1606247425079346,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5525,
|
|
"mean_token_accuracy": 0.8310859203338623,
|
|
"num_tokens": 75850325.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.24211597151576805,
|
|
"grad_norm": 1.1594878435134888,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.537,
|
|
"mean_token_accuracy": 0.8359923958778381,
|
|
"num_tokens": 76015367.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 0.24262461851475076,
|
|
"grad_norm": 1.145020842552185,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5472,
|
|
"mean_token_accuracy": 0.8321256041526794,
|
|
"num_tokens": 76165209.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 0.24313326551373346,
|
|
"grad_norm": 1.1626603603363037,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5882,
|
|
"mean_token_accuracy": 0.8215240240097046,
|
|
"num_tokens": 76309618.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 0.24364191251271616,
|
|
"grad_norm": 1.2585151195526123,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5432,
|
|
"mean_token_accuracy": 0.8327075242996216,
|
|
"num_tokens": 76468138.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 0.2441505595116989,
|
|
"grad_norm": 1.1981087923049927,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5643,
|
|
"mean_token_accuracy": 0.8272363543510437,
|
|
"num_tokens": 76638347.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.2446592065106816,
|
|
"grad_norm": 1.1107510328292847,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5826,
|
|
"mean_token_accuracy": 0.8227603435516357,
|
|
"num_tokens": 76807912.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 0.2451678535096643,
|
|
"grad_norm": 1.204334020614624,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5646,
|
|
"mean_token_accuracy": 0.8287724256515503,
|
|
"num_tokens": 76960377.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 0.245676500508647,
|
|
"grad_norm": 1.0959969758987427,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6225,
|
|
"mean_token_accuracy": 0.8141569495201111,
|
|
"num_tokens": 77135219.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 0.2461851475076297,
|
|
"grad_norm": 1.1571955680847168,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5869,
|
|
"mean_token_accuracy": 0.823060929775238,
|
|
"num_tokens": 77284277.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 0.2466937945066124,
|
|
"grad_norm": 1.116564393043518,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5758,
|
|
"mean_token_accuracy": 0.8271507024765015,
|
|
"num_tokens": 77463576.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.24720244150559512,
|
|
"grad_norm": 1.211808443069458,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5533,
|
|
"mean_token_accuracy": 0.8313890695571899,
|
|
"num_tokens": 77624014.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 0.24771108850457782,
|
|
"grad_norm": 1.2386025190353394,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5741,
|
|
"mean_token_accuracy": 0.8249900937080383,
|
|
"num_tokens": 77786805.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 0.24821973550356052,
|
|
"grad_norm": 1.2798279523849487,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5859,
|
|
"mean_token_accuracy": 0.8219289779663086,
|
|
"num_tokens": 77955105.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 0.24872838250254323,
|
|
"grad_norm": 1.3217813968658447,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8318721652030945,
|
|
"num_tokens": 78103025.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 0.24923702950152593,
|
|
"grad_norm": 1.253214716911316,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.830656886100769,
|
|
"num_tokens": 78259934.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.24974567650050863,
|
|
"grad_norm": 1.133701205253601,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.556,
|
|
"mean_token_accuracy": 0.8326247930526733,
|
|
"num_tokens": 78420450.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 0.25025432349949134,
|
|
"grad_norm": 1.265200138092041,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5736,
|
|
"mean_token_accuracy": 0.8258102536201477,
|
|
"num_tokens": 78575928.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 0.25076297049847407,
|
|
"grad_norm": 1.128560185432434,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5646,
|
|
"mean_token_accuracy": 0.8270517587661743,
|
|
"num_tokens": 78723238.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 0.25127161749745675,
|
|
"grad_norm": 1.2510902881622314,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5781,
|
|
"mean_token_accuracy": 0.8246864080429077,
|
|
"num_tokens": 78897539.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 0.2517802644964395,
|
|
"grad_norm": 1.1227195262908936,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5312,
|
|
"mean_token_accuracy": 0.8382051587104797,
|
|
"num_tokens": 79059834.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.25228891149542215,
|
|
"grad_norm": 1.2865270376205444,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5752,
|
|
"mean_token_accuracy": 0.8254743218421936,
|
|
"num_tokens": 79206412.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 0.2527975584944049,
|
|
"grad_norm": 1.1764838695526123,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5741,
|
|
"mean_token_accuracy": 0.8257397413253784,
|
|
"num_tokens": 79379375.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 0.2533062054933876,
|
|
"grad_norm": 1.19319748878479,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5585,
|
|
"mean_token_accuracy": 0.8302797675132751,
|
|
"num_tokens": 79550225.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 0.2538148524923703,
|
|
"grad_norm": 1.2153621912002563,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5426,
|
|
"mean_token_accuracy": 0.8351129293441772,
|
|
"num_tokens": 79705365.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 0.254323499491353,
|
|
"grad_norm": 1.177880883216858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.8263179063796997,
|
|
"num_tokens": 79861100.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.2548321464903357,
|
|
"grad_norm": 1.2405526638031006,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5707,
|
|
"mean_token_accuracy": 0.8265781402587891,
|
|
"num_tokens": 80017875.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 0.25534079348931843,
|
|
"grad_norm": 1.1290812492370605,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5704,
|
|
"mean_token_accuracy": 0.8263634443283081,
|
|
"num_tokens": 80184953.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 0.2558494404883011,
|
|
"grad_norm": 1.1392712593078613,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.828170120716095,
|
|
"num_tokens": 80342864.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 0.25635808748728384,
|
|
"grad_norm": 1.1106423139572144,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5356,
|
|
"mean_token_accuracy": 0.8346061706542969,
|
|
"num_tokens": 80497644.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 0.2568667344862665,
|
|
"grad_norm": 1.235333800315857,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6029,
|
|
"mean_token_accuracy": 0.817632794380188,
|
|
"num_tokens": 80652132.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.25737538148524924,
|
|
"grad_norm": 1.1881519556045532,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8317403793334961,
|
|
"num_tokens": 80815501.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 0.2578840284842319,
|
|
"grad_norm": 1.3391587734222412,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5836,
|
|
"mean_token_accuracy": 0.8228754997253418,
|
|
"num_tokens": 80982538.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 0.25839267548321465,
|
|
"grad_norm": 1.2976542711257935,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5823,
|
|
"mean_token_accuracy": 0.8225487470626831,
|
|
"num_tokens": 81137163.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 0.2589013224821974,
|
|
"grad_norm": 1.196365475654602,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5924,
|
|
"mean_token_accuracy": 0.8200660943984985,
|
|
"num_tokens": 81304752.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 0.25940996948118006,
|
|
"grad_norm": 1.3634917736053467,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5546,
|
|
"mean_token_accuracy": 0.829670786857605,
|
|
"num_tokens": 81448192.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.2599186164801628,
|
|
"grad_norm": 1.3710311651229858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5916,
|
|
"mean_token_accuracy": 0.8205992579460144,
|
|
"num_tokens": 81608372.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 0.26042726347914547,
|
|
"grad_norm": 1.280065894126892,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.8289808034896851,
|
|
"num_tokens": 81757788.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 0.2609359104781282,
|
|
"grad_norm": 1.1710976362228394,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5536,
|
|
"mean_token_accuracy": 0.8313885927200317,
|
|
"num_tokens": 81917301.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 0.2614445574771109,
|
|
"grad_norm": 1.1584504842758179,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.8271463513374329,
|
|
"num_tokens": 82085431.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 0.2619532044760936,
|
|
"grad_norm": 1.0697828531265259,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5555,
|
|
"mean_token_accuracy": 0.8318459391593933,
|
|
"num_tokens": 82238284.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.2624618514750763,
|
|
"grad_norm": 1.1559085845947266,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5851,
|
|
"mean_token_accuracy": 0.8227416276931763,
|
|
"num_tokens": 82404647.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 0.262970498474059,
|
|
"grad_norm": 1.1305633783340454,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5602,
|
|
"mean_token_accuracy": 0.8290926814079285,
|
|
"num_tokens": 82572960.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 0.2634791454730417,
|
|
"grad_norm": 1.0789847373962402,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5676,
|
|
"mean_token_accuracy": 0.8283363580703735,
|
|
"num_tokens": 82738083.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 0.2639877924720244,
|
|
"grad_norm": 1.2178400754928589,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8298940658569336,
|
|
"num_tokens": 82887087.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 0.2644964394710071,
|
|
"grad_norm": 1.152214527130127,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5696,
|
|
"mean_token_accuracy": 0.826300323009491,
|
|
"num_tokens": 83033584.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.2650050864699898,
|
|
"grad_norm": 1.2611678838729858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5777,
|
|
"mean_token_accuracy": 0.8233135342597961,
|
|
"num_tokens": 83195499.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 0.26551373346897256,
|
|
"grad_norm": 1.1738433837890625,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5708,
|
|
"mean_token_accuracy": 0.8280128240585327,
|
|
"num_tokens": 83363928.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 0.26602238046795523,
|
|
"grad_norm": 1.1233623027801514,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.828639805316925,
|
|
"num_tokens": 83520177.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 0.26653102746693796,
|
|
"grad_norm": 1.09394371509552,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5443,
|
|
"mean_token_accuracy": 0.8328458666801453,
|
|
"num_tokens": 83680255.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 0.26703967446592064,
|
|
"grad_norm": 1.126006841659546,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5638,
|
|
"mean_token_accuracy": 0.8271574974060059,
|
|
"num_tokens": 83852400.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.26754832146490337,
|
|
"grad_norm": 1.186543583869934,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5908,
|
|
"mean_token_accuracy": 0.8196697235107422,
|
|
"num_tokens": 84014162.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 0.26805696846388605,
|
|
"grad_norm": 1.0805085897445679,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5464,
|
|
"mean_token_accuracy": 0.8328375220298767,
|
|
"num_tokens": 84170100.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 0.2685656154628688,
|
|
"grad_norm": 1.1757475137710571,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5305,
|
|
"mean_token_accuracy": 0.8382119536399841,
|
|
"num_tokens": 84329460.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 0.26907426246185145,
|
|
"grad_norm": 1.1372390985488892,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5253,
|
|
"mean_token_accuracy": 0.8392993211746216,
|
|
"num_tokens": 84486375.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 0.2695829094608342,
|
|
"grad_norm": 1.1432785987854004,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5336,
|
|
"mean_token_accuracy": 0.837626576423645,
|
|
"num_tokens": 84643661.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.27009155645981686,
|
|
"grad_norm": 1.1244240999221802,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5488,
|
|
"mean_token_accuracy": 0.83336341381073,
|
|
"num_tokens": 84812504.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 0.2706002034587996,
|
|
"grad_norm": 1.108756184577942,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5488,
|
|
"mean_token_accuracy": 0.8323627710342407,
|
|
"num_tokens": 84968874.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 0.2711088504577823,
|
|
"grad_norm": 1.1216325759887695,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5543,
|
|
"mean_token_accuracy": 0.8296369314193726,
|
|
"num_tokens": 85129918.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 0.271617497456765,
|
|
"grad_norm": 1.092794418334961,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8335623741149902,
|
|
"num_tokens": 85292100.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 0.27212614445574773,
|
|
"grad_norm": 1.1919782161712646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5328,
|
|
"mean_token_accuracy": 0.839339017868042,
|
|
"num_tokens": 85445236.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.2726347914547304,
|
|
"grad_norm": 1.1656126976013184,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5675,
|
|
"mean_token_accuracy": 0.827284574508667,
|
|
"num_tokens": 85605945.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 0.27314343845371314,
|
|
"grad_norm": 1.3499900102615356,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5479,
|
|
"mean_token_accuracy": 0.8318436145782471,
|
|
"num_tokens": 85760177.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 0.2736520854526958,
|
|
"grad_norm": 1.0947843790054321,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5664,
|
|
"mean_token_accuracy": 0.8289875984191895,
|
|
"num_tokens": 85924331.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 0.27416073245167855,
|
|
"grad_norm": 1.0087196826934814,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5853,
|
|
"mean_token_accuracy": 0.8228893280029297,
|
|
"num_tokens": 86087686.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 0.2746693794506612,
|
|
"grad_norm": 1.1464523077011108,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5818,
|
|
"mean_token_accuracy": 0.8230845928192139,
|
|
"num_tokens": 86231923.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.27517802644964395,
|
|
"grad_norm": 1.132155179977417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5538,
|
|
"mean_token_accuracy": 0.8299921751022339,
|
|
"num_tokens": 86393599.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 0.27568667344862663,
|
|
"grad_norm": 1.1671158075332642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5237,
|
|
"mean_token_accuracy": 0.8375282287597656,
|
|
"num_tokens": 86546912.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 0.27619532044760936,
|
|
"grad_norm": 1.2495208978652954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6201,
|
|
"mean_token_accuracy": 0.814518392086029,
|
|
"num_tokens": 86710627.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 0.2767039674465921,
|
|
"grad_norm": 1.2193448543548584,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5723,
|
|
"mean_token_accuracy": 0.8245118856430054,
|
|
"num_tokens": 86872441.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 0.27721261444557477,
|
|
"grad_norm": 1.4755446910858154,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.58,
|
|
"mean_token_accuracy": 0.8226820230484009,
|
|
"num_tokens": 87040426.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.2777212614445575,
|
|
"grad_norm": 1.0825291872024536,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5649,
|
|
"mean_token_accuracy": 0.8284255862236023,
|
|
"num_tokens": 87204457.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 0.2782299084435402,
|
|
"grad_norm": 1.168476939201355,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5733,
|
|
"mean_token_accuracy": 0.8267923593521118,
|
|
"num_tokens": 87350790.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 0.2787385554425229,
|
|
"grad_norm": 1.2543644905090332,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5804,
|
|
"mean_token_accuracy": 0.8234599232673645,
|
|
"num_tokens": 87501470.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 0.2792472024415056,
|
|
"grad_norm": 1.14836847782135,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5712,
|
|
"mean_token_accuracy": 0.8264409303665161,
|
|
"num_tokens": 87654280.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 0.2797558494404883,
|
|
"grad_norm": 1.1316653490066528,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5536,
|
|
"mean_token_accuracy": 0.8308054208755493,
|
|
"num_tokens": 87816423.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.280264496439471,
|
|
"grad_norm": 1.271012783050537,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5891,
|
|
"mean_token_accuracy": 0.8203872442245483,
|
|
"num_tokens": 87960156.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 0.2807731434384537,
|
|
"grad_norm": 1.1925300359725952,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5343,
|
|
"mean_token_accuracy": 0.8370486497879028,
|
|
"num_tokens": 88105701.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 0.2812817904374364,
|
|
"grad_norm": 1.173293113708496,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5629,
|
|
"mean_token_accuracy": 0.8282473087310791,
|
|
"num_tokens": 88261189.0,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 0.28179043743641913,
|
|
"grad_norm": 1.1886355876922607,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5721,
|
|
"mean_token_accuracy": 0.8262102603912354,
|
|
"num_tokens": 88429993.0,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 0.2822990844354018,
|
|
"grad_norm": 1.221113681793213,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.559,
|
|
"mean_token_accuracy": 0.8296666145324707,
|
|
"num_tokens": 88598116.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.28280773143438453,
|
|
"grad_norm": 1.2348664999008179,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5854,
|
|
"mean_token_accuracy": 0.8205877542495728,
|
|
"num_tokens": 88754109.0,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 0.28331637843336727,
|
|
"grad_norm": 1.3055741786956787,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5708,
|
|
"mean_token_accuracy": 0.8285917043685913,
|
|
"num_tokens": 88911407.0,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 0.28382502543234994,
|
|
"grad_norm": 1.2409954071044922,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6024,
|
|
"mean_token_accuracy": 0.8154451847076416,
|
|
"num_tokens": 89075556.0,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 0.2843336724313327,
|
|
"grad_norm": 1.163521647453308,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.8276627659797668,
|
|
"num_tokens": 89234090.0,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 0.28484231943031535,
|
|
"grad_norm": 1.3338189125061035,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5765,
|
|
"mean_token_accuracy": 0.8253119587898254,
|
|
"num_tokens": 89394941.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.2853509664292981,
|
|
"grad_norm": 1.2435412406921387,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5745,
|
|
"mean_token_accuracy": 0.8244850039482117,
|
|
"num_tokens": 89560585.0,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 0.28585961342828076,
|
|
"grad_norm": 1.0883127450942993,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5723,
|
|
"mean_token_accuracy": 0.8268938660621643,
|
|
"num_tokens": 89726320.0,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 0.2863682604272635,
|
|
"grad_norm": 1.1344267129898071,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5375,
|
|
"mean_token_accuracy": 0.8374463319778442,
|
|
"num_tokens": 89880236.0,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 0.28687690742624616,
|
|
"grad_norm": 1.095139741897583,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5982,
|
|
"mean_token_accuracy": 0.8192014694213867,
|
|
"num_tokens": 90046963.0,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 0.2873855544252289,
|
|
"grad_norm": 1.0343623161315918,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.8351205587387085,
|
|
"num_tokens": 90208654.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.28789420142421157,
|
|
"grad_norm": 1.2066987752914429,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5724,
|
|
"mean_token_accuracy": 0.8254907131195068,
|
|
"num_tokens": 90360145.0,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 0.2884028484231943,
|
|
"grad_norm": 1.2492485046386719,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5638,
|
|
"mean_token_accuracy": 0.8292480707168579,
|
|
"num_tokens": 90517341.0,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 0.28891149542217703,
|
|
"grad_norm": 1.1790997982025146,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5624,
|
|
"mean_token_accuracy": 0.8293505907058716,
|
|
"num_tokens": 90666248.0,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 0.2894201424211597,
|
|
"grad_norm": 1.1388493776321411,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5468,
|
|
"mean_token_accuracy": 0.8336046934127808,
|
|
"num_tokens": 90824390.0,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 0.28992878942014244,
|
|
"grad_norm": 1.0945671796798706,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5772,
|
|
"mean_token_accuracy": 0.8247801661491394,
|
|
"num_tokens": 90990168.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.2904374364191251,
|
|
"grad_norm": 1.185009241104126,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5592,
|
|
"mean_token_accuracy": 0.8298337459564209,
|
|
"num_tokens": 91130945.0,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 0.29094608341810785,
|
|
"grad_norm": 1.0907163619995117,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5513,
|
|
"mean_token_accuracy": 0.8324260711669922,
|
|
"num_tokens": 91287120.0,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 0.2914547304170905,
|
|
"grad_norm": 1.1829215288162231,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5664,
|
|
"mean_token_accuracy": 0.8273265957832336,
|
|
"num_tokens": 91445888.0,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 0.29196337741607326,
|
|
"grad_norm": 1.0761456489562988,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5296,
|
|
"mean_token_accuracy": 0.837720513343811,
|
|
"num_tokens": 91605888.0,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 0.29247202441505593,
|
|
"grad_norm": 1.1502619981765747,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6016,
|
|
"mean_token_accuracy": 0.8188234567642212,
|
|
"num_tokens": 91766382.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.29298067141403866,
|
|
"grad_norm": 1.134983777999878,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.562,
|
|
"mean_token_accuracy": 0.8294249773025513,
|
|
"num_tokens": 91923111.0,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 0.29348931841302134,
|
|
"grad_norm": 1.0459659099578857,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5822,
|
|
"mean_token_accuracy": 0.8240058422088623,
|
|
"num_tokens": 92076217.0,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 0.29399796541200407,
|
|
"grad_norm": 1.205342411994934,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5151,
|
|
"mean_token_accuracy": 0.8412837982177734,
|
|
"num_tokens": 92233100.0,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 0.2945066124109868,
|
|
"grad_norm": 1.2601711750030518,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6028,
|
|
"mean_token_accuracy": 0.8175768852233887,
|
|
"num_tokens": 92384392.0,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 0.2950152594099695,
|
|
"grad_norm": 1.1117498874664307,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5212,
|
|
"mean_token_accuracy": 0.8412041664123535,
|
|
"num_tokens": 92538563.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.2955239064089522,
|
|
"grad_norm": 1.2051867246627808,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5439,
|
|
"mean_token_accuracy": 0.8337169885635376,
|
|
"num_tokens": 92702514.0,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 0.2960325534079349,
|
|
"grad_norm": 1.1142427921295166,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.509,
|
|
"mean_token_accuracy": 0.8424208760261536,
|
|
"num_tokens": 92861897.0,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 0.2965412004069176,
|
|
"grad_norm": 1.2217726707458496,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5242,
|
|
"mean_token_accuracy": 0.8390108346939087,
|
|
"num_tokens": 93021485.0,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 0.2970498474059003,
|
|
"grad_norm": 1.1346980333328247,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4927,
|
|
"mean_token_accuracy": 0.8458535075187683,
|
|
"num_tokens": 93169971.0,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 0.297558494404883,
|
|
"grad_norm": 1.2165158987045288,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.549,
|
|
"mean_token_accuracy": 0.8321750164031982,
|
|
"num_tokens": 93321364.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.2980671414038657,
|
|
"grad_norm": 1.2306874990463257,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5744,
|
|
"mean_token_accuracy": 0.825247049331665,
|
|
"num_tokens": 93476129.0,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 0.29857578840284843,
|
|
"grad_norm": 1.1397591829299927,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5624,
|
|
"mean_token_accuracy": 0.8268832564353943,
|
|
"num_tokens": 93637944.0,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 0.2990844354018311,
|
|
"grad_norm": 1.1084394454956055,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8345076441764832,
|
|
"num_tokens": 93795891.0,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 0.29959308240081384,
|
|
"grad_norm": 1.1393333673477173,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5966,
|
|
"mean_token_accuracy": 0.8186751008033752,
|
|
"num_tokens": 93949753.0,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 0.30010172939979657,
|
|
"grad_norm": 1.122271180152893,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5179,
|
|
"mean_token_accuracy": 0.8422044515609741,
|
|
"num_tokens": 94099353.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.30061037639877924,
|
|
"grad_norm": 1.3281346559524536,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6231,
|
|
"mean_token_accuracy": 0.8126146793365479,
|
|
"num_tokens": 94262648.0,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 0.301119023397762,
|
|
"grad_norm": 1.1627881526947021,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5631,
|
|
"mean_token_accuracy": 0.8286106586456299,
|
|
"num_tokens": 94420660.0,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 0.30162767039674465,
|
|
"grad_norm": 1.3525162935256958,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5532,
|
|
"mean_token_accuracy": 0.8333991169929504,
|
|
"num_tokens": 94580578.0,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 0.3021363173957274,
|
|
"grad_norm": 1.2686326503753662,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5606,
|
|
"mean_token_accuracy": 0.8290703296661377,
|
|
"num_tokens": 94745810.0,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 0.30264496439471006,
|
|
"grad_norm": 1.2042104005813599,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5721,
|
|
"mean_token_accuracy": 0.8256835341453552,
|
|
"num_tokens": 94904762.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.3031536113936928,
|
|
"grad_norm": 1.2832545042037964,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5525,
|
|
"mean_token_accuracy": 0.8298637866973877,
|
|
"num_tokens": 95053065.0,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 0.30366225839267547,
|
|
"grad_norm": 1.1183339357376099,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5705,
|
|
"mean_token_accuracy": 0.8255320191383362,
|
|
"num_tokens": 95207324.0,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 0.3041709053916582,
|
|
"grad_norm": 1.2548408508300781,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5414,
|
|
"mean_token_accuracy": 0.8333613872528076,
|
|
"num_tokens": 95365564.0,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 0.3046795523906409,
|
|
"grad_norm": 1.0983479022979736,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5569,
|
|
"mean_token_accuracy": 0.8273598551750183,
|
|
"num_tokens": 95522523.0,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 0.3051881993896236,
|
|
"grad_norm": 1.1975111961364746,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5299,
|
|
"mean_token_accuracy": 0.8379498720169067,
|
|
"num_tokens": 95677617.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.3056968463886063,
|
|
"grad_norm": 1.0161926746368408,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.8283061981201172,
|
|
"num_tokens": 95855477.0,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 0.306205493387589,
|
|
"grad_norm": 1.1558902263641357,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5963,
|
|
"mean_token_accuracy": 0.8191391229629517,
|
|
"num_tokens": 96017692.0,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 0.30671414038657174,
|
|
"grad_norm": 1.0920082330703735,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5534,
|
|
"mean_token_accuracy": 0.8309683799743652,
|
|
"num_tokens": 96175479.0,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 0.3072227873855544,
|
|
"grad_norm": 1.1383585929870605,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5682,
|
|
"mean_token_accuracy": 0.8261626958847046,
|
|
"num_tokens": 96337729.0,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 0.30773143438453715,
|
|
"grad_norm": 1.082250952720642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8316413760185242,
|
|
"num_tokens": 96492659.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.3082400813835198,
|
|
"grad_norm": 1.1070678234100342,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5639,
|
|
"mean_token_accuracy": 0.8267689943313599,
|
|
"num_tokens": 96644965.0,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 0.30874872838250256,
|
|
"grad_norm": 1.134475588798523,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5262,
|
|
"mean_token_accuracy": 0.8369567394256592,
|
|
"num_tokens": 96801670.0,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 0.30925737538148523,
|
|
"grad_norm": 1.5056709051132202,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5581,
|
|
"mean_token_accuracy": 0.8286861181259155,
|
|
"num_tokens": 96966380.0,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 0.30976602238046796,
|
|
"grad_norm": 1.1796578168869019,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.559,
|
|
"mean_token_accuracy": 0.8290995955467224,
|
|
"num_tokens": 97116653.0,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 0.31027466937945064,
|
|
"grad_norm": 1.1612471342086792,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5519,
|
|
"mean_token_accuracy": 0.8312628269195557,
|
|
"num_tokens": 97279451.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.31078331637843337,
|
|
"grad_norm": 1.0776804685592651,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5899,
|
|
"mean_token_accuracy": 0.8217321634292603,
|
|
"num_tokens": 97439569.0,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 0.31129196337741605,
|
|
"grad_norm": 1.046061396598816,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8333207368850708,
|
|
"num_tokens": 97595648.0,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 0.3118006103763988,
|
|
"grad_norm": 1.1500645875930786,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5834,
|
|
"mean_token_accuracy": 0.8228082656860352,
|
|
"num_tokens": 97754844.0,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 0.3123092573753815,
|
|
"grad_norm": 1.0216745138168335,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8335670232772827,
|
|
"num_tokens": 97916479.0,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 0.3128179043743642,
|
|
"grad_norm": 1.2384233474731445,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5799,
|
|
"mean_token_accuracy": 0.8242952823638916,
|
|
"num_tokens": 98083229.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.3133265513733469,
|
|
"grad_norm": 1.1097160577774048,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5567,
|
|
"mean_token_accuracy": 0.8286508321762085,
|
|
"num_tokens": 98234766.0,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 0.3138351983723296,
|
|
"grad_norm": 1.1675152778625488,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5453,
|
|
"mean_token_accuracy": 0.8325939774513245,
|
|
"num_tokens": 98401537.0,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 0.3143438453713123,
|
|
"grad_norm": 1.1450773477554321,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5373,
|
|
"mean_token_accuracy": 0.835297703742981,
|
|
"num_tokens": 98567715.0,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 0.314852492370295,
|
|
"grad_norm": 1.0319682359695435,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5566,
|
|
"mean_token_accuracy": 0.8285013437271118,
|
|
"num_tokens": 98735978.0,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 0.31536113936927773,
|
|
"grad_norm": 1.2614619731903076,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5846,
|
|
"mean_token_accuracy": 0.8239743113517761,
|
|
"num_tokens": 98906165.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.3158697863682604,
|
|
"grad_norm": 1.1205494403839111,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5651,
|
|
"mean_token_accuracy": 0.8274800777435303,
|
|
"num_tokens": 99056653.0,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 0.31637843336724314,
|
|
"grad_norm": 1.4943937063217163,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5201,
|
|
"mean_token_accuracy": 0.8389872312545776,
|
|
"num_tokens": 99211579.0,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 0.3168870803662258,
|
|
"grad_norm": 1.1277450323104858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.569,
|
|
"mean_token_accuracy": 0.8268486857414246,
|
|
"num_tokens": 99361670.0,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 0.31739572736520855,
|
|
"grad_norm": 1.0670257806777954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.585,
|
|
"mean_token_accuracy": 0.8215529322624207,
|
|
"num_tokens": 99530349.0,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 0.3179043743641913,
|
|
"grad_norm": 1.1288607120513916,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8349924683570862,
|
|
"num_tokens": 99693143.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.31841302136317395,
|
|
"grad_norm": 1.061599612236023,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6094,
|
|
"mean_token_accuracy": 0.8155608177185059,
|
|
"num_tokens": 99867681.0,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 0.3189216683621567,
|
|
"grad_norm": 1.1407296657562256,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5559,
|
|
"mean_token_accuracy": 0.8296022415161133,
|
|
"num_tokens": 100026848.0,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 0.31943031536113936,
|
|
"grad_norm": 1.1488444805145264,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5456,
|
|
"mean_token_accuracy": 0.8349856734275818,
|
|
"num_tokens": 100198041.0,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 0.3199389623601221,
|
|
"grad_norm": 1.0721195936203003,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5241,
|
|
"mean_token_accuracy": 0.8384602069854736,
|
|
"num_tokens": 100352919.0,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 0.32044760935910477,
|
|
"grad_norm": 1.0653283596038818,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5277,
|
|
"mean_token_accuracy": 0.838874101638794,
|
|
"num_tokens": 100509537.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.3209562563580875,
|
|
"grad_norm": 1.0388782024383545,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5296,
|
|
"mean_token_accuracy": 0.8368690013885498,
|
|
"num_tokens": 100663059.0,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 0.3214649033570702,
|
|
"grad_norm": 1.297975778579712,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5831,
|
|
"mean_token_accuracy": 0.8221128582954407,
|
|
"num_tokens": 100831452.0,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 0.3219735503560529,
|
|
"grad_norm": 1.128312349319458,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.832910418510437,
|
|
"num_tokens": 100984332.0,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 0.3224821973550356,
|
|
"grad_norm": 1.1248503923416138,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5693,
|
|
"mean_token_accuracy": 0.825761079788208,
|
|
"num_tokens": 101131963.0,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 0.3229908443540183,
|
|
"grad_norm": 1.0807161331176758,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5484,
|
|
"mean_token_accuracy": 0.8321346640586853,
|
|
"num_tokens": 101289236.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.323499491353001,
|
|
"grad_norm": 1.1452345848083496,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.8327028751373291,
|
|
"num_tokens": 101448301.0,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 0.3240081383519837,
|
|
"grad_norm": 1.174797534942627,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5562,
|
|
"mean_token_accuracy": 0.8287744522094727,
|
|
"num_tokens": 101601568.0,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 0.32451678535096645,
|
|
"grad_norm": 1.191448450088501,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5549,
|
|
"mean_token_accuracy": 0.8289895057678223,
|
|
"num_tokens": 101750506.0,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 0.32502543234994913,
|
|
"grad_norm": 1.159472107887268,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8275212645530701,
|
|
"num_tokens": 101915028.0,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 0.32553407934893186,
|
|
"grad_norm": 1.1111050844192505,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5638,
|
|
"mean_token_accuracy": 0.8270004391670227,
|
|
"num_tokens": 102079659.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.32604272634791454,
|
|
"grad_norm": 1.036197304725647,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.532,
|
|
"mean_token_accuracy": 0.8362898230552673,
|
|
"num_tokens": 102247764.0,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 0.32655137334689727,
|
|
"grad_norm": 1.1457844972610474,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5326,
|
|
"mean_token_accuracy": 0.8351918458938599,
|
|
"num_tokens": 102405734.0,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 0.32706002034587994,
|
|
"grad_norm": 1.09752357006073,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.552,
|
|
"mean_token_accuracy": 0.8294593095779419,
|
|
"num_tokens": 102566525.0,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 0.3275686673448627,
|
|
"grad_norm": 1.045760989189148,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5984,
|
|
"mean_token_accuracy": 0.8189845085144043,
|
|
"num_tokens": 102739870.0,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 0.32807731434384535,
|
|
"grad_norm": 1.205141305923462,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5725,
|
|
"mean_token_accuracy": 0.8254177570343018,
|
|
"num_tokens": 102895661.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.3285859613428281,
|
|
"grad_norm": 1.1817325353622437,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5442,
|
|
"mean_token_accuracy": 0.8346107006072998,
|
|
"num_tokens": 103055954.0,
|
|
"step": 646
|
|
},
|
|
{
|
|
"epoch": 0.32909460834181076,
|
|
"grad_norm": 1.075392246246338,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5386,
|
|
"mean_token_accuracy": 0.8339201211929321,
|
|
"num_tokens": 103214101.0,
|
|
"step": 647
|
|
},
|
|
{
|
|
"epoch": 0.3296032553407935,
|
|
"grad_norm": 1.1824406385421753,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5771,
|
|
"mean_token_accuracy": 0.8253869414329529,
|
|
"num_tokens": 103373341.0,
|
|
"step": 648
|
|
},
|
|
{
|
|
"epoch": 0.3301119023397762,
|
|
"grad_norm": 1.0850316286087036,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5402,
|
|
"mean_token_accuracy": 0.833788275718689,
|
|
"num_tokens": 103546489.0,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 0.3306205493387589,
|
|
"grad_norm": 1.1251380443572998,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5415,
|
|
"mean_token_accuracy": 0.8325050473213196,
|
|
"num_tokens": 103709672.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.3311291963377416,
|
|
"grad_norm": 1.1506757736206055,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8368813991546631,
|
|
"num_tokens": 103875427.0,
|
|
"step": 651
|
|
},
|
|
{
|
|
"epoch": 0.3316378433367243,
|
|
"grad_norm": 1.1591253280639648,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5508,
|
|
"mean_token_accuracy": 0.83094322681427,
|
|
"num_tokens": 104027661.0,
|
|
"step": 652
|
|
},
|
|
{
|
|
"epoch": 0.33214649033570703,
|
|
"grad_norm": 1.2312512397766113,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5871,
|
|
"mean_token_accuracy": 0.8218927979469299,
|
|
"num_tokens": 104180735.0,
|
|
"step": 653
|
|
},
|
|
{
|
|
"epoch": 0.3326551373346897,
|
|
"grad_norm": 1.1636476516723633,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5472,
|
|
"mean_token_accuracy": 0.83375084400177,
|
|
"num_tokens": 104353014.0,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 0.33316378433367244,
|
|
"grad_norm": 1.1845154762268066,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8296653032302856,
|
|
"num_tokens": 104517143.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.3336724313326551,
|
|
"grad_norm": 1.076217532157898,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5901,
|
|
"mean_token_accuracy": 0.8188375234603882,
|
|
"num_tokens": 104669203.0,
|
|
"step": 656
|
|
},
|
|
{
|
|
"epoch": 0.33418107833163785,
|
|
"grad_norm": 1.3136963844299316,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5769,
|
|
"mean_token_accuracy": 0.8237686157226562,
|
|
"num_tokens": 104810466.0,
|
|
"step": 657
|
|
},
|
|
{
|
|
"epoch": 0.3346897253306205,
|
|
"grad_norm": 1.0738552808761597,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5778,
|
|
"mean_token_accuracy": 0.8232786059379578,
|
|
"num_tokens": 104980367.0,
|
|
"step": 658
|
|
},
|
|
{
|
|
"epoch": 0.33519837232960326,
|
|
"grad_norm": 1.3627864122390747,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5358,
|
|
"mean_token_accuracy": 0.8357287645339966,
|
|
"num_tokens": 105131160.0,
|
|
"step": 659
|
|
},
|
|
{
|
|
"epoch": 0.335707019328586,
|
|
"grad_norm": 1.1652209758758545,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5907,
|
|
"mean_token_accuracy": 0.8218494653701782,
|
|
"num_tokens": 105298008.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.33621566632756866,
|
|
"grad_norm": 1.2509148120880127,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5953,
|
|
"mean_token_accuracy": 0.8190317153930664,
|
|
"num_tokens": 105452271.0,
|
|
"step": 661
|
|
},
|
|
{
|
|
"epoch": 0.3367243133265514,
|
|
"grad_norm": 1.1237406730651855,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5307,
|
|
"mean_token_accuracy": 0.8366305232048035,
|
|
"num_tokens": 105619978.0,
|
|
"step": 662
|
|
},
|
|
{
|
|
"epoch": 0.33723296032553407,
|
|
"grad_norm": 1.064732313156128,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.543,
|
|
"mean_token_accuracy": 0.8332208395004272,
|
|
"num_tokens": 105786355.0,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 0.3377416073245168,
|
|
"grad_norm": 1.2354017496109009,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5806,
|
|
"mean_token_accuracy": 0.8233789801597595,
|
|
"num_tokens": 105942503.0,
|
|
"step": 664
|
|
},
|
|
{
|
|
"epoch": 0.3382502543234995,
|
|
"grad_norm": 1.099792242050171,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5569,
|
|
"mean_token_accuracy": 0.8300788998603821,
|
|
"num_tokens": 106088220.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.3387589013224822,
|
|
"grad_norm": 1.207027792930603,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5179,
|
|
"mean_token_accuracy": 0.8397770524024963,
|
|
"num_tokens": 106246812.0,
|
|
"step": 666
|
|
},
|
|
{
|
|
"epoch": 0.3392675483214649,
|
|
"grad_norm": 1.0667011737823486,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5166,
|
|
"mean_token_accuracy": 0.8391870260238647,
|
|
"num_tokens": 106405912.0,
|
|
"step": 667
|
|
},
|
|
{
|
|
"epoch": 0.3397761953204476,
|
|
"grad_norm": 1.1162816286087036,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6033,
|
|
"mean_token_accuracy": 0.816855788230896,
|
|
"num_tokens": 106556412.0,
|
|
"step": 668
|
|
},
|
|
{
|
|
"epoch": 0.3402848423194303,
|
|
"grad_norm": 1.1591635942459106,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5731,
|
|
"mean_token_accuracy": 0.8236411809921265,
|
|
"num_tokens": 106712640.0,
|
|
"step": 669
|
|
},
|
|
{
|
|
"epoch": 0.340793489318413,
|
|
"grad_norm": 1.0984569787979126,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.562,
|
|
"mean_token_accuracy": 0.8283123970031738,
|
|
"num_tokens": 106869710.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.34130213631739575,
|
|
"grad_norm": 1.0950126647949219,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5342,
|
|
"mean_token_accuracy": 0.8369549512863159,
|
|
"num_tokens": 107034402.0,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 0.34181078331637843,
|
|
"grad_norm": 1.1208062171936035,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5633,
|
|
"mean_token_accuracy": 0.8275705575942993,
|
|
"num_tokens": 107197492.0,
|
|
"step": 672
|
|
},
|
|
{
|
|
"epoch": 0.34231943031536116,
|
|
"grad_norm": 1.1074978113174438,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5457,
|
|
"mean_token_accuracy": 0.8320577144622803,
|
|
"num_tokens": 107355096.0,
|
|
"step": 673
|
|
},
|
|
{
|
|
"epoch": 0.34282807731434384,
|
|
"grad_norm": 1.1080939769744873,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5437,
|
|
"mean_token_accuracy": 0.8340401649475098,
|
|
"num_tokens": 107520176.0,
|
|
"step": 674
|
|
},
|
|
{
|
|
"epoch": 0.34333672431332657,
|
|
"grad_norm": 1.0805479288101196,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5333,
|
|
"mean_token_accuracy": 0.8356107473373413,
|
|
"num_tokens": 107676895.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.34384537131230924,
|
|
"grad_norm": 1.1147992610931396,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.831654965877533,
|
|
"num_tokens": 107829848.0,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 0.344354018311292,
|
|
"grad_norm": 1.0968836545944214,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5843,
|
|
"mean_token_accuracy": 0.8204114437103271,
|
|
"num_tokens": 108003143.0,
|
|
"step": 677
|
|
},
|
|
{
|
|
"epoch": 0.34486266531027465,
|
|
"grad_norm": 1.1353141069412231,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.8269537687301636,
|
|
"num_tokens": 108169162.0,
|
|
"step": 678
|
|
},
|
|
{
|
|
"epoch": 0.3453713123092574,
|
|
"grad_norm": 1.1155294179916382,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5328,
|
|
"mean_token_accuracy": 0.8368821144104004,
|
|
"num_tokens": 108326285.0,
|
|
"step": 679
|
|
},
|
|
{
|
|
"epoch": 0.34587995930824006,
|
|
"grad_norm": 1.108866810798645,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5324,
|
|
"mean_token_accuracy": 0.8361458778381348,
|
|
"num_tokens": 108481647.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.3463886063072228,
|
|
"grad_norm": 1.1015127897262573,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5403,
|
|
"mean_token_accuracy": 0.8353789448738098,
|
|
"num_tokens": 108632219.0,
|
|
"step": 681
|
|
},
|
|
{
|
|
"epoch": 0.34689725330620547,
|
|
"grad_norm": 1.1202666759490967,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8291305303573608,
|
|
"num_tokens": 108795594.0,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 0.3474059003051882,
|
|
"grad_norm": 1.0737191438674927,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5419,
|
|
"mean_token_accuracy": 0.8332039713859558,
|
|
"num_tokens": 108947589.0,
|
|
"step": 683
|
|
},
|
|
{
|
|
"epoch": 0.34791454730417093,
|
|
"grad_norm": 1.0999579429626465,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5561,
|
|
"mean_token_accuracy": 0.8297255635261536,
|
|
"num_tokens": 109100848.0,
|
|
"step": 684
|
|
},
|
|
{
|
|
"epoch": 0.3484231943031536,
|
|
"grad_norm": 1.2430332899093628,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5895,
|
|
"mean_token_accuracy": 0.8223748803138733,
|
|
"num_tokens": 109268616.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.34893184130213634,
|
|
"grad_norm": 0.9805262088775635,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5276,
|
|
"mean_token_accuracy": 0.8367460370063782,
|
|
"num_tokens": 109425497.0,
|
|
"step": 686
|
|
},
|
|
{
|
|
"epoch": 0.349440488301119,
|
|
"grad_norm": 1.2571215629577637,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5481,
|
|
"mean_token_accuracy": 0.8310940265655518,
|
|
"num_tokens": 109596743.0,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 0.34994913530010174,
|
|
"grad_norm": 1.0542333126068115,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5245,
|
|
"mean_token_accuracy": 0.838855504989624,
|
|
"num_tokens": 109752861.0,
|
|
"step": 688
|
|
},
|
|
{
|
|
"epoch": 0.3504577822990844,
|
|
"grad_norm": 1.0893378257751465,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5275,
|
|
"mean_token_accuracy": 0.8377349972724915,
|
|
"num_tokens": 109912156.0,
|
|
"step": 689
|
|
},
|
|
{
|
|
"epoch": 0.35096642929806715,
|
|
"grad_norm": 1.232289433479309,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5344,
|
|
"mean_token_accuracy": 0.8353825807571411,
|
|
"num_tokens": 110075837.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.3514750762970498,
|
|
"grad_norm": 1.110674262046814,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5713,
|
|
"mean_token_accuracy": 0.8261686563491821,
|
|
"num_tokens": 110230103.0,
|
|
"step": 691
|
|
},
|
|
{
|
|
"epoch": 0.35198372329603256,
|
|
"grad_norm": 1.09687340259552,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8328587412834167,
|
|
"num_tokens": 110388178.0,
|
|
"step": 692
|
|
},
|
|
{
|
|
"epoch": 0.35249237029501523,
|
|
"grad_norm": 1.2102097272872925,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.828652560710907,
|
|
"num_tokens": 110556832.0,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 0.35300101729399797,
|
|
"grad_norm": 1.1342049837112427,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.565,
|
|
"mean_token_accuracy": 0.8272509574890137,
|
|
"num_tokens": 110712545.0,
|
|
"step": 694
|
|
},
|
|
{
|
|
"epoch": 0.3535096642929807,
|
|
"grad_norm": 1.196113109588623,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5294,
|
|
"mean_token_accuracy": 0.8361930847167969,
|
|
"num_tokens": 110855683.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.3540183112919634,
|
|
"grad_norm": 1.3626251220703125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.8290280103683472,
|
|
"num_tokens": 111014148.0,
|
|
"step": 696
|
|
},
|
|
{
|
|
"epoch": 0.3545269582909461,
|
|
"grad_norm": 1.1502712965011597,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5533,
|
|
"mean_token_accuracy": 0.8318491578102112,
|
|
"num_tokens": 111168794.0,
|
|
"step": 697
|
|
},
|
|
{
|
|
"epoch": 0.3550356052899288,
|
|
"grad_norm": 1.1832911968231201,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8285268545150757,
|
|
"num_tokens": 111318802.0,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 0.3555442522889115,
|
|
"grad_norm": 1.1197277307510376,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5411,
|
|
"mean_token_accuracy": 0.8349237442016602,
|
|
"num_tokens": 111482438.0,
|
|
"step": 699
|
|
},
|
|
{
|
|
"epoch": 0.3560528992878942,
|
|
"grad_norm": 1.0746982097625732,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5548,
|
|
"mean_token_accuracy": 0.8285970091819763,
|
|
"num_tokens": 111631108.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.3565615462868769,
|
|
"grad_norm": 1.0557246208190918,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5652,
|
|
"mean_token_accuracy": 0.827860951423645,
|
|
"num_tokens": 111792659.0,
|
|
"step": 701
|
|
},
|
|
{
|
|
"epoch": 0.3570701932858596,
|
|
"grad_norm": 1.2453454732894897,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5597,
|
|
"mean_token_accuracy": 0.8288803100585938,
|
|
"num_tokens": 111956492.0,
|
|
"step": 702
|
|
},
|
|
{
|
|
"epoch": 0.3575788402848423,
|
|
"grad_norm": 1.0866833925247192,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5545,
|
|
"mean_token_accuracy": 0.8305703401565552,
|
|
"num_tokens": 112111925.0,
|
|
"step": 703
|
|
},
|
|
{
|
|
"epoch": 0.358087487283825,
|
|
"grad_norm": 1.1169756650924683,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8297122120857239,
|
|
"num_tokens": 112271716.0,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 0.35859613428280773,
|
|
"grad_norm": 1.1024497747421265,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5255,
|
|
"mean_token_accuracy": 0.8370829820632935,
|
|
"num_tokens": 112433508.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.35910478128179046,
|
|
"grad_norm": 1.0677741765975952,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5224,
|
|
"mean_token_accuracy": 0.8398569226264954,
|
|
"num_tokens": 112595775.0,
|
|
"step": 706
|
|
},
|
|
{
|
|
"epoch": 0.35961342828077314,
|
|
"grad_norm": 1.2112849950790405,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5341,
|
|
"mean_token_accuracy": 0.8345004916191101,
|
|
"num_tokens": 112747723.0,
|
|
"step": 707
|
|
},
|
|
{
|
|
"epoch": 0.36012207527975587,
|
|
"grad_norm": 1.1175358295440674,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5488,
|
|
"mean_token_accuracy": 0.8320543766021729,
|
|
"num_tokens": 112906425.0,
|
|
"step": 708
|
|
},
|
|
{
|
|
"epoch": 0.36063072227873855,
|
|
"grad_norm": 1.7222286462783813,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5376,
|
|
"mean_token_accuracy": 0.8360615372657776,
|
|
"num_tokens": 113062355.0,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 0.3611393692777213,
|
|
"grad_norm": 1.2156181335449219,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.8246958255767822,
|
|
"num_tokens": 113237249.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.36164801627670395,
|
|
"grad_norm": 1.1710059642791748,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5676,
|
|
"mean_token_accuracy": 0.8252333402633667,
|
|
"num_tokens": 113399443.0,
|
|
"step": 711
|
|
},
|
|
{
|
|
"epoch": 0.3621566632756867,
|
|
"grad_norm": 1.1496976613998413,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5254,
|
|
"mean_token_accuracy": 0.8372830152511597,
|
|
"num_tokens": 113555518.0,
|
|
"step": 712
|
|
},
|
|
{
|
|
"epoch": 0.36266531027466936,
|
|
"grad_norm": 1.176712989807129,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8295558094978333,
|
|
"num_tokens": 113698492.0,
|
|
"step": 713
|
|
},
|
|
{
|
|
"epoch": 0.3631739572736521,
|
|
"grad_norm": 1.0876966714859009,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6004,
|
|
"mean_token_accuracy": 0.8183210492134094,
|
|
"num_tokens": 113869795.0,
|
|
"step": 714
|
|
},
|
|
{
|
|
"epoch": 0.36368260427263477,
|
|
"grad_norm": 1.1487334966659546,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5297,
|
|
"mean_token_accuracy": 0.8361709117889404,
|
|
"num_tokens": 114035400.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.3641912512716175,
|
|
"grad_norm": 1.0755226612091064,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5701,
|
|
"mean_token_accuracy": 0.8265610933303833,
|
|
"num_tokens": 114180922.0,
|
|
"step": 716
|
|
},
|
|
{
|
|
"epoch": 0.3646998982706002,
|
|
"grad_norm": 1.1360986232757568,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5439,
|
|
"mean_token_accuracy": 0.8341394066810608,
|
|
"num_tokens": 114351814.0,
|
|
"step": 717
|
|
},
|
|
{
|
|
"epoch": 0.3652085452695829,
|
|
"grad_norm": 1.1471298933029175,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5271,
|
|
"mean_token_accuracy": 0.8365101218223572,
|
|
"num_tokens": 114503125.0,
|
|
"step": 718
|
|
},
|
|
{
|
|
"epoch": 0.36571719226856564,
|
|
"grad_norm": 1.0503714084625244,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5671,
|
|
"mean_token_accuracy": 0.8277009725570679,
|
|
"num_tokens": 114658004.0,
|
|
"step": 719
|
|
},
|
|
{
|
|
"epoch": 0.3662258392675483,
|
|
"grad_norm": 1.220484972000122,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5817,
|
|
"mean_token_accuracy": 0.8230093717575073,
|
|
"num_tokens": 114829230.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.36673448626653105,
|
|
"grad_norm": 1.265468716621399,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5498,
|
|
"mean_token_accuracy": 0.8305054903030396,
|
|
"num_tokens": 114987847.0,
|
|
"step": 721
|
|
},
|
|
{
|
|
"epoch": 0.3672431332655137,
|
|
"grad_norm": 1.0840364694595337,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.538,
|
|
"mean_token_accuracy": 0.8362969756126404,
|
|
"num_tokens": 115154558.0,
|
|
"step": 722
|
|
},
|
|
{
|
|
"epoch": 0.36775178026449645,
|
|
"grad_norm": 1.1251972913742065,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5431,
|
|
"mean_token_accuracy": 0.8324218988418579,
|
|
"num_tokens": 115315702.0,
|
|
"step": 723
|
|
},
|
|
{
|
|
"epoch": 0.36826042726347913,
|
|
"grad_norm": 1.1791112422943115,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8332595825195312,
|
|
"num_tokens": 115471150.0,
|
|
"step": 724
|
|
},
|
|
{
|
|
"epoch": 0.36876907426246186,
|
|
"grad_norm": 1.0629161596298218,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5626,
|
|
"mean_token_accuracy": 0.8278679251670837,
|
|
"num_tokens": 115630359.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.36927772126144454,
|
|
"grad_norm": 1.0933114290237427,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.8298508524894714,
|
|
"num_tokens": 115802068.0,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 0.36978636826042727,
|
|
"grad_norm": 1.0984097719192505,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5362,
|
|
"mean_token_accuracy": 0.8353390693664551,
|
|
"num_tokens": 115954597.0,
|
|
"step": 727
|
|
},
|
|
{
|
|
"epoch": 0.37029501525940994,
|
|
"grad_norm": 1.1613320112228394,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5752,
|
|
"mean_token_accuracy": 0.824324369430542,
|
|
"num_tokens": 116121996.0,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 0.3708036622583927,
|
|
"grad_norm": 1.0691723823547363,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5554,
|
|
"mean_token_accuracy": 0.8294112086296082,
|
|
"num_tokens": 116286650.0,
|
|
"step": 729
|
|
},
|
|
{
|
|
"epoch": 0.3713123092573754,
|
|
"grad_norm": 1.1356724500656128,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5367,
|
|
"mean_token_accuracy": 0.8361466526985168,
|
|
"num_tokens": 116456648.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.3718209562563581,
|
|
"grad_norm": 1.0564593076705933,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5391,
|
|
"mean_token_accuracy": 0.8352751135826111,
|
|
"num_tokens": 116621931.0,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 0.3723296032553408,
|
|
"grad_norm": 1.1024948358535767,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5992,
|
|
"mean_token_accuracy": 0.8183151483535767,
|
|
"num_tokens": 116787232.0,
|
|
"step": 732
|
|
},
|
|
{
|
|
"epoch": 0.3728382502543235,
|
|
"grad_norm": 1.2190114259719849,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6021,
|
|
"mean_token_accuracy": 0.8169412016868591,
|
|
"num_tokens": 116949019.0,
|
|
"step": 733
|
|
},
|
|
{
|
|
"epoch": 0.3733468972533062,
|
|
"grad_norm": 1.0475753545761108,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5763,
|
|
"mean_token_accuracy": 0.8254858255386353,
|
|
"num_tokens": 117122455.0,
|
|
"step": 734
|
|
},
|
|
{
|
|
"epoch": 0.3738555442522889,
|
|
"grad_norm": 1.2131965160369873,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5491,
|
|
"mean_token_accuracy": 0.8324887752532959,
|
|
"num_tokens": 117267738.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.3743641912512716,
|
|
"grad_norm": 1.0916552543640137,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5512,
|
|
"mean_token_accuracy": 0.8287943601608276,
|
|
"num_tokens": 117430802.0,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 0.3748728382502543,
|
|
"grad_norm": 1.0967727899551392,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5625,
|
|
"mean_token_accuracy": 0.826190710067749,
|
|
"num_tokens": 117587032.0,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 0.37538148524923703,
|
|
"grad_norm": 1.0747716426849365,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8252145051956177,
|
|
"num_tokens": 117747701.0,
|
|
"step": 738
|
|
},
|
|
{
|
|
"epoch": 0.3758901322482197,
|
|
"grad_norm": 1.0945491790771484,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5474,
|
|
"mean_token_accuracy": 0.8329147100448608,
|
|
"num_tokens": 117912178.0,
|
|
"step": 739
|
|
},
|
|
{
|
|
"epoch": 0.37639877924720244,
|
|
"grad_norm": 1.1760427951812744,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5195,
|
|
"mean_token_accuracy": 0.8384370803833008,
|
|
"num_tokens": 118069631.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.3769074262461852,
|
|
"grad_norm": 1.1599196195602417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5332,
|
|
"mean_token_accuracy": 0.8354936838150024,
|
|
"num_tokens": 118216192.0,
|
|
"step": 741
|
|
},
|
|
{
|
|
"epoch": 0.37741607324516785,
|
|
"grad_norm": 1.2120177745819092,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5334,
|
|
"mean_token_accuracy": 0.8361068964004517,
|
|
"num_tokens": 118364107.0,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 0.3779247202441506,
|
|
"grad_norm": 1.112884521484375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5787,
|
|
"mean_token_accuracy": 0.8213391900062561,
|
|
"num_tokens": 118532657.0,
|
|
"step": 743
|
|
},
|
|
{
|
|
"epoch": 0.37843336724313326,
|
|
"grad_norm": 1.1208746433258057,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5405,
|
|
"mean_token_accuracy": 0.8333334922790527,
|
|
"num_tokens": 118689942.0,
|
|
"step": 744
|
|
},
|
|
{
|
|
"epoch": 0.378942014242116,
|
|
"grad_norm": 1.233646035194397,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5404,
|
|
"mean_token_accuracy": 0.8324503898620605,
|
|
"num_tokens": 118861717.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.37945066124109866,
|
|
"grad_norm": 1.1507185697555542,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5545,
|
|
"mean_token_accuracy": 0.8297289609909058,
|
|
"num_tokens": 119027905.0,
|
|
"step": 746
|
|
},
|
|
{
|
|
"epoch": 0.3799593082400814,
|
|
"grad_norm": 1.1578809022903442,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4967,
|
|
"mean_token_accuracy": 0.8448212742805481,
|
|
"num_tokens": 119173593.0,
|
|
"step": 747
|
|
},
|
|
{
|
|
"epoch": 0.38046795523906407,
|
|
"grad_norm": 1.2863305807113647,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5918,
|
|
"mean_token_accuracy": 0.8200984001159668,
|
|
"num_tokens": 119333549.0,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 0.3809766022380468,
|
|
"grad_norm": 1.2527709007263184,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5377,
|
|
"mean_token_accuracy": 0.8346661925315857,
|
|
"num_tokens": 119485196.0,
|
|
"step": 749
|
|
},
|
|
{
|
|
"epoch": 0.3814852492370295,
|
|
"grad_norm": 1.1452044248580933,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5355,
|
|
"mean_token_accuracy": 0.8352641463279724,
|
|
"num_tokens": 119635566.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.3819938962360122,
|
|
"grad_norm": 1.3274515867233276,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5437,
|
|
"mean_token_accuracy": 0.8326550722122192,
|
|
"num_tokens": 119796534.0,
|
|
"step": 751
|
|
},
|
|
{
|
|
"epoch": 0.38250254323499494,
|
|
"grad_norm": 1.1830132007598877,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5864,
|
|
"mean_token_accuracy": 0.8197687864303589,
|
|
"num_tokens": 119965071.0,
|
|
"step": 752
|
|
},
|
|
{
|
|
"epoch": 0.3830111902339776,
|
|
"grad_norm": 1.2463164329528809,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5297,
|
|
"mean_token_accuracy": 0.8368417024612427,
|
|
"num_tokens": 120116111.0,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 0.38351983723296035,
|
|
"grad_norm": 1.2696422338485718,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5701,
|
|
"mean_token_accuracy": 0.8251786231994629,
|
|
"num_tokens": 120275408.0,
|
|
"step": 754
|
|
},
|
|
{
|
|
"epoch": 0.384028484231943,
|
|
"grad_norm": 1.212146520614624,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5463,
|
|
"mean_token_accuracy": 0.8328184485435486,
|
|
"num_tokens": 120432203.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.38453713123092575,
|
|
"grad_norm": 1.281558632850647,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8236889839172363,
|
|
"num_tokens": 120600556.0,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 0.38504577822990843,
|
|
"grad_norm": 1.1393814086914062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8314993977546692,
|
|
"num_tokens": 120758883.0,
|
|
"step": 757
|
|
},
|
|
{
|
|
"epoch": 0.38555442522889116,
|
|
"grad_norm": 1.2054526805877686,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.534,
|
|
"mean_token_accuracy": 0.8376193046569824,
|
|
"num_tokens": 120922100.0,
|
|
"step": 758
|
|
},
|
|
{
|
|
"epoch": 0.38606307222787384,
|
|
"grad_norm": 1.3072320222854614,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5511,
|
|
"mean_token_accuracy": 0.8303576111793518,
|
|
"num_tokens": 121075897.0,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 0.38657171922685657,
|
|
"grad_norm": 1.3093321323394775,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5366,
|
|
"mean_token_accuracy": 0.8357855677604675,
|
|
"num_tokens": 121235661.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.38708036622583925,
|
|
"grad_norm": 1.2017589807510376,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5646,
|
|
"mean_token_accuracy": 0.8272026777267456,
|
|
"num_tokens": 121393640.0,
|
|
"step": 761
|
|
},
|
|
{
|
|
"epoch": 0.387589013224822,
|
|
"grad_norm": 1.2778068780899048,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5603,
|
|
"mean_token_accuracy": 0.8284906148910522,
|
|
"num_tokens": 121539889.0,
|
|
"step": 762
|
|
},
|
|
{
|
|
"epoch": 0.38809766022380465,
|
|
"grad_norm": 1.272511601448059,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5434,
|
|
"mean_token_accuracy": 0.832564115524292,
|
|
"num_tokens": 121699739.0,
|
|
"step": 763
|
|
},
|
|
{
|
|
"epoch": 0.3886063072227874,
|
|
"grad_norm": 1.189244031906128,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5028,
|
|
"mean_token_accuracy": 0.8430757522583008,
|
|
"num_tokens": 121844594.0,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 0.3891149542217701,
|
|
"grad_norm": 1.1704158782958984,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5594,
|
|
"mean_token_accuracy": 0.8299344778060913,
|
|
"num_tokens": 122018838.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.3896236012207528,
|
|
"grad_norm": 1.1334116458892822,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5787,
|
|
"mean_token_accuracy": 0.8251534104347229,
|
|
"num_tokens": 122181626.0,
|
|
"step": 766
|
|
},
|
|
{
|
|
"epoch": 0.3901322482197355,
|
|
"grad_norm": 1.182862401008606,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8328073024749756,
|
|
"num_tokens": 122336319.0,
|
|
"step": 767
|
|
},
|
|
{
|
|
"epoch": 0.3906408952187182,
|
|
"grad_norm": 1.194340705871582,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5122,
|
|
"mean_token_accuracy": 0.8423342704772949,
|
|
"num_tokens": 122486556.0,
|
|
"step": 768
|
|
},
|
|
{
|
|
"epoch": 0.39114954221770093,
|
|
"grad_norm": 1.1797587871551514,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.556,
|
|
"mean_token_accuracy": 0.8285393714904785,
|
|
"num_tokens": 122643871.0,
|
|
"step": 769
|
|
},
|
|
{
|
|
"epoch": 0.3916581892166836,
|
|
"grad_norm": 1.1087149381637573,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.8313184976577759,
|
|
"num_tokens": 122806517.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.39216683621566634,
|
|
"grad_norm": 1.30690598487854,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5464,
|
|
"mean_token_accuracy": 0.8317302465438843,
|
|
"num_tokens": 122963330.0,
|
|
"step": 771
|
|
},
|
|
{
|
|
"epoch": 0.392675483214649,
|
|
"grad_norm": 1.1016590595245361,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5561,
|
|
"mean_token_accuracy": 0.828952431678772,
|
|
"num_tokens": 123130884.0,
|
|
"step": 772
|
|
},
|
|
{
|
|
"epoch": 0.39318413021363174,
|
|
"grad_norm": 1.0075249671936035,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5394,
|
|
"mean_token_accuracy": 0.8364933729171753,
|
|
"num_tokens": 123291806.0,
|
|
"step": 773
|
|
},
|
|
{
|
|
"epoch": 0.3936927772126144,
|
|
"grad_norm": 1.066081166267395,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4964,
|
|
"mean_token_accuracy": 0.8464153409004211,
|
|
"num_tokens": 123442859.0,
|
|
"step": 774
|
|
},
|
|
{
|
|
"epoch": 0.39420142421159715,
|
|
"grad_norm": 1.0588123798370361,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.54,
|
|
"mean_token_accuracy": 0.8324978947639465,
|
|
"num_tokens": 123592681.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.3947100712105799,
|
|
"grad_norm": 1.2031190395355225,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5249,
|
|
"mean_token_accuracy": 0.8368514180183411,
|
|
"num_tokens": 123764752.0,
|
|
"step": 776
|
|
},
|
|
{
|
|
"epoch": 0.39521871820956256,
|
|
"grad_norm": 1.0871250629425049,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5575,
|
|
"mean_token_accuracy": 0.8293015360832214,
|
|
"num_tokens": 123922538.0,
|
|
"step": 777
|
|
},
|
|
{
|
|
"epoch": 0.3957273652085453,
|
|
"grad_norm": 1.3108958005905151,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5285,
|
|
"mean_token_accuracy": 0.8365644812583923,
|
|
"num_tokens": 124083040.0,
|
|
"step": 778
|
|
},
|
|
{
|
|
"epoch": 0.39623601220752797,
|
|
"grad_norm": 1.2524911165237427,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5476,
|
|
"mean_token_accuracy": 0.8321986198425293,
|
|
"num_tokens": 124239220.0,
|
|
"step": 779
|
|
},
|
|
{
|
|
"epoch": 0.3967446592065107,
|
|
"grad_norm": 1.0792248249053955,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.545,
|
|
"mean_token_accuracy": 0.8336870670318604,
|
|
"num_tokens": 124398859.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.3972533062054934,
|
|
"grad_norm": 1.1411080360412598,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5672,
|
|
"mean_token_accuracy": 0.8271961212158203,
|
|
"num_tokens": 124549761.0,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 0.3977619532044761,
|
|
"grad_norm": 1.1141024827957153,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5719,
|
|
"mean_token_accuracy": 0.8256720304489136,
|
|
"num_tokens": 124705300.0,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 0.3982706002034588,
|
|
"grad_norm": 1.1549406051635742,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.830722451210022,
|
|
"num_tokens": 124863824.0,
|
|
"step": 783
|
|
},
|
|
{
|
|
"epoch": 0.3987792472024415,
|
|
"grad_norm": 1.238250494003296,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5769,
|
|
"mean_token_accuracy": 0.8239268660545349,
|
|
"num_tokens": 125017794.0,
|
|
"step": 784
|
|
},
|
|
{
|
|
"epoch": 0.3992878942014242,
|
|
"grad_norm": 1.1919786930084229,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5661,
|
|
"mean_token_accuracy": 0.829487144947052,
|
|
"num_tokens": 125178609.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.3997965412004069,
|
|
"grad_norm": 1.2083204984664917,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5622,
|
|
"mean_token_accuracy": 0.8262380957603455,
|
|
"num_tokens": 125331610.0,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 0.40030518819938965,
|
|
"grad_norm": 1.1720713376998901,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.519,
|
|
"mean_token_accuracy": 0.8383011817932129,
|
|
"num_tokens": 125481250.0,
|
|
"step": 787
|
|
},
|
|
{
|
|
"epoch": 0.4008138351983723,
|
|
"grad_norm": 1.2033262252807617,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5629,
|
|
"mean_token_accuracy": 0.8281571865081787,
|
|
"num_tokens": 125650464.0,
|
|
"step": 788
|
|
},
|
|
{
|
|
"epoch": 0.40132248219735506,
|
|
"grad_norm": 1.2404285669326782,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5386,
|
|
"mean_token_accuracy": 0.8318977355957031,
|
|
"num_tokens": 125817057.0,
|
|
"step": 789
|
|
},
|
|
{
|
|
"epoch": 0.40183112919633773,
|
|
"grad_norm": 1.0183967351913452,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5271,
|
|
"mean_token_accuracy": 0.8368712663650513,
|
|
"num_tokens": 125964354.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.40233977619532046,
|
|
"grad_norm": 1.081229567527771,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5242,
|
|
"mean_token_accuracy": 0.8378585577011108,
|
|
"num_tokens": 126119274.0,
|
|
"step": 791
|
|
},
|
|
{
|
|
"epoch": 0.40284842319430314,
|
|
"grad_norm": 1.2605633735656738,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5467,
|
|
"mean_token_accuracy": 0.8325355052947998,
|
|
"num_tokens": 126266332.0,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 0.40335707019328587,
|
|
"grad_norm": 1.0738158226013184,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5535,
|
|
"mean_token_accuracy": 0.8316068053245544,
|
|
"num_tokens": 126416815.0,
|
|
"step": 793
|
|
},
|
|
{
|
|
"epoch": 0.40386571719226855,
|
|
"grad_norm": 1.109156847000122,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5332,
|
|
"mean_token_accuracy": 0.8381474018096924,
|
|
"num_tokens": 126582617.0,
|
|
"step": 794
|
|
},
|
|
{
|
|
"epoch": 0.4043743641912513,
|
|
"grad_norm": 1.0866683721542358,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5448,
|
|
"mean_token_accuracy": 0.8342831134796143,
|
|
"num_tokens": 126734456.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.40488301119023395,
|
|
"grad_norm": 1.0128190517425537,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5367,
|
|
"mean_token_accuracy": 0.835590124130249,
|
|
"num_tokens": 126894951.0,
|
|
"step": 796
|
|
},
|
|
{
|
|
"epoch": 0.4053916581892167,
|
|
"grad_norm": 1.161100149154663,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5605,
|
|
"mean_token_accuracy": 0.8282392024993896,
|
|
"num_tokens": 127040903.0,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 0.4059003051881994,
|
|
"grad_norm": 1.1611078977584839,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5763,
|
|
"mean_token_accuracy": 0.8230469226837158,
|
|
"num_tokens": 127193010.0,
|
|
"step": 798
|
|
},
|
|
{
|
|
"epoch": 0.4064089521871821,
|
|
"grad_norm": 1.0060153007507324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5567,
|
|
"mean_token_accuracy": 0.8288756608963013,
|
|
"num_tokens": 127356124.0,
|
|
"step": 799
|
|
},
|
|
{
|
|
"epoch": 0.4069175991861648,
|
|
"grad_norm": 1.014762043952942,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5359,
|
|
"mean_token_accuracy": 0.8361398577690125,
|
|
"num_tokens": 127530354.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.4074262461851475,
|
|
"grad_norm": 1.1564446687698364,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.8339039087295532,
|
|
"num_tokens": 127674617.0,
|
|
"step": 801
|
|
},
|
|
{
|
|
"epoch": 0.40793489318413023,
|
|
"grad_norm": 1.0868537425994873,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5377,
|
|
"mean_token_accuracy": 0.8345374464988708,
|
|
"num_tokens": 127833308.0,
|
|
"step": 802
|
|
},
|
|
{
|
|
"epoch": 0.4084435401831129,
|
|
"grad_norm": 1.187218427658081,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5677,
|
|
"mean_token_accuracy": 0.826422393321991,
|
|
"num_tokens": 127994496.0,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 0.40895218718209564,
|
|
"grad_norm": 1.1443551778793335,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5443,
|
|
"mean_token_accuracy": 0.833172082901001,
|
|
"num_tokens": 128161302.0,
|
|
"step": 804
|
|
},
|
|
{
|
|
"epoch": 0.4094608341810783,
|
|
"grad_norm": 1.140722632408142,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.555,
|
|
"mean_token_accuracy": 0.8290700316429138,
|
|
"num_tokens": 128318321.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.40996948118006105,
|
|
"grad_norm": 1.1972770690917969,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5492,
|
|
"mean_token_accuracy": 0.8306229114532471,
|
|
"num_tokens": 128485010.0,
|
|
"step": 806
|
|
},
|
|
{
|
|
"epoch": 0.4104781281790437,
|
|
"grad_norm": 1.1250247955322266,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.8245084285736084,
|
|
"num_tokens": 128651307.0,
|
|
"step": 807
|
|
},
|
|
{
|
|
"epoch": 0.41098677517802645,
|
|
"grad_norm": 1.1499453783035278,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5326,
|
|
"mean_token_accuracy": 0.8350856304168701,
|
|
"num_tokens": 128794807.0,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 0.41149542217700913,
|
|
"grad_norm": 1.1552046537399292,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5393,
|
|
"mean_token_accuracy": 0.8341234922409058,
|
|
"num_tokens": 128954027.0,
|
|
"step": 809
|
|
},
|
|
{
|
|
"epoch": 0.41200406917599186,
|
|
"grad_norm": 1.1102688312530518,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5383,
|
|
"mean_token_accuracy": 0.83314049243927,
|
|
"num_tokens": 129117445.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.4125127161749746,
|
|
"grad_norm": 1.0696158409118652,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5389,
|
|
"mean_token_accuracy": 0.8332682847976685,
|
|
"num_tokens": 129285951.0,
|
|
"step": 811
|
|
},
|
|
{
|
|
"epoch": 0.41302136317395727,
|
|
"grad_norm": 1.0423203706741333,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5748,
|
|
"mean_token_accuracy": 0.8261610269546509,
|
|
"num_tokens": 129465561.0,
|
|
"step": 812
|
|
},
|
|
{
|
|
"epoch": 0.41353001017294,
|
|
"grad_norm": 1.1869776248931885,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4957,
|
|
"mean_token_accuracy": 0.8446314334869385,
|
|
"num_tokens": 129614099.0,
|
|
"step": 813
|
|
},
|
|
{
|
|
"epoch": 0.4140386571719227,
|
|
"grad_norm": 1.0922024250030518,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5367,
|
|
"mean_token_accuracy": 0.8350465297698975,
|
|
"num_tokens": 129789431.0,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 0.4145473041709054,
|
|
"grad_norm": 0.968750536441803,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4969,
|
|
"mean_token_accuracy": 0.8457138538360596,
|
|
"num_tokens": 129947839.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.4150559511698881,
|
|
"grad_norm": 1.3184736967086792,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5797,
|
|
"mean_token_accuracy": 0.8236253261566162,
|
|
"num_tokens": 130099262.0,
|
|
"step": 816
|
|
},
|
|
{
|
|
"epoch": 0.4155645981688708,
|
|
"grad_norm": 1.151092529296875,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5413,
|
|
"mean_token_accuracy": 0.8329277038574219,
|
|
"num_tokens": 130262007.0,
|
|
"step": 817
|
|
},
|
|
{
|
|
"epoch": 0.4160732451678535,
|
|
"grad_norm": 1.134224534034729,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5709,
|
|
"mean_token_accuracy": 0.82569420337677,
|
|
"num_tokens": 130412634.0,
|
|
"step": 818
|
|
},
|
|
{
|
|
"epoch": 0.4165818921668362,
|
|
"grad_norm": 1.132689356803894,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.554,
|
|
"mean_token_accuracy": 0.8313882350921631,
|
|
"num_tokens": 130576459.0,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 0.4170905391658189,
|
|
"grad_norm": 1.1339707374572754,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5519,
|
|
"mean_token_accuracy": 0.8296515345573425,
|
|
"num_tokens": 130738545.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.41759918616480163,
|
|
"grad_norm": 0.9829337000846863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5683,
|
|
"mean_token_accuracy": 0.8278769254684448,
|
|
"num_tokens": 130899383.0,
|
|
"step": 821
|
|
},
|
|
{
|
|
"epoch": 0.41810783316378436,
|
|
"grad_norm": 1.1367756128311157,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5361,
|
|
"mean_token_accuracy": 0.8330759406089783,
|
|
"num_tokens": 131069889.0,
|
|
"step": 822
|
|
},
|
|
{
|
|
"epoch": 0.41861648016276704,
|
|
"grad_norm": 1.0659213066101074,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5492,
|
|
"mean_token_accuracy": 0.8324937224388123,
|
|
"num_tokens": 131230218.0,
|
|
"step": 823
|
|
},
|
|
{
|
|
"epoch": 0.41912512716174977,
|
|
"grad_norm": 1.2729610204696655,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5525,
|
|
"mean_token_accuracy": 0.8298782706260681,
|
|
"num_tokens": 131374265.0,
|
|
"step": 824
|
|
},
|
|
{
|
|
"epoch": 0.41963377416073244,
|
|
"grad_norm": 1.1545099020004272,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5354,
|
|
"mean_token_accuracy": 0.8355467319488525,
|
|
"num_tokens": 131543646.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.4201424211597152,
|
|
"grad_norm": 1.100499153137207,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5781,
|
|
"mean_token_accuracy": 0.8228617310523987,
|
|
"num_tokens": 131702768.0,
|
|
"step": 826
|
|
},
|
|
{
|
|
"epoch": 0.42065106815869785,
|
|
"grad_norm": 1.2159390449523926,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5612,
|
|
"mean_token_accuracy": 0.8297737836837769,
|
|
"num_tokens": 131860643.0,
|
|
"step": 827
|
|
},
|
|
{
|
|
"epoch": 0.4211597151576806,
|
|
"grad_norm": 1.0870367288589478,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5212,
|
|
"mean_token_accuracy": 0.8382449746131897,
|
|
"num_tokens": 132005394.0,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 0.42166836215666326,
|
|
"grad_norm": 1.1786516904830933,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5291,
|
|
"mean_token_accuracy": 0.835383415222168,
|
|
"num_tokens": 132162321.0,
|
|
"step": 829
|
|
},
|
|
{
|
|
"epoch": 0.422177009155646,
|
|
"grad_norm": 1.0985463857650757,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5205,
|
|
"mean_token_accuracy": 0.8396868109703064,
|
|
"num_tokens": 132318929.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.42268565615462866,
|
|
"grad_norm": 1.0966908931732178,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5353,
|
|
"mean_token_accuracy": 0.836506724357605,
|
|
"num_tokens": 132479785.0,
|
|
"step": 831
|
|
},
|
|
{
|
|
"epoch": 0.4231943031536114,
|
|
"grad_norm": 1.1082007884979248,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5069,
|
|
"mean_token_accuracy": 0.8426880836486816,
|
|
"num_tokens": 132639135.0,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 0.4237029501525941,
|
|
"grad_norm": 1.3057457208633423,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5942,
|
|
"mean_token_accuracy": 0.817604660987854,
|
|
"num_tokens": 132798835.0,
|
|
"step": 833
|
|
},
|
|
{
|
|
"epoch": 0.4242115971515768,
|
|
"grad_norm": 1.0456715822219849,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5276,
|
|
"mean_token_accuracy": 0.8367403745651245,
|
|
"num_tokens": 132955884.0,
|
|
"step": 834
|
|
},
|
|
{
|
|
"epoch": 0.42472024415055953,
|
|
"grad_norm": 1.2031054496765137,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.525,
|
|
"mean_token_accuracy": 0.837424635887146,
|
|
"num_tokens": 133118521.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.4252288911495422,
|
|
"grad_norm": 1.1078040599822998,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5021,
|
|
"mean_token_accuracy": 0.8442510366439819,
|
|
"num_tokens": 133281242.0,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 0.42573753814852494,
|
|
"grad_norm": 1.1191807985305786,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5346,
|
|
"mean_token_accuracy": 0.8353614211082458,
|
|
"num_tokens": 133450654.0,
|
|
"step": 837
|
|
},
|
|
{
|
|
"epoch": 0.4262461851475076,
|
|
"grad_norm": 1.1722224950790405,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5612,
|
|
"mean_token_accuracy": 0.828338623046875,
|
|
"num_tokens": 133620766.0,
|
|
"step": 838
|
|
},
|
|
{
|
|
"epoch": 0.42675483214649035,
|
|
"grad_norm": 1.0733799934387207,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8271428346633911,
|
|
"num_tokens": 133790293.0,
|
|
"step": 839
|
|
},
|
|
{
|
|
"epoch": 0.427263479145473,
|
|
"grad_norm": 1.1412605047225952,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5242,
|
|
"mean_token_accuracy": 0.8385905027389526,
|
|
"num_tokens": 133962681.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.42777212614445576,
|
|
"grad_norm": 1.1903674602508545,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8283593654632568,
|
|
"num_tokens": 134128875.0,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 0.42828077314343843,
|
|
"grad_norm": 1.1169062852859497,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5915,
|
|
"mean_token_accuracy": 0.8202317953109741,
|
|
"num_tokens": 134299862.0,
|
|
"step": 842
|
|
},
|
|
{
|
|
"epoch": 0.42878942014242116,
|
|
"grad_norm": 1.2607040405273438,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5223,
|
|
"mean_token_accuracy": 0.838632345199585,
|
|
"num_tokens": 134447887.0,
|
|
"step": 843
|
|
},
|
|
{
|
|
"epoch": 0.42929806714140384,
|
|
"grad_norm": 1.2096004486083984,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5424,
|
|
"mean_token_accuracy": 0.8333189487457275,
|
|
"num_tokens": 134603411.0,
|
|
"step": 844
|
|
},
|
|
{
|
|
"epoch": 0.42980671414038657,
|
|
"grad_norm": 1.0911368131637573,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.519,
|
|
"mean_token_accuracy": 0.8395025730133057,
|
|
"num_tokens": 134753687.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.4303153611393693,
|
|
"grad_norm": 1.1959718465805054,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5493,
|
|
"mean_token_accuracy": 0.8322297930717468,
|
|
"num_tokens": 134910477.0,
|
|
"step": 846
|
|
},
|
|
{
|
|
"epoch": 0.430824008138352,
|
|
"grad_norm": 1.0427497625350952,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.8288239240646362,
|
|
"num_tokens": 135065619.0,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 0.4313326551373347,
|
|
"grad_norm": 1.1838765144348145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5826,
|
|
"mean_token_accuracy": 0.8237420320510864,
|
|
"num_tokens": 135215377.0,
|
|
"step": 848
|
|
},
|
|
{
|
|
"epoch": 0.4318413021363174,
|
|
"grad_norm": 1.099975824356079,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5574,
|
|
"mean_token_accuracy": 0.8286048769950867,
|
|
"num_tokens": 135371149.0,
|
|
"step": 849
|
|
},
|
|
{
|
|
"epoch": 0.4323499491353001,
|
|
"grad_norm": 0.9886288046836853,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5407,
|
|
"mean_token_accuracy": 0.8347184062004089,
|
|
"num_tokens": 135530087.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.4328585961342828,
|
|
"grad_norm": 1.0195449590682983,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5356,
|
|
"mean_token_accuracy": 0.8346019983291626,
|
|
"num_tokens": 135689766.0,
|
|
"step": 851
|
|
},
|
|
{
|
|
"epoch": 0.4333672431332655,
|
|
"grad_norm": 1.000313639640808,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5138,
|
|
"mean_token_accuracy": 0.8414555191993713,
|
|
"num_tokens": 135852811.0,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 0.4338758901322482,
|
|
"grad_norm": 1.0192416906356812,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5371,
|
|
"mean_token_accuracy": 0.8356699347496033,
|
|
"num_tokens": 136011756.0,
|
|
"step": 853
|
|
},
|
|
{
|
|
"epoch": 0.43438453713123093,
|
|
"grad_norm": 1.1201744079589844,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5373,
|
|
"mean_token_accuracy": 0.8338669538497925,
|
|
"num_tokens": 136177024.0,
|
|
"step": 854
|
|
},
|
|
{
|
|
"epoch": 0.4348931841302136,
|
|
"grad_norm": 1.1117981672286987,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.51,
|
|
"mean_token_accuracy": 0.8413081169128418,
|
|
"num_tokens": 136325905.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.43540183112919634,
|
|
"grad_norm": 1.0561431646347046,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.564,
|
|
"mean_token_accuracy": 0.8267576098442078,
|
|
"num_tokens": 136482739.0,
|
|
"step": 856
|
|
},
|
|
{
|
|
"epoch": 0.43591047812817907,
|
|
"grad_norm": 1.1021794080734253,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5187,
|
|
"mean_token_accuracy": 0.840574324131012,
|
|
"num_tokens": 136652674.0,
|
|
"step": 857
|
|
},
|
|
{
|
|
"epoch": 0.43641912512716174,
|
|
"grad_norm": 1.1251834630966187,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.526,
|
|
"mean_token_accuracy": 0.8371627330780029,
|
|
"num_tokens": 136822688.0,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 0.4369277721261445,
|
|
"grad_norm": 1.0258177518844604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.833103597164154,
|
|
"num_tokens": 136980565.0,
|
|
"step": 859
|
|
},
|
|
{
|
|
"epoch": 0.43743641912512715,
|
|
"grad_norm": 1.0182185173034668,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.501,
|
|
"mean_token_accuracy": 0.8441320657730103,
|
|
"num_tokens": 137137295.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.4379450661241099,
|
|
"grad_norm": 1.1967811584472656,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5279,
|
|
"mean_token_accuracy": 0.8357078433036804,
|
|
"num_tokens": 137301870.0,
|
|
"step": 861
|
|
},
|
|
{
|
|
"epoch": 0.43845371312309256,
|
|
"grad_norm": 1.0632413625717163,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5429,
|
|
"mean_token_accuracy": 0.8325663805007935,
|
|
"num_tokens": 137449013.0,
|
|
"step": 862
|
|
},
|
|
{
|
|
"epoch": 0.4389623601220753,
|
|
"grad_norm": 1.1115227937698364,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5444,
|
|
"mean_token_accuracy": 0.8315908908843994,
|
|
"num_tokens": 137604020.0,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 0.43947100712105797,
|
|
"grad_norm": 1.0673210620880127,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5252,
|
|
"mean_token_accuracy": 0.8367598652839661,
|
|
"num_tokens": 137767680.0,
|
|
"step": 864
|
|
},
|
|
{
|
|
"epoch": 0.4399796541200407,
|
|
"grad_norm": 1.0349266529083252,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5362,
|
|
"mean_token_accuracy": 0.8359603881835938,
|
|
"num_tokens": 137912184.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.4404883011190234,
|
|
"grad_norm": 1.1605753898620605,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5534,
|
|
"mean_token_accuracy": 0.8329131007194519,
|
|
"num_tokens": 138066689.0,
|
|
"step": 866
|
|
},
|
|
{
|
|
"epoch": 0.4409969481180061,
|
|
"grad_norm": 1.0888206958770752,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5543,
|
|
"mean_token_accuracy": 0.8302878141403198,
|
|
"num_tokens": 138235428.0,
|
|
"step": 867
|
|
},
|
|
{
|
|
"epoch": 0.44150559511698884,
|
|
"grad_norm": 1.126081943511963,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6004,
|
|
"mean_token_accuracy": 0.8197178244590759,
|
|
"num_tokens": 138390463.0,
|
|
"step": 868
|
|
},
|
|
{
|
|
"epoch": 0.4420142421159715,
|
|
"grad_norm": 1.0957306623458862,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5584,
|
|
"mean_token_accuracy": 0.8287143111228943,
|
|
"num_tokens": 138552942.0,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 0.44252288911495424,
|
|
"grad_norm": 1.1695512533187866,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5362,
|
|
"mean_token_accuracy": 0.8339910507202148,
|
|
"num_tokens": 138712726.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.4430315361139369,
|
|
"grad_norm": 1.050856351852417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5312,
|
|
"mean_token_accuracy": 0.83653324842453,
|
|
"num_tokens": 138880829.0,
|
|
"step": 871
|
|
},
|
|
{
|
|
"epoch": 0.44354018311291965,
|
|
"grad_norm": 1.1957542896270752,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8297691345214844,
|
|
"num_tokens": 139056494.0,
|
|
"step": 872
|
|
},
|
|
{
|
|
"epoch": 0.4440488301119023,
|
|
"grad_norm": 1.187698245048523,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6039,
|
|
"mean_token_accuracy": 0.8158071041107178,
|
|
"num_tokens": 139216913.0,
|
|
"step": 873
|
|
},
|
|
{
|
|
"epoch": 0.44455747711088506,
|
|
"grad_norm": 1.103418231010437,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4986,
|
|
"mean_token_accuracy": 0.8445761799812317,
|
|
"num_tokens": 139362971.0,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 0.44506612410986773,
|
|
"grad_norm": 1.1682536602020264,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5678,
|
|
"mean_token_accuracy": 0.8246084451675415,
|
|
"num_tokens": 139513998.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.44557477110885046,
|
|
"grad_norm": 1.117625117301941,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5046,
|
|
"mean_token_accuracy": 0.8425447940826416,
|
|
"num_tokens": 139661330.0,
|
|
"step": 876
|
|
},
|
|
{
|
|
"epoch": 0.44608341810783314,
|
|
"grad_norm": 1.0645250082015991,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5313,
|
|
"mean_token_accuracy": 0.8361184597015381,
|
|
"num_tokens": 139830108.0,
|
|
"step": 877
|
|
},
|
|
{
|
|
"epoch": 0.44659206510681587,
|
|
"grad_norm": 1.0478426218032837,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5779,
|
|
"mean_token_accuracy": 0.8240824937820435,
|
|
"num_tokens": 139996848.0,
|
|
"step": 878
|
|
},
|
|
{
|
|
"epoch": 0.4471007121057986,
|
|
"grad_norm": 1.070643663406372,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5489,
|
|
"mean_token_accuracy": 0.8307640552520752,
|
|
"num_tokens": 140162511.0,
|
|
"step": 879
|
|
},
|
|
{
|
|
"epoch": 0.4476093591047813,
|
|
"grad_norm": 1.0568501949310303,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5285,
|
|
"mean_token_accuracy": 0.8367209434509277,
|
|
"num_tokens": 140315404.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.448118006103764,
|
|
"grad_norm": 1.106792688369751,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5716,
|
|
"mean_token_accuracy": 0.8244082927703857,
|
|
"num_tokens": 140482165.0,
|
|
"step": 881
|
|
},
|
|
{
|
|
"epoch": 0.4486266531027467,
|
|
"grad_norm": 1.1020784378051758,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5338,
|
|
"mean_token_accuracy": 0.8342344164848328,
|
|
"num_tokens": 140632686.0,
|
|
"step": 882
|
|
},
|
|
{
|
|
"epoch": 0.4491353001017294,
|
|
"grad_norm": 1.0874664783477783,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5439,
|
|
"mean_token_accuracy": 0.8320925235748291,
|
|
"num_tokens": 140795616.0,
|
|
"step": 883
|
|
},
|
|
{
|
|
"epoch": 0.4496439471007121,
|
|
"grad_norm": 1.1139014959335327,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5159,
|
|
"mean_token_accuracy": 0.8387758135795593,
|
|
"num_tokens": 140945277.0,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 0.4501525940996948,
|
|
"grad_norm": 1.216664433479309,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5413,
|
|
"mean_token_accuracy": 0.8327680826187134,
|
|
"num_tokens": 141105845.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.4506612410986775,
|
|
"grad_norm": 1.028967022895813,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5432,
|
|
"mean_token_accuracy": 0.8323748111724854,
|
|
"num_tokens": 141261262.0,
|
|
"step": 886
|
|
},
|
|
{
|
|
"epoch": 0.45116988809766023,
|
|
"grad_norm": 1.1917214393615723,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5199,
|
|
"mean_token_accuracy": 0.8402753472328186,
|
|
"num_tokens": 141419743.0,
|
|
"step": 887
|
|
},
|
|
{
|
|
"epoch": 0.4516785350966429,
|
|
"grad_norm": 1.134547233581543,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5542,
|
|
"mean_token_accuracy": 0.8280611038208008,
|
|
"num_tokens": 141584264.0,
|
|
"step": 888
|
|
},
|
|
{
|
|
"epoch": 0.45218718209562564,
|
|
"grad_norm": 0.978706419467926,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5456,
|
|
"mean_token_accuracy": 0.8324958086013794,
|
|
"num_tokens": 141744206.0,
|
|
"step": 889
|
|
},
|
|
{
|
|
"epoch": 0.4526958290946083,
|
|
"grad_norm": 1.1821465492248535,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.492,
|
|
"mean_token_accuracy": 0.8444518446922302,
|
|
"num_tokens": 141902690.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.45320447609359105,
|
|
"grad_norm": 1.079921841621399,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5396,
|
|
"mean_token_accuracy": 0.8340380191802979,
|
|
"num_tokens": 142051810.0,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 0.4537131230925738,
|
|
"grad_norm": 1.1663280725479126,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5491,
|
|
"mean_token_accuracy": 0.8292901515960693,
|
|
"num_tokens": 142212848.0,
|
|
"step": 892
|
|
},
|
|
{
|
|
"epoch": 0.45422177009155645,
|
|
"grad_norm": 1.1193724870681763,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5637,
|
|
"mean_token_accuracy": 0.8273748755455017,
|
|
"num_tokens": 142378279.0,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 0.4547304170905392,
|
|
"grad_norm": 1.0705968141555786,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5554,
|
|
"mean_token_accuracy": 0.8292081952095032,
|
|
"num_tokens": 142533419.0,
|
|
"step": 894
|
|
},
|
|
{
|
|
"epoch": 0.45523906408952186,
|
|
"grad_norm": 1.0355265140533447,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5433,
|
|
"mean_token_accuracy": 0.8332676887512207,
|
|
"num_tokens": 142681407.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.4557477110885046,
|
|
"grad_norm": 1.0541002750396729,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5455,
|
|
"mean_token_accuracy": 0.8321433067321777,
|
|
"num_tokens": 142834121.0,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 0.45625635808748727,
|
|
"grad_norm": 1.1078165769577026,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5396,
|
|
"mean_token_accuracy": 0.8324682712554932,
|
|
"num_tokens": 142995941.0,
|
|
"step": 897
|
|
},
|
|
{
|
|
"epoch": 0.45676500508647,
|
|
"grad_norm": 1.0441187620162964,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5304,
|
|
"mean_token_accuracy": 0.8361738324165344,
|
|
"num_tokens": 143165685.0,
|
|
"step": 898
|
|
},
|
|
{
|
|
"epoch": 0.4572736520854527,
|
|
"grad_norm": 1.1353709697723389,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5243,
|
|
"mean_token_accuracy": 0.8376160860061646,
|
|
"num_tokens": 143324501.0,
|
|
"step": 899
|
|
},
|
|
{
|
|
"epoch": 0.4577822990844354,
|
|
"grad_norm": 1.0670422315597534,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.8358619809150696,
|
|
"num_tokens": 143485496.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.4582909460834181,
|
|
"grad_norm": 1.0932042598724365,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5655,
|
|
"mean_token_accuracy": 0.8243823051452637,
|
|
"num_tokens": 143640935.0,
|
|
"step": 901
|
|
},
|
|
{
|
|
"epoch": 0.4587995930824008,
|
|
"grad_norm": 1.2445162534713745,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5523,
|
|
"mean_token_accuracy": 0.8297701478004456,
|
|
"num_tokens": 143805905.0,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 0.45930824008138355,
|
|
"grad_norm": 1.1864466667175293,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5639,
|
|
"mean_token_accuracy": 0.8264601230621338,
|
|
"num_tokens": 143960760.0,
|
|
"step": 903
|
|
},
|
|
{
|
|
"epoch": 0.4598168870803662,
|
|
"grad_norm": 1.0087008476257324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4989,
|
|
"mean_token_accuracy": 0.8451262712478638,
|
|
"num_tokens": 144107657.0,
|
|
"step": 904
|
|
},
|
|
{
|
|
"epoch": 0.46032553407934895,
|
|
"grad_norm": 1.4060758352279663,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5462,
|
|
"mean_token_accuracy": 0.8316032886505127,
|
|
"num_tokens": 144261684.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.46083418107833163,
|
|
"grad_norm": 1.3151096105575562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5599,
|
|
"mean_token_accuracy": 0.8292255401611328,
|
|
"num_tokens": 144426788.0,
|
|
"step": 906
|
|
},
|
|
{
|
|
"epoch": 0.46134282807731436,
|
|
"grad_norm": 0.9935531616210938,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5139,
|
|
"mean_token_accuracy": 0.8413008451461792,
|
|
"num_tokens": 144588094.0,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 0.46185147507629704,
|
|
"grad_norm": 1.2904752492904663,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5358,
|
|
"mean_token_accuracy": 0.833750307559967,
|
|
"num_tokens": 144742302.0,
|
|
"step": 908
|
|
},
|
|
{
|
|
"epoch": 0.46236012207527977,
|
|
"grad_norm": 1.242739200592041,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.8278312683105469,
|
|
"num_tokens": 144889815.0,
|
|
"step": 909
|
|
},
|
|
{
|
|
"epoch": 0.46286876907426244,
|
|
"grad_norm": 1.0914876461029053,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5314,
|
|
"mean_token_accuracy": 0.835478663444519,
|
|
"num_tokens": 145041480.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.4633774160732452,
|
|
"grad_norm": 1.1101939678192139,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5203,
|
|
"mean_token_accuracy": 0.8407809138298035,
|
|
"num_tokens": 145207314.0,
|
|
"step": 911
|
|
},
|
|
{
|
|
"epoch": 0.46388606307222785,
|
|
"grad_norm": 1.0934215784072876,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5442,
|
|
"mean_token_accuracy": 0.8313193917274475,
|
|
"num_tokens": 145365776.0,
|
|
"step": 912
|
|
},
|
|
{
|
|
"epoch": 0.4643947100712106,
|
|
"grad_norm": 1.050016164779663,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5408,
|
|
"mean_token_accuracy": 0.8334711194038391,
|
|
"num_tokens": 145530569.0,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 0.4649033570701933,
|
|
"grad_norm": 1.1077460050582886,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5129,
|
|
"mean_token_accuracy": 0.8392268419265747,
|
|
"num_tokens": 145686829.0,
|
|
"step": 914
|
|
},
|
|
{
|
|
"epoch": 0.465412004069176,
|
|
"grad_norm": 1.1671563386917114,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5374,
|
|
"mean_token_accuracy": 0.8326257467269897,
|
|
"num_tokens": 145853471.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.4659206510681587,
|
|
"grad_norm": 1.0945162773132324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8274140357971191,
|
|
"num_tokens": 146031134.0,
|
|
"step": 916
|
|
},
|
|
{
|
|
"epoch": 0.4664292980671414,
|
|
"grad_norm": 1.158488392829895,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5865,
|
|
"mean_token_accuracy": 0.8225713968276978,
|
|
"num_tokens": 146186384.0,
|
|
"step": 917
|
|
},
|
|
{
|
|
"epoch": 0.4669379450661241,
|
|
"grad_norm": 1.0633790493011475,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5562,
|
|
"mean_token_accuracy": 0.8297000527381897,
|
|
"num_tokens": 146342004.0,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 0.4674465920651068,
|
|
"grad_norm": 1.045977234840393,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8342148065567017,
|
|
"num_tokens": 146496365.0,
|
|
"step": 919
|
|
},
|
|
{
|
|
"epoch": 0.46795523906408953,
|
|
"grad_norm": 1.0840096473693848,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.830905556678772,
|
|
"num_tokens": 146660600.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.4684638860630722,
|
|
"grad_norm": 1.1081016063690186,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6068,
|
|
"mean_token_accuracy": 0.8147741556167603,
|
|
"num_tokens": 146823441.0,
|
|
"step": 921
|
|
},
|
|
{
|
|
"epoch": 0.46897253306205494,
|
|
"grad_norm": 1.1600236892700195,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5753,
|
|
"mean_token_accuracy": 0.8257789611816406,
|
|
"num_tokens": 146980645.0,
|
|
"step": 922
|
|
},
|
|
{
|
|
"epoch": 0.4694811800610376,
|
|
"grad_norm": 1.0981653928756714,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5677,
|
|
"mean_token_accuracy": 0.825057864189148,
|
|
"num_tokens": 147140337.0,
|
|
"step": 923
|
|
},
|
|
{
|
|
"epoch": 0.46998982706002035,
|
|
"grad_norm": 1.1312999725341797,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.830898106098175,
|
|
"num_tokens": 147298752.0,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 0.470498474059003,
|
|
"grad_norm": 1.0304560661315918,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4943,
|
|
"mean_token_accuracy": 0.845089316368103,
|
|
"num_tokens": 147452293.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.47100712105798576,
|
|
"grad_norm": 1.380007028579712,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5369,
|
|
"mean_token_accuracy": 0.8345775604248047,
|
|
"num_tokens": 147606035.0,
|
|
"step": 926
|
|
},
|
|
{
|
|
"epoch": 0.4715157680569685,
|
|
"grad_norm": 1.200478196144104,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5083,
|
|
"mean_token_accuracy": 0.8416787385940552,
|
|
"num_tokens": 147740900.0,
|
|
"step": 927
|
|
},
|
|
{
|
|
"epoch": 0.47202441505595116,
|
|
"grad_norm": 1.1169615983963013,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5081,
|
|
"mean_token_accuracy": 0.8411614298820496,
|
|
"num_tokens": 147900796.0,
|
|
"step": 928
|
|
},
|
|
{
|
|
"epoch": 0.4725330620549339,
|
|
"grad_norm": 1.1735718250274658,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5557,
|
|
"mean_token_accuracy": 0.8280220031738281,
|
|
"num_tokens": 148048395.0,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 0.47304170905391657,
|
|
"grad_norm": 1.0261050462722778,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4892,
|
|
"mean_token_accuracy": 0.8481217622756958,
|
|
"num_tokens": 148213700.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.4735503560528993,
|
|
"grad_norm": 1.1236721277236938,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5357,
|
|
"mean_token_accuracy": 0.8327924013137817,
|
|
"num_tokens": 148360212.0,
|
|
"step": 931
|
|
},
|
|
{
|
|
"epoch": 0.474059003051882,
|
|
"grad_norm": 1.1015719175338745,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5319,
|
|
"mean_token_accuracy": 0.835745632648468,
|
|
"num_tokens": 148519502.0,
|
|
"step": 932
|
|
},
|
|
{
|
|
"epoch": 0.4745676500508647,
|
|
"grad_norm": 1.4522173404693604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5592,
|
|
"mean_token_accuracy": 0.8281233906745911,
|
|
"num_tokens": 148685441.0,
|
|
"step": 933
|
|
},
|
|
{
|
|
"epoch": 0.4750762970498474,
|
|
"grad_norm": 1.0404940843582153,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5566,
|
|
"mean_token_accuracy": 0.8302342891693115,
|
|
"num_tokens": 148837050.0,
|
|
"step": 934
|
|
},
|
|
{
|
|
"epoch": 0.4755849440488301,
|
|
"grad_norm": 1.0455938577651978,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5201,
|
|
"mean_token_accuracy": 0.8396633267402649,
|
|
"num_tokens": 149001259.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.4760935910478128,
|
|
"grad_norm": 1.0345394611358643,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.568,
|
|
"mean_token_accuracy": 0.8266684412956238,
|
|
"num_tokens": 149158085.0,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 0.4766022380467955,
|
|
"grad_norm": 1.1560389995574951,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5066,
|
|
"mean_token_accuracy": 0.8434706330299377,
|
|
"num_tokens": 149306579.0,
|
|
"step": 937
|
|
},
|
|
{
|
|
"epoch": 0.47711088504577825,
|
|
"grad_norm": 1.0784151554107666,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5931,
|
|
"mean_token_accuracy": 0.8210408687591553,
|
|
"num_tokens": 149470715.0,
|
|
"step": 938
|
|
},
|
|
{
|
|
"epoch": 0.47761953204476093,
|
|
"grad_norm": 1.043968677520752,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5462,
|
|
"mean_token_accuracy": 0.8326769471168518,
|
|
"num_tokens": 149626418.0,
|
|
"step": 939
|
|
},
|
|
{
|
|
"epoch": 0.47812817904374366,
|
|
"grad_norm": 1.0808452367782593,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5607,
|
|
"mean_token_accuracy": 0.8269791603088379,
|
|
"num_tokens": 149779243.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.47863682604272634,
|
|
"grad_norm": 0.9930307269096375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5337,
|
|
"mean_token_accuracy": 0.8348826169967651,
|
|
"num_tokens": 149928437.0,
|
|
"step": 941
|
|
},
|
|
{
|
|
"epoch": 0.47914547304170907,
|
|
"grad_norm": 1.5441179275512695,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5901,
|
|
"mean_token_accuracy": 0.8194471597671509,
|
|
"num_tokens": 150086890.0,
|
|
"step": 942
|
|
},
|
|
{
|
|
"epoch": 0.47965412004069174,
|
|
"grad_norm": 1.1243621110916138,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5035,
|
|
"mean_token_accuracy": 0.8437596559524536,
|
|
"num_tokens": 150246489.0,
|
|
"step": 943
|
|
},
|
|
{
|
|
"epoch": 0.4801627670396745,
|
|
"grad_norm": 1.0576550960540771,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5325,
|
|
"mean_token_accuracy": 0.8357880115509033,
|
|
"num_tokens": 150414091.0,
|
|
"step": 944
|
|
},
|
|
{
|
|
"epoch": 0.48067141403865715,
|
|
"grad_norm": 1.0353620052337646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5518,
|
|
"mean_token_accuracy": 0.8307743668556213,
|
|
"num_tokens": 150581538.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.4811800610376399,
|
|
"grad_norm": 1.1700119972229004,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5589,
|
|
"mean_token_accuracy": 0.8275570869445801,
|
|
"num_tokens": 150749759.0,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 0.48168870803662256,
|
|
"grad_norm": 1.160486102104187,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5355,
|
|
"mean_token_accuracy": 0.8344378471374512,
|
|
"num_tokens": 150911352.0,
|
|
"step": 947
|
|
},
|
|
{
|
|
"epoch": 0.4821973550356053,
|
|
"grad_norm": 1.0820791721343994,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8328191041946411,
|
|
"num_tokens": 151078517.0,
|
|
"step": 948
|
|
},
|
|
{
|
|
"epoch": 0.482706002034588,
|
|
"grad_norm": 1.0750031471252441,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5639,
|
|
"mean_token_accuracy": 0.8277915716171265,
|
|
"num_tokens": 151244643.0,
|
|
"step": 949
|
|
},
|
|
{
|
|
"epoch": 0.4832146490335707,
|
|
"grad_norm": 1.0744576454162598,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5193,
|
|
"mean_token_accuracy": 0.8381385207176208,
|
|
"num_tokens": 151404760.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.48372329603255343,
|
|
"grad_norm": 1.1603211164474487,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5303,
|
|
"mean_token_accuracy": 0.8365808725357056,
|
|
"num_tokens": 151555236.0,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 0.4842319430315361,
|
|
"grad_norm": 1.0513938665390015,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4976,
|
|
"mean_token_accuracy": 0.84479820728302,
|
|
"num_tokens": 151714374.0,
|
|
"step": 952
|
|
},
|
|
{
|
|
"epoch": 0.48474059003051884,
|
|
"grad_norm": 1.2464724779129028,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5563,
|
|
"mean_token_accuracy": 0.8285852670669556,
|
|
"num_tokens": 151873694.0,
|
|
"step": 953
|
|
},
|
|
{
|
|
"epoch": 0.4852492370295015,
|
|
"grad_norm": 1.6022312641143799,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5001,
|
|
"mean_token_accuracy": 0.8462530374526978,
|
|
"num_tokens": 152034628.0,
|
|
"step": 954
|
|
},
|
|
{
|
|
"epoch": 0.48575788402848424,
|
|
"grad_norm": 1.1984519958496094,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5662,
|
|
"mean_token_accuracy": 0.8279759883880615,
|
|
"num_tokens": 152182009.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.4862665310274669,
|
|
"grad_norm": 1.0882331132888794,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5118,
|
|
"mean_token_accuracy": 0.8404330015182495,
|
|
"num_tokens": 152336970.0,
|
|
"step": 956
|
|
},
|
|
{
|
|
"epoch": 0.48677517802644965,
|
|
"grad_norm": 0.9980266094207764,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5558,
|
|
"mean_token_accuracy": 0.8287216424942017,
|
|
"num_tokens": 152512035.0,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 0.4872838250254323,
|
|
"grad_norm": 1.5697441101074219,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5442,
|
|
"mean_token_accuracy": 0.8325050473213196,
|
|
"num_tokens": 152684663.0,
|
|
"step": 958
|
|
},
|
|
{
|
|
"epoch": 0.48779247202441506,
|
|
"grad_norm": 1.0851205587387085,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5731,
|
|
"mean_token_accuracy": 0.8240979909896851,
|
|
"num_tokens": 152850154.0,
|
|
"step": 959
|
|
},
|
|
{
|
|
"epoch": 0.4883011190233978,
|
|
"grad_norm": 1.0393105745315552,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5432,
|
|
"mean_token_accuracy": 0.8336237072944641,
|
|
"num_tokens": 153010247.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.48880976602238047,
|
|
"grad_norm": 0.9916489720344543,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5082,
|
|
"mean_token_accuracy": 0.8411122560501099,
|
|
"num_tokens": 153174299.0,
|
|
"step": 961
|
|
},
|
|
{
|
|
"epoch": 0.4893184130213632,
|
|
"grad_norm": 1.0968836545944214,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.837975263595581,
|
|
"num_tokens": 153337306.0,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 0.4898270600203459,
|
|
"grad_norm": 1.5415338277816772,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.8305727243423462,
|
|
"num_tokens": 153492940.0,
|
|
"step": 963
|
|
},
|
|
{
|
|
"epoch": 0.4903357070193286,
|
|
"grad_norm": 1.1375877857208252,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5239,
|
|
"mean_token_accuracy": 0.8357195854187012,
|
|
"num_tokens": 153652759.0,
|
|
"step": 964
|
|
},
|
|
{
|
|
"epoch": 0.4908443540183113,
|
|
"grad_norm": 1.08938729763031,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5141,
|
|
"mean_token_accuracy": 0.8405740857124329,
|
|
"num_tokens": 153818682.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.491353001017294,
|
|
"grad_norm": 1.066440463066101,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8383237719535828,
|
|
"num_tokens": 153982904.0,
|
|
"step": 966
|
|
},
|
|
{
|
|
"epoch": 0.4918616480162767,
|
|
"grad_norm": 1.0795077085494995,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5357,
|
|
"mean_token_accuracy": 0.8331218361854553,
|
|
"num_tokens": 154142701.0,
|
|
"step": 967
|
|
},
|
|
{
|
|
"epoch": 0.4923702950152594,
|
|
"grad_norm": 1.1853792667388916,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5836,
|
|
"mean_token_accuracy": 0.8225834369659424,
|
|
"num_tokens": 154318454.0,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 0.4928789420142421,
|
|
"grad_norm": 1.0849460363388062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5899,
|
|
"mean_token_accuracy": 0.8213503360748291,
|
|
"num_tokens": 154485998.0,
|
|
"step": 969
|
|
},
|
|
{
|
|
"epoch": 0.4933875890132248,
|
|
"grad_norm": 1.0151082277297974,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5389,
|
|
"mean_token_accuracy": 0.8343636393547058,
|
|
"num_tokens": 154646655.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.4938962360122075,
|
|
"grad_norm": 1.0000461339950562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8375341892242432,
|
|
"num_tokens": 154807349.0,
|
|
"step": 971
|
|
},
|
|
{
|
|
"epoch": 0.49440488301119023,
|
|
"grad_norm": 1.0775938034057617,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.541,
|
|
"mean_token_accuracy": 0.8329321146011353,
|
|
"num_tokens": 154961491.0,
|
|
"step": 972
|
|
},
|
|
{
|
|
"epoch": 0.49491353001017296,
|
|
"grad_norm": 1.1057847738265991,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5324,
|
|
"mean_token_accuracy": 0.8363903164863586,
|
|
"num_tokens": 155114869.0,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 0.49542217700915564,
|
|
"grad_norm": 1.0250566005706787,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5301,
|
|
"mean_token_accuracy": 0.8337911367416382,
|
|
"num_tokens": 155273562.0,
|
|
"step": 974
|
|
},
|
|
{
|
|
"epoch": 0.49593082400813837,
|
|
"grad_norm": 1.373247504234314,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4725,
|
|
"mean_token_accuracy": 0.8508755564689636,
|
|
"num_tokens": 155431483.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.49643947100712105,
|
|
"grad_norm": 1.1253653764724731,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5481,
|
|
"mean_token_accuracy": 0.8335919380187988,
|
|
"num_tokens": 155577059.0,
|
|
"step": 976
|
|
},
|
|
{
|
|
"epoch": 0.4969481180061038,
|
|
"grad_norm": 1.032317042350769,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5654,
|
|
"mean_token_accuracy": 0.8286297917366028,
|
|
"num_tokens": 155734946.0,
|
|
"step": 977
|
|
},
|
|
{
|
|
"epoch": 0.49745676500508645,
|
|
"grad_norm": 1.1771360635757446,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5462,
|
|
"mean_token_accuracy": 0.833104133605957,
|
|
"num_tokens": 155896969.0,
|
|
"step": 978
|
|
},
|
|
{
|
|
"epoch": 0.4979654120040692,
|
|
"grad_norm": 1.0790753364562988,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5577,
|
|
"mean_token_accuracy": 0.8295459747314453,
|
|
"num_tokens": 156060582.0,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 0.49847405900305186,
|
|
"grad_norm": 1.1007190942764282,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5259,
|
|
"mean_token_accuracy": 0.83788001537323,
|
|
"num_tokens": 156221661.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.4989827060020346,
|
|
"grad_norm": 1.074344277381897,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5278,
|
|
"mean_token_accuracy": 0.8362329602241516,
|
|
"num_tokens": 156378238.0,
|
|
"step": 981
|
|
},
|
|
{
|
|
"epoch": 0.49949135300101727,
|
|
"grad_norm": 1.1053829193115234,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5745,
|
|
"mean_token_accuracy": 0.8221993446350098,
|
|
"num_tokens": 156538559.0,
|
|
"step": 982
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 1.070326805114746,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5492,
|
|
"mean_token_accuracy": 0.8299859762191772,
|
|
"num_tokens": 156700036.0,
|
|
"step": 983
|
|
},
|
|
{
|
|
"epoch": 0.5005086469989827,
|
|
"grad_norm": 0.9463331699371338,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5565,
|
|
"mean_token_accuracy": 0.8299938440322876,
|
|
"num_tokens": 156879118.0,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 0.5010172939979655,
|
|
"grad_norm": 1.009292721748352,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4972,
|
|
"mean_token_accuracy": 0.8462924957275391,
|
|
"num_tokens": 157043565.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.5015259409969481,
|
|
"grad_norm": 1.0810712575912476,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5365,
|
|
"mean_token_accuracy": 0.833280086517334,
|
|
"num_tokens": 157198178.0,
|
|
"step": 986
|
|
},
|
|
{
|
|
"epoch": 0.5020345879959308,
|
|
"grad_norm": 1.0298830270767212,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5399,
|
|
"mean_token_accuracy": 0.8341619968414307,
|
|
"num_tokens": 157349621.0,
|
|
"step": 987
|
|
},
|
|
{
|
|
"epoch": 0.5025432349949135,
|
|
"grad_norm": 1.095278024673462,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5261,
|
|
"mean_token_accuracy": 0.8374025225639343,
|
|
"num_tokens": 157519093.0,
|
|
"step": 988
|
|
},
|
|
{
|
|
"epoch": 0.5030518819938963,
|
|
"grad_norm": 1.0355744361877441,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.528,
|
|
"mean_token_accuracy": 0.8360305428504944,
|
|
"num_tokens": 157671835.0,
|
|
"step": 989
|
|
},
|
|
{
|
|
"epoch": 0.503560528992879,
|
|
"grad_norm": 1.0776753425598145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.526,
|
|
"mean_token_accuracy": 0.8369237780570984,
|
|
"num_tokens": 157826508.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.5040691759918616,
|
|
"grad_norm": 1.1481845378875732,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8297188878059387,
|
|
"num_tokens": 157983854.0,
|
|
"step": 991
|
|
},
|
|
{
|
|
"epoch": 0.5045778229908443,
|
|
"grad_norm": 1.1202970743179321,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5434,
|
|
"mean_token_accuracy": 0.8322055339813232,
|
|
"num_tokens": 158153856.0,
|
|
"step": 992
|
|
},
|
|
{
|
|
"epoch": 0.5050864699898271,
|
|
"grad_norm": 0.963240921497345,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5158,
|
|
"mean_token_accuracy": 0.8408123254776001,
|
|
"num_tokens": 158311723.0,
|
|
"step": 993
|
|
},
|
|
{
|
|
"epoch": 0.5055951169888098,
|
|
"grad_norm": 1.0589247941970825,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5716,
|
|
"mean_token_accuracy": 0.8279574513435364,
|
|
"num_tokens": 158468860.0,
|
|
"step": 994
|
|
},
|
|
{
|
|
"epoch": 0.5061037639877924,
|
|
"grad_norm": 1.0449001789093018,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.547,
|
|
"mean_token_accuracy": 0.8317117691040039,
|
|
"num_tokens": 158623811.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.5066124109867752,
|
|
"grad_norm": 1.028795838356018,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.532,
|
|
"mean_token_accuracy": 0.8349628448486328,
|
|
"num_tokens": 158789067.0,
|
|
"step": 996
|
|
},
|
|
{
|
|
"epoch": 0.5071210579857579,
|
|
"grad_norm": 1.1221290826797485,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5566,
|
|
"mean_token_accuracy": 0.8285014629364014,
|
|
"num_tokens": 158950421.0,
|
|
"step": 997
|
|
},
|
|
{
|
|
"epoch": 0.5076297049847406,
|
|
"grad_norm": 1.1094428300857544,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.528,
|
|
"mean_token_accuracy": 0.8371928930282593,
|
|
"num_tokens": 159115006.0,
|
|
"step": 998
|
|
},
|
|
{
|
|
"epoch": 0.5081383519837233,
|
|
"grad_norm": 1.1847496032714844,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5235,
|
|
"mean_token_accuracy": 0.8361454010009766,
|
|
"num_tokens": 159267147.0,
|
|
"step": 999
|
|
},
|
|
{
|
|
"epoch": 0.508646998982706,
|
|
"grad_norm": 1.0200287103652954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.516,
|
|
"mean_token_accuracy": 0.8391585946083069,
|
|
"num_tokens": 159430845.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.5091556459816887,
|
|
"grad_norm": 1.096622109413147,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8338131904602051,
|
|
"num_tokens": 159592673.0,
|
|
"step": 1001
|
|
},
|
|
{
|
|
"epoch": 0.5096642929806714,
|
|
"grad_norm": 1.1021353006362915,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.546,
|
|
"mean_token_accuracy": 0.8303905725479126,
|
|
"num_tokens": 159766028.0,
|
|
"step": 1002
|
|
},
|
|
{
|
|
"epoch": 0.5101729399796541,
|
|
"grad_norm": 1.0417699813842773,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5339,
|
|
"mean_token_accuracy": 0.8354570865631104,
|
|
"num_tokens": 159928990.0,
|
|
"step": 1003
|
|
},
|
|
{
|
|
"epoch": 0.5106815869786369,
|
|
"grad_norm": 1.0172051191329956,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4989,
|
|
"mean_token_accuracy": 0.8456557989120483,
|
|
"num_tokens": 160093084.0,
|
|
"step": 1004
|
|
},
|
|
{
|
|
"epoch": 0.5111902339776195,
|
|
"grad_norm": 1.036210536956787,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5578,
|
|
"mean_token_accuracy": 0.8279012441635132,
|
|
"num_tokens": 160253282.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.5116988809766022,
|
|
"grad_norm": 1.0002751350402832,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5162,
|
|
"mean_token_accuracy": 0.8411789536476135,
|
|
"num_tokens": 160414551.0,
|
|
"step": 1006
|
|
},
|
|
{
|
|
"epoch": 0.512207527975585,
|
|
"grad_norm": 1.19473397731781,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.8280037641525269,
|
|
"num_tokens": 160566903.0,
|
|
"step": 1007
|
|
},
|
|
{
|
|
"epoch": 0.5127161749745677,
|
|
"grad_norm": 0.9549198746681213,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5119,
|
|
"mean_token_accuracy": 0.842778742313385,
|
|
"num_tokens": 160732492.0,
|
|
"step": 1008
|
|
},
|
|
{
|
|
"epoch": 0.5132248219735503,
|
|
"grad_norm": 1.104193091392517,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5599,
|
|
"mean_token_accuracy": 0.8274698853492737,
|
|
"num_tokens": 160901405.0,
|
|
"step": 1009
|
|
},
|
|
{
|
|
"epoch": 0.513733468972533,
|
|
"grad_norm": 1.0524988174438477,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5201,
|
|
"mean_token_accuracy": 0.8397318124771118,
|
|
"num_tokens": 161063180.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.5142421159715158,
|
|
"grad_norm": 1.068765640258789,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5529,
|
|
"mean_token_accuracy": 0.831360936164856,
|
|
"num_tokens": 161224667.0,
|
|
"step": 1011
|
|
},
|
|
{
|
|
"epoch": 0.5147507629704985,
|
|
"grad_norm": 1.076060175895691,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5444,
|
|
"mean_token_accuracy": 0.8327022790908813,
|
|
"num_tokens": 161375840.0,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"epoch": 0.5152594099694812,
|
|
"grad_norm": 1.0867894887924194,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.551,
|
|
"mean_token_accuracy": 0.8314119577407837,
|
|
"num_tokens": 161539381.0,
|
|
"step": 1013
|
|
},
|
|
{
|
|
"epoch": 0.5157680569684638,
|
|
"grad_norm": 1.0638878345489502,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8310900926589966,
|
|
"num_tokens": 161712451.0,
|
|
"step": 1014
|
|
},
|
|
{
|
|
"epoch": 0.5162767039674466,
|
|
"grad_norm": 1.0727468729019165,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5481,
|
|
"mean_token_accuracy": 0.8312073945999146,
|
|
"num_tokens": 161891850.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.5167853509664293,
|
|
"grad_norm": 1.1078076362609863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5165,
|
|
"mean_token_accuracy": 0.8393900394439697,
|
|
"num_tokens": 162053742.0,
|
|
"step": 1016
|
|
},
|
|
{
|
|
"epoch": 0.517293997965412,
|
|
"grad_norm": 1.01996648311615,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5195,
|
|
"mean_token_accuracy": 0.8393486738204956,
|
|
"num_tokens": 162209650.0,
|
|
"step": 1017
|
|
},
|
|
{
|
|
"epoch": 0.5178026449643948,
|
|
"grad_norm": 1.0445448160171509,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5701,
|
|
"mean_token_accuracy": 0.8260886669158936,
|
|
"num_tokens": 162383502.0,
|
|
"step": 1018
|
|
},
|
|
{
|
|
"epoch": 0.5183112919633774,
|
|
"grad_norm": 1.0632621049880981,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5055,
|
|
"mean_token_accuracy": 0.8429189324378967,
|
|
"num_tokens": 162544656.0,
|
|
"step": 1019
|
|
},
|
|
{
|
|
"epoch": 0.5188199389623601,
|
|
"grad_norm": 1.0640442371368408,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5447,
|
|
"mean_token_accuracy": 0.8306871056556702,
|
|
"num_tokens": 162721800.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.5193285859613428,
|
|
"grad_norm": 1.1594151258468628,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5355,
|
|
"mean_token_accuracy": 0.8341934680938721,
|
|
"num_tokens": 162876310.0,
|
|
"step": 1021
|
|
},
|
|
{
|
|
"epoch": 0.5198372329603256,
|
|
"grad_norm": 0.9929187297821045,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5354,
|
|
"mean_token_accuracy": 0.8347718715667725,
|
|
"num_tokens": 163040860.0,
|
|
"step": 1022
|
|
},
|
|
{
|
|
"epoch": 0.5203458799593083,
|
|
"grad_norm": 1.0557368993759155,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.546,
|
|
"mean_token_accuracy": 0.8325394988059998,
|
|
"num_tokens": 163207134.0,
|
|
"step": 1023
|
|
},
|
|
{
|
|
"epoch": 0.5208545269582909,
|
|
"grad_norm": 1.1534385681152344,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5223,
|
|
"mean_token_accuracy": 0.8368192911148071,
|
|
"num_tokens": 163354092.0,
|
|
"step": 1024
|
|
},
|
|
{
|
|
"epoch": 0.5213631739572736,
|
|
"grad_norm": 1.0177485942840576,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.542,
|
|
"mean_token_accuracy": 0.8332100510597229,
|
|
"num_tokens": 163506047.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.5218718209562564,
|
|
"grad_norm": 1.1139676570892334,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5349,
|
|
"mean_token_accuracy": 0.8346984386444092,
|
|
"num_tokens": 163677302.0,
|
|
"step": 1026
|
|
},
|
|
{
|
|
"epoch": 0.5223804679552391,
|
|
"grad_norm": 1.1026079654693604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5222,
|
|
"mean_token_accuracy": 0.8363239169120789,
|
|
"num_tokens": 163838092.0,
|
|
"step": 1027
|
|
},
|
|
{
|
|
"epoch": 0.5228891149542217,
|
|
"grad_norm": 1.0351805686950684,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.513,
|
|
"mean_token_accuracy": 0.8408992290496826,
|
|
"num_tokens": 163993198.0,
|
|
"step": 1028
|
|
},
|
|
{
|
|
"epoch": 0.5233977619532044,
|
|
"grad_norm": 1.3836876153945923,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.8245905637741089,
|
|
"num_tokens": 164159543.0,
|
|
"step": 1029
|
|
},
|
|
{
|
|
"epoch": 0.5239064089521872,
|
|
"grad_norm": 1.176855444908142,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5256,
|
|
"mean_token_accuracy": 0.8370753526687622,
|
|
"num_tokens": 164325603.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.5244150559511699,
|
|
"grad_norm": 1.0500699281692505,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5213,
|
|
"mean_token_accuracy": 0.8387798070907593,
|
|
"num_tokens": 164481267.0,
|
|
"step": 1031
|
|
},
|
|
{
|
|
"epoch": 0.5249237029501526,
|
|
"grad_norm": 1.14847731590271,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5077,
|
|
"mean_token_accuracy": 0.8410720825195312,
|
|
"num_tokens": 164634955.0,
|
|
"step": 1032
|
|
},
|
|
{
|
|
"epoch": 0.5254323499491353,
|
|
"grad_norm": 1.0690829753875732,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5138,
|
|
"mean_token_accuracy": 0.8392068147659302,
|
|
"num_tokens": 164788873.0,
|
|
"step": 1033
|
|
},
|
|
{
|
|
"epoch": 0.525940996948118,
|
|
"grad_norm": 1.161458969116211,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5155,
|
|
"mean_token_accuracy": 0.838508129119873,
|
|
"num_tokens": 164949736.0,
|
|
"step": 1034
|
|
},
|
|
{
|
|
"epoch": 0.5264496439471007,
|
|
"grad_norm": 1.0988707542419434,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5545,
|
|
"mean_token_accuracy": 0.8288130164146423,
|
|
"num_tokens": 165111991.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.5269582909460834,
|
|
"grad_norm": 1.038994312286377,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.52,
|
|
"mean_token_accuracy": 0.8401618003845215,
|
|
"num_tokens": 165268681.0,
|
|
"step": 1036
|
|
},
|
|
{
|
|
"epoch": 0.5274669379450662,
|
|
"grad_norm": 0.9615954756736755,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5043,
|
|
"mean_token_accuracy": 0.8433043956756592,
|
|
"num_tokens": 165439160.0,
|
|
"step": 1037
|
|
},
|
|
{
|
|
"epoch": 0.5279755849440488,
|
|
"grad_norm": 0.9413687586784363,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5042,
|
|
"mean_token_accuracy": 0.8420565128326416,
|
|
"num_tokens": 165611395.0,
|
|
"step": 1038
|
|
},
|
|
{
|
|
"epoch": 0.5284842319430315,
|
|
"grad_norm": 1.0146604776382446,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8289092183113098,
|
|
"num_tokens": 165770065.0,
|
|
"step": 1039
|
|
},
|
|
{
|
|
"epoch": 0.5289928789420142,
|
|
"grad_norm": 1.0559642314910889,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5718,
|
|
"mean_token_accuracy": 0.8239895105361938,
|
|
"num_tokens": 165930541.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.529501525940997,
|
|
"grad_norm": 1.0382318496704102,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8355600237846375,
|
|
"num_tokens": 166095798.0,
|
|
"step": 1041
|
|
},
|
|
{
|
|
"epoch": 0.5300101729399797,
|
|
"grad_norm": 1.1315993070602417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5313,
|
|
"mean_token_accuracy": 0.8369777202606201,
|
|
"num_tokens": 166253912.0,
|
|
"step": 1042
|
|
},
|
|
{
|
|
"epoch": 0.5305188199389623,
|
|
"grad_norm": 1.0437849760055542,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5123,
|
|
"mean_token_accuracy": 0.8429243564605713,
|
|
"num_tokens": 166414865.0,
|
|
"step": 1043
|
|
},
|
|
{
|
|
"epoch": 0.5310274669379451,
|
|
"grad_norm": 1.1972240209579468,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5492,
|
|
"mean_token_accuracy": 0.8301947116851807,
|
|
"num_tokens": 166570026.0,
|
|
"step": 1044
|
|
},
|
|
{
|
|
"epoch": 0.5315361139369278,
|
|
"grad_norm": 1.0601789951324463,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5288,
|
|
"mean_token_accuracy": 0.8359141945838928,
|
|
"num_tokens": 166723092.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.5320447609359105,
|
|
"grad_norm": 1.1009857654571533,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5452,
|
|
"mean_token_accuracy": 0.8325526714324951,
|
|
"num_tokens": 166891414.0,
|
|
"step": 1046
|
|
},
|
|
{
|
|
"epoch": 0.5325534079348931,
|
|
"grad_norm": 1.0576329231262207,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8268650770187378,
|
|
"num_tokens": 167043099.0,
|
|
"step": 1047
|
|
},
|
|
{
|
|
"epoch": 0.5330620549338759,
|
|
"grad_norm": 1.107357144355774,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5345,
|
|
"mean_token_accuracy": 0.8358545303344727,
|
|
"num_tokens": 167199448.0,
|
|
"step": 1048
|
|
},
|
|
{
|
|
"epoch": 0.5335707019328586,
|
|
"grad_norm": 1.069495677947998,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.546,
|
|
"mean_token_accuracy": 0.8324536681175232,
|
|
"num_tokens": 167356786.0,
|
|
"step": 1049
|
|
},
|
|
{
|
|
"epoch": 0.5340793489318413,
|
|
"grad_norm": 1.0422338247299194,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8359999060630798,
|
|
"num_tokens": 167508836.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.534587995930824,
|
|
"grad_norm": 1.057760238647461,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5194,
|
|
"mean_token_accuracy": 0.8387189507484436,
|
|
"num_tokens": 167670690.0,
|
|
"step": 1051
|
|
},
|
|
{
|
|
"epoch": 0.5350966429298067,
|
|
"grad_norm": 1.1019717454910278,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5587,
|
|
"mean_token_accuracy": 0.8284574151039124,
|
|
"num_tokens": 167821540.0,
|
|
"step": 1052
|
|
},
|
|
{
|
|
"epoch": 0.5356052899287894,
|
|
"grad_norm": 1.0659122467041016,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5128,
|
|
"mean_token_accuracy": 0.8404115438461304,
|
|
"num_tokens": 167998236.0,
|
|
"step": 1053
|
|
},
|
|
{
|
|
"epoch": 0.5361139369277721,
|
|
"grad_norm": 1.0259801149368286,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5335,
|
|
"mean_token_accuracy": 0.8355931043624878,
|
|
"num_tokens": 168163810.0,
|
|
"step": 1054
|
|
},
|
|
{
|
|
"epoch": 0.5366225839267549,
|
|
"grad_norm": 1.013709306716919,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5012,
|
|
"mean_token_accuracy": 0.8424390554428101,
|
|
"num_tokens": 168312526.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.5371312309257376,
|
|
"grad_norm": 1.0958232879638672,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5175,
|
|
"mean_token_accuracy": 0.8409861326217651,
|
|
"num_tokens": 168465144.0,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"epoch": 0.5376398779247202,
|
|
"grad_norm": 1.0535881519317627,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5212,
|
|
"mean_token_accuracy": 0.8389724493026733,
|
|
"num_tokens": 168621602.0,
|
|
"step": 1057
|
|
},
|
|
{
|
|
"epoch": 0.5381485249237029,
|
|
"grad_norm": 1.065081238746643,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5579,
|
|
"mean_token_accuracy": 0.8279095888137817,
|
|
"num_tokens": 168772296.0,
|
|
"step": 1058
|
|
},
|
|
{
|
|
"epoch": 0.5386571719226857,
|
|
"grad_norm": 1.122185468673706,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5207,
|
|
"mean_token_accuracy": 0.8379378914833069,
|
|
"num_tokens": 168930909.0,
|
|
"step": 1059
|
|
},
|
|
{
|
|
"epoch": 0.5391658189216684,
|
|
"grad_norm": 1.0356076955795288,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5481,
|
|
"mean_token_accuracy": 0.8306833505630493,
|
|
"num_tokens": 169084941.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.539674465920651,
|
|
"grad_norm": 1.203890323638916,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.516,
|
|
"mean_token_accuracy": 0.8395755290985107,
|
|
"num_tokens": 169235523.0,
|
|
"step": 1061
|
|
},
|
|
{
|
|
"epoch": 0.5401831129196337,
|
|
"grad_norm": 1.3647130727767944,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.516,
|
|
"mean_token_accuracy": 0.8395922183990479,
|
|
"num_tokens": 169398931.0,
|
|
"step": 1062
|
|
},
|
|
{
|
|
"epoch": 0.5406917599186165,
|
|
"grad_norm": 1.1263409852981567,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5113,
|
|
"mean_token_accuracy": 0.8389729261398315,
|
|
"num_tokens": 169559733.0,
|
|
"step": 1063
|
|
},
|
|
{
|
|
"epoch": 0.5412004069175992,
|
|
"grad_norm": 0.9808051586151123,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5292,
|
|
"mean_token_accuracy": 0.8377224206924438,
|
|
"num_tokens": 169713890.0,
|
|
"step": 1064
|
|
},
|
|
{
|
|
"epoch": 0.5417090539165819,
|
|
"grad_norm": 1.0912060737609863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5197,
|
|
"mean_token_accuracy": 0.8408249020576477,
|
|
"num_tokens": 169872319.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.5422177009155646,
|
|
"grad_norm": 1.1292952299118042,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5675,
|
|
"mean_token_accuracy": 0.8258577585220337,
|
|
"num_tokens": 170042021.0,
|
|
"step": 1066
|
|
},
|
|
{
|
|
"epoch": 0.5427263479145473,
|
|
"grad_norm": 1.1030066013336182,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5364,
|
|
"mean_token_accuracy": 0.8327227830886841,
|
|
"num_tokens": 170190297.0,
|
|
"step": 1067
|
|
},
|
|
{
|
|
"epoch": 0.54323499491353,
|
|
"grad_norm": 1.0513752698898315,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5024,
|
|
"mean_token_accuracy": 0.8434488773345947,
|
|
"num_tokens": 170344681.0,
|
|
"step": 1068
|
|
},
|
|
{
|
|
"epoch": 0.5437436419125127,
|
|
"grad_norm": 1.0956968069076538,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5461,
|
|
"mean_token_accuracy": 0.8309398889541626,
|
|
"num_tokens": 170503123.0,
|
|
"step": 1069
|
|
},
|
|
{
|
|
"epoch": 0.5442522889114955,
|
|
"grad_norm": 1.0473368167877197,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5228,
|
|
"mean_token_accuracy": 0.8388941287994385,
|
|
"num_tokens": 170662002.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.5447609359104781,
|
|
"grad_norm": 1.2247461080551147,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5789,
|
|
"mean_token_accuracy": 0.8226563334465027,
|
|
"num_tokens": 170819743.0,
|
|
"step": 1071
|
|
},
|
|
{
|
|
"epoch": 0.5452695829094608,
|
|
"grad_norm": 0.992435872554779,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5033,
|
|
"mean_token_accuracy": 0.8444223999977112,
|
|
"num_tokens": 170965381.0,
|
|
"step": 1072
|
|
},
|
|
{
|
|
"epoch": 0.5457782299084435,
|
|
"grad_norm": 1.111801266670227,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5441,
|
|
"mean_token_accuracy": 0.8337465524673462,
|
|
"num_tokens": 171118434.0,
|
|
"step": 1073
|
|
},
|
|
{
|
|
"epoch": 0.5462868769074263,
|
|
"grad_norm": 0.9795582294464111,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.583,
|
|
"mean_token_accuracy": 0.8244104981422424,
|
|
"num_tokens": 171297534.0,
|
|
"step": 1074
|
|
},
|
|
{
|
|
"epoch": 0.546795523906409,
|
|
"grad_norm": 1.0069224834442139,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5169,
|
|
"mean_token_accuracy": 0.8392422199249268,
|
|
"num_tokens": 171454468.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.5473041709053916,
|
|
"grad_norm": 1.1006546020507812,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5477,
|
|
"mean_token_accuracy": 0.830169677734375,
|
|
"num_tokens": 171615049.0,
|
|
"step": 1076
|
|
},
|
|
{
|
|
"epoch": 0.5478128179043744,
|
|
"grad_norm": 1.012514591217041,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5176,
|
|
"mean_token_accuracy": 0.8388389348983765,
|
|
"num_tokens": 171777974.0,
|
|
"step": 1077
|
|
},
|
|
{
|
|
"epoch": 0.5483214649033571,
|
|
"grad_norm": 1.052994728088379,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5153,
|
|
"mean_token_accuracy": 0.8392099738121033,
|
|
"num_tokens": 171931276.0,
|
|
"step": 1078
|
|
},
|
|
{
|
|
"epoch": 0.5488301119023398,
|
|
"grad_norm": 1.0102014541625977,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5226,
|
|
"mean_token_accuracy": 0.838862419128418,
|
|
"num_tokens": 172092986.0,
|
|
"step": 1079
|
|
},
|
|
{
|
|
"epoch": 0.5493387589013224,
|
|
"grad_norm": 1.006404161453247,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5346,
|
|
"mean_token_accuracy": 0.8348338603973389,
|
|
"num_tokens": 172259334.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.5498474059003052,
|
|
"grad_norm": 1.0299087762832642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5221,
|
|
"mean_token_accuracy": 0.840072751045227,
|
|
"num_tokens": 172432734.0,
|
|
"step": 1081
|
|
},
|
|
{
|
|
"epoch": 0.5503560528992879,
|
|
"grad_norm": 1.0475842952728271,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5122,
|
|
"mean_token_accuracy": 0.840729296207428,
|
|
"num_tokens": 172580086.0,
|
|
"step": 1082
|
|
},
|
|
{
|
|
"epoch": 0.5508646998982706,
|
|
"grad_norm": 1.0941567420959473,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5321,
|
|
"mean_token_accuracy": 0.8355224132537842,
|
|
"num_tokens": 172733854.0,
|
|
"step": 1083
|
|
},
|
|
{
|
|
"epoch": 0.5513733468972533,
|
|
"grad_norm": 1.0811351537704468,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5478,
|
|
"mean_token_accuracy": 0.8315902948379517,
|
|
"num_tokens": 172895819.0,
|
|
"step": 1084
|
|
},
|
|
{
|
|
"epoch": 0.551881993896236,
|
|
"grad_norm": 1.003430724143982,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5164,
|
|
"mean_token_accuracy": 0.8390946388244629,
|
|
"num_tokens": 173057284.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.5523906408952187,
|
|
"grad_norm": 1.0279691219329834,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5143,
|
|
"mean_token_accuracy": 0.8408864140510559,
|
|
"num_tokens": 173207899.0,
|
|
"step": 1086
|
|
},
|
|
{
|
|
"epoch": 0.5528992878942014,
|
|
"grad_norm": 1.1198608875274658,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5385,
|
|
"mean_token_accuracy": 0.8336430788040161,
|
|
"num_tokens": 173378839.0,
|
|
"step": 1087
|
|
},
|
|
{
|
|
"epoch": 0.5534079348931842,
|
|
"grad_norm": 1.075195074081421,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5266,
|
|
"mean_token_accuracy": 0.8351539373397827,
|
|
"num_tokens": 173547290.0,
|
|
"step": 1088
|
|
},
|
|
{
|
|
"epoch": 0.5539165818921669,
|
|
"grad_norm": 1.0774073600769043,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5401,
|
|
"mean_token_accuracy": 0.8340626955032349,
|
|
"num_tokens": 173706562.0,
|
|
"step": 1089
|
|
},
|
|
{
|
|
"epoch": 0.5544252288911495,
|
|
"grad_norm": 1.069839358329773,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.8282644748687744,
|
|
"num_tokens": 173872148.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.5549338758901322,
|
|
"grad_norm": 1.0688462257385254,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5197,
|
|
"mean_token_accuracy": 0.8376371264457703,
|
|
"num_tokens": 174021510.0,
|
|
"step": 1091
|
|
},
|
|
{
|
|
"epoch": 0.555442522889115,
|
|
"grad_norm": 1.1761980056762695,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.566,
|
|
"mean_token_accuracy": 0.8264144659042358,
|
|
"num_tokens": 174174594.0,
|
|
"step": 1092
|
|
},
|
|
{
|
|
"epoch": 0.5559511698880977,
|
|
"grad_norm": 1.1928164958953857,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5519,
|
|
"mean_token_accuracy": 0.8272026181221008,
|
|
"num_tokens": 174327672.0,
|
|
"step": 1093
|
|
},
|
|
{
|
|
"epoch": 0.5564598168870803,
|
|
"grad_norm": 1.1046916246414185,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5155,
|
|
"mean_token_accuracy": 0.8390681743621826,
|
|
"num_tokens": 174483118.0,
|
|
"step": 1094
|
|
},
|
|
{
|
|
"epoch": 0.556968463886063,
|
|
"grad_norm": 1.0523358583450317,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5209,
|
|
"mean_token_accuracy": 0.8369934558868408,
|
|
"num_tokens": 174644489.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.5574771108850458,
|
|
"grad_norm": 1.1735951900482178,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5318,
|
|
"mean_token_accuracy": 0.8356931209564209,
|
|
"num_tokens": 174798555.0,
|
|
"step": 1096
|
|
},
|
|
{
|
|
"epoch": 0.5579857578840285,
|
|
"grad_norm": 1.0315639972686768,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5258,
|
|
"mean_token_accuracy": 0.837049126625061,
|
|
"num_tokens": 174955732.0,
|
|
"step": 1097
|
|
},
|
|
{
|
|
"epoch": 0.5584944048830112,
|
|
"grad_norm": 0.9019716382026672,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5236,
|
|
"mean_token_accuracy": 0.8368342518806458,
|
|
"num_tokens": 175130207.0,
|
|
"step": 1098
|
|
},
|
|
{
|
|
"epoch": 0.559003051881994,
|
|
"grad_norm": 1.1175049543380737,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.507,
|
|
"mean_token_accuracy": 0.8416058421134949,
|
|
"num_tokens": 175283656.0,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"epoch": 0.5595116988809766,
|
|
"grad_norm": 1.0007203817367554,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5157,
|
|
"mean_token_accuracy": 0.8406989574432373,
|
|
"num_tokens": 175445505.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.5600203458799593,
|
|
"grad_norm": 1.105495572090149,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5244,
|
|
"mean_token_accuracy": 0.8378565907478333,
|
|
"num_tokens": 175600838.0,
|
|
"step": 1101
|
|
},
|
|
{
|
|
"epoch": 0.560528992878942,
|
|
"grad_norm": 1.0666669607162476,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.552,
|
|
"mean_token_accuracy": 0.8297303915023804,
|
|
"num_tokens": 175767713.0,
|
|
"step": 1102
|
|
},
|
|
{
|
|
"epoch": 0.5610376398779248,
|
|
"grad_norm": 1.0583263635635376,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5514,
|
|
"mean_token_accuracy": 0.8285160064697266,
|
|
"num_tokens": 175927627.0,
|
|
"step": 1103
|
|
},
|
|
{
|
|
"epoch": 0.5615462868769074,
|
|
"grad_norm": 1.0607812404632568,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8328511714935303,
|
|
"num_tokens": 176094548.0,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"epoch": 0.5620549338758901,
|
|
"grad_norm": 1.0297958850860596,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.525,
|
|
"mean_token_accuracy": 0.8363398313522339,
|
|
"num_tokens": 176249226.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.5625635808748728,
|
|
"grad_norm": 0.9327942132949829,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5301,
|
|
"mean_token_accuracy": 0.8366088271141052,
|
|
"num_tokens": 176419770.0,
|
|
"step": 1106
|
|
},
|
|
{
|
|
"epoch": 0.5630722278738556,
|
|
"grad_norm": 1.3391690254211426,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5127,
|
|
"mean_token_accuracy": 0.8409700393676758,
|
|
"num_tokens": 176579372.0,
|
|
"step": 1107
|
|
},
|
|
{
|
|
"epoch": 0.5635808748728383,
|
|
"grad_norm": 1.161334753036499,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.55,
|
|
"mean_token_accuracy": 0.830707848072052,
|
|
"num_tokens": 176731375.0,
|
|
"step": 1108
|
|
},
|
|
{
|
|
"epoch": 0.5640895218718209,
|
|
"grad_norm": 0.9866883158683777,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5171,
|
|
"mean_token_accuracy": 0.8403599262237549,
|
|
"num_tokens": 176882380.0,
|
|
"step": 1109
|
|
},
|
|
{
|
|
"epoch": 0.5645981688708036,
|
|
"grad_norm": 1.0278397798538208,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5141,
|
|
"mean_token_accuracy": 0.8405447006225586,
|
|
"num_tokens": 177037040.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.5651068158697864,
|
|
"grad_norm": 1.0140935182571411,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5535,
|
|
"mean_token_accuracy": 0.8283101320266724,
|
|
"num_tokens": 177193855.0,
|
|
"step": 1111
|
|
},
|
|
{
|
|
"epoch": 0.5656154628687691,
|
|
"grad_norm": 1.0519375801086426,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.8352972269058228,
|
|
"num_tokens": 177353869.0,
|
|
"step": 1112
|
|
},
|
|
{
|
|
"epoch": 0.5661241098677517,
|
|
"grad_norm": 1.0424383878707886,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5224,
|
|
"mean_token_accuracy": 0.8369056582450867,
|
|
"num_tokens": 177503862.0,
|
|
"step": 1113
|
|
},
|
|
{
|
|
"epoch": 0.5666327568667345,
|
|
"grad_norm": 1.0207585096359253,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5019,
|
|
"mean_token_accuracy": 0.8427259922027588,
|
|
"num_tokens": 177663239.0,
|
|
"step": 1114
|
|
},
|
|
{
|
|
"epoch": 0.5671414038657172,
|
|
"grad_norm": 1.1228545904159546,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5125,
|
|
"mean_token_accuracy": 0.8402339220046997,
|
|
"num_tokens": 177822738.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.5676500508646999,
|
|
"grad_norm": 1.2176703214645386,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5286,
|
|
"mean_token_accuracy": 0.836188793182373,
|
|
"num_tokens": 177989713.0,
|
|
"step": 1116
|
|
},
|
|
{
|
|
"epoch": 0.5681586978636826,
|
|
"grad_norm": 1.0226936340332031,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.542,
|
|
"mean_token_accuracy": 0.8335666656494141,
|
|
"num_tokens": 178140529.0,
|
|
"step": 1117
|
|
},
|
|
{
|
|
"epoch": 0.5686673448626653,
|
|
"grad_norm": 0.9989066123962402,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5325,
|
|
"mean_token_accuracy": 0.8349342942237854,
|
|
"num_tokens": 178311252.0,
|
|
"step": 1118
|
|
},
|
|
{
|
|
"epoch": 0.569175991861648,
|
|
"grad_norm": 1.1514434814453125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5541,
|
|
"mean_token_accuracy": 0.828987181186676,
|
|
"num_tokens": 178475164.0,
|
|
"step": 1119
|
|
},
|
|
{
|
|
"epoch": 0.5696846388606307,
|
|
"grad_norm": 1.0415675640106201,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5576,
|
|
"mean_token_accuracy": 0.8286171555519104,
|
|
"num_tokens": 178635634.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.5701932858596134,
|
|
"grad_norm": 1.0814350843429565,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.557,
|
|
"mean_token_accuracy": 0.8278229236602783,
|
|
"num_tokens": 178786348.0,
|
|
"step": 1121
|
|
},
|
|
{
|
|
"epoch": 0.5707019328585962,
|
|
"grad_norm": 0.9982712268829346,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5362,
|
|
"mean_token_accuracy": 0.8349918127059937,
|
|
"num_tokens": 178948373.0,
|
|
"step": 1122
|
|
},
|
|
{
|
|
"epoch": 0.5712105798575788,
|
|
"grad_norm": 1.150430679321289,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4993,
|
|
"mean_token_accuracy": 0.8427037000656128,
|
|
"num_tokens": 179104122.0,
|
|
"step": 1123
|
|
},
|
|
{
|
|
"epoch": 0.5717192268565615,
|
|
"grad_norm": 0.967299222946167,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5227,
|
|
"mean_token_accuracy": 0.8367307782173157,
|
|
"num_tokens": 179265598.0,
|
|
"step": 1124
|
|
},
|
|
{
|
|
"epoch": 0.5722278738555443,
|
|
"grad_norm": 0.9581523537635803,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5429,
|
|
"mean_token_accuracy": 0.8328096866607666,
|
|
"num_tokens": 179417035.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.572736520854527,
|
|
"grad_norm": 1.031461477279663,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5154,
|
|
"mean_token_accuracy": 0.8411537408828735,
|
|
"num_tokens": 179577100.0,
|
|
"step": 1126
|
|
},
|
|
{
|
|
"epoch": 0.5732451678535097,
|
|
"grad_norm": 0.9838495254516602,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5457,
|
|
"mean_token_accuracy": 0.8309280872344971,
|
|
"num_tokens": 179744420.0,
|
|
"step": 1127
|
|
},
|
|
{
|
|
"epoch": 0.5737538148524923,
|
|
"grad_norm": 1.0091938972473145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5306,
|
|
"mean_token_accuracy": 0.8353382349014282,
|
|
"num_tokens": 179892505.0,
|
|
"step": 1128
|
|
},
|
|
{
|
|
"epoch": 0.5742624618514751,
|
|
"grad_norm": 1.0297493934631348,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5522,
|
|
"mean_token_accuracy": 0.8297716379165649,
|
|
"num_tokens": 180051796.0,
|
|
"step": 1129
|
|
},
|
|
{
|
|
"epoch": 0.5747711088504578,
|
|
"grad_norm": 0.9671316742897034,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5245,
|
|
"mean_token_accuracy": 0.8383359313011169,
|
|
"num_tokens": 180223161.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.5752797558494405,
|
|
"grad_norm": 0.9914503693580627,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5439,
|
|
"mean_token_accuracy": 0.8331002593040466,
|
|
"num_tokens": 180381038.0,
|
|
"step": 1131
|
|
},
|
|
{
|
|
"epoch": 0.5757884028484231,
|
|
"grad_norm": 1.0318700075149536,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5333,
|
|
"mean_token_accuracy": 0.8343907594680786,
|
|
"num_tokens": 180529292.0,
|
|
"step": 1132
|
|
},
|
|
{
|
|
"epoch": 0.5762970498474059,
|
|
"grad_norm": 1.076903223991394,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5127,
|
|
"mean_token_accuracy": 0.8403801918029785,
|
|
"num_tokens": 180673192.0,
|
|
"step": 1133
|
|
},
|
|
{
|
|
"epoch": 0.5768056968463886,
|
|
"grad_norm": 1.013776183128357,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5254,
|
|
"mean_token_accuracy": 0.8366549015045166,
|
|
"num_tokens": 180831990.0,
|
|
"step": 1134
|
|
},
|
|
{
|
|
"epoch": 0.5773143438453713,
|
|
"grad_norm": 0.9801108241081238,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5405,
|
|
"mean_token_accuracy": 0.8329926133155823,
|
|
"num_tokens": 180994135.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.5778229908443541,
|
|
"grad_norm": 1.0288931131362915,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8331727981567383,
|
|
"num_tokens": 181158179.0,
|
|
"step": 1136
|
|
},
|
|
{
|
|
"epoch": 0.5783316378433367,
|
|
"grad_norm": 0.9568202495574951,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5099,
|
|
"mean_token_accuracy": 0.8413141965866089,
|
|
"num_tokens": 181316469.0,
|
|
"step": 1137
|
|
},
|
|
{
|
|
"epoch": 0.5788402848423194,
|
|
"grad_norm": 0.9759686589241028,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5088,
|
|
"mean_token_accuracy": 0.8403933048248291,
|
|
"num_tokens": 181465876.0,
|
|
"step": 1138
|
|
},
|
|
{
|
|
"epoch": 0.5793489318413021,
|
|
"grad_norm": 0.9994838237762451,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5034,
|
|
"mean_token_accuracy": 0.842954695224762,
|
|
"num_tokens": 181626190.0,
|
|
"step": 1139
|
|
},
|
|
{
|
|
"epoch": 0.5798575788402849,
|
|
"grad_norm": 1.0594063997268677,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5766,
|
|
"mean_token_accuracy": 0.8225234746932983,
|
|
"num_tokens": 181772486.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.5803662258392676,
|
|
"grad_norm": 0.9984065294265747,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.537,
|
|
"mean_token_accuracy": 0.8352517485618591,
|
|
"num_tokens": 181930828.0,
|
|
"step": 1141
|
|
},
|
|
{
|
|
"epoch": 0.5808748728382502,
|
|
"grad_norm": 0.9843501448631287,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5304,
|
|
"mean_token_accuracy": 0.8349761366844177,
|
|
"num_tokens": 182092553.0,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"epoch": 0.5813835198372329,
|
|
"grad_norm": 1.0099085569381714,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.834098219871521,
|
|
"num_tokens": 182242889.0,
|
|
"step": 1143
|
|
},
|
|
{
|
|
"epoch": 0.5818921668362157,
|
|
"grad_norm": 1.0297266244888306,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5475,
|
|
"mean_token_accuracy": 0.8309882879257202,
|
|
"num_tokens": 182399236.0,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"epoch": 0.5824008138351984,
|
|
"grad_norm": 0.9727159142494202,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5545,
|
|
"mean_token_accuracy": 0.8294179439544678,
|
|
"num_tokens": 182555200.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.582909460834181,
|
|
"grad_norm": 1.0565613508224487,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5345,
|
|
"mean_token_accuracy": 0.8345987796783447,
|
|
"num_tokens": 182711095.0,
|
|
"step": 1146
|
|
},
|
|
{
|
|
"epoch": 0.5834181078331638,
|
|
"grad_norm": 0.9976463317871094,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5551,
|
|
"mean_token_accuracy": 0.8283690810203552,
|
|
"num_tokens": 182884563.0,
|
|
"step": 1147
|
|
},
|
|
{
|
|
"epoch": 0.5839267548321465,
|
|
"grad_norm": 1.0735167264938354,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5702,
|
|
"mean_token_accuracy": 0.8263883590698242,
|
|
"num_tokens": 183045279.0,
|
|
"step": 1148
|
|
},
|
|
{
|
|
"epoch": 0.5844354018311292,
|
|
"grad_norm": 1.0010398626327515,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.535,
|
|
"mean_token_accuracy": 0.8358119130134583,
|
|
"num_tokens": 183209431.0,
|
|
"step": 1149
|
|
},
|
|
{
|
|
"epoch": 0.5849440488301119,
|
|
"grad_norm": 1.0518771409988403,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5117,
|
|
"mean_token_accuracy": 0.8412382006645203,
|
|
"num_tokens": 183377751.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.5854526958290946,
|
|
"grad_norm": 1.0084384679794312,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.514,
|
|
"mean_token_accuracy": 0.8410466313362122,
|
|
"num_tokens": 183530201.0,
|
|
"step": 1151
|
|
},
|
|
{
|
|
"epoch": 0.5859613428280773,
|
|
"grad_norm": 1.0706181526184082,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5386,
|
|
"mean_token_accuracy": 0.8333868980407715,
|
|
"num_tokens": 183676743.0,
|
|
"step": 1152
|
|
},
|
|
{
|
|
"epoch": 0.58646998982706,
|
|
"grad_norm": 0.9817941188812256,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5193,
|
|
"mean_token_accuracy": 0.8401201963424683,
|
|
"num_tokens": 183838597.0,
|
|
"step": 1153
|
|
},
|
|
{
|
|
"epoch": 0.5869786368260427,
|
|
"grad_norm": 1.0238702297210693,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5262,
|
|
"mean_token_accuracy": 0.8377246260643005,
|
|
"num_tokens": 184006491.0,
|
|
"step": 1154
|
|
},
|
|
{
|
|
"epoch": 0.5874872838250255,
|
|
"grad_norm": 0.975884199142456,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5243,
|
|
"mean_token_accuracy": 0.8374344706535339,
|
|
"num_tokens": 184170537.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.5879959308240081,
|
|
"grad_norm": 1.1449670791625977,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5142,
|
|
"mean_token_accuracy": 0.8390841484069824,
|
|
"num_tokens": 184334536.0,
|
|
"step": 1156
|
|
},
|
|
{
|
|
"epoch": 0.5885045778229908,
|
|
"grad_norm": 1.1105258464813232,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5109,
|
|
"mean_token_accuracy": 0.8375200033187866,
|
|
"num_tokens": 184489719.0,
|
|
"step": 1157
|
|
},
|
|
{
|
|
"epoch": 0.5890132248219736,
|
|
"grad_norm": 1.1190193891525269,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5498,
|
|
"mean_token_accuracy": 0.8310710191726685,
|
|
"num_tokens": 184644555.0,
|
|
"step": 1158
|
|
},
|
|
{
|
|
"epoch": 0.5895218718209563,
|
|
"grad_norm": 1.0898367166519165,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5203,
|
|
"mean_token_accuracy": 0.8381561040878296,
|
|
"num_tokens": 184803771.0,
|
|
"step": 1159
|
|
},
|
|
{
|
|
"epoch": 0.590030518819939,
|
|
"grad_norm": 1.0364891290664673,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5385,
|
|
"mean_token_accuracy": 0.832038164138794,
|
|
"num_tokens": 184956764.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.5905391658189216,
|
|
"grad_norm": 1.103318691253662,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.51,
|
|
"mean_token_accuracy": 0.8410016298294067,
|
|
"num_tokens": 185122654.0,
|
|
"step": 1161
|
|
},
|
|
{
|
|
"epoch": 0.5910478128179044,
|
|
"grad_norm": 1.0777701139450073,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5444,
|
|
"mean_token_accuracy": 0.8325847387313843,
|
|
"num_tokens": 185278874.0,
|
|
"step": 1162
|
|
},
|
|
{
|
|
"epoch": 0.5915564598168871,
|
|
"grad_norm": 1.1580455303192139,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.8376891613006592,
|
|
"num_tokens": 185438919.0,
|
|
"step": 1163
|
|
},
|
|
{
|
|
"epoch": 0.5920651068158698,
|
|
"grad_norm": 1.1831949949264526,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5168,
|
|
"mean_token_accuracy": 0.8394304513931274,
|
|
"num_tokens": 185603559.0,
|
|
"step": 1164
|
|
},
|
|
{
|
|
"epoch": 0.5925737538148524,
|
|
"grad_norm": 1.07766592502594,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5117,
|
|
"mean_token_accuracy": 0.8413243293762207,
|
|
"num_tokens": 185745740.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.5930824008138352,
|
|
"grad_norm": 1.0141584873199463,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.543,
|
|
"mean_token_accuracy": 0.8322431445121765,
|
|
"num_tokens": 185904158.0,
|
|
"step": 1166
|
|
},
|
|
{
|
|
"epoch": 0.5935910478128179,
|
|
"grad_norm": 1.1730612516403198,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.53,
|
|
"mean_token_accuracy": 0.8371715545654297,
|
|
"num_tokens": 186062692.0,
|
|
"step": 1167
|
|
},
|
|
{
|
|
"epoch": 0.5940996948118006,
|
|
"grad_norm": 1.1393283605575562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5071,
|
|
"mean_token_accuracy": 0.8407073020935059,
|
|
"num_tokens": 186208990.0,
|
|
"step": 1168
|
|
},
|
|
{
|
|
"epoch": 0.5946083418107834,
|
|
"grad_norm": 1.0351285934448242,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5224,
|
|
"mean_token_accuracy": 0.8370941877365112,
|
|
"num_tokens": 186353260.0,
|
|
"step": 1169
|
|
},
|
|
{
|
|
"epoch": 0.595116988809766,
|
|
"grad_norm": 1.1622053384780884,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5739,
|
|
"mean_token_accuracy": 0.8237625360488892,
|
|
"num_tokens": 186527391.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.5956256358087487,
|
|
"grad_norm": 1.1217572689056396,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.544,
|
|
"mean_token_accuracy": 0.8346309065818787,
|
|
"num_tokens": 186690975.0,
|
|
"step": 1171
|
|
},
|
|
{
|
|
"epoch": 0.5961342828077314,
|
|
"grad_norm": 1.0588243007659912,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4896,
|
|
"mean_token_accuracy": 0.8487622737884521,
|
|
"num_tokens": 186851711.0,
|
|
"step": 1172
|
|
},
|
|
{
|
|
"epoch": 0.5966429298067142,
|
|
"grad_norm": 1.043603539466858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5245,
|
|
"mean_token_accuracy": 0.8333913087844849,
|
|
"num_tokens": 187016950.0,
|
|
"step": 1173
|
|
},
|
|
{
|
|
"epoch": 0.5971515768056969,
|
|
"grad_norm": 1.1444929838180542,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5408,
|
|
"mean_token_accuracy": 0.8336371183395386,
|
|
"num_tokens": 187158410.0,
|
|
"step": 1174
|
|
},
|
|
{
|
|
"epoch": 0.5976602238046795,
|
|
"grad_norm": 1.0118918418884277,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4959,
|
|
"mean_token_accuracy": 0.8448071479797363,
|
|
"num_tokens": 187319518.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.5981688708036622,
|
|
"grad_norm": 1.0308904647827148,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5207,
|
|
"mean_token_accuracy": 0.8391602039337158,
|
|
"num_tokens": 187492387.0,
|
|
"step": 1176
|
|
},
|
|
{
|
|
"epoch": 0.598677517802645,
|
|
"grad_norm": 1.0049819946289062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5559,
|
|
"mean_token_accuracy": 0.8291609287261963,
|
|
"num_tokens": 187660820.0,
|
|
"step": 1177
|
|
},
|
|
{
|
|
"epoch": 0.5991861648016277,
|
|
"grad_norm": 1.1178535223007202,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5368,
|
|
"mean_token_accuracy": 0.8333885073661804,
|
|
"num_tokens": 187815200.0,
|
|
"step": 1178
|
|
},
|
|
{
|
|
"epoch": 0.5996948118006104,
|
|
"grad_norm": 1.0717005729675293,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5124,
|
|
"mean_token_accuracy": 0.8403778076171875,
|
|
"num_tokens": 187976720.0,
|
|
"step": 1179
|
|
},
|
|
{
|
|
"epoch": 0.6002034587995931,
|
|
"grad_norm": 0.9705209136009216,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5009,
|
|
"mean_token_accuracy": 0.8434971570968628,
|
|
"num_tokens": 188141681.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.6007121057985758,
|
|
"grad_norm": 1.0719575881958008,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5254,
|
|
"mean_token_accuracy": 0.8383431434631348,
|
|
"num_tokens": 188306729.0,
|
|
"step": 1181
|
|
},
|
|
{
|
|
"epoch": 0.6012207527975585,
|
|
"grad_norm": 0.9925136566162109,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5582,
|
|
"mean_token_accuracy": 0.82879239320755,
|
|
"num_tokens": 188459816.0,
|
|
"step": 1182
|
|
},
|
|
{
|
|
"epoch": 0.6017293997965412,
|
|
"grad_norm": 1.0106931924819946,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5462,
|
|
"mean_token_accuracy": 0.82989501953125,
|
|
"num_tokens": 188624442.0,
|
|
"step": 1183
|
|
},
|
|
{
|
|
"epoch": 0.602238046795524,
|
|
"grad_norm": 0.9931594729423523,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5471,
|
|
"mean_token_accuracy": 0.8313771486282349,
|
|
"num_tokens": 188793110.0,
|
|
"step": 1184
|
|
},
|
|
{
|
|
"epoch": 0.6027466937945066,
|
|
"grad_norm": 0.9614710807800293,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5012,
|
|
"mean_token_accuracy": 0.8416813611984253,
|
|
"num_tokens": 188959411.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.6032553407934893,
|
|
"grad_norm": 1.049680471420288,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.524,
|
|
"mean_token_accuracy": 0.8369648456573486,
|
|
"num_tokens": 189123138.0,
|
|
"step": 1186
|
|
},
|
|
{
|
|
"epoch": 0.603763987792472,
|
|
"grad_norm": 1.0057904720306396,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.52,
|
|
"mean_token_accuracy": 0.8387330770492554,
|
|
"num_tokens": 189288712.0,
|
|
"step": 1187
|
|
},
|
|
{
|
|
"epoch": 0.6042726347914548,
|
|
"grad_norm": 1.028718113899231,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5155,
|
|
"mean_token_accuracy": 0.839519739151001,
|
|
"num_tokens": 189448908.0,
|
|
"step": 1188
|
|
},
|
|
{
|
|
"epoch": 0.6047812817904374,
|
|
"grad_norm": 1.1342430114746094,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5422,
|
|
"mean_token_accuracy": 0.8313463926315308,
|
|
"num_tokens": 189601380.0,
|
|
"step": 1189
|
|
},
|
|
{
|
|
"epoch": 0.6052899287894201,
|
|
"grad_norm": 1.1675688028335571,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5418,
|
|
"mean_token_accuracy": 0.8315528631210327,
|
|
"num_tokens": 189760563.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.6057985757884028,
|
|
"grad_norm": 1.0995194911956787,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8341425657272339,
|
|
"num_tokens": 189919046.0,
|
|
"step": 1191
|
|
},
|
|
{
|
|
"epoch": 0.6063072227873856,
|
|
"grad_norm": 1.0976941585540771,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5386,
|
|
"mean_token_accuracy": 0.8345488905906677,
|
|
"num_tokens": 190067307.0,
|
|
"step": 1192
|
|
},
|
|
{
|
|
"epoch": 0.6068158697863683,
|
|
"grad_norm": 1.0513907670974731,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5177,
|
|
"mean_token_accuracy": 0.8396793603897095,
|
|
"num_tokens": 190216290.0,
|
|
"step": 1193
|
|
},
|
|
{
|
|
"epoch": 0.6073245167853509,
|
|
"grad_norm": 1.115285873413086,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.522,
|
|
"mean_token_accuracy": 0.8372184038162231,
|
|
"num_tokens": 190372698.0,
|
|
"step": 1194
|
|
},
|
|
{
|
|
"epoch": 0.6078331637843337,
|
|
"grad_norm": 1.105042815208435,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5214,
|
|
"mean_token_accuracy": 0.8376275300979614,
|
|
"num_tokens": 190528564.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.6083418107833164,
|
|
"grad_norm": 1.1105037927627563,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5507,
|
|
"mean_token_accuracy": 0.8296712040901184,
|
|
"num_tokens": 190679521.0,
|
|
"step": 1196
|
|
},
|
|
{
|
|
"epoch": 0.6088504577822991,
|
|
"grad_norm": 1.072302222251892,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5351,
|
|
"mean_token_accuracy": 0.8351011276245117,
|
|
"num_tokens": 190826754.0,
|
|
"step": 1197
|
|
},
|
|
{
|
|
"epoch": 0.6093591047812817,
|
|
"grad_norm": 1.2463853359222412,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.837123692035675,
|
|
"num_tokens": 190990350.0,
|
|
"step": 1198
|
|
},
|
|
{
|
|
"epoch": 0.6098677517802645,
|
|
"grad_norm": 1.14852774143219,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5544,
|
|
"mean_token_accuracy": 0.8295097351074219,
|
|
"num_tokens": 191150121.0,
|
|
"step": 1199
|
|
},
|
|
{
|
|
"epoch": 0.6103763987792472,
|
|
"grad_norm": 1.1386868953704834,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5304,
|
|
"mean_token_accuracy": 0.8363741040229797,
|
|
"num_tokens": 191298356.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.6108850457782299,
|
|
"grad_norm": 1.0642979145050049,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5082,
|
|
"mean_token_accuracy": 0.8421168327331543,
|
|
"num_tokens": 191454560.0,
|
|
"step": 1201
|
|
},
|
|
{
|
|
"epoch": 0.6113936927772126,
|
|
"grad_norm": 1.0878254175186157,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4906,
|
|
"mean_token_accuracy": 0.8473243713378906,
|
|
"num_tokens": 191607061.0,
|
|
"step": 1202
|
|
},
|
|
{
|
|
"epoch": 0.6119023397761953,
|
|
"grad_norm": 1.0132228136062622,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.835957407951355,
|
|
"num_tokens": 191764683.0,
|
|
"step": 1203
|
|
},
|
|
{
|
|
"epoch": 0.612410986775178,
|
|
"grad_norm": 1.0476408004760742,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5188,
|
|
"mean_token_accuracy": 0.8387986421585083,
|
|
"num_tokens": 191920982.0,
|
|
"step": 1204
|
|
},
|
|
{
|
|
"epoch": 0.6129196337741607,
|
|
"grad_norm": 1.0326831340789795,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.516,
|
|
"mean_token_accuracy": 0.8392593860626221,
|
|
"num_tokens": 192084464.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.6134282807731435,
|
|
"grad_norm": 1.0202337503433228,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5359,
|
|
"mean_token_accuracy": 0.8356767892837524,
|
|
"num_tokens": 192244544.0,
|
|
"step": 1206
|
|
},
|
|
{
|
|
"epoch": 0.6139369277721262,
|
|
"grad_norm": 1.1243996620178223,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5127,
|
|
"mean_token_accuracy": 0.8402092456817627,
|
|
"num_tokens": 192398895.0,
|
|
"step": 1207
|
|
},
|
|
{
|
|
"epoch": 0.6144455747711088,
|
|
"grad_norm": 1.0691757202148438,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5592,
|
|
"mean_token_accuracy": 0.8307750225067139,
|
|
"num_tokens": 192568952.0,
|
|
"step": 1208
|
|
},
|
|
{
|
|
"epoch": 0.6149542217700915,
|
|
"grad_norm": 1.1174930334091187,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5477,
|
|
"mean_token_accuracy": 0.8309003114700317,
|
|
"num_tokens": 192726852.0,
|
|
"step": 1209
|
|
},
|
|
{
|
|
"epoch": 0.6154628687690743,
|
|
"grad_norm": 1.0564061403274536,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4912,
|
|
"mean_token_accuracy": 0.8462415337562561,
|
|
"num_tokens": 192871340.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.615971515768057,
|
|
"grad_norm": 1.022898554801941,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5056,
|
|
"mean_token_accuracy": 0.8419616222381592,
|
|
"num_tokens": 193042111.0,
|
|
"step": 1211
|
|
},
|
|
{
|
|
"epoch": 0.6164801627670397,
|
|
"grad_norm": 1.1113489866256714,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5367,
|
|
"mean_token_accuracy": 0.8345298767089844,
|
|
"num_tokens": 193194025.0,
|
|
"step": 1212
|
|
},
|
|
{
|
|
"epoch": 0.6169888097660223,
|
|
"grad_norm": 1.1346272230148315,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.8465094566345215,
|
|
"num_tokens": 193349718.0,
|
|
"step": 1213
|
|
},
|
|
{
|
|
"epoch": 0.6174974567650051,
|
|
"grad_norm": 1.0465701818466187,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5306,
|
|
"mean_token_accuracy": 0.8361408710479736,
|
|
"num_tokens": 193505599.0,
|
|
"step": 1214
|
|
},
|
|
{
|
|
"epoch": 0.6180061037639878,
|
|
"grad_norm": 1.014984130859375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5698,
|
|
"mean_token_accuracy": 0.8271299600601196,
|
|
"num_tokens": 193681994.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.6185147507629705,
|
|
"grad_norm": 0.9723906517028809,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.548,
|
|
"mean_token_accuracy": 0.8309305906295776,
|
|
"num_tokens": 193846107.0,
|
|
"step": 1216
|
|
},
|
|
{
|
|
"epoch": 0.6190233977619533,
|
|
"grad_norm": 1.0247881412506104,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5244,
|
|
"mean_token_accuracy": 0.8384035229682922,
|
|
"num_tokens": 194001101.0,
|
|
"step": 1217
|
|
},
|
|
{
|
|
"epoch": 0.6195320447609359,
|
|
"grad_norm": 0.9313552379608154,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4889,
|
|
"mean_token_accuracy": 0.846920907497406,
|
|
"num_tokens": 194162440.0,
|
|
"step": 1218
|
|
},
|
|
{
|
|
"epoch": 0.6200406917599186,
|
|
"grad_norm": 0.9980710744857788,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5287,
|
|
"mean_token_accuracy": 0.8353384137153625,
|
|
"num_tokens": 194316830.0,
|
|
"step": 1219
|
|
},
|
|
{
|
|
"epoch": 0.6205493387589013,
|
|
"grad_norm": 1.0040054321289062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5096,
|
|
"mean_token_accuracy": 0.8394368886947632,
|
|
"num_tokens": 194471827.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.6210579857578841,
|
|
"grad_norm": 1.1046159267425537,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8353234529495239,
|
|
"num_tokens": 194631877.0,
|
|
"step": 1221
|
|
},
|
|
{
|
|
"epoch": 0.6215666327568667,
|
|
"grad_norm": 1.0001643896102905,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.8372697234153748,
|
|
"num_tokens": 194798176.0,
|
|
"step": 1222
|
|
},
|
|
{
|
|
"epoch": 0.6220752797558494,
|
|
"grad_norm": 1.0163205862045288,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5319,
|
|
"mean_token_accuracy": 0.8357371687889099,
|
|
"num_tokens": 194954556.0,
|
|
"step": 1223
|
|
},
|
|
{
|
|
"epoch": 0.6225839267548321,
|
|
"grad_norm": 1.1208192110061646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5558,
|
|
"mean_token_accuracy": 0.8277769088745117,
|
|
"num_tokens": 195115010.0,
|
|
"step": 1224
|
|
},
|
|
{
|
|
"epoch": 0.6230925737538149,
|
|
"grad_norm": 0.9982933402061462,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5043,
|
|
"mean_token_accuracy": 0.844214677810669,
|
|
"num_tokens": 195277332.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.6236012207527976,
|
|
"grad_norm": 1.0954415798187256,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5065,
|
|
"mean_token_accuracy": 0.8418806195259094,
|
|
"num_tokens": 195442166.0,
|
|
"step": 1226
|
|
},
|
|
{
|
|
"epoch": 0.6241098677517802,
|
|
"grad_norm": 0.956048846244812,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5032,
|
|
"mean_token_accuracy": 0.8432636857032776,
|
|
"num_tokens": 195598203.0,
|
|
"step": 1227
|
|
},
|
|
{
|
|
"epoch": 0.624618514750763,
|
|
"grad_norm": 1.0685564279556274,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5075,
|
|
"mean_token_accuracy": 0.842692494392395,
|
|
"num_tokens": 195755945.0,
|
|
"step": 1228
|
|
},
|
|
{
|
|
"epoch": 0.6251271617497457,
|
|
"grad_norm": 0.9353527426719666,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.508,
|
|
"mean_token_accuracy": 0.8419412970542908,
|
|
"num_tokens": 195923721.0,
|
|
"step": 1229
|
|
},
|
|
{
|
|
"epoch": 0.6256358087487284,
|
|
"grad_norm": 1.064367651939392,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4982,
|
|
"mean_token_accuracy": 0.8454999327659607,
|
|
"num_tokens": 196076642.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.626144455747711,
|
|
"grad_norm": 1.070304274559021,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5258,
|
|
"mean_token_accuracy": 0.8377371430397034,
|
|
"num_tokens": 196232622.0,
|
|
"step": 1231
|
|
},
|
|
{
|
|
"epoch": 0.6266531027466938,
|
|
"grad_norm": 1.0615029335021973,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5459,
|
|
"mean_token_accuracy": 0.8300840258598328,
|
|
"num_tokens": 196384640.0,
|
|
"step": 1232
|
|
},
|
|
{
|
|
"epoch": 0.6271617497456765,
|
|
"grad_norm": 1.0311565399169922,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.562,
|
|
"mean_token_accuracy": 0.8277008533477783,
|
|
"num_tokens": 196546059.0,
|
|
"step": 1233
|
|
},
|
|
{
|
|
"epoch": 0.6276703967446592,
|
|
"grad_norm": 1.0338419675827026,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5517,
|
|
"mean_token_accuracy": 0.831037700176239,
|
|
"num_tokens": 196709416.0,
|
|
"step": 1234
|
|
},
|
|
{
|
|
"epoch": 0.6281790437436419,
|
|
"grad_norm": 0.9907321333885193,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.8356142640113831,
|
|
"num_tokens": 196874595.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.6286876907426246,
|
|
"grad_norm": 1.000959873199463,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5046,
|
|
"mean_token_accuracy": 0.8412952423095703,
|
|
"num_tokens": 197038688.0,
|
|
"step": 1236
|
|
},
|
|
{
|
|
"epoch": 0.6291963377416073,
|
|
"grad_norm": 1.143560767173767,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5659,
|
|
"mean_token_accuracy": 0.8249709606170654,
|
|
"num_tokens": 197192465.0,
|
|
"step": 1237
|
|
},
|
|
{
|
|
"epoch": 0.62970498474059,
|
|
"grad_norm": 1.0114355087280273,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5446,
|
|
"mean_token_accuracy": 0.8341284990310669,
|
|
"num_tokens": 197361082.0,
|
|
"step": 1238
|
|
},
|
|
{
|
|
"epoch": 0.6302136317395728,
|
|
"grad_norm": 1.0758358240127563,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4813,
|
|
"mean_token_accuracy": 0.8505460023880005,
|
|
"num_tokens": 197512368.0,
|
|
"step": 1239
|
|
},
|
|
{
|
|
"epoch": 0.6307222787385555,
|
|
"grad_norm": 1.033604621887207,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.547,
|
|
"mean_token_accuracy": 0.8309687972068787,
|
|
"num_tokens": 197654672.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.6312309257375381,
|
|
"grad_norm": 1.1330300569534302,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5497,
|
|
"mean_token_accuracy": 0.830880880355835,
|
|
"num_tokens": 197816279.0,
|
|
"step": 1241
|
|
},
|
|
{
|
|
"epoch": 0.6317395727365208,
|
|
"grad_norm": 1.0106984376907349,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5049,
|
|
"mean_token_accuracy": 0.8433741331100464,
|
|
"num_tokens": 197972971.0,
|
|
"step": 1242
|
|
},
|
|
{
|
|
"epoch": 0.6322482197355036,
|
|
"grad_norm": 1.062400460243225,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5176,
|
|
"mean_token_accuracy": 0.8375513553619385,
|
|
"num_tokens": 198125745.0,
|
|
"step": 1243
|
|
},
|
|
{
|
|
"epoch": 0.6327568667344863,
|
|
"grad_norm": 0.9587567448616028,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.48,
|
|
"mean_token_accuracy": 0.8493208885192871,
|
|
"num_tokens": 198288270.0,
|
|
"step": 1244
|
|
},
|
|
{
|
|
"epoch": 0.633265513733469,
|
|
"grad_norm": 1.0087758302688599,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5336,
|
|
"mean_token_accuracy": 0.834213137626648,
|
|
"num_tokens": 198445798.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.6337741607324516,
|
|
"grad_norm": 1.089056134223938,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5013,
|
|
"mean_token_accuracy": 0.8421186208724976,
|
|
"num_tokens": 198600564.0,
|
|
"step": 1246
|
|
},
|
|
{
|
|
"epoch": 0.6342828077314344,
|
|
"grad_norm": 0.9666113257408142,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4992,
|
|
"mean_token_accuracy": 0.8449921607971191,
|
|
"num_tokens": 198768145.0,
|
|
"step": 1247
|
|
},
|
|
{
|
|
"epoch": 0.6347914547304171,
|
|
"grad_norm": 1.0998759269714355,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5058,
|
|
"mean_token_accuracy": 0.8415104150772095,
|
|
"num_tokens": 198919816.0,
|
|
"step": 1248
|
|
},
|
|
{
|
|
"epoch": 0.6353001017293998,
|
|
"grad_norm": 1.020984411239624,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5534,
|
|
"mean_token_accuracy": 0.8289154767990112,
|
|
"num_tokens": 199084288.0,
|
|
"step": 1249
|
|
},
|
|
{
|
|
"epoch": 0.6358087487283826,
|
|
"grad_norm": 2.9039652347564697,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5429,
|
|
"mean_token_accuracy": 0.8318288922309875,
|
|
"num_tokens": 199234621.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.6363173957273652,
|
|
"grad_norm": 1.0918387174606323,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5136,
|
|
"mean_token_accuracy": 0.8401831388473511,
|
|
"num_tokens": 199415992.0,
|
|
"step": 1251
|
|
},
|
|
{
|
|
"epoch": 0.6368260427263479,
|
|
"grad_norm": 0.9810324311256409,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5334,
|
|
"mean_token_accuracy": 0.8348792791366577,
|
|
"num_tokens": 199581880.0,
|
|
"step": 1252
|
|
},
|
|
{
|
|
"epoch": 0.6373346897253306,
|
|
"grad_norm": 1.1713511943817139,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5106,
|
|
"mean_token_accuracy": 0.8398399949073792,
|
|
"num_tokens": 199728425.0,
|
|
"step": 1253
|
|
},
|
|
{
|
|
"epoch": 0.6378433367243134,
|
|
"grad_norm": 1.0506319999694824,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5682,
|
|
"mean_token_accuracy": 0.8250914216041565,
|
|
"num_tokens": 199887035.0,
|
|
"step": 1254
|
|
},
|
|
{
|
|
"epoch": 0.638351983723296,
|
|
"grad_norm": 1.083480715751648,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5414,
|
|
"mean_token_accuracy": 0.8333582878112793,
|
|
"num_tokens": 200042873.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.6388606307222787,
|
|
"grad_norm": 1.0766798257827759,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5109,
|
|
"mean_token_accuracy": 0.8416237831115723,
|
|
"num_tokens": 200218153.0,
|
|
"step": 1256
|
|
},
|
|
{
|
|
"epoch": 0.6393692777212614,
|
|
"grad_norm": 1.0037503242492676,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5348,
|
|
"mean_token_accuracy": 0.8360911011695862,
|
|
"num_tokens": 200375972.0,
|
|
"step": 1257
|
|
},
|
|
{
|
|
"epoch": 0.6398779247202442,
|
|
"grad_norm": 1.1143869161605835,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5432,
|
|
"mean_token_accuracy": 0.8323516845703125,
|
|
"num_tokens": 200538925.0,
|
|
"step": 1258
|
|
},
|
|
{
|
|
"epoch": 0.6403865717192269,
|
|
"grad_norm": 1.0148932933807373,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5143,
|
|
"mean_token_accuracy": 0.8395555019378662,
|
|
"num_tokens": 200702936.0,
|
|
"step": 1259
|
|
},
|
|
{
|
|
"epoch": 0.6408952187182095,
|
|
"grad_norm": 1.3639062643051147,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5129,
|
|
"mean_token_accuracy": 0.8397424221038818,
|
|
"num_tokens": 200852605.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.6414038657171923,
|
|
"grad_norm": 1.0762628316879272,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5116,
|
|
"mean_token_accuracy": 0.8400771617889404,
|
|
"num_tokens": 201011038.0,
|
|
"step": 1261
|
|
},
|
|
{
|
|
"epoch": 0.641912512716175,
|
|
"grad_norm": 1.1453301906585693,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5425,
|
|
"mean_token_accuracy": 0.8312779068946838,
|
|
"num_tokens": 201171353.0,
|
|
"step": 1262
|
|
},
|
|
{
|
|
"epoch": 0.6424211597151577,
|
|
"grad_norm": 0.977043628692627,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5493,
|
|
"mean_token_accuracy": 0.8298898935317993,
|
|
"num_tokens": 201335064.0,
|
|
"step": 1263
|
|
},
|
|
{
|
|
"epoch": 0.6429298067141404,
|
|
"grad_norm": 1.070381999015808,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5072,
|
|
"mean_token_accuracy": 0.8416640758514404,
|
|
"num_tokens": 201496911.0,
|
|
"step": 1264
|
|
},
|
|
{
|
|
"epoch": 0.6434384537131231,
|
|
"grad_norm": 1.0922592878341675,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5333,
|
|
"mean_token_accuracy": 0.8351348042488098,
|
|
"num_tokens": 201650881.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.6439471007121058,
|
|
"grad_norm": 1.032348394393921,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5552,
|
|
"mean_token_accuracy": 0.8285496830940247,
|
|
"num_tokens": 201822744.0,
|
|
"step": 1266
|
|
},
|
|
{
|
|
"epoch": 0.6444557477110885,
|
|
"grad_norm": 0.9909312129020691,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5119,
|
|
"mean_token_accuracy": 0.841721773147583,
|
|
"num_tokens": 201988138.0,
|
|
"step": 1267
|
|
},
|
|
{
|
|
"epoch": 0.6449643947100712,
|
|
"grad_norm": 0.9247465133666992,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5111,
|
|
"mean_token_accuracy": 0.8412255644798279,
|
|
"num_tokens": 202163172.0,
|
|
"step": 1268
|
|
},
|
|
{
|
|
"epoch": 0.645473041709054,
|
|
"grad_norm": 1.0801644325256348,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5172,
|
|
"mean_token_accuracy": 0.8394771814346313,
|
|
"num_tokens": 202330354.0,
|
|
"step": 1269
|
|
},
|
|
{
|
|
"epoch": 0.6459816887080366,
|
|
"grad_norm": 1.036254644393921,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5362,
|
|
"mean_token_accuracy": 0.8337914943695068,
|
|
"num_tokens": 202477704.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.6464903357070193,
|
|
"grad_norm": 1.0122811794281006,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5626,
|
|
"mean_token_accuracy": 0.8263964653015137,
|
|
"num_tokens": 202638535.0,
|
|
"step": 1271
|
|
},
|
|
{
|
|
"epoch": 0.646998982706002,
|
|
"grad_norm": 1.1269254684448242,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8313633799552917,
|
|
"num_tokens": 202799169.0,
|
|
"step": 1272
|
|
},
|
|
{
|
|
"epoch": 0.6475076297049848,
|
|
"grad_norm": 1.0821533203125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5237,
|
|
"mean_token_accuracy": 0.8368152379989624,
|
|
"num_tokens": 202948731.0,
|
|
"step": 1273
|
|
},
|
|
{
|
|
"epoch": 0.6480162767039674,
|
|
"grad_norm": 1.0910481214523315,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5229,
|
|
"mean_token_accuracy": 0.8392539024353027,
|
|
"num_tokens": 203103019.0,
|
|
"step": 1274
|
|
},
|
|
{
|
|
"epoch": 0.6485249237029501,
|
|
"grad_norm": 1.1418254375457764,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5554,
|
|
"mean_token_accuracy": 0.8282456398010254,
|
|
"num_tokens": 203258243.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.6490335707019329,
|
|
"grad_norm": 1.0393235683441162,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5194,
|
|
"mean_token_accuracy": 0.8381021022796631,
|
|
"num_tokens": 203417686.0,
|
|
"step": 1276
|
|
},
|
|
{
|
|
"epoch": 0.6495422177009156,
|
|
"grad_norm": 1.012725830078125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5259,
|
|
"mean_token_accuracy": 0.8369470238685608,
|
|
"num_tokens": 203581925.0,
|
|
"step": 1277
|
|
},
|
|
{
|
|
"epoch": 0.6500508646998983,
|
|
"grad_norm": 1.1534172296524048,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5528,
|
|
"mean_token_accuracy": 0.8291853666305542,
|
|
"num_tokens": 203737547.0,
|
|
"step": 1278
|
|
},
|
|
{
|
|
"epoch": 0.6505595116988809,
|
|
"grad_norm": 1.0724189281463623,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5194,
|
|
"mean_token_accuracy": 0.8382695913314819,
|
|
"num_tokens": 203886501.0,
|
|
"step": 1279
|
|
},
|
|
{
|
|
"epoch": 0.6510681586978637,
|
|
"grad_norm": 1.2702937126159668,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8344364166259766,
|
|
"num_tokens": 204040525.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.6515768056968464,
|
|
"grad_norm": 1.134999394416809,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5134,
|
|
"mean_token_accuracy": 0.839902400970459,
|
|
"num_tokens": 204205921.0,
|
|
"step": 1281
|
|
},
|
|
{
|
|
"epoch": 0.6520854526958291,
|
|
"grad_norm": 1.0884934663772583,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5071,
|
|
"mean_token_accuracy": 0.841905951499939,
|
|
"num_tokens": 204371623.0,
|
|
"step": 1282
|
|
},
|
|
{
|
|
"epoch": 0.6525940996948117,
|
|
"grad_norm": 1.0996270179748535,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5095,
|
|
"mean_token_accuracy": 0.8415507078170776,
|
|
"num_tokens": 204531555.0,
|
|
"step": 1283
|
|
},
|
|
{
|
|
"epoch": 0.6531027466937945,
|
|
"grad_norm": 1.0175594091415405,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5171,
|
|
"mean_token_accuracy": 0.8389270305633545,
|
|
"num_tokens": 204695860.0,
|
|
"step": 1284
|
|
},
|
|
{
|
|
"epoch": 0.6536113936927772,
|
|
"grad_norm": 1.1143662929534912,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5131,
|
|
"mean_token_accuracy": 0.8399682641029358,
|
|
"num_tokens": 204849971.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.6541200406917599,
|
|
"grad_norm": 1.0179085731506348,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4899,
|
|
"mean_token_accuracy": 0.8472344279289246,
|
|
"num_tokens": 205022471.0,
|
|
"step": 1286
|
|
},
|
|
{
|
|
"epoch": 0.6546286876907427,
|
|
"grad_norm": 1.0679529905319214,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5025,
|
|
"mean_token_accuracy": 0.8438515067100525,
|
|
"num_tokens": 205191250.0,
|
|
"step": 1287
|
|
},
|
|
{
|
|
"epoch": 0.6551373346897253,
|
|
"grad_norm": 1.1572391986846924,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5431,
|
|
"mean_token_accuracy": 0.8309118747711182,
|
|
"num_tokens": 205338248.0,
|
|
"step": 1288
|
|
},
|
|
{
|
|
"epoch": 0.655645981688708,
|
|
"grad_norm": 0.9996314644813538,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5478,
|
|
"mean_token_accuracy": 0.8296094536781311,
|
|
"num_tokens": 205510777.0,
|
|
"step": 1289
|
|
},
|
|
{
|
|
"epoch": 0.6561546286876907,
|
|
"grad_norm": 1.0826754570007324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.538,
|
|
"mean_token_accuracy": 0.832534909248352,
|
|
"num_tokens": 205654689.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.6566632756866735,
|
|
"grad_norm": 1.0576002597808838,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5486,
|
|
"mean_token_accuracy": 0.8308680057525635,
|
|
"num_tokens": 205818547.0,
|
|
"step": 1291
|
|
},
|
|
{
|
|
"epoch": 0.6571719226856562,
|
|
"grad_norm": 1.0578711032867432,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.8268966674804688,
|
|
"num_tokens": 205984399.0,
|
|
"step": 1292
|
|
},
|
|
{
|
|
"epoch": 0.6576805696846388,
|
|
"grad_norm": 1.0611701011657715,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5702,
|
|
"mean_token_accuracy": 0.8249958157539368,
|
|
"num_tokens": 206148673.0,
|
|
"step": 1293
|
|
},
|
|
{
|
|
"epoch": 0.6581892166836215,
|
|
"grad_norm": 1.1365247964859009,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5099,
|
|
"mean_token_accuracy": 0.8411107063293457,
|
|
"num_tokens": 206308245.0,
|
|
"step": 1294
|
|
},
|
|
{
|
|
"epoch": 0.6586978636826043,
|
|
"grad_norm": 0.9704191088676453,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4897,
|
|
"mean_token_accuracy": 0.8457728624343872,
|
|
"num_tokens": 206471247.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.659206510681587,
|
|
"grad_norm": 1.032420039176941,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5294,
|
|
"mean_token_accuracy": 0.8350492119789124,
|
|
"num_tokens": 206613022.0,
|
|
"step": 1296
|
|
},
|
|
{
|
|
"epoch": 0.6597151576805697,
|
|
"grad_norm": 1.0824891328811646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.534,
|
|
"mean_token_accuracy": 0.832705557346344,
|
|
"num_tokens": 206781121.0,
|
|
"step": 1297
|
|
},
|
|
{
|
|
"epoch": 0.6602238046795524,
|
|
"grad_norm": 0.9917576909065247,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5379,
|
|
"mean_token_accuracy": 0.832955002784729,
|
|
"num_tokens": 206935916.0,
|
|
"step": 1298
|
|
},
|
|
{
|
|
"epoch": 0.6607324516785351,
|
|
"grad_norm": 1.120984435081482,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4862,
|
|
"mean_token_accuracy": 0.8467674851417542,
|
|
"num_tokens": 207089114.0,
|
|
"step": 1299
|
|
},
|
|
{
|
|
"epoch": 0.6612410986775178,
|
|
"grad_norm": 1.0345065593719482,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5132,
|
|
"mean_token_accuracy": 0.8396544456481934,
|
|
"num_tokens": 207249857.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.6617497456765005,
|
|
"grad_norm": 1.0298868417739868,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5605,
|
|
"mean_token_accuracy": 0.8278207778930664,
|
|
"num_tokens": 207415519.0,
|
|
"step": 1301
|
|
},
|
|
{
|
|
"epoch": 0.6622583926754833,
|
|
"grad_norm": 1.0737025737762451,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5251,
|
|
"mean_token_accuracy": 0.8366029262542725,
|
|
"num_tokens": 207585946.0,
|
|
"step": 1302
|
|
},
|
|
{
|
|
"epoch": 0.6627670396744659,
|
|
"grad_norm": 0.944338858127594,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.844987154006958,
|
|
"num_tokens": 207742283.0,
|
|
"step": 1303
|
|
},
|
|
{
|
|
"epoch": 0.6632756866734486,
|
|
"grad_norm": 0.9760245084762573,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4936,
|
|
"mean_token_accuracy": 0.8455411791801453,
|
|
"num_tokens": 207905907.0,
|
|
"step": 1304
|
|
},
|
|
{
|
|
"epoch": 0.6637843336724313,
|
|
"grad_norm": 1.0034639835357666,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4984,
|
|
"mean_token_accuracy": 0.8437528610229492,
|
|
"num_tokens": 208066296.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.6642929806714141,
|
|
"grad_norm": 0.991218626499176,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5013,
|
|
"mean_token_accuracy": 0.8421469926834106,
|
|
"num_tokens": 208221915.0,
|
|
"step": 1306
|
|
},
|
|
{
|
|
"epoch": 0.6648016276703967,
|
|
"grad_norm": 1.1763343811035156,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4764,
|
|
"mean_token_accuracy": 0.8507419228553772,
|
|
"num_tokens": 208374510.0,
|
|
"step": 1307
|
|
},
|
|
{
|
|
"epoch": 0.6653102746693794,
|
|
"grad_norm": 1.0376806259155273,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5323,
|
|
"mean_token_accuracy": 0.8359139561653137,
|
|
"num_tokens": 208541953.0,
|
|
"step": 1308
|
|
},
|
|
{
|
|
"epoch": 0.6658189216683622,
|
|
"grad_norm": 1.2217199802398682,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5134,
|
|
"mean_token_accuracy": 0.8390167355537415,
|
|
"num_tokens": 208698721.0,
|
|
"step": 1309
|
|
},
|
|
{
|
|
"epoch": 0.6663275686673449,
|
|
"grad_norm": 1.1456576585769653,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5181,
|
|
"mean_token_accuracy": 0.8398691415786743,
|
|
"num_tokens": 208855549.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.6668362156663276,
|
|
"grad_norm": 1.0265754461288452,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5423,
|
|
"mean_token_accuracy": 0.8332537412643433,
|
|
"num_tokens": 209015229.0,
|
|
"step": 1311
|
|
},
|
|
{
|
|
"epoch": 0.6673448626653102,
|
|
"grad_norm": 1.1075199842453003,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5314,
|
|
"mean_token_accuracy": 0.8359677791595459,
|
|
"num_tokens": 209167815.0,
|
|
"step": 1312
|
|
},
|
|
{
|
|
"epoch": 0.667853509664293,
|
|
"grad_norm": 0.9988571405410767,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5403,
|
|
"mean_token_accuracy": 0.8331695795059204,
|
|
"num_tokens": 209339913.0,
|
|
"step": 1313
|
|
},
|
|
{
|
|
"epoch": 0.6683621566632757,
|
|
"grad_norm": 1.055841326713562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5642,
|
|
"mean_token_accuracy": 0.8272186517715454,
|
|
"num_tokens": 209506826.0,
|
|
"step": 1314
|
|
},
|
|
{
|
|
"epoch": 0.6688708036622584,
|
|
"grad_norm": 1.170231580734253,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5122,
|
|
"mean_token_accuracy": 0.840907096862793,
|
|
"num_tokens": 209658534.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.669379450661241,
|
|
"grad_norm": 1.0777288675308228,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5046,
|
|
"mean_token_accuracy": 0.841313362121582,
|
|
"num_tokens": 209810131.0,
|
|
"step": 1316
|
|
},
|
|
{
|
|
"epoch": 0.6698880976602238,
|
|
"grad_norm": 1.0405516624450684,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4887,
|
|
"mean_token_accuracy": 0.8478529453277588,
|
|
"num_tokens": 209972848.0,
|
|
"step": 1317
|
|
},
|
|
{
|
|
"epoch": 0.6703967446592065,
|
|
"grad_norm": 1.056444764137268,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8281592130661011,
|
|
"num_tokens": 210123131.0,
|
|
"step": 1318
|
|
},
|
|
{
|
|
"epoch": 0.6709053916581892,
|
|
"grad_norm": 0.9567363858222961,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5293,
|
|
"mean_token_accuracy": 0.8372219800949097,
|
|
"num_tokens": 210281302.0,
|
|
"step": 1319
|
|
},
|
|
{
|
|
"epoch": 0.671414038657172,
|
|
"grad_norm": 1.0734000205993652,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.508,
|
|
"mean_token_accuracy": 0.8434922695159912,
|
|
"num_tokens": 210444300.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.6719226856561547,
|
|
"grad_norm": 0.9549962282180786,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.515,
|
|
"mean_token_accuracy": 0.839155912399292,
|
|
"num_tokens": 210610063.0,
|
|
"step": 1321
|
|
},
|
|
{
|
|
"epoch": 0.6724313326551373,
|
|
"grad_norm": 1.0906566381454468,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5276,
|
|
"mean_token_accuracy": 0.8351768255233765,
|
|
"num_tokens": 210771386.0,
|
|
"step": 1322
|
|
},
|
|
{
|
|
"epoch": 0.67293997965412,
|
|
"grad_norm": 1.040381908416748,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.503,
|
|
"mean_token_accuracy": 0.8433479070663452,
|
|
"num_tokens": 210919989.0,
|
|
"step": 1323
|
|
},
|
|
{
|
|
"epoch": 0.6734486266531028,
|
|
"grad_norm": 1.1143410205841064,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5419,
|
|
"mean_token_accuracy": 0.8338621854782104,
|
|
"num_tokens": 211073799.0,
|
|
"step": 1324
|
|
},
|
|
{
|
|
"epoch": 0.6739572736520855,
|
|
"grad_norm": 0.9642016291618347,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5196,
|
|
"mean_token_accuracy": 0.8390560746192932,
|
|
"num_tokens": 211241360.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.6744659206510681,
|
|
"grad_norm": 1.1692264080047607,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5437,
|
|
"mean_token_accuracy": 0.8322386741638184,
|
|
"num_tokens": 211407370.0,
|
|
"step": 1326
|
|
},
|
|
{
|
|
"epoch": 0.6749745676500508,
|
|
"grad_norm": 0.965532660484314,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.527,
|
|
"mean_token_accuracy": 0.8356199860572815,
|
|
"num_tokens": 211581491.0,
|
|
"step": 1327
|
|
},
|
|
{
|
|
"epoch": 0.6754832146490336,
|
|
"grad_norm": 1.0441728830337524,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5079,
|
|
"mean_token_accuracy": 0.8422735929489136,
|
|
"num_tokens": 211739189.0,
|
|
"step": 1328
|
|
},
|
|
{
|
|
"epoch": 0.6759918616480163,
|
|
"grad_norm": 1.1415163278579712,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8360558748245239,
|
|
"num_tokens": 211893052.0,
|
|
"step": 1329
|
|
},
|
|
{
|
|
"epoch": 0.676500508646999,
|
|
"grad_norm": 0.9364932179450989,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4948,
|
|
"mean_token_accuracy": 0.8465912342071533,
|
|
"num_tokens": 212047699.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.6770091556459817,
|
|
"grad_norm": 1.1136362552642822,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.8364666700363159,
|
|
"num_tokens": 212189491.0,
|
|
"step": 1331
|
|
},
|
|
{
|
|
"epoch": 0.6775178026449644,
|
|
"grad_norm": 1.1026740074157715,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5339,
|
|
"mean_token_accuracy": 0.8352006673812866,
|
|
"num_tokens": 212350768.0,
|
|
"step": 1332
|
|
},
|
|
{
|
|
"epoch": 0.6780264496439471,
|
|
"grad_norm": 1.0726979970932007,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5379,
|
|
"mean_token_accuracy": 0.8336712121963501,
|
|
"num_tokens": 212505424.0,
|
|
"step": 1333
|
|
},
|
|
{
|
|
"epoch": 0.6785350966429298,
|
|
"grad_norm": 1.030218243598938,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5231,
|
|
"mean_token_accuracy": 0.8367509245872498,
|
|
"num_tokens": 212656414.0,
|
|
"step": 1334
|
|
},
|
|
{
|
|
"epoch": 0.6790437436419126,
|
|
"grad_norm": 1.0471112728118896,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5232,
|
|
"mean_token_accuracy": 0.8369899988174438,
|
|
"num_tokens": 212814380.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.6795523906408952,
|
|
"grad_norm": 1.234264850616455,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5171,
|
|
"mean_token_accuracy": 0.8384055495262146,
|
|
"num_tokens": 212970630.0,
|
|
"step": 1336
|
|
},
|
|
{
|
|
"epoch": 0.6800610376398779,
|
|
"grad_norm": 0.9486947655677795,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5237,
|
|
"mean_token_accuracy": 0.838019609451294,
|
|
"num_tokens": 213135584.0,
|
|
"step": 1337
|
|
},
|
|
{
|
|
"epoch": 0.6805696846388606,
|
|
"grad_norm": 1.022884488105774,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.523,
|
|
"mean_token_accuracy": 0.836395263671875,
|
|
"num_tokens": 213296625.0,
|
|
"step": 1338
|
|
},
|
|
{
|
|
"epoch": 0.6810783316378434,
|
|
"grad_norm": 0.9599829316139221,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5183,
|
|
"mean_token_accuracy": 0.8398178815841675,
|
|
"num_tokens": 213466408.0,
|
|
"step": 1339
|
|
},
|
|
{
|
|
"epoch": 0.681586978636826,
|
|
"grad_norm": 1.0036944150924683,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4879,
|
|
"mean_token_accuracy": 0.8481632471084595,
|
|
"num_tokens": 213608058.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.6820956256358087,
|
|
"grad_norm": 0.992483913898468,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5253,
|
|
"mean_token_accuracy": 0.837264895439148,
|
|
"num_tokens": 213757840.0,
|
|
"step": 1341
|
|
},
|
|
{
|
|
"epoch": 0.6826042726347915,
|
|
"grad_norm": 0.9334516525268555,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5014,
|
|
"mean_token_accuracy": 0.8425880670547485,
|
|
"num_tokens": 213919287.0,
|
|
"step": 1342
|
|
},
|
|
{
|
|
"epoch": 0.6831129196337742,
|
|
"grad_norm": 0.9441404342651367,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5197,
|
|
"mean_token_accuracy": 0.8386458158493042,
|
|
"num_tokens": 214089237.0,
|
|
"step": 1343
|
|
},
|
|
{
|
|
"epoch": 0.6836215666327569,
|
|
"grad_norm": 1.0068684816360474,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.512,
|
|
"mean_token_accuracy": 0.841089129447937,
|
|
"num_tokens": 214248836.0,
|
|
"step": 1344
|
|
},
|
|
{
|
|
"epoch": 0.6841302136317395,
|
|
"grad_norm": 0.9266252517700195,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5125,
|
|
"mean_token_accuracy": 0.842008113861084,
|
|
"num_tokens": 214419340.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.6846388606307223,
|
|
"grad_norm": 0.9865654110908508,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5216,
|
|
"mean_token_accuracy": 0.8350708484649658,
|
|
"num_tokens": 214578559.0,
|
|
"step": 1346
|
|
},
|
|
{
|
|
"epoch": 0.685147507629705,
|
|
"grad_norm": 1.041604995727539,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5192,
|
|
"mean_token_accuracy": 0.8384052515029907,
|
|
"num_tokens": 214735210.0,
|
|
"step": 1347
|
|
},
|
|
{
|
|
"epoch": 0.6856561546286877,
|
|
"grad_norm": 0.9903523325920105,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.529,
|
|
"mean_token_accuracy": 0.8353708386421204,
|
|
"num_tokens": 214902909.0,
|
|
"step": 1348
|
|
},
|
|
{
|
|
"epoch": 0.6861648016276704,
|
|
"grad_norm": 0.9816705584526062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.522,
|
|
"mean_token_accuracy": 0.8378950953483582,
|
|
"num_tokens": 215064535.0,
|
|
"step": 1349
|
|
},
|
|
{
|
|
"epoch": 0.6866734486266531,
|
|
"grad_norm": 0.9787136316299438,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5212,
|
|
"mean_token_accuracy": 0.8377700448036194,
|
|
"num_tokens": 215220787.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.6871820956256358,
|
|
"grad_norm": 1.077988862991333,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5432,
|
|
"mean_token_accuracy": 0.8315883278846741,
|
|
"num_tokens": 215375205.0,
|
|
"step": 1351
|
|
},
|
|
{
|
|
"epoch": 0.6876907426246185,
|
|
"grad_norm": 0.9856903553009033,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5344,
|
|
"mean_token_accuracy": 0.8377841711044312,
|
|
"num_tokens": 215532596.0,
|
|
"step": 1352
|
|
},
|
|
{
|
|
"epoch": 0.6881993896236012,
|
|
"grad_norm": 0.9433656930923462,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5328,
|
|
"mean_token_accuracy": 0.836259126663208,
|
|
"num_tokens": 215686435.0,
|
|
"step": 1353
|
|
},
|
|
{
|
|
"epoch": 0.688708036622584,
|
|
"grad_norm": 1.0300414562225342,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5327,
|
|
"mean_token_accuracy": 0.833829402923584,
|
|
"num_tokens": 215854411.0,
|
|
"step": 1354
|
|
},
|
|
{
|
|
"epoch": 0.6892166836215666,
|
|
"grad_norm": 1.0212823152542114,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.498,
|
|
"mean_token_accuracy": 0.8442879319190979,
|
|
"num_tokens": 216016683.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.6897253306205493,
|
|
"grad_norm": 1.0083696842193604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5365,
|
|
"mean_token_accuracy": 0.8317540884017944,
|
|
"num_tokens": 216176025.0,
|
|
"step": 1356
|
|
},
|
|
{
|
|
"epoch": 0.6902339776195321,
|
|
"grad_norm": 1.0283464193344116,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5094,
|
|
"mean_token_accuracy": 0.8410217761993408,
|
|
"num_tokens": 216335904.0,
|
|
"step": 1357
|
|
},
|
|
{
|
|
"epoch": 0.6907426246185148,
|
|
"grad_norm": 1.0335670709609985,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5145,
|
|
"mean_token_accuracy": 0.8385999202728271,
|
|
"num_tokens": 216490560.0,
|
|
"step": 1358
|
|
},
|
|
{
|
|
"epoch": 0.6912512716174974,
|
|
"grad_norm": 1.0509625673294067,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5338,
|
|
"mean_token_accuracy": 0.8343715667724609,
|
|
"num_tokens": 216636941.0,
|
|
"step": 1359
|
|
},
|
|
{
|
|
"epoch": 0.6917599186164801,
|
|
"grad_norm": 0.9860029816627502,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5173,
|
|
"mean_token_accuracy": 0.8389407992362976,
|
|
"num_tokens": 216788892.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.6922685656154629,
|
|
"grad_norm": 1.0808199644088745,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4999,
|
|
"mean_token_accuracy": 0.8442080020904541,
|
|
"num_tokens": 216933108.0,
|
|
"step": 1361
|
|
},
|
|
{
|
|
"epoch": 0.6927772126144456,
|
|
"grad_norm": 1.1449096202850342,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.517,
|
|
"mean_token_accuracy": 0.8379891514778137,
|
|
"num_tokens": 217091789.0,
|
|
"step": 1362
|
|
},
|
|
{
|
|
"epoch": 0.6932858596134283,
|
|
"grad_norm": 1.0072581768035889,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5181,
|
|
"mean_token_accuracy": 0.8373380899429321,
|
|
"num_tokens": 217237394.0,
|
|
"step": 1363
|
|
},
|
|
{
|
|
"epoch": 0.6937945066124109,
|
|
"grad_norm": 1.0011417865753174,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5236,
|
|
"mean_token_accuracy": 0.836451530456543,
|
|
"num_tokens": 217391649.0,
|
|
"step": 1364
|
|
},
|
|
{
|
|
"epoch": 0.6943031536113937,
|
|
"grad_norm": 0.9622162580490112,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4995,
|
|
"mean_token_accuracy": 0.8435579538345337,
|
|
"num_tokens": 217554324.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.6948118006103764,
|
|
"grad_norm": 1.0910418033599854,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5147,
|
|
"mean_token_accuracy": 0.8388841152191162,
|
|
"num_tokens": 217705676.0,
|
|
"step": 1366
|
|
},
|
|
{
|
|
"epoch": 0.6953204476093591,
|
|
"grad_norm": 1.0049858093261719,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5367,
|
|
"mean_token_accuracy": 0.8346538543701172,
|
|
"num_tokens": 217870837.0,
|
|
"step": 1367
|
|
},
|
|
{
|
|
"epoch": 0.6958290946083419,
|
|
"grad_norm": 1.0026202201843262,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.8319190144538879,
|
|
"num_tokens": 218034192.0,
|
|
"step": 1368
|
|
},
|
|
{
|
|
"epoch": 0.6963377416073245,
|
|
"grad_norm": 1.0037686824798584,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5156,
|
|
"mean_token_accuracy": 0.8391927480697632,
|
|
"num_tokens": 218201741.0,
|
|
"step": 1369
|
|
},
|
|
{
|
|
"epoch": 0.6968463886063072,
|
|
"grad_norm": 0.9872056841850281,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5308,
|
|
"mean_token_accuracy": 0.8365633487701416,
|
|
"num_tokens": 218361880.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.6973550356052899,
|
|
"grad_norm": 1.0084397792816162,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4928,
|
|
"mean_token_accuracy": 0.8447875380516052,
|
|
"num_tokens": 218507302.0,
|
|
"step": 1371
|
|
},
|
|
{
|
|
"epoch": 0.6978636826042727,
|
|
"grad_norm": 0.9623077511787415,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5196,
|
|
"mean_token_accuracy": 0.837771475315094,
|
|
"num_tokens": 218677170.0,
|
|
"step": 1372
|
|
},
|
|
{
|
|
"epoch": 0.6983723296032553,
|
|
"grad_norm": 1.1019070148468018,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5203,
|
|
"mean_token_accuracy": 0.8386264443397522,
|
|
"num_tokens": 218830583.0,
|
|
"step": 1373
|
|
},
|
|
{
|
|
"epoch": 0.698880976602238,
|
|
"grad_norm": 1.0796947479248047,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5176,
|
|
"mean_token_accuracy": 0.8380075693130493,
|
|
"num_tokens": 218990542.0,
|
|
"step": 1374
|
|
},
|
|
{
|
|
"epoch": 0.6993896236012207,
|
|
"grad_norm": 1.0787526369094849,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5301,
|
|
"mean_token_accuracy": 0.8361427783966064,
|
|
"num_tokens": 219150118.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.6998982706002035,
|
|
"grad_norm": 1.0091830492019653,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5049,
|
|
"mean_token_accuracy": 0.8422755002975464,
|
|
"num_tokens": 219320664.0,
|
|
"step": 1376
|
|
},
|
|
{
|
|
"epoch": 0.7004069175991862,
|
|
"grad_norm": 0.9992945790290833,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5343,
|
|
"mean_token_accuracy": 0.833981990814209,
|
|
"num_tokens": 219483168.0,
|
|
"step": 1377
|
|
},
|
|
{
|
|
"epoch": 0.7009155645981688,
|
|
"grad_norm": 0.9681557416915894,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5219,
|
|
"mean_token_accuracy": 0.8369916081428528,
|
|
"num_tokens": 219643460.0,
|
|
"step": 1378
|
|
},
|
|
{
|
|
"epoch": 0.7014242115971516,
|
|
"grad_norm": 0.9948337078094482,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5161,
|
|
"mean_token_accuracy": 0.8396735787391663,
|
|
"num_tokens": 219801607.0,
|
|
"step": 1379
|
|
},
|
|
{
|
|
"epoch": 0.7019328585961343,
|
|
"grad_norm": 1.0998892784118652,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5306,
|
|
"mean_token_accuracy": 0.8362758159637451,
|
|
"num_tokens": 219953874.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.702441505595117,
|
|
"grad_norm": 1.0089136362075806,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4836,
|
|
"mean_token_accuracy": 0.8482525944709778,
|
|
"num_tokens": 220101993.0,
|
|
"step": 1381
|
|
},
|
|
{
|
|
"epoch": 0.7029501525940997,
|
|
"grad_norm": 0.9455198645591736,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5219,
|
|
"mean_token_accuracy": 0.8371497988700867,
|
|
"num_tokens": 220272230.0,
|
|
"step": 1382
|
|
},
|
|
{
|
|
"epoch": 0.7034587995930824,
|
|
"grad_norm": 1.017932415008545,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5176,
|
|
"mean_token_accuracy": 0.8393813371658325,
|
|
"num_tokens": 220432579.0,
|
|
"step": 1383
|
|
},
|
|
{
|
|
"epoch": 0.7039674465920651,
|
|
"grad_norm": 1.018938422203064,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5252,
|
|
"mean_token_accuracy": 0.8366577625274658,
|
|
"num_tokens": 220588939.0,
|
|
"step": 1384
|
|
},
|
|
{
|
|
"epoch": 0.7044760935910478,
|
|
"grad_norm": 0.9065205454826355,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.497,
|
|
"mean_token_accuracy": 0.8459725379943848,
|
|
"num_tokens": 220759155.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.7049847405900305,
|
|
"grad_norm": 0.926688015460968,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5175,
|
|
"mean_token_accuracy": 0.8387424945831299,
|
|
"num_tokens": 220916843.0,
|
|
"step": 1386
|
|
},
|
|
{
|
|
"epoch": 0.7054933875890133,
|
|
"grad_norm": 0.9841321706771851,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5269,
|
|
"mean_token_accuracy": 0.8358370065689087,
|
|
"num_tokens": 221083858.0,
|
|
"step": 1387
|
|
},
|
|
{
|
|
"epoch": 0.7060020345879959,
|
|
"grad_norm": 1.0243306159973145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5678,
|
|
"mean_token_accuracy": 0.82574862241745,
|
|
"num_tokens": 221258649.0,
|
|
"step": 1388
|
|
},
|
|
{
|
|
"epoch": 0.7065106815869786,
|
|
"grad_norm": 0.9554473161697388,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5314,
|
|
"mean_token_accuracy": 0.8345929980278015,
|
|
"num_tokens": 221425144.0,
|
|
"step": 1389
|
|
},
|
|
{
|
|
"epoch": 0.7070193285859614,
|
|
"grad_norm": 0.9833611845970154,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5295,
|
|
"mean_token_accuracy": 0.8353415131568909,
|
|
"num_tokens": 221583215.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.7075279755849441,
|
|
"grad_norm": 0.9443389177322388,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.535,
|
|
"mean_token_accuracy": 0.8340977430343628,
|
|
"num_tokens": 221755255.0,
|
|
"step": 1391
|
|
},
|
|
{
|
|
"epoch": 0.7080366225839267,
|
|
"grad_norm": 1.032128930091858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5473,
|
|
"mean_token_accuracy": 0.8305888175964355,
|
|
"num_tokens": 221923372.0,
|
|
"step": 1392
|
|
},
|
|
{
|
|
"epoch": 0.7085452695829094,
|
|
"grad_norm": 0.9580515027046204,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5242,
|
|
"mean_token_accuracy": 0.8368549346923828,
|
|
"num_tokens": 222080701.0,
|
|
"step": 1393
|
|
},
|
|
{
|
|
"epoch": 0.7090539165818922,
|
|
"grad_norm": 0.9367018938064575,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.486,
|
|
"mean_token_accuracy": 0.8447219729423523,
|
|
"num_tokens": 222233287.0,
|
|
"step": 1394
|
|
},
|
|
{
|
|
"epoch": 0.7095625635808749,
|
|
"grad_norm": 0.9627552628517151,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5408,
|
|
"mean_token_accuracy": 0.8316167593002319,
|
|
"num_tokens": 222399597.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.7100712105798576,
|
|
"grad_norm": 1.0496351718902588,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5269,
|
|
"mean_token_accuracy": 0.8371133804321289,
|
|
"num_tokens": 222549595.0,
|
|
"step": 1396
|
|
},
|
|
{
|
|
"epoch": 0.7105798575788402,
|
|
"grad_norm": 0.9316955208778381,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5483,
|
|
"mean_token_accuracy": 0.8318901062011719,
|
|
"num_tokens": 222711422.0,
|
|
"step": 1397
|
|
},
|
|
{
|
|
"epoch": 0.711088504577823,
|
|
"grad_norm": 0.9864873290061951,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5008,
|
|
"mean_token_accuracy": 0.8416286706924438,
|
|
"num_tokens": 222873104.0,
|
|
"step": 1398
|
|
},
|
|
{
|
|
"epoch": 0.7115971515768057,
|
|
"grad_norm": 1.0518662929534912,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5304,
|
|
"mean_token_accuracy": 0.8358045816421509,
|
|
"num_tokens": 223020625.0,
|
|
"step": 1399
|
|
},
|
|
{
|
|
"epoch": 0.7121057985757884,
|
|
"grad_norm": 1.0203341245651245,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5068,
|
|
"mean_token_accuracy": 0.8423791527748108,
|
|
"num_tokens": 223175189.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.7126144455747712,
|
|
"grad_norm": 1.0138752460479736,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.511,
|
|
"mean_token_accuracy": 0.8377603888511658,
|
|
"num_tokens": 223335073.0,
|
|
"step": 1401
|
|
},
|
|
{
|
|
"epoch": 0.7131230925737538,
|
|
"grad_norm": 0.9489789009094238,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5215,
|
|
"mean_token_accuracy": 0.8382784128189087,
|
|
"num_tokens": 223499553.0,
|
|
"step": 1402
|
|
},
|
|
{
|
|
"epoch": 0.7136317395727365,
|
|
"grad_norm": 1.0367980003356934,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5051,
|
|
"mean_token_accuracy": 0.8410635590553284,
|
|
"num_tokens": 223657911.0,
|
|
"step": 1403
|
|
},
|
|
{
|
|
"epoch": 0.7141403865717192,
|
|
"grad_norm": 1.0104140043258667,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5258,
|
|
"mean_token_accuracy": 0.8353630304336548,
|
|
"num_tokens": 223808765.0,
|
|
"step": 1404
|
|
},
|
|
{
|
|
"epoch": 0.714649033570702,
|
|
"grad_norm": 0.991611897945404,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5019,
|
|
"mean_token_accuracy": 0.8419798612594604,
|
|
"num_tokens": 223974533.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.7151576805696847,
|
|
"grad_norm": 1.1228381395339966,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.484,
|
|
"mean_token_accuracy": 0.847956120967865,
|
|
"num_tokens": 224133481.0,
|
|
"step": 1406
|
|
},
|
|
{
|
|
"epoch": 0.7156663275686673,
|
|
"grad_norm": 1.118977427482605,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5064,
|
|
"mean_token_accuracy": 0.8419226408004761,
|
|
"num_tokens": 224282640.0,
|
|
"step": 1407
|
|
},
|
|
{
|
|
"epoch": 0.71617497456765,
|
|
"grad_norm": 0.9979066252708435,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5444,
|
|
"mean_token_accuracy": 0.8306796550750732,
|
|
"num_tokens": 224447298.0,
|
|
"step": 1408
|
|
},
|
|
{
|
|
"epoch": 0.7166836215666328,
|
|
"grad_norm": 1.0414625406265259,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5029,
|
|
"mean_token_accuracy": 0.8423417210578918,
|
|
"num_tokens": 224596045.0,
|
|
"step": 1409
|
|
},
|
|
{
|
|
"epoch": 0.7171922685656155,
|
|
"grad_norm": 0.9783027172088623,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5285,
|
|
"mean_token_accuracy": 0.8376615047454834,
|
|
"num_tokens": 224757199.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.7177009155645981,
|
|
"grad_norm": 0.9670535922050476,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5061,
|
|
"mean_token_accuracy": 0.8409932851791382,
|
|
"num_tokens": 224913566.0,
|
|
"step": 1411
|
|
},
|
|
{
|
|
"epoch": 0.7182095625635809,
|
|
"grad_norm": 1.0231362581253052,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5214,
|
|
"mean_token_accuracy": 0.8397393226623535,
|
|
"num_tokens": 225078638.0,
|
|
"step": 1412
|
|
},
|
|
{
|
|
"epoch": 0.7187182095625636,
|
|
"grad_norm": 0.9188551902770996,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5248,
|
|
"mean_token_accuracy": 0.8372691869735718,
|
|
"num_tokens": 225242585.0,
|
|
"step": 1413
|
|
},
|
|
{
|
|
"epoch": 0.7192268565615463,
|
|
"grad_norm": 1.0983929634094238,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4956,
|
|
"mean_token_accuracy": 0.8437936902046204,
|
|
"num_tokens": 225402143.0,
|
|
"step": 1414
|
|
},
|
|
{
|
|
"epoch": 0.719735503560529,
|
|
"grad_norm": 1.0025497674942017,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5487,
|
|
"mean_token_accuracy": 0.832549512386322,
|
|
"num_tokens": 225561481.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.7202441505595117,
|
|
"grad_norm": 0.9898235201835632,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5019,
|
|
"mean_token_accuracy": 0.8437957167625427,
|
|
"num_tokens": 225720513.0,
|
|
"step": 1416
|
|
},
|
|
{
|
|
"epoch": 0.7207527975584944,
|
|
"grad_norm": 1.0856152772903442,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5512,
|
|
"mean_token_accuracy": 0.8304049968719482,
|
|
"num_tokens": 225873349.0,
|
|
"step": 1417
|
|
},
|
|
{
|
|
"epoch": 0.7212614445574771,
|
|
"grad_norm": 0.9928951263427734,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5003,
|
|
"mean_token_accuracy": 0.8427510261535645,
|
|
"num_tokens": 226039357.0,
|
|
"step": 1418
|
|
},
|
|
{
|
|
"epoch": 0.7217700915564598,
|
|
"grad_norm": 1.0025311708450317,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4958,
|
|
"mean_token_accuracy": 0.8438827991485596,
|
|
"num_tokens": 226189348.0,
|
|
"step": 1419
|
|
},
|
|
{
|
|
"epoch": 0.7222787385554426,
|
|
"grad_norm": 0.986140787601471,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5282,
|
|
"mean_token_accuracy": 0.8362927436828613,
|
|
"num_tokens": 226335920.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.7227873855544252,
|
|
"grad_norm": 1.1514464616775513,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5317,
|
|
"mean_token_accuracy": 0.8350828289985657,
|
|
"num_tokens": 226495005.0,
|
|
"step": 1421
|
|
},
|
|
{
|
|
"epoch": 0.7232960325534079,
|
|
"grad_norm": 1.0557314157485962,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.49,
|
|
"mean_token_accuracy": 0.8465843200683594,
|
|
"num_tokens": 226657995.0,
|
|
"step": 1422
|
|
},
|
|
{
|
|
"epoch": 0.7238046795523907,
|
|
"grad_norm": 1.168913722038269,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5245,
|
|
"mean_token_accuracy": 0.837516188621521,
|
|
"num_tokens": 226828202.0,
|
|
"step": 1423
|
|
},
|
|
{
|
|
"epoch": 0.7243133265513734,
|
|
"grad_norm": 1.1428192853927612,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5346,
|
|
"mean_token_accuracy": 0.8357243537902832,
|
|
"num_tokens": 226982248.0,
|
|
"step": 1424
|
|
},
|
|
{
|
|
"epoch": 0.724821973550356,
|
|
"grad_norm": 1.0416052341461182,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5246,
|
|
"mean_token_accuracy": 0.8367089629173279,
|
|
"num_tokens": 227129474.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.7253306205493387,
|
|
"grad_norm": 1.2324395179748535,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5015,
|
|
"mean_token_accuracy": 0.8420953154563904,
|
|
"num_tokens": 227274613.0,
|
|
"step": 1426
|
|
},
|
|
{
|
|
"epoch": 0.7258392675483215,
|
|
"grad_norm": 1.0069247484207153,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.539,
|
|
"mean_token_accuracy": 0.8342633247375488,
|
|
"num_tokens": 227440413.0,
|
|
"step": 1427
|
|
},
|
|
{
|
|
"epoch": 0.7263479145473042,
|
|
"grad_norm": 1.1502869129180908,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5142,
|
|
"mean_token_accuracy": 0.8386930227279663,
|
|
"num_tokens": 227605134.0,
|
|
"step": 1428
|
|
},
|
|
{
|
|
"epoch": 0.7268565615462869,
|
|
"grad_norm": 0.9862993955612183,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5063,
|
|
"mean_token_accuracy": 0.8415361046791077,
|
|
"num_tokens": 227761956.0,
|
|
"step": 1429
|
|
},
|
|
{
|
|
"epoch": 0.7273652085452695,
|
|
"grad_norm": 1.174629807472229,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5725,
|
|
"mean_token_accuracy": 0.8255341649055481,
|
|
"num_tokens": 227922627.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.7278738555442523,
|
|
"grad_norm": 1.102269172668457,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.519,
|
|
"mean_token_accuracy": 0.8369065523147583,
|
|
"num_tokens": 228080800.0,
|
|
"step": 1431
|
|
},
|
|
{
|
|
"epoch": 0.728382502543235,
|
|
"grad_norm": 1.0408049821853638,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5122,
|
|
"mean_token_accuracy": 0.8411990404129028,
|
|
"num_tokens": 228242141.0,
|
|
"step": 1432
|
|
},
|
|
{
|
|
"epoch": 0.7288911495422177,
|
|
"grad_norm": 1.3122097253799438,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5153,
|
|
"mean_token_accuracy": 0.8383074402809143,
|
|
"num_tokens": 228387653.0,
|
|
"step": 1433
|
|
},
|
|
{
|
|
"epoch": 0.7293997965412004,
|
|
"grad_norm": 0.9923901557922363,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4949,
|
|
"mean_token_accuracy": 0.8445407152175903,
|
|
"num_tokens": 228545473.0,
|
|
"step": 1434
|
|
},
|
|
{
|
|
"epoch": 0.7299084435401831,
|
|
"grad_norm": 1.1669878959655762,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5289,
|
|
"mean_token_accuracy": 0.8352810144424438,
|
|
"num_tokens": 228713308.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.7304170905391658,
|
|
"grad_norm": 1.0849850177764893,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5468,
|
|
"mean_token_accuracy": 0.8317233920097351,
|
|
"num_tokens": 228884216.0,
|
|
"step": 1436
|
|
},
|
|
{
|
|
"epoch": 0.7309257375381485,
|
|
"grad_norm": 1.0434106588363647,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5594,
|
|
"mean_token_accuracy": 0.8295158743858337,
|
|
"num_tokens": 229029827.0,
|
|
"step": 1437
|
|
},
|
|
{
|
|
"epoch": 0.7314343845371313,
|
|
"grad_norm": 1.1874513626098633,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.537,
|
|
"mean_token_accuracy": 0.8329129219055176,
|
|
"num_tokens": 229181920.0,
|
|
"step": 1438
|
|
},
|
|
{
|
|
"epoch": 0.731943031536114,
|
|
"grad_norm": 0.96856290102005,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5377,
|
|
"mean_token_accuracy": 0.8328238725662231,
|
|
"num_tokens": 229349488.0,
|
|
"step": 1439
|
|
},
|
|
{
|
|
"epoch": 0.7324516785350966,
|
|
"grad_norm": 1.1322166919708252,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5119,
|
|
"mean_token_accuracy": 0.8405275940895081,
|
|
"num_tokens": 229505884.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.7329603255340793,
|
|
"grad_norm": 1.402640700340271,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5212,
|
|
"mean_token_accuracy": 0.8364936113357544,
|
|
"num_tokens": 229673440.0,
|
|
"step": 1441
|
|
},
|
|
{
|
|
"epoch": 0.7334689725330621,
|
|
"grad_norm": 0.9987578988075256,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5087,
|
|
"mean_token_accuracy": 0.8413553237915039,
|
|
"num_tokens": 229844345.0,
|
|
"step": 1442
|
|
},
|
|
{
|
|
"epoch": 0.7339776195320448,
|
|
"grad_norm": 1.277686357498169,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5684,
|
|
"mean_token_accuracy": 0.826140284538269,
|
|
"num_tokens": 230001344.0,
|
|
"step": 1443
|
|
},
|
|
{
|
|
"epoch": 0.7344862665310274,
|
|
"grad_norm": 1.149610161781311,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5012,
|
|
"mean_token_accuracy": 0.843376874923706,
|
|
"num_tokens": 230160760.0,
|
|
"step": 1444
|
|
},
|
|
{
|
|
"epoch": 0.7349949135300101,
|
|
"grad_norm": 1.1200135946273804,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5128,
|
|
"mean_token_accuracy": 0.8402576446533203,
|
|
"num_tokens": 230318252.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.7355035605289929,
|
|
"grad_norm": 1.131650447845459,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8372815847396851,
|
|
"num_tokens": 230480542.0,
|
|
"step": 1446
|
|
},
|
|
{
|
|
"epoch": 0.7360122075279756,
|
|
"grad_norm": 1.0408190488815308,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5353,
|
|
"mean_token_accuracy": 0.8357641100883484,
|
|
"num_tokens": 230631226.0,
|
|
"step": 1447
|
|
},
|
|
{
|
|
"epoch": 0.7365208545269583,
|
|
"grad_norm": 1.0704621076583862,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4936,
|
|
"mean_token_accuracy": 0.8441085815429688,
|
|
"num_tokens": 230793950.0,
|
|
"step": 1448
|
|
},
|
|
{
|
|
"epoch": 0.737029501525941,
|
|
"grad_norm": 1.017288088798523,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5674,
|
|
"mean_token_accuracy": 0.8265225887298584,
|
|
"num_tokens": 230949258.0,
|
|
"step": 1449
|
|
},
|
|
{
|
|
"epoch": 0.7375381485249237,
|
|
"grad_norm": 1.0223156213760376,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5082,
|
|
"mean_token_accuracy": 0.8423564434051514,
|
|
"num_tokens": 231116995.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.7380467955239064,
|
|
"grad_norm": 0.9167082905769348,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4922,
|
|
"mean_token_accuracy": 0.8476444482803345,
|
|
"num_tokens": 231286840.0,
|
|
"step": 1451
|
|
},
|
|
{
|
|
"epoch": 0.7385554425228891,
|
|
"grad_norm": 0.993761420249939,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5135,
|
|
"mean_token_accuracy": 0.8388065099716187,
|
|
"num_tokens": 231446504.0,
|
|
"step": 1452
|
|
},
|
|
{
|
|
"epoch": 0.7390640895218719,
|
|
"grad_norm": 0.9417846202850342,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5438,
|
|
"mean_token_accuracy": 0.832386314868927,
|
|
"num_tokens": 231614914.0,
|
|
"step": 1453
|
|
},
|
|
{
|
|
"epoch": 0.7395727365208545,
|
|
"grad_norm": 0.9867107272148132,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5138,
|
|
"mean_token_accuracy": 0.8408212661743164,
|
|
"num_tokens": 231772874.0,
|
|
"step": 1454
|
|
},
|
|
{
|
|
"epoch": 0.7400813835198372,
|
|
"grad_norm": 0.9379030466079712,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5249,
|
|
"mean_token_accuracy": 0.8361393213272095,
|
|
"num_tokens": 231937051.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.7405900305188199,
|
|
"grad_norm": 0.9655819535255432,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5098,
|
|
"mean_token_accuracy": 0.8403584957122803,
|
|
"num_tokens": 232087611.0,
|
|
"step": 1456
|
|
},
|
|
{
|
|
"epoch": 0.7410986775178027,
|
|
"grad_norm": 1.0079295635223389,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5616,
|
|
"mean_token_accuracy": 0.8275667428970337,
|
|
"num_tokens": 232246891.0,
|
|
"step": 1457
|
|
},
|
|
{
|
|
"epoch": 0.7416073245167853,
|
|
"grad_norm": 0.9451082944869995,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5022,
|
|
"mean_token_accuracy": 0.8424191474914551,
|
|
"num_tokens": 232405181.0,
|
|
"step": 1458
|
|
},
|
|
{
|
|
"epoch": 0.742115971515768,
|
|
"grad_norm": 0.9628739953041077,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5641,
|
|
"mean_token_accuracy": 0.8226780891418457,
|
|
"num_tokens": 232569215.0,
|
|
"step": 1459
|
|
},
|
|
{
|
|
"epoch": 0.7426246185147508,
|
|
"grad_norm": 0.9538934826850891,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.8450824618339539,
|
|
"num_tokens": 232730814.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.7431332655137335,
|
|
"grad_norm": 0.9364286065101624,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5198,
|
|
"mean_token_accuracy": 0.8396707773208618,
|
|
"num_tokens": 232892427.0,
|
|
"step": 1461
|
|
},
|
|
{
|
|
"epoch": 0.7436419125127162,
|
|
"grad_norm": 0.971477210521698,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5105,
|
|
"mean_token_accuracy": 0.8389374613761902,
|
|
"num_tokens": 233062413.0,
|
|
"step": 1462
|
|
},
|
|
{
|
|
"epoch": 0.7441505595116988,
|
|
"grad_norm": 1.0381510257720947,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.499,
|
|
"mean_token_accuracy": 0.8426993489265442,
|
|
"num_tokens": 233223449.0,
|
|
"step": 1463
|
|
},
|
|
{
|
|
"epoch": 0.7446592065106816,
|
|
"grad_norm": 0.9925874471664429,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5116,
|
|
"mean_token_accuracy": 0.8392652273178101,
|
|
"num_tokens": 233382314.0,
|
|
"step": 1464
|
|
},
|
|
{
|
|
"epoch": 0.7451678535096643,
|
|
"grad_norm": 0.9298463463783264,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5043,
|
|
"mean_token_accuracy": 0.8449363708496094,
|
|
"num_tokens": 233549471.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.745676500508647,
|
|
"grad_norm": 0.9784131050109863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5269,
|
|
"mean_token_accuracy": 0.835938572883606,
|
|
"num_tokens": 233714244.0,
|
|
"step": 1466
|
|
},
|
|
{
|
|
"epoch": 0.7461851475076297,
|
|
"grad_norm": 1.0752923488616943,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5157,
|
|
"mean_token_accuracy": 0.8408986330032349,
|
|
"num_tokens": 233873031.0,
|
|
"step": 1467
|
|
},
|
|
{
|
|
"epoch": 0.7466937945066124,
|
|
"grad_norm": 0.9333047270774841,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4983,
|
|
"mean_token_accuracy": 0.8432742953300476,
|
|
"num_tokens": 234028924.0,
|
|
"step": 1468
|
|
},
|
|
{
|
|
"epoch": 0.7472024415055951,
|
|
"grad_norm": 1.0717540979385376,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5069,
|
|
"mean_token_accuracy": 0.8415995240211487,
|
|
"num_tokens": 234187316.0,
|
|
"step": 1469
|
|
},
|
|
{
|
|
"epoch": 0.7477110885045778,
|
|
"grad_norm": 1.023341178894043,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.498,
|
|
"mean_token_accuracy": 0.8454621434211731,
|
|
"num_tokens": 234352052.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.7482197355035606,
|
|
"grad_norm": 1.088341236114502,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.499,
|
|
"mean_token_accuracy": 0.8432536125183105,
|
|
"num_tokens": 234522725.0,
|
|
"step": 1471
|
|
},
|
|
{
|
|
"epoch": 0.7487283825025433,
|
|
"grad_norm": 1.048627257347107,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5375,
|
|
"mean_token_accuracy": 0.8310221433639526,
|
|
"num_tokens": 234685164.0,
|
|
"step": 1472
|
|
},
|
|
{
|
|
"epoch": 0.7492370295015259,
|
|
"grad_norm": 0.977834165096283,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5033,
|
|
"mean_token_accuracy": 0.8420848846435547,
|
|
"num_tokens": 234844848.0,
|
|
"step": 1473
|
|
},
|
|
{
|
|
"epoch": 0.7497456765005086,
|
|
"grad_norm": 1.0553686618804932,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4901,
|
|
"mean_token_accuracy": 0.8447784185409546,
|
|
"num_tokens": 234999779.0,
|
|
"step": 1474
|
|
},
|
|
{
|
|
"epoch": 0.7502543234994914,
|
|
"grad_norm": 1.0582385063171387,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.584,
|
|
"mean_token_accuracy": 0.8204246163368225,
|
|
"num_tokens": 235162263.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.7507629704984741,
|
|
"grad_norm": 0.9934267997741699,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5622,
|
|
"mean_token_accuracy": 0.8252400755882263,
|
|
"num_tokens": 235319603.0,
|
|
"step": 1476
|
|
},
|
|
{
|
|
"epoch": 0.7512716174974567,
|
|
"grad_norm": 1.0062447786331177,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5026,
|
|
"mean_token_accuracy": 0.8412712812423706,
|
|
"num_tokens": 235479108.0,
|
|
"step": 1477
|
|
},
|
|
{
|
|
"epoch": 0.7517802644964394,
|
|
"grad_norm": 1.0643982887268066,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.521,
|
|
"mean_token_accuracy": 0.8388631343841553,
|
|
"num_tokens": 235635662.0,
|
|
"step": 1478
|
|
},
|
|
{
|
|
"epoch": 0.7522889114954222,
|
|
"grad_norm": 1.0867574214935303,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5331,
|
|
"mean_token_accuracy": 0.8342521786689758,
|
|
"num_tokens": 235778848.0,
|
|
"step": 1479
|
|
},
|
|
{
|
|
"epoch": 0.7527975584944049,
|
|
"grad_norm": 1.0071399211883545,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4977,
|
|
"mean_token_accuracy": 0.8450353145599365,
|
|
"num_tokens": 235935563.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.7533062054933876,
|
|
"grad_norm": 0.9993019104003906,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5225,
|
|
"mean_token_accuracy": 0.8392289280891418,
|
|
"num_tokens": 236102712.0,
|
|
"step": 1481
|
|
},
|
|
{
|
|
"epoch": 0.7538148524923703,
|
|
"grad_norm": 1.0551786422729492,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5134,
|
|
"mean_token_accuracy": 0.8395529985427856,
|
|
"num_tokens": 236247865.0,
|
|
"step": 1482
|
|
},
|
|
{
|
|
"epoch": 0.754323499491353,
|
|
"grad_norm": 1.036880612373352,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5198,
|
|
"mean_token_accuracy": 0.838604211807251,
|
|
"num_tokens": 236406459.0,
|
|
"step": 1483
|
|
},
|
|
{
|
|
"epoch": 0.7548321464903357,
|
|
"grad_norm": 0.9874710440635681,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4894,
|
|
"mean_token_accuracy": 0.8457585573196411,
|
|
"num_tokens": 236548474.0,
|
|
"step": 1484
|
|
},
|
|
{
|
|
"epoch": 0.7553407934893184,
|
|
"grad_norm": 1.0621289014816284,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5356,
|
|
"mean_token_accuracy": 0.8316954374313354,
|
|
"num_tokens": 236699987.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.7558494404883012,
|
|
"grad_norm": 1.032537579536438,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4908,
|
|
"mean_token_accuracy": 0.8453277349472046,
|
|
"num_tokens": 236859308.0,
|
|
"step": 1486
|
|
},
|
|
{
|
|
"epoch": 0.7563580874872838,
|
|
"grad_norm": 0.9991673231124878,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4927,
|
|
"mean_token_accuracy": 0.8444706201553345,
|
|
"num_tokens": 237011851.0,
|
|
"step": 1487
|
|
},
|
|
{
|
|
"epoch": 0.7568667344862665,
|
|
"grad_norm": 1.0048476457595825,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5205,
|
|
"mean_token_accuracy": 0.837803840637207,
|
|
"num_tokens": 237170473.0,
|
|
"step": 1488
|
|
},
|
|
{
|
|
"epoch": 0.7573753814852492,
|
|
"grad_norm": 1.1486921310424805,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4974,
|
|
"mean_token_accuracy": 0.8430988788604736,
|
|
"num_tokens": 237317583.0,
|
|
"step": 1489
|
|
},
|
|
{
|
|
"epoch": 0.757884028484232,
|
|
"grad_norm": 1.0181177854537964,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4949,
|
|
"mean_token_accuracy": 0.8443738222122192,
|
|
"num_tokens": 237483382.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.7583926754832147,
|
|
"grad_norm": 0.9788311123847961,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.508,
|
|
"mean_token_accuracy": 0.8405285477638245,
|
|
"num_tokens": 237642857.0,
|
|
"step": 1491
|
|
},
|
|
{
|
|
"epoch": 0.7589013224821973,
|
|
"grad_norm": 0.9558936953544617,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5103,
|
|
"mean_token_accuracy": 0.8410489559173584,
|
|
"num_tokens": 237809071.0,
|
|
"step": 1492
|
|
},
|
|
{
|
|
"epoch": 0.7594099694811801,
|
|
"grad_norm": 0.996228039264679,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.533,
|
|
"mean_token_accuracy": 0.835081934928894,
|
|
"num_tokens": 237977918.0,
|
|
"step": 1493
|
|
},
|
|
{
|
|
"epoch": 0.7599186164801628,
|
|
"grad_norm": 0.986886203289032,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5303,
|
|
"mean_token_accuracy": 0.8364228010177612,
|
|
"num_tokens": 238146507.0,
|
|
"step": 1494
|
|
},
|
|
{
|
|
"epoch": 0.7604272634791455,
|
|
"grad_norm": 0.9645000100135803,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5002,
|
|
"mean_token_accuracy": 0.8437860012054443,
|
|
"num_tokens": 238317537.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.7609359104781281,
|
|
"grad_norm": 1.049106478691101,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.501,
|
|
"mean_token_accuracy": 0.8432005643844604,
|
|
"num_tokens": 238483591.0,
|
|
"step": 1496
|
|
},
|
|
{
|
|
"epoch": 0.7614445574771109,
|
|
"grad_norm": 0.9359789490699768,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4906,
|
|
"mean_token_accuracy": 0.8473359942436218,
|
|
"num_tokens": 238641826.0,
|
|
"step": 1497
|
|
},
|
|
{
|
|
"epoch": 0.7619532044760936,
|
|
"grad_norm": 1.026975393295288,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5106,
|
|
"mean_token_accuracy": 0.8406385183334351,
|
|
"num_tokens": 238793523.0,
|
|
"step": 1498
|
|
},
|
|
{
|
|
"epoch": 0.7624618514750763,
|
|
"grad_norm": 1.0067918300628662,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5171,
|
|
"mean_token_accuracy": 0.8395569920539856,
|
|
"num_tokens": 238956488.0,
|
|
"step": 1499
|
|
},
|
|
{
|
|
"epoch": 0.762970498474059,
|
|
"grad_norm": 0.9238914847373962,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4893,
|
|
"mean_token_accuracy": 0.8466312885284424,
|
|
"num_tokens": 239112968.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.7634791454730417,
|
|
"grad_norm": 1.0350664854049683,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.545,
|
|
"mean_token_accuracy": 0.8327237367630005,
|
|
"num_tokens": 239278918.0,
|
|
"step": 1501
|
|
},
|
|
{
|
|
"epoch": 0.7639877924720244,
|
|
"grad_norm": 1.1178715229034424,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4998,
|
|
"mean_token_accuracy": 0.8421357274055481,
|
|
"num_tokens": 239429305.0,
|
|
"step": 1502
|
|
},
|
|
{
|
|
"epoch": 0.7644964394710071,
|
|
"grad_norm": 0.9649775624275208,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4849,
|
|
"mean_token_accuracy": 0.8484745025634766,
|
|
"num_tokens": 239592603.0,
|
|
"step": 1503
|
|
},
|
|
{
|
|
"epoch": 0.7650050864699899,
|
|
"grad_norm": 1.130552887916565,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5433,
|
|
"mean_token_accuracy": 0.8310620784759521,
|
|
"num_tokens": 239745054.0,
|
|
"step": 1504
|
|
},
|
|
{
|
|
"epoch": 0.7655137334689726,
|
|
"grad_norm": 0.9407525658607483,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5056,
|
|
"mean_token_accuracy": 0.8418401479721069,
|
|
"num_tokens": 239909253.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.7660223804679552,
|
|
"grad_norm": 1.05576753616333,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5058,
|
|
"mean_token_accuracy": 0.8412678241729736,
|
|
"num_tokens": 240051112.0,
|
|
"step": 1506
|
|
},
|
|
{
|
|
"epoch": 0.7665310274669379,
|
|
"grad_norm": 1.0370968580245972,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4773,
|
|
"mean_token_accuracy": 0.8489423990249634,
|
|
"num_tokens": 240211700.0,
|
|
"step": 1507
|
|
},
|
|
{
|
|
"epoch": 0.7670396744659207,
|
|
"grad_norm": 0.9873724579811096,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.515,
|
|
"mean_token_accuracy": 0.8384915590286255,
|
|
"num_tokens": 240373170.0,
|
|
"step": 1508
|
|
},
|
|
{
|
|
"epoch": 0.7675483214649034,
|
|
"grad_norm": 1.1496977806091309,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5341,
|
|
"mean_token_accuracy": 0.8343428373336792,
|
|
"num_tokens": 240518936.0,
|
|
"step": 1509
|
|
},
|
|
{
|
|
"epoch": 0.768056968463886,
|
|
"grad_norm": 0.9918933510780334,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5145,
|
|
"mean_token_accuracy": 0.8385894894599915,
|
|
"num_tokens": 240668912.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.7685656154628687,
|
|
"grad_norm": 1.0380821228027344,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5004,
|
|
"mean_token_accuracy": 0.8435571193695068,
|
|
"num_tokens": 240828027.0,
|
|
"step": 1511
|
|
},
|
|
{
|
|
"epoch": 0.7690742624618515,
|
|
"grad_norm": 0.9556916952133179,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5036,
|
|
"mean_token_accuracy": 0.8424919843673706,
|
|
"num_tokens": 240996115.0,
|
|
"step": 1512
|
|
},
|
|
{
|
|
"epoch": 0.7695829094608342,
|
|
"grad_norm": 1.1142911911010742,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5031,
|
|
"mean_token_accuracy": 0.8429108262062073,
|
|
"num_tokens": 241154020.0,
|
|
"step": 1513
|
|
},
|
|
{
|
|
"epoch": 0.7700915564598169,
|
|
"grad_norm": 0.992397665977478,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4952,
|
|
"mean_token_accuracy": 0.8445471525192261,
|
|
"num_tokens": 241313175.0,
|
|
"step": 1514
|
|
},
|
|
{
|
|
"epoch": 0.7706002034587996,
|
|
"grad_norm": 1.2248951196670532,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5139,
|
|
"mean_token_accuracy": 0.838437557220459,
|
|
"num_tokens": 241461371.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.7711088504577823,
|
|
"grad_norm": 1.07581627368927,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5355,
|
|
"mean_token_accuracy": 0.8337858319282532,
|
|
"num_tokens": 241631176.0,
|
|
"step": 1516
|
|
},
|
|
{
|
|
"epoch": 0.771617497456765,
|
|
"grad_norm": 0.9590333104133606,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.831385612487793,
|
|
"num_tokens": 241791904.0,
|
|
"step": 1517
|
|
},
|
|
{
|
|
"epoch": 0.7721261444557477,
|
|
"grad_norm": 1.0338011980056763,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5032,
|
|
"mean_token_accuracy": 0.8443799018859863,
|
|
"num_tokens": 241954029.0,
|
|
"step": 1518
|
|
},
|
|
{
|
|
"epoch": 0.7726347914547305,
|
|
"grad_norm": 0.9844240546226501,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5657,
|
|
"mean_token_accuracy": 0.8268498778343201,
|
|
"num_tokens": 242106558.0,
|
|
"step": 1519
|
|
},
|
|
{
|
|
"epoch": 0.7731434384537131,
|
|
"grad_norm": 1.0088313817977905,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5292,
|
|
"mean_token_accuracy": 0.8359307050704956,
|
|
"num_tokens": 242276624.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.7736520854526958,
|
|
"grad_norm": 0.989596426486969,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5221,
|
|
"mean_token_accuracy": 0.8372203707695007,
|
|
"num_tokens": 242434754.0,
|
|
"step": 1521
|
|
},
|
|
{
|
|
"epoch": 0.7741607324516785,
|
|
"grad_norm": 1.020996332168579,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4928,
|
|
"mean_token_accuracy": 0.8438454866409302,
|
|
"num_tokens": 242576931.0,
|
|
"step": 1522
|
|
},
|
|
{
|
|
"epoch": 0.7746693794506613,
|
|
"grad_norm": 1.0195339918136597,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5139,
|
|
"mean_token_accuracy": 0.8392676711082458,
|
|
"num_tokens": 242733793.0,
|
|
"step": 1523
|
|
},
|
|
{
|
|
"epoch": 0.775178026449644,
|
|
"grad_norm": 1.117997169494629,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5112,
|
|
"mean_token_accuracy": 0.8406413197517395,
|
|
"num_tokens": 242882978.0,
|
|
"step": 1524
|
|
},
|
|
{
|
|
"epoch": 0.7756866734486266,
|
|
"grad_norm": 0.981311559677124,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5115,
|
|
"mean_token_accuracy": 0.8393944501876831,
|
|
"num_tokens": 243038462.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.7761953204476093,
|
|
"grad_norm": 1.094862937927246,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5242,
|
|
"mean_token_accuracy": 0.8363096714019775,
|
|
"num_tokens": 243206785.0,
|
|
"step": 1526
|
|
},
|
|
{
|
|
"epoch": 0.7767039674465921,
|
|
"grad_norm": 1.0014318227767944,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5096,
|
|
"mean_token_accuracy": 0.8408504128456116,
|
|
"num_tokens": 243357645.0,
|
|
"step": 1527
|
|
},
|
|
{
|
|
"epoch": 0.7772126144455748,
|
|
"grad_norm": 1.0128889083862305,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5019,
|
|
"mean_token_accuracy": 0.8421143293380737,
|
|
"num_tokens": 243512602.0,
|
|
"step": 1528
|
|
},
|
|
{
|
|
"epoch": 0.7777212614445574,
|
|
"grad_norm": 1.0889755487442017,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.505,
|
|
"mean_token_accuracy": 0.8413415551185608,
|
|
"num_tokens": 243687260.0,
|
|
"step": 1529
|
|
},
|
|
{
|
|
"epoch": 0.7782299084435402,
|
|
"grad_norm": 1.0065422058105469,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5121,
|
|
"mean_token_accuracy": 0.8403002619743347,
|
|
"num_tokens": 243845125.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.7787385554425229,
|
|
"grad_norm": 1.048030138015747,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4737,
|
|
"mean_token_accuracy": 0.8500345945358276,
|
|
"num_tokens": 243985343.0,
|
|
"step": 1531
|
|
},
|
|
{
|
|
"epoch": 0.7792472024415056,
|
|
"grad_norm": 1.0371286869049072,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5294,
|
|
"mean_token_accuracy": 0.8364617228507996,
|
|
"num_tokens": 244139431.0,
|
|
"step": 1532
|
|
},
|
|
{
|
|
"epoch": 0.7797558494404883,
|
|
"grad_norm": 1.0632935762405396,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5255,
|
|
"mean_token_accuracy": 0.8366844058036804,
|
|
"num_tokens": 244296684.0,
|
|
"step": 1533
|
|
},
|
|
{
|
|
"epoch": 0.780264496439471,
|
|
"grad_norm": 1.0461639165878296,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5206,
|
|
"mean_token_accuracy": 0.8383218050003052,
|
|
"num_tokens": 244456341.0,
|
|
"step": 1534
|
|
},
|
|
{
|
|
"epoch": 0.7807731434384537,
|
|
"grad_norm": 1.0430577993392944,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4919,
|
|
"mean_token_accuracy": 0.8446320295333862,
|
|
"num_tokens": 244612234.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.7812817904374364,
|
|
"grad_norm": 1.0549424886703491,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5374,
|
|
"mean_token_accuracy": 0.8326089382171631,
|
|
"num_tokens": 244763392.0,
|
|
"step": 1536
|
|
},
|
|
{
|
|
"epoch": 0.7817904374364191,
|
|
"grad_norm": 1.0101888179779053,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8328830003738403,
|
|
"num_tokens": 244921544.0,
|
|
"step": 1537
|
|
},
|
|
{
|
|
"epoch": 0.7822990844354019,
|
|
"grad_norm": 1.0776944160461426,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5038,
|
|
"mean_token_accuracy": 0.8413270711898804,
|
|
"num_tokens": 245082597.0,
|
|
"step": 1538
|
|
},
|
|
{
|
|
"epoch": 0.7828077314343845,
|
|
"grad_norm": 1.0002422332763672,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5249,
|
|
"mean_token_accuracy": 0.8351992964744568,
|
|
"num_tokens": 245229522.0,
|
|
"step": 1539
|
|
},
|
|
{
|
|
"epoch": 0.7833163784333672,
|
|
"grad_norm": 1.0443092584609985,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5214,
|
|
"mean_token_accuracy": 0.8371322154998779,
|
|
"num_tokens": 245379953.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.78382502543235,
|
|
"grad_norm": 0.9857534170150757,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.506,
|
|
"mean_token_accuracy": 0.8414639830589294,
|
|
"num_tokens": 245531845.0,
|
|
"step": 1541
|
|
},
|
|
{
|
|
"epoch": 0.7843336724313327,
|
|
"grad_norm": 0.9642319083213806,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4966,
|
|
"mean_token_accuracy": 0.8451875448226929,
|
|
"num_tokens": 245696677.0,
|
|
"step": 1542
|
|
},
|
|
{
|
|
"epoch": 0.7848423194303153,
|
|
"grad_norm": 0.9377528429031372,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5102,
|
|
"mean_token_accuracy": 0.8393126726150513,
|
|
"num_tokens": 245866942.0,
|
|
"step": 1543
|
|
},
|
|
{
|
|
"epoch": 0.785350966429298,
|
|
"grad_norm": 0.992759644985199,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.834294319152832,
|
|
"num_tokens": 246030786.0,
|
|
"step": 1544
|
|
},
|
|
{
|
|
"epoch": 0.7858596134282808,
|
|
"grad_norm": 1.0652512311935425,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5154,
|
|
"mean_token_accuracy": 0.8386496305465698,
|
|
"num_tokens": 246181272.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.7863682604272635,
|
|
"grad_norm": 1.1718790531158447,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4754,
|
|
"mean_token_accuracy": 0.848141074180603,
|
|
"num_tokens": 246332311.0,
|
|
"step": 1546
|
|
},
|
|
{
|
|
"epoch": 0.7868769074262462,
|
|
"grad_norm": 1.164916753768921,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5441,
|
|
"mean_token_accuracy": 0.8322082757949829,
|
|
"num_tokens": 246491365.0,
|
|
"step": 1547
|
|
},
|
|
{
|
|
"epoch": 0.7873855544252288,
|
|
"grad_norm": 1.1251907348632812,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4793,
|
|
"mean_token_accuracy": 0.8482698202133179,
|
|
"num_tokens": 246646398.0,
|
|
"step": 1548
|
|
},
|
|
{
|
|
"epoch": 0.7878942014242116,
|
|
"grad_norm": 1.0759825706481934,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5078,
|
|
"mean_token_accuracy": 0.8396289348602295,
|
|
"num_tokens": 246806249.0,
|
|
"step": 1549
|
|
},
|
|
{
|
|
"epoch": 0.7884028484231943,
|
|
"grad_norm": 1.0879809856414795,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4741,
|
|
"mean_token_accuracy": 0.848809003829956,
|
|
"num_tokens": 246957754.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.788911495422177,
|
|
"grad_norm": 0.964314341545105,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4633,
|
|
"mean_token_accuracy": 0.8527604341506958,
|
|
"num_tokens": 247122388.0,
|
|
"step": 1551
|
|
},
|
|
{
|
|
"epoch": 0.7894201424211598,
|
|
"grad_norm": 1.0330865383148193,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4731,
|
|
"mean_token_accuracy": 0.8512973785400391,
|
|
"num_tokens": 247284678.0,
|
|
"step": 1552
|
|
},
|
|
{
|
|
"epoch": 0.7899287894201424,
|
|
"grad_norm": 1.049044132232666,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5357,
|
|
"mean_token_accuracy": 0.8345746397972107,
|
|
"num_tokens": 247442396.0,
|
|
"step": 1553
|
|
},
|
|
{
|
|
"epoch": 0.7904374364191251,
|
|
"grad_norm": 1.0321537256240845,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5033,
|
|
"mean_token_accuracy": 0.8421740531921387,
|
|
"num_tokens": 247601438.0,
|
|
"step": 1554
|
|
},
|
|
{
|
|
"epoch": 0.7909460834181078,
|
|
"grad_norm": 1.083871841430664,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4968,
|
|
"mean_token_accuracy": 0.8436111211776733,
|
|
"num_tokens": 247766929.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.7914547304170906,
|
|
"grad_norm": 1.0228075981140137,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5253,
|
|
"mean_token_accuracy": 0.8366875648498535,
|
|
"num_tokens": 247935788.0,
|
|
"step": 1556
|
|
},
|
|
{
|
|
"epoch": 0.7919633774160733,
|
|
"grad_norm": 1.103209137916565,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4918,
|
|
"mean_token_accuracy": 0.8470031023025513,
|
|
"num_tokens": 248102197.0,
|
|
"step": 1557
|
|
},
|
|
{
|
|
"epoch": 0.7924720244150559,
|
|
"grad_norm": 1.0742714405059814,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4889,
|
|
"mean_token_accuracy": 0.8469794988632202,
|
|
"num_tokens": 248252748.0,
|
|
"step": 1558
|
|
},
|
|
{
|
|
"epoch": 0.7929806714140386,
|
|
"grad_norm": 1.0506287813186646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5021,
|
|
"mean_token_accuracy": 0.8429771661758423,
|
|
"num_tokens": 248410690.0,
|
|
"step": 1559
|
|
},
|
|
{
|
|
"epoch": 0.7934893184130214,
|
|
"grad_norm": 1.2658612728118896,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4979,
|
|
"mean_token_accuracy": 0.8446970582008362,
|
|
"num_tokens": 248562629.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.7939979654120041,
|
|
"grad_norm": 1.0374503135681152,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.529,
|
|
"mean_token_accuracy": 0.8339532613754272,
|
|
"num_tokens": 248724426.0,
|
|
"step": 1561
|
|
},
|
|
{
|
|
"epoch": 0.7945066124109867,
|
|
"grad_norm": 1.109431266784668,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5269,
|
|
"mean_token_accuracy": 0.8367703557014465,
|
|
"num_tokens": 248884711.0,
|
|
"step": 1562
|
|
},
|
|
{
|
|
"epoch": 0.7950152594099695,
|
|
"grad_norm": 1.0636353492736816,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5234,
|
|
"mean_token_accuracy": 0.8372197151184082,
|
|
"num_tokens": 249039319.0,
|
|
"step": 1563
|
|
},
|
|
{
|
|
"epoch": 0.7955239064089522,
|
|
"grad_norm": 1.0044738054275513,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5103,
|
|
"mean_token_accuracy": 0.8407605290412903,
|
|
"num_tokens": 249198668.0,
|
|
"step": 1564
|
|
},
|
|
{
|
|
"epoch": 0.7960325534079349,
|
|
"grad_norm": 1.0944424867630005,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5106,
|
|
"mean_token_accuracy": 0.8392775058746338,
|
|
"num_tokens": 249355386.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.7965412004069176,
|
|
"grad_norm": 1.0104095935821533,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5258,
|
|
"mean_token_accuracy": 0.836336612701416,
|
|
"num_tokens": 249506726.0,
|
|
"step": 1566
|
|
},
|
|
{
|
|
"epoch": 0.7970498474059003,
|
|
"grad_norm": 1.019428014755249,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5437,
|
|
"mean_token_accuracy": 0.8311398029327393,
|
|
"num_tokens": 249668433.0,
|
|
"step": 1567
|
|
},
|
|
{
|
|
"epoch": 0.797558494404883,
|
|
"grad_norm": 1.0521516799926758,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4922,
|
|
"mean_token_accuracy": 0.8451628684997559,
|
|
"num_tokens": 249815595.0,
|
|
"step": 1568
|
|
},
|
|
{
|
|
"epoch": 0.7980671414038657,
|
|
"grad_norm": 1.0037367343902588,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5098,
|
|
"mean_token_accuracy": 0.839537501335144,
|
|
"num_tokens": 249977103.0,
|
|
"step": 1569
|
|
},
|
|
{
|
|
"epoch": 0.7985757884028484,
|
|
"grad_norm": 0.9603949189186096,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5177,
|
|
"mean_token_accuracy": 0.8383146524429321,
|
|
"num_tokens": 250145992.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.7990844354018312,
|
|
"grad_norm": 1.1059879064559937,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5197,
|
|
"mean_token_accuracy": 0.837566614151001,
|
|
"num_tokens": 250280246.0,
|
|
"step": 1571
|
|
},
|
|
{
|
|
"epoch": 0.7995930824008138,
|
|
"grad_norm": 0.9839898943901062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5232,
|
|
"mean_token_accuracy": 0.8364046812057495,
|
|
"num_tokens": 250443792.0,
|
|
"step": 1572
|
|
},
|
|
{
|
|
"epoch": 0.8001017293997965,
|
|
"grad_norm": 0.919985294342041,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5083,
|
|
"mean_token_accuracy": 0.8388224840164185,
|
|
"num_tokens": 250599631.0,
|
|
"step": 1573
|
|
},
|
|
{
|
|
"epoch": 0.8006103763987793,
|
|
"grad_norm": 1.0868338346481323,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.8346595168113708,
|
|
"num_tokens": 250747877.0,
|
|
"step": 1574
|
|
},
|
|
{
|
|
"epoch": 0.801119023397762,
|
|
"grad_norm": 1.0660415887832642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.528,
|
|
"mean_token_accuracy": 0.8343693614006042,
|
|
"num_tokens": 250900217.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.8016276703967447,
|
|
"grad_norm": 1.2554595470428467,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4897,
|
|
"mean_token_accuracy": 0.8446955680847168,
|
|
"num_tokens": 251052502.0,
|
|
"step": 1576
|
|
},
|
|
{
|
|
"epoch": 0.8021363173957273,
|
|
"grad_norm": 0.9247676134109497,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4928,
|
|
"mean_token_accuracy": 0.8457845449447632,
|
|
"num_tokens": 251208564.0,
|
|
"step": 1577
|
|
},
|
|
{
|
|
"epoch": 0.8026449643947101,
|
|
"grad_norm": 1.0211766958236694,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5086,
|
|
"mean_token_accuracy": 0.8419086933135986,
|
|
"num_tokens": 251352775.0,
|
|
"step": 1578
|
|
},
|
|
{
|
|
"epoch": 0.8031536113936928,
|
|
"grad_norm": 0.9540887475013733,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5056,
|
|
"mean_token_accuracy": 0.8426491022109985,
|
|
"num_tokens": 251518171.0,
|
|
"step": 1579
|
|
},
|
|
{
|
|
"epoch": 0.8036622583926755,
|
|
"grad_norm": 0.9458035826683044,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5137,
|
|
"mean_token_accuracy": 0.8412249088287354,
|
|
"num_tokens": 251677652.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.8041709053916581,
|
|
"grad_norm": 1.045344352722168,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5235,
|
|
"mean_token_accuracy": 0.8355517387390137,
|
|
"num_tokens": 251821333.0,
|
|
"step": 1581
|
|
},
|
|
{
|
|
"epoch": 0.8046795523906409,
|
|
"grad_norm": 1.0049554109573364,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5681,
|
|
"mean_token_accuracy": 0.8258787989616394,
|
|
"num_tokens": 251995232.0,
|
|
"step": 1582
|
|
},
|
|
{
|
|
"epoch": 0.8051881993896236,
|
|
"grad_norm": 0.9822063446044922,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5013,
|
|
"mean_token_accuracy": 0.8422402739524841,
|
|
"num_tokens": 252151382.0,
|
|
"step": 1583
|
|
},
|
|
{
|
|
"epoch": 0.8056968463886063,
|
|
"grad_norm": 0.9994479417800903,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5343,
|
|
"mean_token_accuracy": 0.8352287411689758,
|
|
"num_tokens": 252318139.0,
|
|
"step": 1584
|
|
},
|
|
{
|
|
"epoch": 0.8062054933875891,
|
|
"grad_norm": 1.0065675973892212,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4998,
|
|
"mean_token_accuracy": 0.8429774641990662,
|
|
"num_tokens": 252479916.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.8067141403865717,
|
|
"grad_norm": 0.9369513392448425,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.513,
|
|
"mean_token_accuracy": 0.8410748839378357,
|
|
"num_tokens": 252655015.0,
|
|
"step": 1586
|
|
},
|
|
{
|
|
"epoch": 0.8072227873855544,
|
|
"grad_norm": 0.9758577942848206,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5066,
|
|
"mean_token_accuracy": 0.8424795866012573,
|
|
"num_tokens": 252810669.0,
|
|
"step": 1587
|
|
},
|
|
{
|
|
"epoch": 0.8077314343845371,
|
|
"grad_norm": 0.9955373406410217,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5034,
|
|
"mean_token_accuracy": 0.8429820537567139,
|
|
"num_tokens": 252975872.0,
|
|
"step": 1588
|
|
},
|
|
{
|
|
"epoch": 0.8082400813835199,
|
|
"grad_norm": 1.0039629936218262,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5167,
|
|
"mean_token_accuracy": 0.838952898979187,
|
|
"num_tokens": 253140927.0,
|
|
"step": 1589
|
|
},
|
|
{
|
|
"epoch": 0.8087487283825026,
|
|
"grad_norm": 1.0134005546569824,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5233,
|
|
"mean_token_accuracy": 0.8362022638320923,
|
|
"num_tokens": 253298562.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.8092573753814852,
|
|
"grad_norm": 0.9585344195365906,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5182,
|
|
"mean_token_accuracy": 0.8382187485694885,
|
|
"num_tokens": 253469245.0,
|
|
"step": 1591
|
|
},
|
|
{
|
|
"epoch": 0.8097660223804679,
|
|
"grad_norm": 1.0151357650756836,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4573,
|
|
"mean_token_accuracy": 0.8557564616203308,
|
|
"num_tokens": 253623711.0,
|
|
"step": 1592
|
|
},
|
|
{
|
|
"epoch": 0.8102746693794507,
|
|
"grad_norm": 1.0037392377853394,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4939,
|
|
"mean_token_accuracy": 0.8455232381820679,
|
|
"num_tokens": 253793013.0,
|
|
"step": 1593
|
|
},
|
|
{
|
|
"epoch": 0.8107833163784334,
|
|
"grad_norm": 1.0920860767364502,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5494,
|
|
"mean_token_accuracy": 0.8293622732162476,
|
|
"num_tokens": 253951728.0,
|
|
"step": 1594
|
|
},
|
|
{
|
|
"epoch": 0.811291963377416,
|
|
"grad_norm": 1.0275788307189941,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5097,
|
|
"mean_token_accuracy": 0.8406323194503784,
|
|
"num_tokens": 254107635.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.8118006103763988,
|
|
"grad_norm": 0.9545956254005432,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5224,
|
|
"mean_token_accuracy": 0.8365148901939392,
|
|
"num_tokens": 254256769.0,
|
|
"step": 1596
|
|
},
|
|
{
|
|
"epoch": 0.8123092573753815,
|
|
"grad_norm": 1.0170985460281372,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5504,
|
|
"mean_token_accuracy": 0.8302421569824219,
|
|
"num_tokens": 254426121.0,
|
|
"step": 1597
|
|
},
|
|
{
|
|
"epoch": 0.8128179043743642,
|
|
"grad_norm": 0.9216252565383911,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5147,
|
|
"mean_token_accuracy": 0.8396792411804199,
|
|
"num_tokens": 254592698.0,
|
|
"step": 1598
|
|
},
|
|
{
|
|
"epoch": 0.8133265513733469,
|
|
"grad_norm": 0.9474987983703613,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.519,
|
|
"mean_token_accuracy": 0.8377450108528137,
|
|
"num_tokens": 254757687.0,
|
|
"step": 1599
|
|
},
|
|
{
|
|
"epoch": 0.8138351983723296,
|
|
"grad_norm": 1.0128676891326904,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5186,
|
|
"mean_token_accuracy": 0.8377546072006226,
|
|
"num_tokens": 254907582.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.8143438453713123,
|
|
"grad_norm": 1.0330684185028076,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4952,
|
|
"mean_token_accuracy": 0.8445876836776733,
|
|
"num_tokens": 255060864.0,
|
|
"step": 1601
|
|
},
|
|
{
|
|
"epoch": 0.814852492370295,
|
|
"grad_norm": 1.0349136590957642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4896,
|
|
"mean_token_accuracy": 0.8454582691192627,
|
|
"num_tokens": 255220612.0,
|
|
"step": 1602
|
|
},
|
|
{
|
|
"epoch": 0.8153611393692777,
|
|
"grad_norm": 1.0863885879516602,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5662,
|
|
"mean_token_accuracy": 0.8244870901107788,
|
|
"num_tokens": 255381676.0,
|
|
"step": 1603
|
|
},
|
|
{
|
|
"epoch": 0.8158697863682605,
|
|
"grad_norm": 1.0500446557998657,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4875,
|
|
"mean_token_accuracy": 0.8489367961883545,
|
|
"num_tokens": 255530491.0,
|
|
"step": 1604
|
|
},
|
|
{
|
|
"epoch": 0.8163784333672431,
|
|
"grad_norm": 0.9453365206718445,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5082,
|
|
"mean_token_accuracy": 0.8424679040908813,
|
|
"num_tokens": 255693669.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.8168870803662258,
|
|
"grad_norm": 1.0558098554611206,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5168,
|
|
"mean_token_accuracy": 0.8376903533935547,
|
|
"num_tokens": 255861693.0,
|
|
"step": 1606
|
|
},
|
|
{
|
|
"epoch": 0.8173957273652085,
|
|
"grad_norm": 1.0058090686798096,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5179,
|
|
"mean_token_accuracy": 0.8368159532546997,
|
|
"num_tokens": 256028864.0,
|
|
"step": 1607
|
|
},
|
|
{
|
|
"epoch": 0.8179043743641913,
|
|
"grad_norm": 1.1240429878234863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4665,
|
|
"mean_token_accuracy": 0.8514443635940552,
|
|
"num_tokens": 256184434.0,
|
|
"step": 1608
|
|
},
|
|
{
|
|
"epoch": 0.818413021363174,
|
|
"grad_norm": 1.0584617853164673,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4774,
|
|
"mean_token_accuracy": 0.8490929007530212,
|
|
"num_tokens": 256342000.0,
|
|
"step": 1609
|
|
},
|
|
{
|
|
"epoch": 0.8189216683621566,
|
|
"grad_norm": 1.1094365119934082,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5219,
|
|
"mean_token_accuracy": 0.8365220427513123,
|
|
"num_tokens": 256517073.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.8194303153611394,
|
|
"grad_norm": 1.046618938446045,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4842,
|
|
"mean_token_accuracy": 0.8463455438613892,
|
|
"num_tokens": 256674055.0,
|
|
"step": 1611
|
|
},
|
|
{
|
|
"epoch": 0.8199389623601221,
|
|
"grad_norm": 1.098393201828003,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5381,
|
|
"mean_token_accuracy": 0.8320057392120361,
|
|
"num_tokens": 256848111.0,
|
|
"step": 1612
|
|
},
|
|
{
|
|
"epoch": 0.8204476093591048,
|
|
"grad_norm": 1.0480852127075195,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4944,
|
|
"mean_token_accuracy": 0.8455042243003845,
|
|
"num_tokens": 257016873.0,
|
|
"step": 1613
|
|
},
|
|
{
|
|
"epoch": 0.8209562563580874,
|
|
"grad_norm": 0.9647024869918823,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5353,
|
|
"mean_token_accuracy": 0.8356543779373169,
|
|
"num_tokens": 257182035.0,
|
|
"step": 1614
|
|
},
|
|
{
|
|
"epoch": 0.8214649033570702,
|
|
"grad_norm": 1.0326327085494995,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5481,
|
|
"mean_token_accuracy": 0.8293918371200562,
|
|
"num_tokens": 257351888.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.8219735503560529,
|
|
"grad_norm": 0.9589417576789856,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5145,
|
|
"mean_token_accuracy": 0.8386868238449097,
|
|
"num_tokens": 257508833.0,
|
|
"step": 1616
|
|
},
|
|
{
|
|
"epoch": 0.8224821973550356,
|
|
"grad_norm": 1.074066400527954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4979,
|
|
"mean_token_accuracy": 0.8440150618553162,
|
|
"num_tokens": 257666306.0,
|
|
"step": 1617
|
|
},
|
|
{
|
|
"epoch": 0.8229908443540183,
|
|
"grad_norm": 0.9626802206039429,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5153,
|
|
"mean_token_accuracy": 0.8389250636100769,
|
|
"num_tokens": 257832236.0,
|
|
"step": 1618
|
|
},
|
|
{
|
|
"epoch": 0.823499491353001,
|
|
"grad_norm": 1.091043472290039,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.488,
|
|
"mean_token_accuracy": 0.8457168340682983,
|
|
"num_tokens": 257996091.0,
|
|
"step": 1619
|
|
},
|
|
{
|
|
"epoch": 0.8240081383519837,
|
|
"grad_norm": 0.9529052972793579,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5091,
|
|
"mean_token_accuracy": 0.8395473957061768,
|
|
"num_tokens": 258156268.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.8245167853509664,
|
|
"grad_norm": 1.021721601486206,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5263,
|
|
"mean_token_accuracy": 0.8362045884132385,
|
|
"num_tokens": 258323374.0,
|
|
"step": 1621
|
|
},
|
|
{
|
|
"epoch": 0.8250254323499492,
|
|
"grad_norm": 0.971449077129364,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5149,
|
|
"mean_token_accuracy": 0.839565634727478,
|
|
"num_tokens": 258495850.0,
|
|
"step": 1622
|
|
},
|
|
{
|
|
"epoch": 0.8255340793489319,
|
|
"grad_norm": 0.9889821410179138,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5132,
|
|
"mean_token_accuracy": 0.8389104604721069,
|
|
"num_tokens": 258653458.0,
|
|
"step": 1623
|
|
},
|
|
{
|
|
"epoch": 0.8260427263479145,
|
|
"grad_norm": 1.0107471942901611,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5253,
|
|
"mean_token_accuracy": 0.8384637832641602,
|
|
"num_tokens": 258823978.0,
|
|
"step": 1624
|
|
},
|
|
{
|
|
"epoch": 0.8265513733468972,
|
|
"grad_norm": 1.0081323385238647,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5071,
|
|
"mean_token_accuracy": 0.8410070538520813,
|
|
"num_tokens": 258981414.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.82706002034588,
|
|
"grad_norm": 0.9704625606536865,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5111,
|
|
"mean_token_accuracy": 0.8383285403251648,
|
|
"num_tokens": 259143758.0,
|
|
"step": 1626
|
|
},
|
|
{
|
|
"epoch": 0.8275686673448627,
|
|
"grad_norm": 1.0984337329864502,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8355196118354797,
|
|
"num_tokens": 259297261.0,
|
|
"step": 1627
|
|
},
|
|
{
|
|
"epoch": 0.8280773143438453,
|
|
"grad_norm": 1.045357584953308,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5176,
|
|
"mean_token_accuracy": 0.8388932943344116,
|
|
"num_tokens": 259459034.0,
|
|
"step": 1628
|
|
},
|
|
{
|
|
"epoch": 0.828585961342828,
|
|
"grad_norm": 0.9529820084571838,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4971,
|
|
"mean_token_accuracy": 0.8412728309631348,
|
|
"num_tokens": 259614641.0,
|
|
"step": 1629
|
|
},
|
|
{
|
|
"epoch": 0.8290946083418108,
|
|
"grad_norm": 1.033054232597351,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4947,
|
|
"mean_token_accuracy": 0.8445708751678467,
|
|
"num_tokens": 259769628.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.8296032553407935,
|
|
"grad_norm": 1.1199381351470947,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5027,
|
|
"mean_token_accuracy": 0.8430094718933105,
|
|
"num_tokens": 259925515.0,
|
|
"step": 1631
|
|
},
|
|
{
|
|
"epoch": 0.8301119023397762,
|
|
"grad_norm": 1.015918493270874,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5572,
|
|
"mean_token_accuracy": 0.8273895978927612,
|
|
"num_tokens": 260091045.0,
|
|
"step": 1632
|
|
},
|
|
{
|
|
"epoch": 0.830620549338759,
|
|
"grad_norm": 1.0634037256240845,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4982,
|
|
"mean_token_accuracy": 0.8440449237823486,
|
|
"num_tokens": 260243318.0,
|
|
"step": 1633
|
|
},
|
|
{
|
|
"epoch": 0.8311291963377416,
|
|
"grad_norm": 1.0427650213241577,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.521,
|
|
"mean_token_accuracy": 0.838854193687439,
|
|
"num_tokens": 260414955.0,
|
|
"step": 1634
|
|
},
|
|
{
|
|
"epoch": 0.8316378433367243,
|
|
"grad_norm": 1.0316675901412964,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5087,
|
|
"mean_token_accuracy": 0.8421332836151123,
|
|
"num_tokens": 260573372.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 0.832146490335707,
|
|
"grad_norm": 0.9899135828018188,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4595,
|
|
"mean_token_accuracy": 0.853246808052063,
|
|
"num_tokens": 260738547.0,
|
|
"step": 1636
|
|
},
|
|
{
|
|
"epoch": 0.8326551373346898,
|
|
"grad_norm": 0.9942594170570374,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4991,
|
|
"mean_token_accuracy": 0.8443979620933533,
|
|
"num_tokens": 260892133.0,
|
|
"step": 1637
|
|
},
|
|
{
|
|
"epoch": 0.8331637843336724,
|
|
"grad_norm": 1.1206623315811157,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8334402441978455,
|
|
"num_tokens": 261051308.0,
|
|
"step": 1638
|
|
},
|
|
{
|
|
"epoch": 0.8336724313326551,
|
|
"grad_norm": 1.0370997190475464,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5182,
|
|
"mean_token_accuracy": 0.8363610506057739,
|
|
"num_tokens": 261205776.0,
|
|
"step": 1639
|
|
},
|
|
{
|
|
"epoch": 0.8341810783316378,
|
|
"grad_norm": 1.0111100673675537,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4809,
|
|
"mean_token_accuracy": 0.8484663963317871,
|
|
"num_tokens": 261358402.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.8346897253306206,
|
|
"grad_norm": 1.0508127212524414,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5326,
|
|
"mean_token_accuracy": 0.8332227468490601,
|
|
"num_tokens": 261524731.0,
|
|
"step": 1641
|
|
},
|
|
{
|
|
"epoch": 0.8351983723296033,
|
|
"grad_norm": 0.9737936854362488,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5032,
|
|
"mean_token_accuracy": 0.8433814644813538,
|
|
"num_tokens": 261687233.0,
|
|
"step": 1642
|
|
},
|
|
{
|
|
"epoch": 0.8357070193285859,
|
|
"grad_norm": 1.0474128723144531,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5228,
|
|
"mean_token_accuracy": 0.8364410400390625,
|
|
"num_tokens": 261854243.0,
|
|
"step": 1643
|
|
},
|
|
{
|
|
"epoch": 0.8362156663275687,
|
|
"grad_norm": 0.9719751477241516,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.528,
|
|
"mean_token_accuracy": 0.8366215229034424,
|
|
"num_tokens": 262026963.0,
|
|
"step": 1644
|
|
},
|
|
{
|
|
"epoch": 0.8367243133265514,
|
|
"grad_norm": 0.9536095857620239,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4901,
|
|
"mean_token_accuracy": 0.8455750942230225,
|
|
"num_tokens": 262196156.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 0.8372329603255341,
|
|
"grad_norm": 0.9847731590270996,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5259,
|
|
"mean_token_accuracy": 0.8377428650856018,
|
|
"num_tokens": 262365347.0,
|
|
"step": 1646
|
|
},
|
|
{
|
|
"epoch": 0.8377416073245167,
|
|
"grad_norm": 1.059686541557312,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5206,
|
|
"mean_token_accuracy": 0.8395569324493408,
|
|
"num_tokens": 262519307.0,
|
|
"step": 1647
|
|
},
|
|
{
|
|
"epoch": 0.8382502543234995,
|
|
"grad_norm": 0.9664161801338196,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.843921422958374,
|
|
"num_tokens": 262673650.0,
|
|
"step": 1648
|
|
},
|
|
{
|
|
"epoch": 0.8387589013224822,
|
|
"grad_norm": 1.0741627216339111,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5116,
|
|
"mean_token_accuracy": 0.8394699096679688,
|
|
"num_tokens": 262812546.0,
|
|
"step": 1649
|
|
},
|
|
{
|
|
"epoch": 0.8392675483214649,
|
|
"grad_norm": 1.0161736011505127,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5391,
|
|
"mean_token_accuracy": 0.832436203956604,
|
|
"num_tokens": 262970304.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.8397761953204476,
|
|
"grad_norm": 1.0764623880386353,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5357,
|
|
"mean_token_accuracy": 0.834117591381073,
|
|
"num_tokens": 263132275.0,
|
|
"step": 1651
|
|
},
|
|
{
|
|
"epoch": 0.8402848423194303,
|
|
"grad_norm": 1.0434576272964478,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5113,
|
|
"mean_token_accuracy": 0.8394972085952759,
|
|
"num_tokens": 263293537.0,
|
|
"step": 1652
|
|
},
|
|
{
|
|
"epoch": 0.840793489318413,
|
|
"grad_norm": 0.9257302284240723,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5123,
|
|
"mean_token_accuracy": 0.840105414390564,
|
|
"num_tokens": 263455476.0,
|
|
"step": 1653
|
|
},
|
|
{
|
|
"epoch": 0.8413021363173957,
|
|
"grad_norm": 0.974478006362915,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4996,
|
|
"mean_token_accuracy": 0.8424463868141174,
|
|
"num_tokens": 263615516.0,
|
|
"step": 1654
|
|
},
|
|
{
|
|
"epoch": 0.8418107833163785,
|
|
"grad_norm": 1.0634433031082153,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4987,
|
|
"mean_token_accuracy": 0.8448903560638428,
|
|
"num_tokens": 263759112.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 0.8423194303153612,
|
|
"grad_norm": 0.922835648059845,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4973,
|
|
"mean_token_accuracy": 0.8442589640617371,
|
|
"num_tokens": 263924035.0,
|
|
"step": 1656
|
|
},
|
|
{
|
|
"epoch": 0.8428280773143438,
|
|
"grad_norm": 0.9309987425804138,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5619,
|
|
"mean_token_accuracy": 0.8253204226493835,
|
|
"num_tokens": 264096716.0,
|
|
"step": 1657
|
|
},
|
|
{
|
|
"epoch": 0.8433367243133265,
|
|
"grad_norm": 0.9536299705505371,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5241,
|
|
"mean_token_accuracy": 0.8355963826179504,
|
|
"num_tokens": 264253124.0,
|
|
"step": 1658
|
|
},
|
|
{
|
|
"epoch": 0.8438453713123093,
|
|
"grad_norm": 0.9557009339332581,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.484,
|
|
"mean_token_accuracy": 0.8470132946968079,
|
|
"num_tokens": 264412558.0,
|
|
"step": 1659
|
|
},
|
|
{
|
|
"epoch": 0.844354018311292,
|
|
"grad_norm": 0.8998320698738098,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.837689220905304,
|
|
"num_tokens": 264578027.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.8448626653102747,
|
|
"grad_norm": 0.9375623464584351,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5143,
|
|
"mean_token_accuracy": 0.8389825224876404,
|
|
"num_tokens": 264737779.0,
|
|
"step": 1661
|
|
},
|
|
{
|
|
"epoch": 0.8453713123092573,
|
|
"grad_norm": 1.0092815160751343,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5244,
|
|
"mean_token_accuracy": 0.8360074758529663,
|
|
"num_tokens": 264888054.0,
|
|
"step": 1662
|
|
},
|
|
{
|
|
"epoch": 0.8458799593082401,
|
|
"grad_norm": 0.9785453677177429,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5214,
|
|
"mean_token_accuracy": 0.837114155292511,
|
|
"num_tokens": 265048672.0,
|
|
"step": 1663
|
|
},
|
|
{
|
|
"epoch": 0.8463886063072228,
|
|
"grad_norm": 3.0526108741760254,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5065,
|
|
"mean_token_accuracy": 0.8422542810440063,
|
|
"num_tokens": 265191225.0,
|
|
"step": 1664
|
|
},
|
|
{
|
|
"epoch": 0.8468972533062055,
|
|
"grad_norm": 1.1187739372253418,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5326,
|
|
"mean_token_accuracy": 0.8355621695518494,
|
|
"num_tokens": 265357998.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 0.8474059003051883,
|
|
"grad_norm": 0.957465648651123,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5054,
|
|
"mean_token_accuracy": 0.8424095511436462,
|
|
"num_tokens": 265529823.0,
|
|
"step": 1666
|
|
},
|
|
{
|
|
"epoch": 0.8479145473041709,
|
|
"grad_norm": 1.0484718084335327,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5208,
|
|
"mean_token_accuracy": 0.8372833728790283,
|
|
"num_tokens": 265683573.0,
|
|
"step": 1667
|
|
},
|
|
{
|
|
"epoch": 0.8484231943031536,
|
|
"grad_norm": 0.9759782552719116,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4765,
|
|
"mean_token_accuracy": 0.8503193259239197,
|
|
"num_tokens": 265845965.0,
|
|
"step": 1668
|
|
},
|
|
{
|
|
"epoch": 0.8489318413021363,
|
|
"grad_norm": 1.09355628490448,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5445,
|
|
"mean_token_accuracy": 0.8312594890594482,
|
|
"num_tokens": 265991385.0,
|
|
"step": 1669
|
|
},
|
|
{
|
|
"epoch": 0.8494404883011191,
|
|
"grad_norm": 1.0344514846801758,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5187,
|
|
"mean_token_accuracy": 0.8376867175102234,
|
|
"num_tokens": 266161615.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.8499491353001017,
|
|
"grad_norm": 1.143629789352417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5329,
|
|
"mean_token_accuracy": 0.8344286680221558,
|
|
"num_tokens": 266310118.0,
|
|
"step": 1671
|
|
},
|
|
{
|
|
"epoch": 0.8504577822990844,
|
|
"grad_norm": 1.0117353200912476,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4985,
|
|
"mean_token_accuracy": 0.8438706398010254,
|
|
"num_tokens": 266462436.0,
|
|
"step": 1672
|
|
},
|
|
{
|
|
"epoch": 0.8509664292980671,
|
|
"grad_norm": 1.0194602012634277,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5075,
|
|
"mean_token_accuracy": 0.8406510353088379,
|
|
"num_tokens": 266613253.0,
|
|
"step": 1673
|
|
},
|
|
{
|
|
"epoch": 0.8514750762970499,
|
|
"grad_norm": 1.0524338483810425,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5141,
|
|
"mean_token_accuracy": 0.839398205280304,
|
|
"num_tokens": 266773293.0,
|
|
"step": 1674
|
|
},
|
|
{
|
|
"epoch": 0.8519837232960326,
|
|
"grad_norm": 1.0298408269882202,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5216,
|
|
"mean_token_accuracy": 0.8359988927841187,
|
|
"num_tokens": 266935261.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.8524923702950152,
|
|
"grad_norm": 1.1193722486495972,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5277,
|
|
"mean_token_accuracy": 0.8361387252807617,
|
|
"num_tokens": 267100828.0,
|
|
"step": 1676
|
|
},
|
|
{
|
|
"epoch": 0.853001017293998,
|
|
"grad_norm": 1.1007781028747559,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5152,
|
|
"mean_token_accuracy": 0.8386905789375305,
|
|
"num_tokens": 267269826.0,
|
|
"step": 1677
|
|
},
|
|
{
|
|
"epoch": 0.8535096642929807,
|
|
"grad_norm": 0.9150673747062683,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4983,
|
|
"mean_token_accuracy": 0.8425648212432861,
|
|
"num_tokens": 267429972.0,
|
|
"step": 1678
|
|
},
|
|
{
|
|
"epoch": 0.8540183112919634,
|
|
"grad_norm": 1.072229027748108,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5125,
|
|
"mean_token_accuracy": 0.8400737643241882,
|
|
"num_tokens": 267590240.0,
|
|
"step": 1679
|
|
},
|
|
{
|
|
"epoch": 0.854526958290946,
|
|
"grad_norm": 0.9825935363769531,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4731,
|
|
"mean_token_accuracy": 0.8503822088241577,
|
|
"num_tokens": 267748631.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.8550356052899288,
|
|
"grad_norm": 0.9460796117782593,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5178,
|
|
"mean_token_accuracy": 0.8373454213142395,
|
|
"num_tokens": 267906365.0,
|
|
"step": 1681
|
|
},
|
|
{
|
|
"epoch": 0.8555442522889115,
|
|
"grad_norm": 0.9841094613075256,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4899,
|
|
"mean_token_accuracy": 0.8446120023727417,
|
|
"num_tokens": 268058483.0,
|
|
"step": 1682
|
|
},
|
|
{
|
|
"epoch": 0.8560528992878942,
|
|
"grad_norm": 1.0456801652908325,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4919,
|
|
"mean_token_accuracy": 0.8460915684700012,
|
|
"num_tokens": 268217173.0,
|
|
"step": 1683
|
|
},
|
|
{
|
|
"epoch": 0.8565615462868769,
|
|
"grad_norm": 0.9893629550933838,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5249,
|
|
"mean_token_accuracy": 0.8375509977340698,
|
|
"num_tokens": 268382188.0,
|
|
"step": 1684
|
|
},
|
|
{
|
|
"epoch": 0.8570701932858596,
|
|
"grad_norm": 1.030446171760559,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5088,
|
|
"mean_token_accuracy": 0.8399752378463745,
|
|
"num_tokens": 268543730.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 0.8575788402848423,
|
|
"grad_norm": 1.0455074310302734,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4968,
|
|
"mean_token_accuracy": 0.8435068130493164,
|
|
"num_tokens": 268701052.0,
|
|
"step": 1686
|
|
},
|
|
{
|
|
"epoch": 0.858087487283825,
|
|
"grad_norm": 1.0250892639160156,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4961,
|
|
"mean_token_accuracy": 0.843188464641571,
|
|
"num_tokens": 268852461.0,
|
|
"step": 1687
|
|
},
|
|
{
|
|
"epoch": 0.8585961342828077,
|
|
"grad_norm": 1.1781569719314575,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4759,
|
|
"mean_token_accuracy": 0.8500210642814636,
|
|
"num_tokens": 268996708.0,
|
|
"step": 1688
|
|
},
|
|
{
|
|
"epoch": 0.8591047812817905,
|
|
"grad_norm": 1.1067460775375366,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5362,
|
|
"mean_token_accuracy": 0.8340690732002258,
|
|
"num_tokens": 269146990.0,
|
|
"step": 1689
|
|
},
|
|
{
|
|
"epoch": 0.8596134282807731,
|
|
"grad_norm": 1.0462528467178345,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.473,
|
|
"mean_token_accuracy": 0.8502258062362671,
|
|
"num_tokens": 269311951.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.8601220752797558,
|
|
"grad_norm": 1.0539947748184204,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5156,
|
|
"mean_token_accuracy": 0.838455080986023,
|
|
"num_tokens": 269450985.0,
|
|
"step": 1691
|
|
},
|
|
{
|
|
"epoch": 0.8606307222787386,
|
|
"grad_norm": 1.1058789491653442,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4725,
|
|
"mean_token_accuracy": 0.8504195213317871,
|
|
"num_tokens": 269598930.0,
|
|
"step": 1692
|
|
},
|
|
{
|
|
"epoch": 0.8611393692777213,
|
|
"grad_norm": 1.2138804197311401,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5342,
|
|
"mean_token_accuracy": 0.8357694745063782,
|
|
"num_tokens": 269760034.0,
|
|
"step": 1693
|
|
},
|
|
{
|
|
"epoch": 0.861648016276704,
|
|
"grad_norm": 1.0645147562026978,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5232,
|
|
"mean_token_accuracy": 0.8370406627655029,
|
|
"num_tokens": 269921692.0,
|
|
"step": 1694
|
|
},
|
|
{
|
|
"epoch": 0.8621566632756866,
|
|
"grad_norm": 1.1063612699508667,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4897,
|
|
"mean_token_accuracy": 0.8471656441688538,
|
|
"num_tokens": 270073024.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 0.8626653102746694,
|
|
"grad_norm": 1.1592669486999512,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5333,
|
|
"mean_token_accuracy": 0.8345301151275635,
|
|
"num_tokens": 270232620.0,
|
|
"step": 1696
|
|
},
|
|
{
|
|
"epoch": 0.8631739572736521,
|
|
"grad_norm": 1.076188325881958,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5072,
|
|
"mean_token_accuracy": 0.8404670357704163,
|
|
"num_tokens": 270385773.0,
|
|
"step": 1697
|
|
},
|
|
{
|
|
"epoch": 0.8636826042726348,
|
|
"grad_norm": 0.9822161793708801,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5015,
|
|
"mean_token_accuracy": 0.8447793126106262,
|
|
"num_tokens": 270560619.0,
|
|
"step": 1698
|
|
},
|
|
{
|
|
"epoch": 0.8641912512716174,
|
|
"grad_norm": 1.886228084564209,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5253,
|
|
"mean_token_accuracy": 0.8372279405593872,
|
|
"num_tokens": 270719944.0,
|
|
"step": 1699
|
|
},
|
|
{
|
|
"epoch": 0.8646998982706002,
|
|
"grad_norm": 1.1022945642471313,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4961,
|
|
"mean_token_accuracy": 0.8436857461929321,
|
|
"num_tokens": 270874728.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.8652085452695829,
|
|
"grad_norm": 1.010673999786377,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5085,
|
|
"mean_token_accuracy": 0.8426359295845032,
|
|
"num_tokens": 271036953.0,
|
|
"step": 1701
|
|
},
|
|
{
|
|
"epoch": 0.8657171922685656,
|
|
"grad_norm": 0.9527826905250549,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5034,
|
|
"mean_token_accuracy": 0.8433707356452942,
|
|
"num_tokens": 271202782.0,
|
|
"step": 1702
|
|
},
|
|
{
|
|
"epoch": 0.8662258392675484,
|
|
"grad_norm": 1.014635682106018,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5183,
|
|
"mean_token_accuracy": 0.8393032550811768,
|
|
"num_tokens": 271367311.0,
|
|
"step": 1703
|
|
},
|
|
{
|
|
"epoch": 0.866734486266531,
|
|
"grad_norm": 1.0066399574279785,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5139,
|
|
"mean_token_accuracy": 0.839614748954773,
|
|
"num_tokens": 271529123.0,
|
|
"step": 1704
|
|
},
|
|
{
|
|
"epoch": 0.8672431332655137,
|
|
"grad_norm": 1.093875765800476,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4982,
|
|
"mean_token_accuracy": 0.844380259513855,
|
|
"num_tokens": 271673733.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 0.8677517802644964,
|
|
"grad_norm": 1.0652796030044556,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4892,
|
|
"mean_token_accuracy": 0.8460615873336792,
|
|
"num_tokens": 271828281.0,
|
|
"step": 1706
|
|
},
|
|
{
|
|
"epoch": 0.8682604272634792,
|
|
"grad_norm": 1.0834511518478394,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5299,
|
|
"mean_token_accuracy": 0.8352327346801758,
|
|
"num_tokens": 271997170.0,
|
|
"step": 1707
|
|
},
|
|
{
|
|
"epoch": 0.8687690742624619,
|
|
"grad_norm": 1.0391765832901,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4862,
|
|
"mean_token_accuracy": 0.8471356630325317,
|
|
"num_tokens": 272152291.0,
|
|
"step": 1708
|
|
},
|
|
{
|
|
"epoch": 0.8692777212614445,
|
|
"grad_norm": 1.064730167388916,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5366,
|
|
"mean_token_accuracy": 0.8317385315895081,
|
|
"num_tokens": 272307347.0,
|
|
"step": 1709
|
|
},
|
|
{
|
|
"epoch": 0.8697863682604272,
|
|
"grad_norm": 1.0805742740631104,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5291,
|
|
"mean_token_accuracy": 0.8354371786117554,
|
|
"num_tokens": 272469049.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.87029501525941,
|
|
"grad_norm": 1.0154848098754883,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.527,
|
|
"mean_token_accuracy": 0.8360839486122131,
|
|
"num_tokens": 272612517.0,
|
|
"step": 1711
|
|
},
|
|
{
|
|
"epoch": 0.8708036622583927,
|
|
"grad_norm": 1.0521576404571533,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.543,
|
|
"mean_token_accuracy": 0.831218957901001,
|
|
"num_tokens": 272768408.0,
|
|
"step": 1712
|
|
},
|
|
{
|
|
"epoch": 0.8713123092573754,
|
|
"grad_norm": 0.9749472141265869,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5158,
|
|
"mean_token_accuracy": 0.8387061953544617,
|
|
"num_tokens": 272919347.0,
|
|
"step": 1713
|
|
},
|
|
{
|
|
"epoch": 0.8718209562563581,
|
|
"grad_norm": 1.0825953483581543,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5007,
|
|
"mean_token_accuracy": 0.8432174921035767,
|
|
"num_tokens": 273089732.0,
|
|
"step": 1714
|
|
},
|
|
{
|
|
"epoch": 0.8723296032553408,
|
|
"grad_norm": 1.0068557262420654,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4934,
|
|
"mean_token_accuracy": 0.8477575778961182,
|
|
"num_tokens": 273242780.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 0.8728382502543235,
|
|
"grad_norm": 0.9401190280914307,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4991,
|
|
"mean_token_accuracy": 0.843682050704956,
|
|
"num_tokens": 273407433.0,
|
|
"step": 1716
|
|
},
|
|
{
|
|
"epoch": 0.8733468972533062,
|
|
"grad_norm": 0.9649401903152466,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5146,
|
|
"mean_token_accuracy": 0.8399534225463867,
|
|
"num_tokens": 273570166.0,
|
|
"step": 1717
|
|
},
|
|
{
|
|
"epoch": 0.873855544252289,
|
|
"grad_norm": 0.9992680549621582,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5289,
|
|
"mean_token_accuracy": 0.8345081210136414,
|
|
"num_tokens": 273726324.0,
|
|
"step": 1718
|
|
},
|
|
{
|
|
"epoch": 0.8743641912512716,
|
|
"grad_norm": 1.0180540084838867,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5199,
|
|
"mean_token_accuracy": 0.8366790413856506,
|
|
"num_tokens": 273890563.0,
|
|
"step": 1719
|
|
},
|
|
{
|
|
"epoch": 0.8748728382502543,
|
|
"grad_norm": 0.9962469339370728,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4778,
|
|
"mean_token_accuracy": 0.8489860892295837,
|
|
"num_tokens": 274048187.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.875381485249237,
|
|
"grad_norm": 0.9790189266204834,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4766,
|
|
"mean_token_accuracy": 0.8492850065231323,
|
|
"num_tokens": 274221855.0,
|
|
"step": 1721
|
|
},
|
|
{
|
|
"epoch": 0.8758901322482198,
|
|
"grad_norm": 0.9516292810440063,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5212,
|
|
"mean_token_accuracy": 0.8369489908218384,
|
|
"num_tokens": 274377833.0,
|
|
"step": 1722
|
|
},
|
|
{
|
|
"epoch": 0.8763987792472024,
|
|
"grad_norm": 0.9753563404083252,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5356,
|
|
"mean_token_accuracy": 0.8327175378799438,
|
|
"num_tokens": 274541936.0,
|
|
"step": 1723
|
|
},
|
|
{
|
|
"epoch": 0.8769074262461851,
|
|
"grad_norm": 0.9900280237197876,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5311,
|
|
"mean_token_accuracy": 0.8332983255386353,
|
|
"num_tokens": 274705006.0,
|
|
"step": 1724
|
|
},
|
|
{
|
|
"epoch": 0.8774160732451679,
|
|
"grad_norm": 1.0057647228240967,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5025,
|
|
"mean_token_accuracy": 0.8407544493675232,
|
|
"num_tokens": 274854136.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.8779247202441506,
|
|
"grad_norm": 0.9810066223144531,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.491,
|
|
"mean_token_accuracy": 0.8476090431213379,
|
|
"num_tokens": 275014059.0,
|
|
"step": 1726
|
|
},
|
|
{
|
|
"epoch": 0.8784333672431333,
|
|
"grad_norm": 1.0328130722045898,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5533,
|
|
"mean_token_accuracy": 0.8297367095947266,
|
|
"num_tokens": 275187361.0,
|
|
"step": 1727
|
|
},
|
|
{
|
|
"epoch": 0.8789420142421159,
|
|
"grad_norm": 0.9669415354728699,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4991,
|
|
"mean_token_accuracy": 0.8440873622894287,
|
|
"num_tokens": 275338869.0,
|
|
"step": 1728
|
|
},
|
|
{
|
|
"epoch": 0.8794506612410987,
|
|
"grad_norm": 0.9246980547904968,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5048,
|
|
"mean_token_accuracy": 0.8422975540161133,
|
|
"num_tokens": 275496774.0,
|
|
"step": 1729
|
|
},
|
|
{
|
|
"epoch": 0.8799593082400814,
|
|
"grad_norm": 1.1389602422714233,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5346,
|
|
"mean_token_accuracy": 0.8315758109092712,
|
|
"num_tokens": 275653401.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.8804679552390641,
|
|
"grad_norm": 1.0125043392181396,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4926,
|
|
"mean_token_accuracy": 0.8451186418533325,
|
|
"num_tokens": 275806070.0,
|
|
"step": 1731
|
|
},
|
|
{
|
|
"epoch": 0.8809766022380467,
|
|
"grad_norm": 0.9579908847808838,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5061,
|
|
"mean_token_accuracy": 0.8411065340042114,
|
|
"num_tokens": 275962373.0,
|
|
"step": 1732
|
|
},
|
|
{
|
|
"epoch": 0.8814852492370295,
|
|
"grad_norm": 1.0477601289749146,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5127,
|
|
"mean_token_accuracy": 0.8413093686103821,
|
|
"num_tokens": 276129118.0,
|
|
"step": 1733
|
|
},
|
|
{
|
|
"epoch": 0.8819938962360122,
|
|
"grad_norm": 0.9794126749038696,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5198,
|
|
"mean_token_accuracy": 0.8378043174743652,
|
|
"num_tokens": 276287064.0,
|
|
"step": 1734
|
|
},
|
|
{
|
|
"epoch": 0.8825025432349949,
|
|
"grad_norm": 1.0100884437561035,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4977,
|
|
"mean_token_accuracy": 0.844001054763794,
|
|
"num_tokens": 276457678.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 0.8830111902339777,
|
|
"grad_norm": 1.052374005317688,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5299,
|
|
"mean_token_accuracy": 0.8356032371520996,
|
|
"num_tokens": 276610916.0,
|
|
"step": 1736
|
|
},
|
|
{
|
|
"epoch": 0.8835198372329603,
|
|
"grad_norm": 0.9559528827667236,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4894,
|
|
"mean_token_accuracy": 0.8479528427124023,
|
|
"num_tokens": 276764925.0,
|
|
"step": 1737
|
|
},
|
|
{
|
|
"epoch": 0.884028484231943,
|
|
"grad_norm": 1.052488088607788,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5685,
|
|
"mean_token_accuracy": 0.8241016864776611,
|
|
"num_tokens": 276928368.0,
|
|
"step": 1738
|
|
},
|
|
{
|
|
"epoch": 0.8845371312309257,
|
|
"grad_norm": 0.9992378950119019,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4906,
|
|
"mean_token_accuracy": 0.8465350270271301,
|
|
"num_tokens": 277094718.0,
|
|
"step": 1739
|
|
},
|
|
{
|
|
"epoch": 0.8850457782299085,
|
|
"grad_norm": 0.9928638935089111,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4951,
|
|
"mean_token_accuracy": 0.8433694839477539,
|
|
"num_tokens": 277246859.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.8855544252288912,
|
|
"grad_norm": 0.923547625541687,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5224,
|
|
"mean_token_accuracy": 0.8368440866470337,
|
|
"num_tokens": 277414068.0,
|
|
"step": 1741
|
|
},
|
|
{
|
|
"epoch": 0.8860630722278738,
|
|
"grad_norm": 0.986189603805542,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5046,
|
|
"mean_token_accuracy": 0.8419417142868042,
|
|
"num_tokens": 277573295.0,
|
|
"step": 1742
|
|
},
|
|
{
|
|
"epoch": 0.8865717192268565,
|
|
"grad_norm": 1.0184423923492432,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5186,
|
|
"mean_token_accuracy": 0.8393261432647705,
|
|
"num_tokens": 277726913.0,
|
|
"step": 1743
|
|
},
|
|
{
|
|
"epoch": 0.8870803662258393,
|
|
"grad_norm": 0.9636666774749756,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4865,
|
|
"mean_token_accuracy": 0.8466484546661377,
|
|
"num_tokens": 277877252.0,
|
|
"step": 1744
|
|
},
|
|
{
|
|
"epoch": 0.887589013224822,
|
|
"grad_norm": 1.0365499258041382,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5315,
|
|
"mean_token_accuracy": 0.8329277634620667,
|
|
"num_tokens": 278034976.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 0.8880976602238047,
|
|
"grad_norm": 0.9787980914115906,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.472,
|
|
"mean_token_accuracy": 0.8506600260734558,
|
|
"num_tokens": 278194339.0,
|
|
"step": 1746
|
|
},
|
|
{
|
|
"epoch": 0.8886063072227874,
|
|
"grad_norm": 0.9673038721084595,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5062,
|
|
"mean_token_accuracy": 0.8403258323669434,
|
|
"num_tokens": 278352150.0,
|
|
"step": 1747
|
|
},
|
|
{
|
|
"epoch": 0.8891149542217701,
|
|
"grad_norm": 1.0336358547210693,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5287,
|
|
"mean_token_accuracy": 0.8341023921966553,
|
|
"num_tokens": 278516575.0,
|
|
"step": 1748
|
|
},
|
|
{
|
|
"epoch": 0.8896236012207528,
|
|
"grad_norm": 0.9779040813446045,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5282,
|
|
"mean_token_accuracy": 0.8354268670082092,
|
|
"num_tokens": 278687157.0,
|
|
"step": 1749
|
|
},
|
|
{
|
|
"epoch": 0.8901322482197355,
|
|
"grad_norm": 1.0056418180465698,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8347800374031067,
|
|
"num_tokens": 278844290.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.8906408952187183,
|
|
"grad_norm": 1.0253515243530273,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5115,
|
|
"mean_token_accuracy": 0.8392736315727234,
|
|
"num_tokens": 279002276.0,
|
|
"step": 1751
|
|
},
|
|
{
|
|
"epoch": 0.8911495422177009,
|
|
"grad_norm": 0.9850518703460693,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5256,
|
|
"mean_token_accuracy": 0.8362544775009155,
|
|
"num_tokens": 279161918.0,
|
|
"step": 1752
|
|
},
|
|
{
|
|
"epoch": 0.8916581892166836,
|
|
"grad_norm": 1.0186904668807983,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.524,
|
|
"mean_token_accuracy": 0.8351168036460876,
|
|
"num_tokens": 279321930.0,
|
|
"step": 1753
|
|
},
|
|
{
|
|
"epoch": 0.8921668362156663,
|
|
"grad_norm": 1.041852355003357,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5296,
|
|
"mean_token_accuracy": 0.8372722864151001,
|
|
"num_tokens": 279481295.0,
|
|
"step": 1754
|
|
},
|
|
{
|
|
"epoch": 0.8926754832146491,
|
|
"grad_norm": 0.9848358631134033,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5179,
|
|
"mean_token_accuracy": 0.8382662534713745,
|
|
"num_tokens": 279633071.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 0.8931841302136317,
|
|
"grad_norm": 1.0203869342803955,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4843,
|
|
"mean_token_accuracy": 0.8476390838623047,
|
|
"num_tokens": 279800329.0,
|
|
"step": 1756
|
|
},
|
|
{
|
|
"epoch": 0.8936927772126144,
|
|
"grad_norm": 0.9973755478858948,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4773,
|
|
"mean_token_accuracy": 0.8494220972061157,
|
|
"num_tokens": 279944707.0,
|
|
"step": 1757
|
|
},
|
|
{
|
|
"epoch": 0.8942014242115972,
|
|
"grad_norm": 0.9391854405403137,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5,
|
|
"mean_token_accuracy": 0.8429989814758301,
|
|
"num_tokens": 280111037.0,
|
|
"step": 1758
|
|
},
|
|
{
|
|
"epoch": 0.8947100712105799,
|
|
"grad_norm": 1.0117913484573364,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5004,
|
|
"mean_token_accuracy": 0.8444038033485413,
|
|
"num_tokens": 280258749.0,
|
|
"step": 1759
|
|
},
|
|
{
|
|
"epoch": 0.8952187182095626,
|
|
"grad_norm": 0.9810928702354431,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5124,
|
|
"mean_token_accuracy": 0.8384989500045776,
|
|
"num_tokens": 280405852.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.8957273652085452,
|
|
"grad_norm": 0.9981587529182434,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5108,
|
|
"mean_token_accuracy": 0.8404662609100342,
|
|
"num_tokens": 280580217.0,
|
|
"step": 1761
|
|
},
|
|
{
|
|
"epoch": 0.896236012207528,
|
|
"grad_norm": 1.026110053062439,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5284,
|
|
"mean_token_accuracy": 0.8371784687042236,
|
|
"num_tokens": 280727104.0,
|
|
"step": 1762
|
|
},
|
|
{
|
|
"epoch": 0.8967446592065107,
|
|
"grad_norm": 1.0082656145095825,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.485,
|
|
"mean_token_accuracy": 0.8476098775863647,
|
|
"num_tokens": 280889043.0,
|
|
"step": 1763
|
|
},
|
|
{
|
|
"epoch": 0.8972533062054934,
|
|
"grad_norm": 1.0548858642578125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5011,
|
|
"mean_token_accuracy": 0.8431612849235535,
|
|
"num_tokens": 281043942.0,
|
|
"step": 1764
|
|
},
|
|
{
|
|
"epoch": 0.897761953204476,
|
|
"grad_norm": 1.0609831809997559,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5155,
|
|
"mean_token_accuracy": 0.8397126197814941,
|
|
"num_tokens": 281216118.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 0.8982706002034588,
|
|
"grad_norm": 1.008461594581604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5126,
|
|
"mean_token_accuracy": 0.8402866125106812,
|
|
"num_tokens": 281374849.0,
|
|
"step": 1766
|
|
},
|
|
{
|
|
"epoch": 0.8987792472024415,
|
|
"grad_norm": 1.030743956565857,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5023,
|
|
"mean_token_accuracy": 0.8423265218734741,
|
|
"num_tokens": 281528478.0,
|
|
"step": 1767
|
|
},
|
|
{
|
|
"epoch": 0.8992878942014242,
|
|
"grad_norm": 1.0897209644317627,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.505,
|
|
"mean_token_accuracy": 0.8422761559486389,
|
|
"num_tokens": 281681982.0,
|
|
"step": 1768
|
|
},
|
|
{
|
|
"epoch": 0.8997965412004069,
|
|
"grad_norm": 0.992885172367096,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.492,
|
|
"mean_token_accuracy": 0.8450870513916016,
|
|
"num_tokens": 281846049.0,
|
|
"step": 1769
|
|
},
|
|
{
|
|
"epoch": 0.9003051881993896,
|
|
"grad_norm": 1.098056674003601,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5516,
|
|
"mean_token_accuracy": 0.8283272981643677,
|
|
"num_tokens": 282019096.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.9008138351983723,
|
|
"grad_norm": 1.0340752601623535,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4885,
|
|
"mean_token_accuracy": 0.846498966217041,
|
|
"num_tokens": 282196142.0,
|
|
"step": 1771
|
|
},
|
|
{
|
|
"epoch": 0.901322482197355,
|
|
"grad_norm": 0.9638667702674866,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5213,
|
|
"mean_token_accuracy": 0.8388038277626038,
|
|
"num_tokens": 282360553.0,
|
|
"step": 1772
|
|
},
|
|
{
|
|
"epoch": 0.9018311291963378,
|
|
"grad_norm": 1.0786890983581543,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4915,
|
|
"mean_token_accuracy": 0.8466812372207642,
|
|
"num_tokens": 282527151.0,
|
|
"step": 1773
|
|
},
|
|
{
|
|
"epoch": 0.9023397761953205,
|
|
"grad_norm": 1.0417206287384033,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5355,
|
|
"mean_token_accuracy": 0.8339139223098755,
|
|
"num_tokens": 282674995.0,
|
|
"step": 1774
|
|
},
|
|
{
|
|
"epoch": 0.9028484231943031,
|
|
"grad_norm": 1.0728365182876587,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4639,
|
|
"mean_token_accuracy": 0.852975070476532,
|
|
"num_tokens": 282832990.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.9033570701932858,
|
|
"grad_norm": 1.0209739208221436,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5316,
|
|
"mean_token_accuracy": 0.8355213403701782,
|
|
"num_tokens": 282997592.0,
|
|
"step": 1776
|
|
},
|
|
{
|
|
"epoch": 0.9038657171922686,
|
|
"grad_norm": 1.114395022392273,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5097,
|
|
"mean_token_accuracy": 0.8406573534011841,
|
|
"num_tokens": 283155838.0,
|
|
"step": 1777
|
|
},
|
|
{
|
|
"epoch": 0.9043743641912513,
|
|
"grad_norm": 0.9711954593658447,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4673,
|
|
"mean_token_accuracy": 0.8512140512466431,
|
|
"num_tokens": 283307527.0,
|
|
"step": 1778
|
|
},
|
|
{
|
|
"epoch": 0.904883011190234,
|
|
"grad_norm": 0.9695219397544861,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5267,
|
|
"mean_token_accuracy": 0.8353557586669922,
|
|
"num_tokens": 283465966.0,
|
|
"step": 1779
|
|
},
|
|
{
|
|
"epoch": 0.9053916581892166,
|
|
"grad_norm": 1.0465660095214844,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5272,
|
|
"mean_token_accuracy": 0.8351324796676636,
|
|
"num_tokens": 283620377.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.9059003051881994,
|
|
"grad_norm": 0.9748997688293457,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4877,
|
|
"mean_token_accuracy": 0.8453119993209839,
|
|
"num_tokens": 283777695.0,
|
|
"step": 1781
|
|
},
|
|
{
|
|
"epoch": 0.9064089521871821,
|
|
"grad_norm": 1.0441025495529175,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5047,
|
|
"mean_token_accuracy": 0.8432708978652954,
|
|
"num_tokens": 283931044.0,
|
|
"step": 1782
|
|
},
|
|
{
|
|
"epoch": 0.9069175991861648,
|
|
"grad_norm": 1.090386152267456,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4958,
|
|
"mean_token_accuracy": 0.8454595804214478,
|
|
"num_tokens": 284089124.0,
|
|
"step": 1783
|
|
},
|
|
{
|
|
"epoch": 0.9074262461851476,
|
|
"grad_norm": 0.9559053182601929,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.528,
|
|
"mean_token_accuracy": 0.8353322744369507,
|
|
"num_tokens": 284248921.0,
|
|
"step": 1784
|
|
},
|
|
{
|
|
"epoch": 0.9079348931841302,
|
|
"grad_norm": 1.0538196563720703,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4949,
|
|
"mean_token_accuracy": 0.8436644673347473,
|
|
"num_tokens": 284406295.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 0.9084435401831129,
|
|
"grad_norm": 1.0404249429702759,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4647,
|
|
"mean_token_accuracy": 0.8512811660766602,
|
|
"num_tokens": 284570062.0,
|
|
"step": 1786
|
|
},
|
|
{
|
|
"epoch": 0.9089521871820956,
|
|
"grad_norm": 1.0365242958068848,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5121,
|
|
"mean_token_accuracy": 0.8398833274841309,
|
|
"num_tokens": 284731919.0,
|
|
"step": 1787
|
|
},
|
|
{
|
|
"epoch": 0.9094608341810784,
|
|
"grad_norm": 0.9548749923706055,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5237,
|
|
"mean_token_accuracy": 0.8365575075149536,
|
|
"num_tokens": 284892788.0,
|
|
"step": 1788
|
|
},
|
|
{
|
|
"epoch": 0.909969481180061,
|
|
"grad_norm": 1.0997098684310913,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5303,
|
|
"mean_token_accuracy": 0.8356540203094482,
|
|
"num_tokens": 285053351.0,
|
|
"step": 1789
|
|
},
|
|
{
|
|
"epoch": 0.9104781281790437,
|
|
"grad_norm": 0.9898800849914551,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5156,
|
|
"mean_token_accuracy": 0.8391681909561157,
|
|
"num_tokens": 285203700.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.9109867751780264,
|
|
"grad_norm": 0.9842552542686462,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5134,
|
|
"mean_token_accuracy": 0.8393641710281372,
|
|
"num_tokens": 285375982.0,
|
|
"step": 1791
|
|
},
|
|
{
|
|
"epoch": 0.9114954221770092,
|
|
"grad_norm": 0.9427465796470642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5069,
|
|
"mean_token_accuracy": 0.8418722748756409,
|
|
"num_tokens": 285550546.0,
|
|
"step": 1792
|
|
},
|
|
{
|
|
"epoch": 0.9120040691759919,
|
|
"grad_norm": 1.0155222415924072,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.517,
|
|
"mean_token_accuracy": 0.8391909003257751,
|
|
"num_tokens": 285699042.0,
|
|
"step": 1793
|
|
},
|
|
{
|
|
"epoch": 0.9125127161749745,
|
|
"grad_norm": 1.0051095485687256,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5109,
|
|
"mean_token_accuracy": 0.8380466103553772,
|
|
"num_tokens": 285845604.0,
|
|
"step": 1794
|
|
},
|
|
{
|
|
"epoch": 0.9130213631739573,
|
|
"grad_norm": 1.6794397830963135,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5147,
|
|
"mean_token_accuracy": 0.8399105668067932,
|
|
"num_tokens": 285989582.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 0.91353001017294,
|
|
"grad_norm": 1.0809688568115234,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5238,
|
|
"mean_token_accuracy": 0.8375011682510376,
|
|
"num_tokens": 286144853.0,
|
|
"step": 1796
|
|
},
|
|
{
|
|
"epoch": 0.9140386571719227,
|
|
"grad_norm": 1.0487751960754395,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5093,
|
|
"mean_token_accuracy": 0.8402568697929382,
|
|
"num_tokens": 286306136.0,
|
|
"step": 1797
|
|
},
|
|
{
|
|
"epoch": 0.9145473041709054,
|
|
"grad_norm": 0.9961705803871155,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5113,
|
|
"mean_token_accuracy": 0.8373109698295593,
|
|
"num_tokens": 286458777.0,
|
|
"step": 1798
|
|
},
|
|
{
|
|
"epoch": 0.9150559511698881,
|
|
"grad_norm": 0.9926751255989075,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5173,
|
|
"mean_token_accuracy": 0.8379104137420654,
|
|
"num_tokens": 286618373.0,
|
|
"step": 1799
|
|
},
|
|
{
|
|
"epoch": 0.9155645981688708,
|
|
"grad_norm": 1.060032606124878,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5038,
|
|
"mean_token_accuracy": 0.8424394130706787,
|
|
"num_tokens": 286774374.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.9160732451678535,
|
|
"grad_norm": 0.9973428249359131,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4875,
|
|
"mean_token_accuracy": 0.8459199666976929,
|
|
"num_tokens": 286926506.0,
|
|
"step": 1801
|
|
},
|
|
{
|
|
"epoch": 0.9165818921668362,
|
|
"grad_norm": 1.0062426328659058,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5365,
|
|
"mean_token_accuracy": 0.832974374294281,
|
|
"num_tokens": 287085779.0,
|
|
"step": 1802
|
|
},
|
|
{
|
|
"epoch": 0.917090539165819,
|
|
"grad_norm": 1.040196418762207,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5065,
|
|
"mean_token_accuracy": 0.8405765295028687,
|
|
"num_tokens": 287253165.0,
|
|
"step": 1803
|
|
},
|
|
{
|
|
"epoch": 0.9175991861648016,
|
|
"grad_norm": 0.997842013835907,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5324,
|
|
"mean_token_accuracy": 0.8338080048561096,
|
|
"num_tokens": 287405700.0,
|
|
"step": 1804
|
|
},
|
|
{
|
|
"epoch": 0.9181078331637843,
|
|
"grad_norm": 0.9390808343887329,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4715,
|
|
"mean_token_accuracy": 0.8508753180503845,
|
|
"num_tokens": 287551897.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 0.9186164801627671,
|
|
"grad_norm": 1.057931900024414,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5091,
|
|
"mean_token_accuracy": 0.839033842086792,
|
|
"num_tokens": 287716716.0,
|
|
"step": 1806
|
|
},
|
|
{
|
|
"epoch": 0.9191251271617498,
|
|
"grad_norm": 1.0124062299728394,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5435,
|
|
"mean_token_accuracy": 0.8313062787055969,
|
|
"num_tokens": 287885440.0,
|
|
"step": 1807
|
|
},
|
|
{
|
|
"epoch": 0.9196337741607324,
|
|
"grad_norm": 0.9651282429695129,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4837,
|
|
"mean_token_accuracy": 0.846926212310791,
|
|
"num_tokens": 288040043.0,
|
|
"step": 1808
|
|
},
|
|
{
|
|
"epoch": 0.9201424211597151,
|
|
"grad_norm": 1.0314844846725464,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4862,
|
|
"mean_token_accuracy": 0.847362220287323,
|
|
"num_tokens": 288190884.0,
|
|
"step": 1809
|
|
},
|
|
{
|
|
"epoch": 0.9206510681586979,
|
|
"grad_norm": 1.0677003860473633,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5356,
|
|
"mean_token_accuracy": 0.832841157913208,
|
|
"num_tokens": 288352383.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.9211597151576806,
|
|
"grad_norm": 0.9072279930114746,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4695,
|
|
"mean_token_accuracy": 0.8505159616470337,
|
|
"num_tokens": 288513226.0,
|
|
"step": 1811
|
|
},
|
|
{
|
|
"epoch": 0.9216683621566633,
|
|
"grad_norm": 1.0185366868972778,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.836707353591919,
|
|
"num_tokens": 288681122.0,
|
|
"step": 1812
|
|
},
|
|
{
|
|
"epoch": 0.9221770091556459,
|
|
"grad_norm": 0.9919898509979248,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5333,
|
|
"mean_token_accuracy": 0.8353002071380615,
|
|
"num_tokens": 288840955.0,
|
|
"step": 1813
|
|
},
|
|
{
|
|
"epoch": 0.9226856561546287,
|
|
"grad_norm": 0.9551877975463867,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5247,
|
|
"mean_token_accuracy": 0.8349590301513672,
|
|
"num_tokens": 289007588.0,
|
|
"step": 1814
|
|
},
|
|
{
|
|
"epoch": 0.9231943031536114,
|
|
"grad_norm": 1.023133635520935,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.506,
|
|
"mean_token_accuracy": 0.8420091867446899,
|
|
"num_tokens": 289171963.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 0.9237029501525941,
|
|
"grad_norm": 0.9281512498855591,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4984,
|
|
"mean_token_accuracy": 0.8434433937072754,
|
|
"num_tokens": 289338174.0,
|
|
"step": 1816
|
|
},
|
|
{
|
|
"epoch": 0.9242115971515769,
|
|
"grad_norm": 0.9637218117713928,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4985,
|
|
"mean_token_accuracy": 0.8432784080505371,
|
|
"num_tokens": 289498369.0,
|
|
"step": 1817
|
|
},
|
|
{
|
|
"epoch": 0.9247202441505595,
|
|
"grad_norm": 1.0020900964736938,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4979,
|
|
"mean_token_accuracy": 0.8426657915115356,
|
|
"num_tokens": 289657315.0,
|
|
"step": 1818
|
|
},
|
|
{
|
|
"epoch": 0.9252288911495422,
|
|
"grad_norm": 0.9804989695549011,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.494,
|
|
"mean_token_accuracy": 0.8450539708137512,
|
|
"num_tokens": 289824552.0,
|
|
"step": 1819
|
|
},
|
|
{
|
|
"epoch": 0.9257375381485249,
|
|
"grad_norm": 0.9363784790039062,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.511,
|
|
"mean_token_accuracy": 0.8384186029434204,
|
|
"num_tokens": 289982984.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.9262461851475077,
|
|
"grad_norm": 0.9209647178649902,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4967,
|
|
"mean_token_accuracy": 0.8442416787147522,
|
|
"num_tokens": 290127147.0,
|
|
"step": 1821
|
|
},
|
|
{
|
|
"epoch": 0.9267548321464903,
|
|
"grad_norm": 0.9212963581085205,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5008,
|
|
"mean_token_accuracy": 0.8438435792922974,
|
|
"num_tokens": 290298649.0,
|
|
"step": 1822
|
|
},
|
|
{
|
|
"epoch": 0.927263479145473,
|
|
"grad_norm": 0.9627429842948914,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5277,
|
|
"mean_token_accuracy": 0.8364180326461792,
|
|
"num_tokens": 290460355.0,
|
|
"step": 1823
|
|
},
|
|
{
|
|
"epoch": 0.9277721261444557,
|
|
"grad_norm": 0.890675961971283,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4794,
|
|
"mean_token_accuracy": 0.8487001657485962,
|
|
"num_tokens": 290617255.0,
|
|
"step": 1824
|
|
},
|
|
{
|
|
"epoch": 0.9282807731434385,
|
|
"grad_norm": 1.059567928314209,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5145,
|
|
"mean_token_accuracy": 0.8380982875823975,
|
|
"num_tokens": 290773839.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.9287894201424212,
|
|
"grad_norm": 0.9610816836357117,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4843,
|
|
"mean_token_accuracy": 0.8474520444869995,
|
|
"num_tokens": 290937706.0,
|
|
"step": 1826
|
|
},
|
|
{
|
|
"epoch": 0.9292980671414038,
|
|
"grad_norm": 0.9560855627059937,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5022,
|
|
"mean_token_accuracy": 0.8428947329521179,
|
|
"num_tokens": 291095257.0,
|
|
"step": 1827
|
|
},
|
|
{
|
|
"epoch": 0.9298067141403866,
|
|
"grad_norm": 1.0478545427322388,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5242,
|
|
"mean_token_accuracy": 0.8372861742973328,
|
|
"num_tokens": 291248241.0,
|
|
"step": 1828
|
|
},
|
|
{
|
|
"epoch": 0.9303153611393693,
|
|
"grad_norm": 0.9691060185432434,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.539,
|
|
"mean_token_accuracy": 0.8321580290794373,
|
|
"num_tokens": 291408732.0,
|
|
"step": 1829
|
|
},
|
|
{
|
|
"epoch": 0.930824008138352,
|
|
"grad_norm": 0.9107898473739624,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4966,
|
|
"mean_token_accuracy": 0.8426162600517273,
|
|
"num_tokens": 291586275.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.9313326551373347,
|
|
"grad_norm": 0.9577358365058899,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4676,
|
|
"mean_token_accuracy": 0.8520687818527222,
|
|
"num_tokens": 291748574.0,
|
|
"step": 1831
|
|
},
|
|
{
|
|
"epoch": 0.9318413021363174,
|
|
"grad_norm": 1.018463134765625,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5156,
|
|
"mean_token_accuracy": 0.8389250040054321,
|
|
"num_tokens": 291904561.0,
|
|
"step": 1832
|
|
},
|
|
{
|
|
"epoch": 0.9323499491353001,
|
|
"grad_norm": 1.0342974662780762,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5027,
|
|
"mean_token_accuracy": 0.8422161340713501,
|
|
"num_tokens": 292063897.0,
|
|
"step": 1833
|
|
},
|
|
{
|
|
"epoch": 0.9328585961342828,
|
|
"grad_norm": 0.9696241021156311,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5125,
|
|
"mean_token_accuracy": 0.839712381362915,
|
|
"num_tokens": 292214056.0,
|
|
"step": 1834
|
|
},
|
|
{
|
|
"epoch": 0.9333672431332655,
|
|
"grad_norm": 1.0362638235092163,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5214,
|
|
"mean_token_accuracy": 0.8358685970306396,
|
|
"num_tokens": 292349124.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 0.9338758901322483,
|
|
"grad_norm": 0.9768627882003784,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4951,
|
|
"mean_token_accuracy": 0.8442312479019165,
|
|
"num_tokens": 292505385.0,
|
|
"step": 1836
|
|
},
|
|
{
|
|
"epoch": 0.9343845371312309,
|
|
"grad_norm": 0.9081904292106628,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5137,
|
|
"mean_token_accuracy": 0.8416123390197754,
|
|
"num_tokens": 292665526.0,
|
|
"step": 1837
|
|
},
|
|
{
|
|
"epoch": 0.9348931841302136,
|
|
"grad_norm": 0.9557290077209473,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5307,
|
|
"mean_token_accuracy": 0.8357499241828918,
|
|
"num_tokens": 292829795.0,
|
|
"step": 1838
|
|
},
|
|
{
|
|
"epoch": 0.9354018311291964,
|
|
"grad_norm": 1.004783034324646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4956,
|
|
"mean_token_accuracy": 0.8466652035713196,
|
|
"num_tokens": 292987496.0,
|
|
"step": 1839
|
|
},
|
|
{
|
|
"epoch": 0.9359104781281791,
|
|
"grad_norm": 0.9559733271598816,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4946,
|
|
"mean_token_accuracy": 0.8437668681144714,
|
|
"num_tokens": 293144553.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.9364191251271617,
|
|
"grad_norm": 0.9844328165054321,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5388,
|
|
"mean_token_accuracy": 0.8340772390365601,
|
|
"num_tokens": 293302965.0,
|
|
"step": 1841
|
|
},
|
|
{
|
|
"epoch": 0.9369277721261444,
|
|
"grad_norm": 0.9541322588920593,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4913,
|
|
"mean_token_accuracy": 0.8453675508499146,
|
|
"num_tokens": 293463953.0,
|
|
"step": 1842
|
|
},
|
|
{
|
|
"epoch": 0.9374364191251272,
|
|
"grad_norm": 0.9606726169586182,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4968,
|
|
"mean_token_accuracy": 0.8437399864196777,
|
|
"num_tokens": 293629039.0,
|
|
"step": 1843
|
|
},
|
|
{
|
|
"epoch": 0.9379450661241099,
|
|
"grad_norm": 0.9770108461380005,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4542,
|
|
"mean_token_accuracy": 0.8554556369781494,
|
|
"num_tokens": 293773864.0,
|
|
"step": 1844
|
|
},
|
|
{
|
|
"epoch": 0.9384537131230926,
|
|
"grad_norm": 0.980654776096344,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5014,
|
|
"mean_token_accuracy": 0.8446040153503418,
|
|
"num_tokens": 293937000.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 0.9389623601220752,
|
|
"grad_norm": 0.9239630103111267,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.478,
|
|
"mean_token_accuracy": 0.8489571213722229,
|
|
"num_tokens": 294103688.0,
|
|
"step": 1846
|
|
},
|
|
{
|
|
"epoch": 0.939471007121058,
|
|
"grad_norm": 0.8894424438476562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4892,
|
|
"mean_token_accuracy": 0.8482279181480408,
|
|
"num_tokens": 294266886.0,
|
|
"step": 1847
|
|
},
|
|
{
|
|
"epoch": 0.9399796541200407,
|
|
"grad_norm": 0.985583484172821,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4722,
|
|
"mean_token_accuracy": 0.8499586582183838,
|
|
"num_tokens": 294425536.0,
|
|
"step": 1848
|
|
},
|
|
{
|
|
"epoch": 0.9404883011190234,
|
|
"grad_norm": 0.9692992568016052,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5176,
|
|
"mean_token_accuracy": 0.8396903276443481,
|
|
"num_tokens": 294590856.0,
|
|
"step": 1849
|
|
},
|
|
{
|
|
"epoch": 0.940996948118006,
|
|
"grad_norm": 0.994042158126831,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5046,
|
|
"mean_token_accuracy": 0.8418453931808472,
|
|
"num_tokens": 294743889.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.9415055951169888,
|
|
"grad_norm": 0.9837060570716858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4981,
|
|
"mean_token_accuracy": 0.8423642516136169,
|
|
"num_tokens": 294899666.0,
|
|
"step": 1851
|
|
},
|
|
{
|
|
"epoch": 0.9420142421159715,
|
|
"grad_norm": 0.9281747341156006,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.8312216997146606,
|
|
"num_tokens": 295078964.0,
|
|
"step": 1852
|
|
},
|
|
{
|
|
"epoch": 0.9425228891149542,
|
|
"grad_norm": 0.9872815608978271,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4768,
|
|
"mean_token_accuracy": 0.8486718535423279,
|
|
"num_tokens": 295221135.0,
|
|
"step": 1853
|
|
},
|
|
{
|
|
"epoch": 0.943031536113937,
|
|
"grad_norm": 0.9974026679992676,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4907,
|
|
"mean_token_accuracy": 0.8463758230209351,
|
|
"num_tokens": 295382594.0,
|
|
"step": 1854
|
|
},
|
|
{
|
|
"epoch": 0.9435401831129197,
|
|
"grad_norm": 0.9670667052268982,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5271,
|
|
"mean_token_accuracy": 0.8365948796272278,
|
|
"num_tokens": 295541772.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 0.9440488301119023,
|
|
"grad_norm": 0.97458815574646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4863,
|
|
"mean_token_accuracy": 0.8459551334381104,
|
|
"num_tokens": 295694723.0,
|
|
"step": 1856
|
|
},
|
|
{
|
|
"epoch": 0.944557477110885,
|
|
"grad_norm": 0.9388545155525208,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4818,
|
|
"mean_token_accuracy": 0.8488112688064575,
|
|
"num_tokens": 295848132.0,
|
|
"step": 1857
|
|
},
|
|
{
|
|
"epoch": 0.9450661241098678,
|
|
"grad_norm": 1.0486011505126953,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4996,
|
|
"mean_token_accuracy": 0.8439739942550659,
|
|
"num_tokens": 295998034.0,
|
|
"step": 1858
|
|
},
|
|
{
|
|
"epoch": 0.9455747711088505,
|
|
"grad_norm": 0.9702494740486145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5149,
|
|
"mean_token_accuracy": 0.8386168479919434,
|
|
"num_tokens": 296142665.0,
|
|
"step": 1859
|
|
},
|
|
{
|
|
"epoch": 0.9460834181078331,
|
|
"grad_norm": 0.9410427212715149,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4939,
|
|
"mean_token_accuracy": 0.8444942235946655,
|
|
"num_tokens": 296290081.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.9465920651068158,
|
|
"grad_norm": 0.9935798645019531,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5125,
|
|
"mean_token_accuracy": 0.8418990969657898,
|
|
"num_tokens": 296439741.0,
|
|
"step": 1861
|
|
},
|
|
{
|
|
"epoch": 0.9471007121057986,
|
|
"grad_norm": 0.9800233244895935,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4851,
|
|
"mean_token_accuracy": 0.8472626209259033,
|
|
"num_tokens": 296598043.0,
|
|
"step": 1862
|
|
},
|
|
{
|
|
"epoch": 0.9476093591047813,
|
|
"grad_norm": 0.9883062839508057,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4907,
|
|
"mean_token_accuracy": 0.8471593260765076,
|
|
"num_tokens": 296748422.0,
|
|
"step": 1863
|
|
},
|
|
{
|
|
"epoch": 0.948118006103764,
|
|
"grad_norm": 0.9847372174263,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5052,
|
|
"mean_token_accuracy": 0.8409173488616943,
|
|
"num_tokens": 296907598.0,
|
|
"step": 1864
|
|
},
|
|
{
|
|
"epoch": 0.9486266531027467,
|
|
"grad_norm": 1.044905662536621,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5184,
|
|
"mean_token_accuracy": 0.8379148244857788,
|
|
"num_tokens": 297068522.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 0.9491353001017294,
|
|
"grad_norm": 0.9221831560134888,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4885,
|
|
"mean_token_accuracy": 0.8469454646110535,
|
|
"num_tokens": 297229538.0,
|
|
"step": 1866
|
|
},
|
|
{
|
|
"epoch": 0.9496439471007121,
|
|
"grad_norm": 1.0027371644973755,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5063,
|
|
"mean_token_accuracy": 0.8409550189971924,
|
|
"num_tokens": 297390837.0,
|
|
"step": 1867
|
|
},
|
|
{
|
|
"epoch": 0.9501525940996948,
|
|
"grad_norm": 1.0401782989501953,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5495,
|
|
"mean_token_accuracy": 0.8319090604782104,
|
|
"num_tokens": 297545849.0,
|
|
"step": 1868
|
|
},
|
|
{
|
|
"epoch": 0.9506612410986776,
|
|
"grad_norm": 1.0064892768859863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.494,
|
|
"mean_token_accuracy": 0.8429477214813232,
|
|
"num_tokens": 297702474.0,
|
|
"step": 1869
|
|
},
|
|
{
|
|
"epoch": 0.9511698880976602,
|
|
"grad_norm": 1.0638560056686401,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5411,
|
|
"mean_token_accuracy": 0.8319836854934692,
|
|
"num_tokens": 297857385.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.9516785350966429,
|
|
"grad_norm": 1.0173994302749634,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4934,
|
|
"mean_token_accuracy": 0.8425914645195007,
|
|
"num_tokens": 298007755.0,
|
|
"step": 1871
|
|
},
|
|
{
|
|
"epoch": 0.9521871820956256,
|
|
"grad_norm": 1.0122215747833252,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5091,
|
|
"mean_token_accuracy": 0.8413197994232178,
|
|
"num_tokens": 298178437.0,
|
|
"step": 1872
|
|
},
|
|
{
|
|
"epoch": 0.9526958290946084,
|
|
"grad_norm": 1.0996077060699463,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5105,
|
|
"mean_token_accuracy": 0.8416764736175537,
|
|
"num_tokens": 298335117.0,
|
|
"step": 1873
|
|
},
|
|
{
|
|
"epoch": 0.953204476093591,
|
|
"grad_norm": 1.0579203367233276,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4829,
|
|
"mean_token_accuracy": 0.8460375666618347,
|
|
"num_tokens": 298502544.0,
|
|
"step": 1874
|
|
},
|
|
{
|
|
"epoch": 0.9537131230925737,
|
|
"grad_norm": 1.020734429359436,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5059,
|
|
"mean_token_accuracy": 0.8390011787414551,
|
|
"num_tokens": 298660517.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 0.9542217700915565,
|
|
"grad_norm": 0.9793227314949036,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.52,
|
|
"mean_token_accuracy": 0.8381893038749695,
|
|
"num_tokens": 298813928.0,
|
|
"step": 1876
|
|
},
|
|
{
|
|
"epoch": 0.9547304170905392,
|
|
"grad_norm": 0.9196975827217102,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4868,
|
|
"mean_token_accuracy": 0.845372200012207,
|
|
"num_tokens": 298982892.0,
|
|
"step": 1877
|
|
},
|
|
{
|
|
"epoch": 0.9552390640895219,
|
|
"grad_norm": 0.9534643888473511,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4918,
|
|
"mean_token_accuracy": 0.8440676927566528,
|
|
"num_tokens": 299145054.0,
|
|
"step": 1878
|
|
},
|
|
{
|
|
"epoch": 0.9557477110885045,
|
|
"grad_norm": 0.924911379814148,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4938,
|
|
"mean_token_accuracy": 0.8451396226882935,
|
|
"num_tokens": 299310640.0,
|
|
"step": 1879
|
|
},
|
|
{
|
|
"epoch": 0.9562563580874873,
|
|
"grad_norm": 0.9954731464385986,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5094,
|
|
"mean_token_accuracy": 0.8396790027618408,
|
|
"num_tokens": 299484449.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.95676500508647,
|
|
"grad_norm": 1.0057673454284668,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.8367470502853394,
|
|
"num_tokens": 299657556.0,
|
|
"step": 1881
|
|
},
|
|
{
|
|
"epoch": 0.9572736520854527,
|
|
"grad_norm": 0.9887730479240417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5136,
|
|
"mean_token_accuracy": 0.8388510942459106,
|
|
"num_tokens": 299821866.0,
|
|
"step": 1882
|
|
},
|
|
{
|
|
"epoch": 0.9577822990844354,
|
|
"grad_norm": 1.045063853263855,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5199,
|
|
"mean_token_accuracy": 0.8389185070991516,
|
|
"num_tokens": 299975575.0,
|
|
"step": 1883
|
|
},
|
|
{
|
|
"epoch": 0.9582909460834181,
|
|
"grad_norm": 1.1116621494293213,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5085,
|
|
"mean_token_accuracy": 0.8415688276290894,
|
|
"num_tokens": 300122149.0,
|
|
"step": 1884
|
|
},
|
|
{
|
|
"epoch": 0.9587995930824008,
|
|
"grad_norm": 0.9680479168891907,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5193,
|
|
"mean_token_accuracy": 0.8371805548667908,
|
|
"num_tokens": 300275629.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 0.9593082400813835,
|
|
"grad_norm": 0.9897911548614502,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5094,
|
|
"mean_token_accuracy": 0.8400824069976807,
|
|
"num_tokens": 300434761.0,
|
|
"step": 1886
|
|
},
|
|
{
|
|
"epoch": 0.9598168870803663,
|
|
"grad_norm": 0.9866281747817993,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5078,
|
|
"mean_token_accuracy": 0.8412807583808899,
|
|
"num_tokens": 300595753.0,
|
|
"step": 1887
|
|
},
|
|
{
|
|
"epoch": 0.960325534079349,
|
|
"grad_norm": 0.9881137013435364,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5187,
|
|
"mean_token_accuracy": 0.8375228047370911,
|
|
"num_tokens": 300761476.0,
|
|
"step": 1888
|
|
},
|
|
{
|
|
"epoch": 0.9608341810783316,
|
|
"grad_norm": 1.011414885520935,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.474,
|
|
"mean_token_accuracy": 0.8508404493331909,
|
|
"num_tokens": 300907255.0,
|
|
"step": 1889
|
|
},
|
|
{
|
|
"epoch": 0.9613428280773143,
|
|
"grad_norm": 0.9575902819633484,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5026,
|
|
"mean_token_accuracy": 0.8420031666755676,
|
|
"num_tokens": 301078714.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.9618514750762971,
|
|
"grad_norm": 0.9903496503829956,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5071,
|
|
"mean_token_accuracy": 0.8426606059074402,
|
|
"num_tokens": 301235054.0,
|
|
"step": 1891
|
|
},
|
|
{
|
|
"epoch": 0.9623601220752798,
|
|
"grad_norm": 0.9883329272270203,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5086,
|
|
"mean_token_accuracy": 0.8412028551101685,
|
|
"num_tokens": 301390887.0,
|
|
"step": 1892
|
|
},
|
|
{
|
|
"epoch": 0.9628687690742624,
|
|
"grad_norm": 0.9743595123291016,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5201,
|
|
"mean_token_accuracy": 0.8380074501037598,
|
|
"num_tokens": 301553051.0,
|
|
"step": 1893
|
|
},
|
|
{
|
|
"epoch": 0.9633774160732451,
|
|
"grad_norm": 0.9916619658470154,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4949,
|
|
"mean_token_accuracy": 0.8453688621520996,
|
|
"num_tokens": 301709240.0,
|
|
"step": 1894
|
|
},
|
|
{
|
|
"epoch": 0.9638860630722279,
|
|
"grad_norm": 0.9561258554458618,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5097,
|
|
"mean_token_accuracy": 0.8401749134063721,
|
|
"num_tokens": 301873555.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 0.9643947100712106,
|
|
"grad_norm": 0.9598016738891602,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5135,
|
|
"mean_token_accuracy": 0.8393949270248413,
|
|
"num_tokens": 302020968.0,
|
|
"step": 1896
|
|
},
|
|
{
|
|
"epoch": 0.9649033570701933,
|
|
"grad_norm": 0.9727898240089417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5428,
|
|
"mean_token_accuracy": 0.8316559791564941,
|
|
"num_tokens": 302178645.0,
|
|
"step": 1897
|
|
},
|
|
{
|
|
"epoch": 0.965412004069176,
|
|
"grad_norm": 0.9923699498176575,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5475,
|
|
"mean_token_accuracy": 0.8295910358428955,
|
|
"num_tokens": 302349196.0,
|
|
"step": 1898
|
|
},
|
|
{
|
|
"epoch": 0.9659206510681587,
|
|
"grad_norm": 0.9970629811286926,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5224,
|
|
"mean_token_accuracy": 0.8365216851234436,
|
|
"num_tokens": 302506292.0,
|
|
"step": 1899
|
|
},
|
|
{
|
|
"epoch": 0.9664292980671414,
|
|
"grad_norm": 1.000089406967163,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5264,
|
|
"mean_token_accuracy": 0.835106372833252,
|
|
"num_tokens": 302661848.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.9669379450661241,
|
|
"grad_norm": 0.961031973361969,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5419,
|
|
"mean_token_accuracy": 0.8303220272064209,
|
|
"num_tokens": 302822413.0,
|
|
"step": 1901
|
|
},
|
|
{
|
|
"epoch": 0.9674465920651069,
|
|
"grad_norm": 0.9586144089698792,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.493,
|
|
"mean_token_accuracy": 0.8451566696166992,
|
|
"num_tokens": 302985335.0,
|
|
"step": 1902
|
|
},
|
|
{
|
|
"epoch": 0.9679552390640895,
|
|
"grad_norm": 1.0209102630615234,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4917,
|
|
"mean_token_accuracy": 0.8447892069816589,
|
|
"num_tokens": 303149395.0,
|
|
"step": 1903
|
|
},
|
|
{
|
|
"epoch": 0.9684638860630722,
|
|
"grad_norm": 0.9929437041282654,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4818,
|
|
"mean_token_accuracy": 0.8488729596138,
|
|
"num_tokens": 303312885.0,
|
|
"step": 1904
|
|
},
|
|
{
|
|
"epoch": 0.9689725330620549,
|
|
"grad_norm": 0.9224243760108948,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5192,
|
|
"mean_token_accuracy": 0.8377442955970764,
|
|
"num_tokens": 303478385.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 0.9694811800610377,
|
|
"grad_norm": 1.0138916969299316,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5091,
|
|
"mean_token_accuracy": 0.8416192531585693,
|
|
"num_tokens": 303632397.0,
|
|
"step": 1906
|
|
},
|
|
{
|
|
"epoch": 0.9699898270600203,
|
|
"grad_norm": 0.973189651966095,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5104,
|
|
"mean_token_accuracy": 0.8388887643814087,
|
|
"num_tokens": 303793145.0,
|
|
"step": 1907
|
|
},
|
|
{
|
|
"epoch": 0.970498474059003,
|
|
"grad_norm": 0.9834722280502319,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5302,
|
|
"mean_token_accuracy": 0.8342853784561157,
|
|
"num_tokens": 303954402.0,
|
|
"step": 1908
|
|
},
|
|
{
|
|
"epoch": 0.9710071210579858,
|
|
"grad_norm": 0.998568058013916,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5134,
|
|
"mean_token_accuracy": 0.8387386202812195,
|
|
"num_tokens": 304115485.0,
|
|
"step": 1909
|
|
},
|
|
{
|
|
"epoch": 0.9715157680569685,
|
|
"grad_norm": 0.9369135499000549,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5122,
|
|
"mean_token_accuracy": 0.8402111530303955,
|
|
"num_tokens": 304285608.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.9720244150559512,
|
|
"grad_norm": 0.9555466771125793,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4608,
|
|
"mean_token_accuracy": 0.8527745008468628,
|
|
"num_tokens": 304444429.0,
|
|
"step": 1911
|
|
},
|
|
{
|
|
"epoch": 0.9725330620549338,
|
|
"grad_norm": 1.0225001573562622,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4925,
|
|
"mean_token_accuracy": 0.8453694581985474,
|
|
"num_tokens": 304604968.0,
|
|
"step": 1912
|
|
},
|
|
{
|
|
"epoch": 0.9730417090539166,
|
|
"grad_norm": 1.0820183753967285,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4846,
|
|
"mean_token_accuracy": 0.8482344746589661,
|
|
"num_tokens": 304762895.0,
|
|
"step": 1913
|
|
},
|
|
{
|
|
"epoch": 0.9735503560528993,
|
|
"grad_norm": 0.9791913628578186,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5053,
|
|
"mean_token_accuracy": 0.8402152061462402,
|
|
"num_tokens": 304919739.0,
|
|
"step": 1914
|
|
},
|
|
{
|
|
"epoch": 0.974059003051882,
|
|
"grad_norm": 0.9272238612174988,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.518,
|
|
"mean_token_accuracy": 0.8391333818435669,
|
|
"num_tokens": 305084650.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 0.9745676500508647,
|
|
"grad_norm": 0.9650485515594482,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4985,
|
|
"mean_token_accuracy": 0.8421435952186584,
|
|
"num_tokens": 305245025.0,
|
|
"step": 1916
|
|
},
|
|
{
|
|
"epoch": 0.9750762970498474,
|
|
"grad_norm": 1.0105266571044922,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4951,
|
|
"mean_token_accuracy": 0.8445926308631897,
|
|
"num_tokens": 305403475.0,
|
|
"step": 1917
|
|
},
|
|
{
|
|
"epoch": 0.9755849440488301,
|
|
"grad_norm": 0.9218766093254089,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.51,
|
|
"mean_token_accuracy": 0.8400081396102905,
|
|
"num_tokens": 305554045.0,
|
|
"step": 1918
|
|
},
|
|
{
|
|
"epoch": 0.9760935910478128,
|
|
"grad_norm": 0.9602965116500854,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4853,
|
|
"mean_token_accuracy": 0.8474862575531006,
|
|
"num_tokens": 305712579.0,
|
|
"step": 1919
|
|
},
|
|
{
|
|
"epoch": 0.9766022380467956,
|
|
"grad_norm": 0.9874665141105652,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4895,
|
|
"mean_token_accuracy": 0.8474003076553345,
|
|
"num_tokens": 305880937.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.9771108850457783,
|
|
"grad_norm": 0.9894477725028992,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4939,
|
|
"mean_token_accuracy": 0.8452675342559814,
|
|
"num_tokens": 306037487.0,
|
|
"step": 1921
|
|
},
|
|
{
|
|
"epoch": 0.9776195320447609,
|
|
"grad_norm": 0.9343340396881104,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5048,
|
|
"mean_token_accuracy": 0.842698335647583,
|
|
"num_tokens": 306205585.0,
|
|
"step": 1922
|
|
},
|
|
{
|
|
"epoch": 0.9781281790437436,
|
|
"grad_norm": 1.034570336341858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5167,
|
|
"mean_token_accuracy": 0.8392354249954224,
|
|
"num_tokens": 306363764.0,
|
|
"step": 1923
|
|
},
|
|
{
|
|
"epoch": 0.9786368260427264,
|
|
"grad_norm": 0.9801474213600159,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.496,
|
|
"mean_token_accuracy": 0.8447216749191284,
|
|
"num_tokens": 306521564.0,
|
|
"step": 1924
|
|
},
|
|
{
|
|
"epoch": 0.9791454730417091,
|
|
"grad_norm": 1.0061413049697876,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4963,
|
|
"mean_token_accuracy": 0.8442873954772949,
|
|
"num_tokens": 306674429.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 0.9796541200406917,
|
|
"grad_norm": 0.9756056070327759,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5349,
|
|
"mean_token_accuracy": 0.8323445320129395,
|
|
"num_tokens": 306832682.0,
|
|
"step": 1926
|
|
},
|
|
{
|
|
"epoch": 0.9801627670396744,
|
|
"grad_norm": 1.1082823276519775,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4761,
|
|
"mean_token_accuracy": 0.8504911065101624,
|
|
"num_tokens": 306990015.0,
|
|
"step": 1927
|
|
},
|
|
{
|
|
"epoch": 0.9806714140386572,
|
|
"grad_norm": 0.9246140122413635,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4983,
|
|
"mean_token_accuracy": 0.8428771495819092,
|
|
"num_tokens": 307155638.0,
|
|
"step": 1928
|
|
},
|
|
{
|
|
"epoch": 0.9811800610376399,
|
|
"grad_norm": 1.0045851469039917,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4776,
|
|
"mean_token_accuracy": 0.8490939140319824,
|
|
"num_tokens": 307317052.0,
|
|
"step": 1929
|
|
},
|
|
{
|
|
"epoch": 0.9816887080366226,
|
|
"grad_norm": 0.9832070469856262,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5067,
|
|
"mean_token_accuracy": 0.8416174054145813,
|
|
"num_tokens": 307461975.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.9821973550356052,
|
|
"grad_norm": 1.0693434476852417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5424,
|
|
"mean_token_accuracy": 0.8304628729820251,
|
|
"num_tokens": 307618995.0,
|
|
"step": 1931
|
|
},
|
|
{
|
|
"epoch": 0.982706002034588,
|
|
"grad_norm": 0.981465220451355,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4994,
|
|
"mean_token_accuracy": 0.8437796235084534,
|
|
"num_tokens": 307787106.0,
|
|
"step": 1932
|
|
},
|
|
{
|
|
"epoch": 0.9832146490335707,
|
|
"grad_norm": 0.91338050365448,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5084,
|
|
"mean_token_accuracy": 0.841780960559845,
|
|
"num_tokens": 307957991.0,
|
|
"step": 1933
|
|
},
|
|
{
|
|
"epoch": 0.9837232960325534,
|
|
"grad_norm": 1.0736961364746094,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5252,
|
|
"mean_token_accuracy": 0.8364447355270386,
|
|
"num_tokens": 308114670.0,
|
|
"step": 1934
|
|
},
|
|
{
|
|
"epoch": 0.9842319430315362,
|
|
"grad_norm": 0.9931672811508179,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5294,
|
|
"mean_token_accuracy": 0.834403395652771,
|
|
"num_tokens": 308273482.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 0.9847405900305188,
|
|
"grad_norm": 0.8900210857391357,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5053,
|
|
"mean_token_accuracy": 0.8416975140571594,
|
|
"num_tokens": 308438717.0,
|
|
"step": 1936
|
|
},
|
|
{
|
|
"epoch": 0.9852492370295015,
|
|
"grad_norm": 0.9809801578521729,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4656,
|
|
"mean_token_accuracy": 0.8523440957069397,
|
|
"num_tokens": 308606239.0,
|
|
"step": 1937
|
|
},
|
|
{
|
|
"epoch": 0.9857578840284842,
|
|
"grad_norm": 0.9752235412597656,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.508,
|
|
"mean_token_accuracy": 0.8410700559616089,
|
|
"num_tokens": 308770389.0,
|
|
"step": 1938
|
|
},
|
|
{
|
|
"epoch": 0.986266531027467,
|
|
"grad_norm": 1.01345956325531,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5072,
|
|
"mean_token_accuracy": 0.841428279876709,
|
|
"num_tokens": 308938986.0,
|
|
"step": 1939
|
|
},
|
|
{
|
|
"epoch": 0.9867751780264497,
|
|
"grad_norm": 0.8987012505531311,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4844,
|
|
"mean_token_accuracy": 0.8478468656539917,
|
|
"num_tokens": 309098467.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.9872838250254323,
|
|
"grad_norm": 1.0079503059387207,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5107,
|
|
"mean_token_accuracy": 0.8383113145828247,
|
|
"num_tokens": 309271767.0,
|
|
"step": 1941
|
|
},
|
|
{
|
|
"epoch": 0.987792472024415,
|
|
"grad_norm": 0.9667823314666748,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5268,
|
|
"mean_token_accuracy": 0.8350872993469238,
|
|
"num_tokens": 309430767.0,
|
|
"step": 1942
|
|
},
|
|
{
|
|
"epoch": 0.9883011190233978,
|
|
"grad_norm": 0.9856606125831604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5076,
|
|
"mean_token_accuracy": 0.8395977020263672,
|
|
"num_tokens": 309588008.0,
|
|
"step": 1943
|
|
},
|
|
{
|
|
"epoch": 0.9888097660223805,
|
|
"grad_norm": 0.9772642254829407,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5115,
|
|
"mean_token_accuracy": 0.8399066925048828,
|
|
"num_tokens": 309743337.0,
|
|
"step": 1944
|
|
},
|
|
{
|
|
"epoch": 0.9893184130213631,
|
|
"grad_norm": 1.0005584955215454,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5132,
|
|
"mean_token_accuracy": 0.8383961319923401,
|
|
"num_tokens": 309895383.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 0.9898270600203459,
|
|
"grad_norm": 1.0755404233932495,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8351097106933594,
|
|
"num_tokens": 310051593.0,
|
|
"step": 1946
|
|
},
|
|
{
|
|
"epoch": 0.9903357070193286,
|
|
"grad_norm": 1.0297179222106934,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5187,
|
|
"mean_token_accuracy": 0.8374543786048889,
|
|
"num_tokens": 310222144.0,
|
|
"step": 1947
|
|
},
|
|
{
|
|
"epoch": 0.9908443540183113,
|
|
"grad_norm": 1.331903100013733,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.481,
|
|
"mean_token_accuracy": 0.8481841087341309,
|
|
"num_tokens": 310376369.0,
|
|
"step": 1948
|
|
},
|
|
{
|
|
"epoch": 0.991353001017294,
|
|
"grad_norm": 0.9783390164375305,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5276,
|
|
"mean_token_accuracy": 0.8363245725631714,
|
|
"num_tokens": 310540773.0,
|
|
"step": 1949
|
|
},
|
|
{
|
|
"epoch": 0.9918616480162767,
|
|
"grad_norm": 1.0117331743240356,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5133,
|
|
"mean_token_accuracy": 0.8386774063110352,
|
|
"num_tokens": 310705795.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.9923702950152594,
|
|
"grad_norm": 0.8965991735458374,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4924,
|
|
"mean_token_accuracy": 0.8455886244773865,
|
|
"num_tokens": 310870435.0,
|
|
"step": 1951
|
|
},
|
|
{
|
|
"epoch": 0.9928789420142421,
|
|
"grad_norm": 0.979218065738678,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5277,
|
|
"mean_token_accuracy": 0.8348729610443115,
|
|
"num_tokens": 311023220.0,
|
|
"step": 1952
|
|
},
|
|
{
|
|
"epoch": 0.9933875890132248,
|
|
"grad_norm": 0.9134143590927124,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5089,
|
|
"mean_token_accuracy": 0.8415424823760986,
|
|
"num_tokens": 311192559.0,
|
|
"step": 1953
|
|
},
|
|
{
|
|
"epoch": 0.9938962360122076,
|
|
"grad_norm": 1.0180108547210693,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5309,
|
|
"mean_token_accuracy": 0.8341297507286072,
|
|
"num_tokens": 311361294.0,
|
|
"step": 1954
|
|
},
|
|
{
|
|
"epoch": 0.9944048830111902,
|
|
"grad_norm": 0.993411660194397,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5372,
|
|
"mean_token_accuracy": 0.8315683603286743,
|
|
"num_tokens": 311513624.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 0.9949135300101729,
|
|
"grad_norm": 1.0283094644546509,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.548,
|
|
"mean_token_accuracy": 0.8294710516929626,
|
|
"num_tokens": 311682933.0,
|
|
"step": 1956
|
|
},
|
|
{
|
|
"epoch": 0.9954221770091557,
|
|
"grad_norm": 0.9628696441650391,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5232,
|
|
"mean_token_accuracy": 0.8364568948745728,
|
|
"num_tokens": 311836701.0,
|
|
"step": 1957
|
|
},
|
|
{
|
|
"epoch": 0.9959308240081384,
|
|
"grad_norm": 0.9204580783843994,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4931,
|
|
"mean_token_accuracy": 0.846779465675354,
|
|
"num_tokens": 311997757.0,
|
|
"step": 1958
|
|
},
|
|
{
|
|
"epoch": 0.996439471007121,
|
|
"grad_norm": 0.9796516299247742,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5053,
|
|
"mean_token_accuracy": 0.8413169384002686,
|
|
"num_tokens": 312141716.0,
|
|
"step": 1959
|
|
},
|
|
{
|
|
"epoch": 0.9969481180061037,
|
|
"grad_norm": 0.9200966954231262,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5064,
|
|
"mean_token_accuracy": 0.8408148288726807,
|
|
"num_tokens": 312300232.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.9974567650050865,
|
|
"grad_norm": 0.9429312348365784,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5184,
|
|
"mean_token_accuracy": 0.8386069536209106,
|
|
"num_tokens": 312460325.0,
|
|
"step": 1961
|
|
},
|
|
{
|
|
"epoch": 0.9979654120040692,
|
|
"grad_norm": 0.9559454917907715,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.8460768461227417,
|
|
"num_tokens": 312617272.0,
|
|
"step": 1962
|
|
},
|
|
{
|
|
"epoch": 0.9984740590030519,
|
|
"grad_norm": 0.8990178108215332,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4999,
|
|
"mean_token_accuracy": 0.8424972891807556,
|
|
"num_tokens": 312782970.0,
|
|
"step": 1963
|
|
},
|
|
{
|
|
"epoch": 0.9989827060020345,
|
|
"grad_norm": 0.9896379709243774,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5175,
|
|
"mean_token_accuracy": 0.839756429195404,
|
|
"num_tokens": 312940535.0,
|
|
"step": 1964
|
|
},
|
|
{
|
|
"epoch": 0.9994913530010173,
|
|
"grad_norm": 0.9886254072189331,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.4982,
|
|
"mean_token_accuracy": 0.8438684940338135,
|
|
"num_tokens": 313094817.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.9970649480819702,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.5205,
|
|
"mean_token_accuracy": 0.836739718914032,
|
|
"num_tokens": 313255150.0,
|
|
"step": 1966
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 1966,
|
|
"total_flos": 2.0963795699960381e+18,
|
|
"train_loss": 0.560804831153985,
|
|
"train_runtime": 2871.9626,
|
|
"train_samples_per_second": 43.792,
|
|
"train_steps_per_second": 0.685
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 1966,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 983,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.0963795699960381e+18,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|