5714 lines
148 KiB
JSON
5714 lines
148 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 630,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.004761904761904762,
|
|
"grad_norm": 8.288029670715332,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.7656,
|
|
"mean_token_accuracy": 0.5768666863441467,
|
|
"num_tokens": 582781.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.009523809523809525,
|
|
"grad_norm": 8.260492324829102,
|
|
"learning_rate": 1.5873015873015874e-07,
|
|
"loss": 1.7728,
|
|
"mean_token_accuracy": 0.5752322673797607,
|
|
"num_tokens": 1163696.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.014285714285714285,
|
|
"grad_norm": 8.188252449035645,
|
|
"learning_rate": 3.174603174603175e-07,
|
|
"loss": 1.776,
|
|
"mean_token_accuracy": 0.5746057033538818,
|
|
"num_tokens": 1762000.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.01904761904761905,
|
|
"grad_norm": 8.122298240661621,
|
|
"learning_rate": 4.7619047619047623e-07,
|
|
"loss": 1.7765,
|
|
"mean_token_accuracy": 0.5741599798202515,
|
|
"num_tokens": 2363228.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.023809523809523808,
|
|
"grad_norm": 7.91809606552124,
|
|
"learning_rate": 6.34920634920635e-07,
|
|
"loss": 1.7924,
|
|
"mean_token_accuracy": 0.5723700523376465,
|
|
"num_tokens": 2968748.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.02857142857142857,
|
|
"grad_norm": 7.924537181854248,
|
|
"learning_rate": 7.936507936507937e-07,
|
|
"loss": 1.7649,
|
|
"mean_token_accuracy": 0.5754636526107788,
|
|
"num_tokens": 3564062.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.03333333333333333,
|
|
"grad_norm": 7.629780292510986,
|
|
"learning_rate": 9.523809523809525e-07,
|
|
"loss": 1.7769,
|
|
"mean_token_accuracy": 0.5719509124755859,
|
|
"num_tokens": 4140352.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.0380952380952381,
|
|
"grad_norm": 7.133674621582031,
|
|
"learning_rate": 1.111111111111111e-06,
|
|
"loss": 1.7748,
|
|
"mean_token_accuracy": 0.5719484090805054,
|
|
"num_tokens": 4748067.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.04285714285714286,
|
|
"grad_norm": 6.150221347808838,
|
|
"learning_rate": 1.26984126984127e-06,
|
|
"loss": 1.7288,
|
|
"mean_token_accuracy": 0.5776432156562805,
|
|
"num_tokens": 5333791.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.047619047619047616,
|
|
"grad_norm": 6.026834964752197,
|
|
"learning_rate": 1.4285714285714286e-06,
|
|
"loss": 1.7401,
|
|
"mean_token_accuracy": 0.5752577781677246,
|
|
"num_tokens": 5923564.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.05238095238095238,
|
|
"grad_norm": 5.608363151550293,
|
|
"learning_rate": 1.5873015873015873e-06,
|
|
"loss": 1.7097,
|
|
"mean_token_accuracy": 0.5797863602638245,
|
|
"num_tokens": 6528559.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.05714285714285714,
|
|
"grad_norm": 4.234569072723389,
|
|
"learning_rate": 1.746031746031746e-06,
|
|
"loss": 1.6598,
|
|
"mean_token_accuracy": 0.5850973725318909,
|
|
"num_tokens": 7118765.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.06190476190476191,
|
|
"grad_norm": 4.145053386688232,
|
|
"learning_rate": 1.904761904761905e-06,
|
|
"loss": 1.6597,
|
|
"mean_token_accuracy": 0.5842898488044739,
|
|
"num_tokens": 7709226.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.06666666666666667,
|
|
"grad_norm": 3.9073646068573,
|
|
"learning_rate": 2.0634920634920634e-06,
|
|
"loss": 1.6303,
|
|
"mean_token_accuracy": 0.5906457901000977,
|
|
"num_tokens": 8298984.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.07142857142857142,
|
|
"grad_norm": 3.8127150535583496,
|
|
"learning_rate": 2.222222222222222e-06,
|
|
"loss": 1.6281,
|
|
"mean_token_accuracy": 0.5896565914154053,
|
|
"num_tokens": 8875624.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.0761904761904762,
|
|
"grad_norm": 3.0899341106414795,
|
|
"learning_rate": 2.380952380952381e-06,
|
|
"loss": 1.5687,
|
|
"mean_token_accuracy": 0.5990549325942993,
|
|
"num_tokens": 9448671.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.08095238095238096,
|
|
"grad_norm": 2.755232334136963,
|
|
"learning_rate": 2.53968253968254e-06,
|
|
"loss": 1.5548,
|
|
"mean_token_accuracy": 0.6021129488945007,
|
|
"num_tokens": 10049546.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.08571428571428572,
|
|
"grad_norm": 2.589613914489746,
|
|
"learning_rate": 2.6984126984126986e-06,
|
|
"loss": 1.5609,
|
|
"mean_token_accuracy": 0.5993459820747375,
|
|
"num_tokens": 10644905.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.09047619047619047,
|
|
"grad_norm": 2.2161478996276855,
|
|
"learning_rate": 2.8571428571428573e-06,
|
|
"loss": 1.5541,
|
|
"mean_token_accuracy": 0.6018418073654175,
|
|
"num_tokens": 11239583.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.09523809523809523,
|
|
"grad_norm": 1.9722470045089722,
|
|
"learning_rate": 3.015873015873016e-06,
|
|
"loss": 1.5295,
|
|
"mean_token_accuracy": 0.6070071458816528,
|
|
"num_tokens": 11827320.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 1.8827704191207886,
|
|
"learning_rate": 3.1746031746031746e-06,
|
|
"loss": 1.4814,
|
|
"mean_token_accuracy": 0.6151003837585449,
|
|
"num_tokens": 12425511.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.10476190476190476,
|
|
"grad_norm": 2.351033926010132,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 1.4865,
|
|
"mean_token_accuracy": 0.6138286590576172,
|
|
"num_tokens": 13015708.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.10952380952380952,
|
|
"grad_norm": 2.134150981903076,
|
|
"learning_rate": 3.492063492063492e-06,
|
|
"loss": 1.469,
|
|
"mean_token_accuracy": 0.6165286302566528,
|
|
"num_tokens": 13608875.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.11428571428571428,
|
|
"grad_norm": 1.9380258321762085,
|
|
"learning_rate": 3.6507936507936507e-06,
|
|
"loss": 1.476,
|
|
"mean_token_accuracy": 0.6141604781150818,
|
|
"num_tokens": 14204297.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.11904761904761904,
|
|
"grad_norm": 1.656062364578247,
|
|
"learning_rate": 3.80952380952381e-06,
|
|
"loss": 1.461,
|
|
"mean_token_accuracy": 0.6166412830352783,
|
|
"num_tokens": 14782206.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.12380952380952381,
|
|
"grad_norm": 1.3905470371246338,
|
|
"learning_rate": 3.968253968253968e-06,
|
|
"loss": 1.4382,
|
|
"mean_token_accuracy": 0.6210923194885254,
|
|
"num_tokens": 15377829.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.12857142857142856,
|
|
"grad_norm": 1.1439160108566284,
|
|
"learning_rate": 4.126984126984127e-06,
|
|
"loss": 1.4318,
|
|
"mean_token_accuracy": 0.6224101781845093,
|
|
"num_tokens": 15975819.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.13333333333333333,
|
|
"grad_norm": 1.0443707704544067,
|
|
"learning_rate": 4.2857142857142855e-06,
|
|
"loss": 1.4182,
|
|
"mean_token_accuracy": 0.6251046061515808,
|
|
"num_tokens": 16577839.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.1380952380952381,
|
|
"grad_norm": 1.0729820728302002,
|
|
"learning_rate": 4.444444444444444e-06,
|
|
"loss": 1.4116,
|
|
"mean_token_accuracy": 0.6257858276367188,
|
|
"num_tokens": 17164831.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.14285714285714285,
|
|
"grad_norm": 1.1262085437774658,
|
|
"learning_rate": 4.603174603174604e-06,
|
|
"loss": 1.3974,
|
|
"mean_token_accuracy": 0.6290417909622192,
|
|
"num_tokens": 17770476.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.14761904761904762,
|
|
"grad_norm": 1.1004436016082764,
|
|
"learning_rate": 4.761904761904762e-06,
|
|
"loss": 1.383,
|
|
"mean_token_accuracy": 0.6305603981018066,
|
|
"num_tokens": 18360862.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.1523809523809524,
|
|
"grad_norm": 0.9822593927383423,
|
|
"learning_rate": 4.920634920634921e-06,
|
|
"loss": 1.3981,
|
|
"mean_token_accuracy": 0.6271172761917114,
|
|
"num_tokens": 18944338.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.15714285714285714,
|
|
"grad_norm": 0.8572197556495667,
|
|
"learning_rate": 5.07936507936508e-06,
|
|
"loss": 1.3721,
|
|
"mean_token_accuracy": 0.6327400207519531,
|
|
"num_tokens": 19540189.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.1619047619047619,
|
|
"grad_norm": 0.9113824963569641,
|
|
"learning_rate": 5.2380952380952384e-06,
|
|
"loss": 1.3689,
|
|
"mean_token_accuracy": 0.6341559290885925,
|
|
"num_tokens": 20138131.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 0.8736249208450317,
|
|
"learning_rate": 5.396825396825397e-06,
|
|
"loss": 1.3855,
|
|
"mean_token_accuracy": 0.6294394731521606,
|
|
"num_tokens": 20735187.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.17142857142857143,
|
|
"grad_norm": 0.8438997268676758,
|
|
"learning_rate": 5.555555555555557e-06,
|
|
"loss": 1.3614,
|
|
"mean_token_accuracy": 0.6335337162017822,
|
|
"num_tokens": 21316383.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.1761904761904762,
|
|
"grad_norm": 0.7541394233703613,
|
|
"learning_rate": 5.7142857142857145e-06,
|
|
"loss": 1.3378,
|
|
"mean_token_accuracy": 0.6401833295822144,
|
|
"num_tokens": 21910626.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.18095238095238095,
|
|
"grad_norm": 0.697533130645752,
|
|
"learning_rate": 5.873015873015874e-06,
|
|
"loss": 1.3591,
|
|
"mean_token_accuracy": 0.6341187357902527,
|
|
"num_tokens": 22503955.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.18571428571428572,
|
|
"grad_norm": 0.677990734577179,
|
|
"learning_rate": 6.031746031746032e-06,
|
|
"loss": 1.3543,
|
|
"mean_token_accuracy": 0.6353764533996582,
|
|
"num_tokens": 23093310.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.19047619047619047,
|
|
"grad_norm": 0.677953839302063,
|
|
"learning_rate": 6.1904761904761914e-06,
|
|
"loss": 1.3249,
|
|
"mean_token_accuracy": 0.641827404499054,
|
|
"num_tokens": 23681028.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.19523809523809524,
|
|
"grad_norm": 0.6177698969841003,
|
|
"learning_rate": 6.349206349206349e-06,
|
|
"loss": 1.3271,
|
|
"mean_token_accuracy": 0.6412782669067383,
|
|
"num_tokens": 24275532.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 0.6382781267166138,
|
|
"learning_rate": 6.507936507936509e-06,
|
|
"loss": 1.3309,
|
|
"mean_token_accuracy": 0.6407559514045715,
|
|
"num_tokens": 24868054.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.20476190476190476,
|
|
"grad_norm": 0.5981337428092957,
|
|
"learning_rate": 6.666666666666667e-06,
|
|
"loss": 1.3323,
|
|
"mean_token_accuracy": 0.6397281885147095,
|
|
"num_tokens": 25459842.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.20952380952380953,
|
|
"grad_norm": 0.5885143876075745,
|
|
"learning_rate": 6.825396825396826e-06,
|
|
"loss": 1.339,
|
|
"mean_token_accuracy": 0.6373006105422974,
|
|
"num_tokens": 26051340.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.21428571428571427,
|
|
"grad_norm": 0.5942175984382629,
|
|
"learning_rate": 6.984126984126984e-06,
|
|
"loss": 1.3188,
|
|
"mean_token_accuracy": 0.6426886320114136,
|
|
"num_tokens": 26635240.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.21904761904761905,
|
|
"grad_norm": 0.6174569129943848,
|
|
"learning_rate": 7.1428571428571436e-06,
|
|
"loss": 1.3198,
|
|
"mean_token_accuracy": 0.6419786214828491,
|
|
"num_tokens": 27228570.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.22380952380952382,
|
|
"grad_norm": 0.6012991070747375,
|
|
"learning_rate": 7.301587301587301e-06,
|
|
"loss": 1.3139,
|
|
"mean_token_accuracy": 0.6440544128417969,
|
|
"num_tokens": 27825958.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.22857142857142856,
|
|
"grad_norm": 0.6103922128677368,
|
|
"learning_rate": 7.460317460317461e-06,
|
|
"loss": 1.3076,
|
|
"mean_token_accuracy": 0.6433683037757874,
|
|
"num_tokens": 28418470.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.23333333333333334,
|
|
"grad_norm": 0.6127147674560547,
|
|
"learning_rate": 7.61904761904762e-06,
|
|
"loss": 1.3044,
|
|
"mean_token_accuracy": 0.6449373364448547,
|
|
"num_tokens": 29013060.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.23809523809523808,
|
|
"grad_norm": 0.5933082103729248,
|
|
"learning_rate": 7.77777777777778e-06,
|
|
"loss": 1.3131,
|
|
"mean_token_accuracy": 0.6417987942695618,
|
|
"num_tokens": 29624709.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.24285714285714285,
|
|
"grad_norm": 0.6003814339637756,
|
|
"learning_rate": 7.936507936507936e-06,
|
|
"loss": 1.3056,
|
|
"mean_token_accuracy": 0.6438874006271362,
|
|
"num_tokens": 30227928.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.24761904761904763,
|
|
"grad_norm": 0.5546218156814575,
|
|
"learning_rate": 8.095238095238097e-06,
|
|
"loss": 1.3073,
|
|
"mean_token_accuracy": 0.6426275968551636,
|
|
"num_tokens": 30823383.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.2523809523809524,
|
|
"grad_norm": 0.5813356637954712,
|
|
"learning_rate": 8.253968253968254e-06,
|
|
"loss": 1.2887,
|
|
"mean_token_accuracy": 0.6480042338371277,
|
|
"num_tokens": 31418593.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.2571428571428571,
|
|
"grad_norm": 0.6125403046607971,
|
|
"learning_rate": 8.412698412698414e-06,
|
|
"loss": 1.2812,
|
|
"mean_token_accuracy": 0.6492801904678345,
|
|
"num_tokens": 32008377.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.2619047619047619,
|
|
"grad_norm": 0.6021028757095337,
|
|
"learning_rate": 8.571428571428571e-06,
|
|
"loss": 1.2881,
|
|
"mean_token_accuracy": 0.6466339230537415,
|
|
"num_tokens": 32600302.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.26666666666666666,
|
|
"grad_norm": 0.5916977524757385,
|
|
"learning_rate": 8.730158730158731e-06,
|
|
"loss": 1.2896,
|
|
"mean_token_accuracy": 0.6466712951660156,
|
|
"num_tokens": 33201147.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.2714285714285714,
|
|
"grad_norm": 0.5573871731758118,
|
|
"learning_rate": 8.888888888888888e-06,
|
|
"loss": 1.269,
|
|
"mean_token_accuracy": 0.6514161229133606,
|
|
"num_tokens": 33790565.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.2761904761904762,
|
|
"grad_norm": 0.6427719593048096,
|
|
"learning_rate": 9.047619047619049e-06,
|
|
"loss": 1.2747,
|
|
"mean_token_accuracy": 0.6507048606872559,
|
|
"num_tokens": 34387187.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.28095238095238095,
|
|
"grad_norm": 0.5992103219032288,
|
|
"learning_rate": 9.206349206349207e-06,
|
|
"loss": 1.2832,
|
|
"mean_token_accuracy": 0.6487317085266113,
|
|
"num_tokens": 35000480.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.2857142857142857,
|
|
"grad_norm": 0.6176905632019043,
|
|
"learning_rate": 9.365079365079366e-06,
|
|
"loss": 1.266,
|
|
"mean_token_accuracy": 0.6526767611503601,
|
|
"num_tokens": 35588577.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.2904761904761905,
|
|
"grad_norm": 0.6162196397781372,
|
|
"learning_rate": 9.523809523809525e-06,
|
|
"loss": 1.2696,
|
|
"mean_token_accuracy": 0.6507794857025146,
|
|
"num_tokens": 36179186.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.29523809523809524,
|
|
"grad_norm": 0.5662937760353088,
|
|
"learning_rate": 9.682539682539683e-06,
|
|
"loss": 1.2769,
|
|
"mean_token_accuracy": 0.6498540639877319,
|
|
"num_tokens": 36787338.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 0.6263328790664673,
|
|
"learning_rate": 9.841269841269842e-06,
|
|
"loss": 1.2659,
|
|
"mean_token_accuracy": 0.6512309908866882,
|
|
"num_tokens": 37376232.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.3047619047619048,
|
|
"grad_norm": 0.5712647438049316,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2575,
|
|
"mean_token_accuracy": 0.65373295545578,
|
|
"num_tokens": 37965066.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.30952380952380953,
|
|
"grad_norm": 0.6364603042602539,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2707,
|
|
"mean_token_accuracy": 0.6504393219947815,
|
|
"num_tokens": 38556474.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.3142857142857143,
|
|
"grad_norm": 0.5501719117164612,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2817,
|
|
"mean_token_accuracy": 0.6485756039619446,
|
|
"num_tokens": 39153957.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.319047619047619,
|
|
"grad_norm": 0.6252837777137756,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.269,
|
|
"mean_token_accuracy": 0.6509230136871338,
|
|
"num_tokens": 39743079.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.3238095238095238,
|
|
"grad_norm": 0.635744035243988,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2538,
|
|
"mean_token_accuracy": 0.6549092531204224,
|
|
"num_tokens": 40341422.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.32857142857142857,
|
|
"grad_norm": 0.602989137172699,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2522,
|
|
"mean_token_accuracy": 0.6547552347183228,
|
|
"num_tokens": 40930579.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 0.6224581003189087,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2475,
|
|
"mean_token_accuracy": 0.6561790704727173,
|
|
"num_tokens": 41521392.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.3380952380952381,
|
|
"grad_norm": 0.6388071179389954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2652,
|
|
"mean_token_accuracy": 0.6521209478378296,
|
|
"num_tokens": 42126117.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.34285714285714286,
|
|
"grad_norm": 0.6036304235458374,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2435,
|
|
"mean_token_accuracy": 0.6566687822341919,
|
|
"num_tokens": 42717085.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.3476190476190476,
|
|
"grad_norm": 0.6735650300979614,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2474,
|
|
"mean_token_accuracy": 0.6550711989402771,
|
|
"num_tokens": 43300932.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.3523809523809524,
|
|
"grad_norm": 0.6821399927139282,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2612,
|
|
"mean_token_accuracy": 0.6513347625732422,
|
|
"num_tokens": 43885512.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.35714285714285715,
|
|
"grad_norm": 0.5906922221183777,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2462,
|
|
"mean_token_accuracy": 0.6552602052688599,
|
|
"num_tokens": 44482626.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.3619047619047619,
|
|
"grad_norm": 0.6703640222549438,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2555,
|
|
"mean_token_accuracy": 0.6526749134063721,
|
|
"num_tokens": 45073331.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.36666666666666664,
|
|
"grad_norm": 0.6432617902755737,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2536,
|
|
"mean_token_accuracy": 0.654289186000824,
|
|
"num_tokens": 45683001.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.37142857142857144,
|
|
"grad_norm": 0.5765655040740967,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2571,
|
|
"mean_token_accuracy": 0.6539218425750732,
|
|
"num_tokens": 46280871.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.3761904761904762,
|
|
"grad_norm": 0.6340111494064331,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2372,
|
|
"mean_token_accuracy": 0.6561391353607178,
|
|
"num_tokens": 46860927.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.38095238095238093,
|
|
"grad_norm": 0.6405033469200134,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2526,
|
|
"mean_token_accuracy": 0.6536115407943726,
|
|
"num_tokens": 47450747.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.38571428571428573,
|
|
"grad_norm": 0.5792959332466125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.25,
|
|
"mean_token_accuracy": 0.6553176641464233,
|
|
"num_tokens": 48053355.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.3904761904761905,
|
|
"grad_norm": 0.686775267124176,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2208,
|
|
"mean_token_accuracy": 0.659858226776123,
|
|
"num_tokens": 48654406.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.3952380952380952,
|
|
"grad_norm": 0.6492419838905334,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2283,
|
|
"mean_token_accuracy": 0.6583410501480103,
|
|
"num_tokens": 49253902.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 0.5871007442474365,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2452,
|
|
"mean_token_accuracy": 0.6552358269691467,
|
|
"num_tokens": 49851728.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.40476190476190477,
|
|
"grad_norm": 0.5860946774482727,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2512,
|
|
"mean_token_accuracy": 0.6536369919776917,
|
|
"num_tokens": 50456288.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.4095238095238095,
|
|
"grad_norm": 0.6220575571060181,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2576,
|
|
"mean_token_accuracy": 0.6526967883110046,
|
|
"num_tokens": 51058176.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.4142857142857143,
|
|
"grad_norm": 0.6111760139465332,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2426,
|
|
"mean_token_accuracy": 0.6556516885757446,
|
|
"num_tokens": 51665178.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.41904761904761906,
|
|
"grad_norm": 0.7028889060020447,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2275,
|
|
"mean_token_accuracy": 0.658629298210144,
|
|
"num_tokens": 52237427.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.4238095238095238,
|
|
"grad_norm": 0.6114148497581482,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2493,
|
|
"mean_token_accuracy": 0.6530453562736511,
|
|
"num_tokens": 52850605.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.42857142857142855,
|
|
"grad_norm": 0.6214424967765808,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2107,
|
|
"mean_token_accuracy": 0.6619127988815308,
|
|
"num_tokens": 53435907.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.43333333333333335,
|
|
"grad_norm": 0.6224313378334045,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2479,
|
|
"mean_token_accuracy": 0.6531662344932556,
|
|
"num_tokens": 54032690.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.4380952380952381,
|
|
"grad_norm": 0.5745725035667419,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2339,
|
|
"mean_token_accuracy": 0.6577485799789429,
|
|
"num_tokens": 54631908.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.44285714285714284,
|
|
"grad_norm": 0.6754887104034424,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2274,
|
|
"mean_token_accuracy": 0.6584199666976929,
|
|
"num_tokens": 55218598.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.44761904761904764,
|
|
"grad_norm": 0.6922246813774109,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2513,
|
|
"mean_token_accuracy": 0.6527312397956848,
|
|
"num_tokens": 55814642.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.4523809523809524,
|
|
"grad_norm": 0.5802931189537048,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2231,
|
|
"mean_token_accuracy": 0.6605392694473267,
|
|
"num_tokens": 56410000.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.45714285714285713,
|
|
"grad_norm": 0.7186371088027954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2325,
|
|
"mean_token_accuracy": 0.6574358940124512,
|
|
"num_tokens": 57001902.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.46190476190476193,
|
|
"grad_norm": 0.5912067294120789,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2413,
|
|
"mean_token_accuracy": 0.6551775336265564,
|
|
"num_tokens": 57612227.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.4666666666666667,
|
|
"grad_norm": 0.7110946774482727,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2272,
|
|
"mean_token_accuracy": 0.6573148965835571,
|
|
"num_tokens": 58198983.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.4714285714285714,
|
|
"grad_norm": 0.703130841255188,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2488,
|
|
"mean_token_accuracy": 0.6536985039710999,
|
|
"num_tokens": 58805739.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.47619047619047616,
|
|
"grad_norm": 0.6474947333335876,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.216,
|
|
"mean_token_accuracy": 0.6609683036804199,
|
|
"num_tokens": 59386596.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.48095238095238096,
|
|
"grad_norm": 0.7493091225624084,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2239,
|
|
"mean_token_accuracy": 0.6587037444114685,
|
|
"num_tokens": 59976677.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.4857142857142857,
|
|
"grad_norm": 0.6101422905921936,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2366,
|
|
"mean_token_accuracy": 0.6560448408126831,
|
|
"num_tokens": 60581023.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.49047619047619045,
|
|
"grad_norm": 0.7304781079292297,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2269,
|
|
"mean_token_accuracy": 0.6589258909225464,
|
|
"num_tokens": 61177587.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.49523809523809526,
|
|
"grad_norm": 0.618215024471283,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2207,
|
|
"mean_token_accuracy": 0.6586862802505493,
|
|
"num_tokens": 61759739.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 0.6789980530738831,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2283,
|
|
"mean_token_accuracy": 0.6580797433853149,
|
|
"num_tokens": 62343623.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.5047619047619047,
|
|
"grad_norm": 0.6834375858306885,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2226,
|
|
"mean_token_accuracy": 0.6588083505630493,
|
|
"num_tokens": 62936609.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.5095238095238095,
|
|
"grad_norm": 0.6128349304199219,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.219,
|
|
"mean_token_accuracy": 0.6602170467376709,
|
|
"num_tokens": 63540035.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.5142857142857142,
|
|
"grad_norm": 0.6424954533576965,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2252,
|
|
"mean_token_accuracy": 0.6583743691444397,
|
|
"num_tokens": 64137406.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.5190476190476191,
|
|
"grad_norm": 0.566566526889801,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2104,
|
|
"mean_token_accuracy": 0.6621809005737305,
|
|
"num_tokens": 64747343.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.5238095238095238,
|
|
"grad_norm": 0.5913292169570923,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.21,
|
|
"mean_token_accuracy": 0.6611165404319763,
|
|
"num_tokens": 65340433.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.5285714285714286,
|
|
"grad_norm": 0.5560601353645325,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2029,
|
|
"mean_token_accuracy": 0.6629985570907593,
|
|
"num_tokens": 65928375.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.5333333333333333,
|
|
"grad_norm": 0.5711589455604553,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2285,
|
|
"mean_token_accuracy": 0.6574028134346008,
|
|
"num_tokens": 66527455.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.5380952380952381,
|
|
"grad_norm": 0.5675383806228638,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2001,
|
|
"mean_token_accuracy": 0.6645528674125671,
|
|
"num_tokens": 67120147.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.5428571428571428,
|
|
"grad_norm": 0.5860258340835571,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2182,
|
|
"mean_token_accuracy": 0.6599565744400024,
|
|
"num_tokens": 67726850.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.5476190476190477,
|
|
"grad_norm": 0.5209094285964966,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2126,
|
|
"mean_token_accuracy": 0.6609143018722534,
|
|
"num_tokens": 68316713.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.5523809523809524,
|
|
"grad_norm": 0.6333171725273132,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2156,
|
|
"mean_token_accuracy": 0.6600525379180908,
|
|
"num_tokens": 68892365.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.5571428571428572,
|
|
"grad_norm": 0.5704973340034485,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2211,
|
|
"mean_token_accuracy": 0.6591875553131104,
|
|
"num_tokens": 69505524.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.5619047619047619,
|
|
"grad_norm": 0.7181419134140015,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2036,
|
|
"mean_token_accuracy": 0.6623135805130005,
|
|
"num_tokens": 70095302.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.5666666666666667,
|
|
"grad_norm": 0.5681948661804199,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.216,
|
|
"mean_token_accuracy": 0.6598063707351685,
|
|
"num_tokens": 70694971.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.5714285714285714,
|
|
"grad_norm": 0.7001712918281555,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2146,
|
|
"mean_token_accuracy": 0.6608985662460327,
|
|
"num_tokens": 71279415.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.5761904761904761,
|
|
"grad_norm": 0.6377084255218506,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.209,
|
|
"mean_token_accuracy": 0.6621115207672119,
|
|
"num_tokens": 71869014.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.580952380952381,
|
|
"grad_norm": 0.6364737153053284,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2171,
|
|
"mean_token_accuracy": 0.6591671705245972,
|
|
"num_tokens": 72472715.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.5857142857142857,
|
|
"grad_norm": 0.6466585397720337,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2089,
|
|
"mean_token_accuracy": 0.661442756652832,
|
|
"num_tokens": 73055740.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.5904761904761905,
|
|
"grad_norm": 0.5920109152793884,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1924,
|
|
"mean_token_accuracy": 0.6659133434295654,
|
|
"num_tokens": 73639151.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.5952380952380952,
|
|
"grad_norm": 0.6872738599777222,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2113,
|
|
"mean_token_accuracy": 0.6628360152244568,
|
|
"num_tokens": 74216756.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 0.5881339907646179,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2062,
|
|
"mean_token_accuracy": 0.662140965461731,
|
|
"num_tokens": 74813953.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.6047619047619047,
|
|
"grad_norm": 0.6483287215232849,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2065,
|
|
"mean_token_accuracy": 0.6624675989151001,
|
|
"num_tokens": 75410691.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.6095238095238096,
|
|
"grad_norm": 0.5890834331512451,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2235,
|
|
"mean_token_accuracy": 0.6575560569763184,
|
|
"num_tokens": 75996496.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.6142857142857143,
|
|
"grad_norm": 0.6782101988792419,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.199,
|
|
"mean_token_accuracy": 0.6648662090301514,
|
|
"num_tokens": 76585198.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.6190476190476191,
|
|
"grad_norm": 0.6252265572547913,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1872,
|
|
"mean_token_accuracy": 0.6665824055671692,
|
|
"num_tokens": 77191596.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.6238095238095238,
|
|
"grad_norm": 0.6833210587501526,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2048,
|
|
"mean_token_accuracy": 0.6622829437255859,
|
|
"num_tokens": 77796998.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.6285714285714286,
|
|
"grad_norm": 0.6870852708816528,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2171,
|
|
"mean_token_accuracy": 0.6590390801429749,
|
|
"num_tokens": 78395104.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.6333333333333333,
|
|
"grad_norm": 0.7417638897895813,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2036,
|
|
"mean_token_accuracy": 0.66297847032547,
|
|
"num_tokens": 78988563.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.638095238095238,
|
|
"grad_norm": 0.569595456123352,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2234,
|
|
"mean_token_accuracy": 0.6573336124420166,
|
|
"num_tokens": 79599633.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.6428571428571429,
|
|
"grad_norm": 0.8054560422897339,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2149,
|
|
"mean_token_accuracy": 0.6601018905639648,
|
|
"num_tokens": 80196954.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.6476190476190476,
|
|
"grad_norm": 0.6360299587249756,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2141,
|
|
"mean_token_accuracy": 0.6599046587944031,
|
|
"num_tokens": 80790959.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.6523809523809524,
|
|
"grad_norm": 0.7952516078948975,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2004,
|
|
"mean_token_accuracy": 0.6641189455986023,
|
|
"num_tokens": 81363350.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.6571428571428571,
|
|
"grad_norm": 0.7050403356552124,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2017,
|
|
"mean_token_accuracy": 0.6631975173950195,
|
|
"num_tokens": 81960356.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.6619047619047619,
|
|
"grad_norm": 0.809806227684021,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2119,
|
|
"mean_token_accuracy": 0.6605896353721619,
|
|
"num_tokens": 82573967.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 0.7040579915046692,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1997,
|
|
"mean_token_accuracy": 0.6634917259216309,
|
|
"num_tokens": 83170751.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.6714285714285714,
|
|
"grad_norm": 0.7381901144981384,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1815,
|
|
"mean_token_accuracy": 0.6675858497619629,
|
|
"num_tokens": 83744022.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.6761904761904762,
|
|
"grad_norm": 0.6610327959060669,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2172,
|
|
"mean_token_accuracy": 0.659174382686615,
|
|
"num_tokens": 84336667.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.680952380952381,
|
|
"grad_norm": 0.8185865879058838,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.199,
|
|
"mean_token_accuracy": 0.6634737253189087,
|
|
"num_tokens": 84927599.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.6857142857142857,
|
|
"grad_norm": 0.6603442430496216,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1976,
|
|
"mean_token_accuracy": 0.6643534898757935,
|
|
"num_tokens": 85516121.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.6904761904761905,
|
|
"grad_norm": 0.7519460320472717,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1926,
|
|
"mean_token_accuracy": 0.664984941482544,
|
|
"num_tokens": 86106161.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.6952380952380952,
|
|
"grad_norm": 0.7080089449882507,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2086,
|
|
"mean_token_accuracy": 0.6621935963630676,
|
|
"num_tokens": 86723880.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 0.7303557395935059,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2033,
|
|
"mean_token_accuracy": 0.6634014248847961,
|
|
"num_tokens": 87327542.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.7047619047619048,
|
|
"grad_norm": 0.6376964449882507,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1977,
|
|
"mean_token_accuracy": 0.6633247137069702,
|
|
"num_tokens": 87912557.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.7095238095238096,
|
|
"grad_norm": 0.6810888051986694,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2087,
|
|
"mean_token_accuracy": 0.6617689728736877,
|
|
"num_tokens": 88514499.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.7142857142857143,
|
|
"grad_norm": 0.6272366046905518,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1879,
|
|
"mean_token_accuracy": 0.6662660837173462,
|
|
"num_tokens": 89090412.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.719047619047619,
|
|
"grad_norm": 0.6499550938606262,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1978,
|
|
"mean_token_accuracy": 0.6638685464859009,
|
|
"num_tokens": 89689944.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.7238095238095238,
|
|
"grad_norm": 0.6450507640838623,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2088,
|
|
"mean_token_accuracy": 0.6614329218864441,
|
|
"num_tokens": 90281605.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.7285714285714285,
|
|
"grad_norm": 0.6113287806510925,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2095,
|
|
"mean_token_accuracy": 0.6616454124450684,
|
|
"num_tokens": 90877169.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.7333333333333333,
|
|
"grad_norm": 0.6421619653701782,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2141,
|
|
"mean_token_accuracy": 0.6598343253135681,
|
|
"num_tokens": 91473587.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.7380952380952381,
|
|
"grad_norm": 0.5994828939437866,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2069,
|
|
"mean_token_accuracy": 0.6615887880325317,
|
|
"num_tokens": 92066142.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.7428571428571429,
|
|
"grad_norm": 0.5635871887207031,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1885,
|
|
"mean_token_accuracy": 0.6657248735427856,
|
|
"num_tokens": 92671294.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.7476190476190476,
|
|
"grad_norm": 0.5961142778396606,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1915,
|
|
"mean_token_accuracy": 0.6649054884910583,
|
|
"num_tokens": 93267004.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.7523809523809524,
|
|
"grad_norm": 0.5518187284469604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2093,
|
|
"mean_token_accuracy": 0.6612235307693481,
|
|
"num_tokens": 93865099.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.7571428571428571,
|
|
"grad_norm": 0.6183374524116516,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1825,
|
|
"mean_token_accuracy": 0.6676396131515503,
|
|
"num_tokens": 94449283.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.7619047619047619,
|
|
"grad_norm": 0.5925056338310242,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1927,
|
|
"mean_token_accuracy": 0.6643291711807251,
|
|
"num_tokens": 95037160.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.7666666666666667,
|
|
"grad_norm": 0.6148018836975098,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1761,
|
|
"mean_token_accuracy": 0.6689929962158203,
|
|
"num_tokens": 95620329.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.7714285714285715,
|
|
"grad_norm": 0.6416387557983398,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1978,
|
|
"mean_token_accuracy": 0.6625751256942749,
|
|
"num_tokens": 96202979.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.7761904761904762,
|
|
"grad_norm": 0.5393695831298828,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1918,
|
|
"mean_token_accuracy": 0.665260910987854,
|
|
"num_tokens": 96794135.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.780952380952381,
|
|
"grad_norm": 0.6334103941917419,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1821,
|
|
"mean_token_accuracy": 0.6664952635765076,
|
|
"num_tokens": 97380180.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.7857142857142857,
|
|
"grad_norm": 0.6443802118301392,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2005,
|
|
"mean_token_accuracy": 0.663545548915863,
|
|
"num_tokens": 97979583.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.7904761904761904,
|
|
"grad_norm": 0.6070786714553833,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1818,
|
|
"mean_token_accuracy": 0.6681106686592102,
|
|
"num_tokens": 98573453.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.7952380952380952,
|
|
"grad_norm": 0.5983892679214478,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.189,
|
|
"mean_token_accuracy": 0.6651272177696228,
|
|
"num_tokens": 99162518.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 0.5511825084686279,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1859,
|
|
"mean_token_accuracy": 0.6656243801116943,
|
|
"num_tokens": 99755688.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.8047619047619048,
|
|
"grad_norm": 0.5612326264381409,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1923,
|
|
"mean_token_accuracy": 0.6645892858505249,
|
|
"num_tokens": 100367122.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.8095238095238095,
|
|
"grad_norm": 0.6149346232414246,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1866,
|
|
"mean_token_accuracy": 0.665177583694458,
|
|
"num_tokens": 100966663.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.8142857142857143,
|
|
"grad_norm": 0.5557584166526794,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1993,
|
|
"mean_token_accuracy": 0.6638921499252319,
|
|
"num_tokens": 101561561.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.819047619047619,
|
|
"grad_norm": 0.6174666285514832,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2058,
|
|
"mean_token_accuracy": 0.6619209051132202,
|
|
"num_tokens": 102150367.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.8238095238095238,
|
|
"grad_norm": 0.6149846911430359,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1956,
|
|
"mean_token_accuracy": 0.6646385788917542,
|
|
"num_tokens": 102744438.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.8285714285714286,
|
|
"grad_norm": 0.6205980777740479,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1944,
|
|
"mean_token_accuracy": 0.6641254425048828,
|
|
"num_tokens": 103336159.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 0.6782044172286987,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1993,
|
|
"mean_token_accuracy": 0.6630405187606812,
|
|
"num_tokens": 103933457.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.8380952380952381,
|
|
"grad_norm": 0.6339226961135864,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1854,
|
|
"mean_token_accuracy": 0.6652607917785645,
|
|
"num_tokens": 104528020.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.8428571428571429,
|
|
"grad_norm": 0.604350209236145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2142,
|
|
"mean_token_accuracy": 0.6597182750701904,
|
|
"num_tokens": 105126562.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.8476190476190476,
|
|
"grad_norm": 0.5730092525482178,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.6674203872680664,
|
|
"num_tokens": 105730229.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.8523809523809524,
|
|
"grad_norm": 0.6724650263786316,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.201,
|
|
"mean_token_accuracy": 0.6622498035430908,
|
|
"num_tokens": 106338239.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.8571428571428571,
|
|
"grad_norm": 0.5882953405380249,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1982,
|
|
"mean_token_accuracy": 0.6630674600601196,
|
|
"num_tokens": 106929782.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.861904761904762,
|
|
"grad_norm": 0.6305244565010071,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1932,
|
|
"mean_token_accuracy": 0.6646133661270142,
|
|
"num_tokens": 107516950.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.8666666666666667,
|
|
"grad_norm": 0.6297836899757385,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1825,
|
|
"mean_token_accuracy": 0.6660134792327881,
|
|
"num_tokens": 108104046.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.8714285714285714,
|
|
"grad_norm": 0.5446469783782959,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1992,
|
|
"mean_token_accuracy": 0.6630533933639526,
|
|
"num_tokens": 108711068.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.8761904761904762,
|
|
"grad_norm": 0.5844411253929138,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1687,
|
|
"mean_token_accuracy": 0.669592022895813,
|
|
"num_tokens": 109294847.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.8809523809523809,
|
|
"grad_norm": 0.6065420508384705,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1886,
|
|
"mean_token_accuracy": 0.664987325668335,
|
|
"num_tokens": 109903424.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.8857142857142857,
|
|
"grad_norm": 0.6002596616744995,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1894,
|
|
"mean_token_accuracy": 0.66484135389328,
|
|
"num_tokens": 110515082.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.8904761904761904,
|
|
"grad_norm": 0.5755858421325684,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1887,
|
|
"mean_token_accuracy": 0.6651521325111389,
|
|
"num_tokens": 111105456.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.8952380952380953,
|
|
"grad_norm": 0.6171888709068298,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1893,
|
|
"mean_token_accuracy": 0.6657494306564331,
|
|
"num_tokens": 111699029.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 0.579205334186554,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1659,
|
|
"mean_token_accuracy": 0.6696426272392273,
|
|
"num_tokens": 112280321.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.9047619047619048,
|
|
"grad_norm": 0.6712483167648315,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1677,
|
|
"mean_token_accuracy": 0.6694087982177734,
|
|
"num_tokens": 112860009.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.9095238095238095,
|
|
"grad_norm": 0.6215792894363403,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1872,
|
|
"mean_token_accuracy": 0.6649343967437744,
|
|
"num_tokens": 113457303.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.9142857142857143,
|
|
"grad_norm": 0.5627334117889404,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.181,
|
|
"mean_token_accuracy": 0.6672377586364746,
|
|
"num_tokens": 114054977.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.919047619047619,
|
|
"grad_norm": 0.5678215622901917,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1778,
|
|
"mean_token_accuracy": 0.6673398613929749,
|
|
"num_tokens": 114641555.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.9238095238095239,
|
|
"grad_norm": 0.5933332443237305,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1939,
|
|
"mean_token_accuracy": 0.6647536754608154,
|
|
"num_tokens": 115241437.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.9285714285714286,
|
|
"grad_norm": 0.5732199549674988,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1714,
|
|
"mean_token_accuracy": 0.6686159372329712,
|
|
"num_tokens": 115845775.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.9333333333333333,
|
|
"grad_norm": 0.6514256596565247,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1782,
|
|
"mean_token_accuracy": 0.6679466962814331,
|
|
"num_tokens": 116452623.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.9380952380952381,
|
|
"grad_norm": 0.5765755772590637,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1861,
|
|
"mean_token_accuracy": 0.6654437780380249,
|
|
"num_tokens": 117045570.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.9428571428571428,
|
|
"grad_norm": 0.7004836797714233,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1638,
|
|
"mean_token_accuracy": 0.6707776784896851,
|
|
"num_tokens": 117654535.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.9476190476190476,
|
|
"grad_norm": 0.5966997146606445,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1772,
|
|
"mean_token_accuracy": 0.6684892177581787,
|
|
"num_tokens": 118247244.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.9523809523809523,
|
|
"grad_norm": 0.6460300087928772,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1713,
|
|
"mean_token_accuracy": 0.6694802045822144,
|
|
"num_tokens": 118843074.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.9571428571428572,
|
|
"grad_norm": 0.599161684513092,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1712,
|
|
"mean_token_accuracy": 0.6690815687179565,
|
|
"num_tokens": 119445023.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.9619047619047619,
|
|
"grad_norm": 0.6229502558708191,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1864,
|
|
"mean_token_accuracy": 0.6660387516021729,
|
|
"num_tokens": 120045748.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.9666666666666667,
|
|
"grad_norm": 0.6429843306541443,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1785,
|
|
"mean_token_accuracy": 0.6669691205024719,
|
|
"num_tokens": 120635079.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.9714285714285714,
|
|
"grad_norm": 0.6153910756111145,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1791,
|
|
"mean_token_accuracy": 0.66630619764328,
|
|
"num_tokens": 121220486.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.9761904761904762,
|
|
"grad_norm": 0.6496953368186951,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1804,
|
|
"mean_token_accuracy": 0.6666555404663086,
|
|
"num_tokens": 121800676.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.9809523809523809,
|
|
"grad_norm": 0.6011868119239807,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1842,
|
|
"mean_token_accuracy": 0.6658217906951904,
|
|
"num_tokens": 122409399.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.9857142857142858,
|
|
"grad_norm": 0.857315182685852,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1652,
|
|
"mean_token_accuracy": 0.6701173186302185,
|
|
"num_tokens": 123003502.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.9904761904761905,
|
|
"grad_norm": 0.6711968183517456,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1821,
|
|
"mean_token_accuracy": 0.6669960021972656,
|
|
"num_tokens": 123595838.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.9952380952380953,
|
|
"grad_norm": 0.8044399619102478,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1797,
|
|
"mean_token_accuracy": 0.6671728491783142,
|
|
"num_tokens": 124166476.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.724872887134552,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1689,
|
|
"mean_token_accuracy": 0.66896653175354,
|
|
"num_tokens": 124761423.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 1.0047619047619047,
|
|
"grad_norm": 0.7732614278793335,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.176,
|
|
"mean_token_accuracy": 0.6668572425842285,
|
|
"num_tokens": 125364371.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 1.0095238095238095,
|
|
"grad_norm": 0.6983124017715454,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1342,
|
|
"mean_token_accuracy": 0.6760746240615845,
|
|
"num_tokens": 125954118.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 1.0142857142857142,
|
|
"grad_norm": 0.6097580790519714,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1398,
|
|
"mean_token_accuracy": 0.6745401620864868,
|
|
"num_tokens": 126544991.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 1.019047619047619,
|
|
"grad_norm": 0.6844852566719055,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1425,
|
|
"mean_token_accuracy": 0.6751389503479004,
|
|
"num_tokens": 127151772.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 1.0238095238095237,
|
|
"grad_norm": 0.7108845114707947,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1472,
|
|
"mean_token_accuracy": 0.6734536290168762,
|
|
"num_tokens": 127762517.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 1.0285714285714285,
|
|
"grad_norm": 0.7051171660423279,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1516,
|
|
"mean_token_accuracy": 0.672438383102417,
|
|
"num_tokens": 128358892.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 1.0333333333333334,
|
|
"grad_norm": 0.742440938949585,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1486,
|
|
"mean_token_accuracy": 0.6727321743965149,
|
|
"num_tokens": 128930309.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 1.0380952380952382,
|
|
"grad_norm": 0.6921288371086121,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1336,
|
|
"mean_token_accuracy": 0.6769453883171082,
|
|
"num_tokens": 129537678.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 1.042857142857143,
|
|
"grad_norm": 0.6531715989112854,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1486,
|
|
"mean_token_accuracy": 0.6732891201972961,
|
|
"num_tokens": 130113717.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 1.0476190476190477,
|
|
"grad_norm": 0.8497748970985413,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1554,
|
|
"mean_token_accuracy": 0.6714987754821777,
|
|
"num_tokens": 130724521.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.0523809523809524,
|
|
"grad_norm": 0.6819850206375122,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1407,
|
|
"mean_token_accuracy": 0.6752928495407104,
|
|
"num_tokens": 131298037.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 1.0571428571428572,
|
|
"grad_norm": 0.785930335521698,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1486,
|
|
"mean_token_accuracy": 0.6729685068130493,
|
|
"num_tokens": 131909779.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 1.061904761904762,
|
|
"grad_norm": 0.6023511290550232,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1458,
|
|
"mean_token_accuracy": 0.6734186410903931,
|
|
"num_tokens": 132506621.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 1.0666666666666667,
|
|
"grad_norm": 0.8720818758010864,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1498,
|
|
"mean_token_accuracy": 0.6726520657539368,
|
|
"num_tokens": 133124443.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 1.0714285714285714,
|
|
"grad_norm": 0.6429004073143005,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1608,
|
|
"mean_token_accuracy": 0.6698133945465088,
|
|
"num_tokens": 133719672.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 1.0761904761904761,
|
|
"grad_norm": 0.7744424343109131,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1357,
|
|
"mean_token_accuracy": 0.6747680306434631,
|
|
"num_tokens": 134309852.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 1.0809523809523809,
|
|
"grad_norm": 0.7106124758720398,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1472,
|
|
"mean_token_accuracy": 0.6723679900169373,
|
|
"num_tokens": 134890952.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 1.0857142857142856,
|
|
"grad_norm": 0.8420917987823486,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1144,
|
|
"mean_token_accuracy": 0.6813350915908813,
|
|
"num_tokens": 135479588.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 1.0904761904761904,
|
|
"grad_norm": 0.7307847738265991,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.14,
|
|
"mean_token_accuracy": 0.6748834848403931,
|
|
"num_tokens": 136065836.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 1.0952380952380953,
|
|
"grad_norm": 0.6740959882736206,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1377,
|
|
"mean_token_accuracy": 0.6761696934700012,
|
|
"num_tokens": 136668062.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 1.1,
|
|
"grad_norm": 0.6920994520187378,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1398,
|
|
"mean_token_accuracy": 0.6743276715278625,
|
|
"num_tokens": 137256533.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 1.1047619047619048,
|
|
"grad_norm": 0.6870349645614624,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1459,
|
|
"mean_token_accuracy": 0.6732701063156128,
|
|
"num_tokens": 137848246.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 1.1095238095238096,
|
|
"grad_norm": 0.6535449028015137,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1494,
|
|
"mean_token_accuracy": 0.6729423999786377,
|
|
"num_tokens": 138450870.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 1.1142857142857143,
|
|
"grad_norm": 0.6108024716377258,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1498,
|
|
"mean_token_accuracy": 0.671773374080658,
|
|
"num_tokens": 139048178.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 1.119047619047619,
|
|
"grad_norm": 0.618743360042572,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1394,
|
|
"mean_token_accuracy": 0.6749163866043091,
|
|
"num_tokens": 139647536.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 1.1238095238095238,
|
|
"grad_norm": 0.5873496532440186,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1428,
|
|
"mean_token_accuracy": 0.6742033958435059,
|
|
"num_tokens": 140237569.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 1.1285714285714286,
|
|
"grad_norm": 0.6749809980392456,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1462,
|
|
"mean_token_accuracy": 0.67291659116745,
|
|
"num_tokens": 140808948.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 1.1333333333333333,
|
|
"grad_norm": 0.5988799333572388,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1417,
|
|
"mean_token_accuracy": 0.6734879016876221,
|
|
"num_tokens": 141387906.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 1.138095238095238,
|
|
"grad_norm": 0.7041788697242737,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1421,
|
|
"mean_token_accuracy": 0.6749635934829712,
|
|
"num_tokens": 141991024.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 1.1428571428571428,
|
|
"grad_norm": 0.677106499671936,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.122,
|
|
"mean_token_accuracy": 0.6789741516113281,
|
|
"num_tokens": 142585170.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.1476190476190475,
|
|
"grad_norm": 0.6422439217567444,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1509,
|
|
"mean_token_accuracy": 0.6719658374786377,
|
|
"num_tokens": 143178473.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 1.1523809523809523,
|
|
"grad_norm": 0.6920860409736633,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1511,
|
|
"mean_token_accuracy": 0.6708908677101135,
|
|
"num_tokens": 143782184.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 1.157142857142857,
|
|
"grad_norm": 0.5582302212715149,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1331,
|
|
"mean_token_accuracy": 0.6759682297706604,
|
|
"num_tokens": 144383051.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 1.161904761904762,
|
|
"grad_norm": 0.6627556085586548,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1432,
|
|
"mean_token_accuracy": 0.6744831204414368,
|
|
"num_tokens": 144977872.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 1.1666666666666667,
|
|
"grad_norm": 0.5956741571426392,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1456,
|
|
"mean_token_accuracy": 0.6733117699623108,
|
|
"num_tokens": 145573077.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 1.1714285714285715,
|
|
"grad_norm": 0.7862910628318787,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1464,
|
|
"mean_token_accuracy": 0.6739993691444397,
|
|
"num_tokens": 146165107.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 1.1761904761904762,
|
|
"grad_norm": 0.6099702715873718,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1393,
|
|
"mean_token_accuracy": 0.6740779876708984,
|
|
"num_tokens": 146763356.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 1.180952380952381,
|
|
"grad_norm": 0.7584065198898315,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.136,
|
|
"mean_token_accuracy": 0.6759775280952454,
|
|
"num_tokens": 147358035.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 1.1857142857142857,
|
|
"grad_norm": 0.6754823327064514,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1523,
|
|
"mean_token_accuracy": 0.6710008978843689,
|
|
"num_tokens": 147955530.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 1.1904761904761905,
|
|
"grad_norm": 0.6045711636543274,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1468,
|
|
"mean_token_accuracy": 0.6728271245956421,
|
|
"num_tokens": 148547950.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.1952380952380952,
|
|
"grad_norm": 0.6770275235176086,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1309,
|
|
"mean_token_accuracy": 0.6762286424636841,
|
|
"num_tokens": 149127280.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 1.2,
|
|
"grad_norm": 0.5667791366577148,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1389,
|
|
"mean_token_accuracy": 0.6750789284706116,
|
|
"num_tokens": 149735096.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 1.2047619047619047,
|
|
"grad_norm": 0.6122450232505798,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1423,
|
|
"mean_token_accuracy": 0.6746940612792969,
|
|
"num_tokens": 150338864.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 1.2095238095238094,
|
|
"grad_norm": 0.6596109867095947,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1234,
|
|
"mean_token_accuracy": 0.6786649227142334,
|
|
"num_tokens": 150940365.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 1.2142857142857142,
|
|
"grad_norm": 0.6414262652397156,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1454,
|
|
"mean_token_accuracy": 0.6734991073608398,
|
|
"num_tokens": 151517902.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 1.2190476190476192,
|
|
"grad_norm": 0.7465854287147522,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1225,
|
|
"mean_token_accuracy": 0.6790366172790527,
|
|
"num_tokens": 152093932.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 1.223809523809524,
|
|
"grad_norm": 0.6045883297920227,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1281,
|
|
"mean_token_accuracy": 0.6779497861862183,
|
|
"num_tokens": 152690003.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 1.2285714285714286,
|
|
"grad_norm": 0.7717053890228271,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1305,
|
|
"mean_token_accuracy": 0.6769629716873169,
|
|
"num_tokens": 153278014.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 1.2333333333333334,
|
|
"grad_norm": 0.6217109560966492,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1377,
|
|
"mean_token_accuracy": 0.6756360530853271,
|
|
"num_tokens": 153871781.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 1.2380952380952381,
|
|
"grad_norm": 0.7101379632949829,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1396,
|
|
"mean_token_accuracy": 0.6745343208312988,
|
|
"num_tokens": 154466124.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.2428571428571429,
|
|
"grad_norm": 0.6611591577529907,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1342,
|
|
"mean_token_accuracy": 0.675082802772522,
|
|
"num_tokens": 155073053.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 1.2476190476190476,
|
|
"grad_norm": 0.7041805386543274,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1612,
|
|
"mean_token_accuracy": 0.6703898906707764,
|
|
"num_tokens": 155680694.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 1.2523809523809524,
|
|
"grad_norm": 0.6518973708152771,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1492,
|
|
"mean_token_accuracy": 0.6719495058059692,
|
|
"num_tokens": 156279612.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 1.2571428571428571,
|
|
"grad_norm": 0.6293846368789673,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1381,
|
|
"mean_token_accuracy": 0.6761749982833862,
|
|
"num_tokens": 156898086.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 1.2619047619047619,
|
|
"grad_norm": 0.5713494420051575,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1527,
|
|
"mean_token_accuracy": 0.6716663837432861,
|
|
"num_tokens": 157502996.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 1.2666666666666666,
|
|
"grad_norm": 0.6561734676361084,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1544,
|
|
"mean_token_accuracy": 0.6708611845970154,
|
|
"num_tokens": 158107778.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 1.2714285714285714,
|
|
"grad_norm": 0.5799586772918701,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1177,
|
|
"mean_token_accuracy": 0.6797953844070435,
|
|
"num_tokens": 158713147.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 1.276190476190476,
|
|
"grad_norm": 0.5941030979156494,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1255,
|
|
"mean_token_accuracy": 0.6776763200759888,
|
|
"num_tokens": 159292006.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 1.2809523809523808,
|
|
"grad_norm": 0.6683588624000549,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1234,
|
|
"mean_token_accuracy": 0.6778484582901001,
|
|
"num_tokens": 159889197.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 1.2857142857142856,
|
|
"grad_norm": 0.6561569571495056,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1378,
|
|
"mean_token_accuracy": 0.6750425696372986,
|
|
"num_tokens": 160485304.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 1.2904761904761906,
|
|
"grad_norm": 0.5719537138938904,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1404,
|
|
"mean_token_accuracy": 0.6747204065322876,
|
|
"num_tokens": 161092433.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 1.2952380952380953,
|
|
"grad_norm": 0.6006868481636047,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1396,
|
|
"mean_token_accuracy": 0.6749382019042969,
|
|
"num_tokens": 161683555.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 1.3,
|
|
"grad_norm": 0.6102608442306519,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1293,
|
|
"mean_token_accuracy": 0.6775893568992615,
|
|
"num_tokens": 162278973.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 1.3047619047619048,
|
|
"grad_norm": 0.6217197179794312,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1366,
|
|
"mean_token_accuracy": 0.6764044165611267,
|
|
"num_tokens": 162885270.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 1.3095238095238095,
|
|
"grad_norm": 0.6187546253204346,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1315,
|
|
"mean_token_accuracy": 0.6765252351760864,
|
|
"num_tokens": 163476311.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 1.3142857142857143,
|
|
"grad_norm": 0.5942601561546326,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1455,
|
|
"mean_token_accuracy": 0.6730477213859558,
|
|
"num_tokens": 164071314.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 1.319047619047619,
|
|
"grad_norm": 0.5942831635475159,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1321,
|
|
"mean_token_accuracy": 0.6764451861381531,
|
|
"num_tokens": 164663415.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 1.3238095238095238,
|
|
"grad_norm": 0.6232311129570007,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1269,
|
|
"mean_token_accuracy": 0.6775949597358704,
|
|
"num_tokens": 165256997.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 1.3285714285714285,
|
|
"grad_norm": 0.6126914024353027,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1317,
|
|
"mean_token_accuracy": 0.676669716835022,
|
|
"num_tokens": 165847922.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 0.6624312400817871,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1337,
|
|
"mean_token_accuracy": 0.6758729815483093,
|
|
"num_tokens": 166444541.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.3380952380952382,
|
|
"grad_norm": 0.6634590029716492,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1246,
|
|
"mean_token_accuracy": 0.6781991124153137,
|
|
"num_tokens": 167028591.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 1.342857142857143,
|
|
"grad_norm": 0.7142046093940735,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1473,
|
|
"mean_token_accuracy": 0.6724534034729004,
|
|
"num_tokens": 167627132.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 1.3476190476190477,
|
|
"grad_norm": 0.5835825800895691,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.119,
|
|
"mean_token_accuracy": 0.6801720857620239,
|
|
"num_tokens": 168226854.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 1.3523809523809525,
|
|
"grad_norm": 0.7441895008087158,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1508,
|
|
"mean_token_accuracy": 0.6721788048744202,
|
|
"num_tokens": 168833732.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 1.3571428571428572,
|
|
"grad_norm": 0.613866925239563,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1263,
|
|
"mean_token_accuracy": 0.6785060167312622,
|
|
"num_tokens": 169426970.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 1.361904761904762,
|
|
"grad_norm": 0.7395045161247253,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1504,
|
|
"mean_token_accuracy": 0.6733224391937256,
|
|
"num_tokens": 170025787.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 1.3666666666666667,
|
|
"grad_norm": 0.7011858224868774,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1457,
|
|
"mean_token_accuracy": 0.6723621487617493,
|
|
"num_tokens": 170621857.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 1.3714285714285714,
|
|
"grad_norm": 0.6301146149635315,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1428,
|
|
"mean_token_accuracy": 0.6736270189285278,
|
|
"num_tokens": 171220708.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 1.3761904761904762,
|
|
"grad_norm": 0.6546505093574524,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1527,
|
|
"mean_token_accuracy": 0.6708388924598694,
|
|
"num_tokens": 171812508.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 1.380952380952381,
|
|
"grad_norm": 0.665846049785614,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1239,
|
|
"mean_token_accuracy": 0.6771635413169861,
|
|
"num_tokens": 172401090.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 1.3857142857142857,
|
|
"grad_norm": 0.6951489448547363,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1303,
|
|
"mean_token_accuracy": 0.6767557263374329,
|
|
"num_tokens": 172989201.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 1.3904761904761904,
|
|
"grad_norm": 0.6228903532028198,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1316,
|
|
"mean_token_accuracy": 0.6754661798477173,
|
|
"num_tokens": 173563807.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 1.3952380952380952,
|
|
"grad_norm": 0.7011890411376953,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1303,
|
|
"mean_token_accuracy": 0.6768910884857178,
|
|
"num_tokens": 174159574.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 1.4,
|
|
"grad_norm": 0.6298404932022095,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1487,
|
|
"mean_token_accuracy": 0.672224223613739,
|
|
"num_tokens": 174744244.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 1.4047619047619047,
|
|
"grad_norm": 0.6158511638641357,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1315,
|
|
"mean_token_accuracy": 0.6756511926651001,
|
|
"num_tokens": 175341946.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 1.4095238095238094,
|
|
"grad_norm": 0.6887179613113403,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1019,
|
|
"mean_token_accuracy": 0.6828951239585876,
|
|
"num_tokens": 175904117.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 1.4142857142857144,
|
|
"grad_norm": 0.64696204662323,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1307,
|
|
"mean_token_accuracy": 0.6764581799507141,
|
|
"num_tokens": 176493621.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 1.4190476190476191,
|
|
"grad_norm": 0.5804628133773804,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1316,
|
|
"mean_token_accuracy": 0.6758260726928711,
|
|
"num_tokens": 177082157.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 1.4238095238095239,
|
|
"grad_norm": 0.6294459104537964,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1325,
|
|
"mean_token_accuracy": 0.6751164197921753,
|
|
"num_tokens": 177668681.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 1.4285714285714286,
|
|
"grad_norm": 0.617782711982727,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1352,
|
|
"mean_token_accuracy": 0.6748452186584473,
|
|
"num_tokens": 178256283.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.4333333333333333,
|
|
"grad_norm": 0.6512781977653503,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1468,
|
|
"mean_token_accuracy": 0.6721617579460144,
|
|
"num_tokens": 178850673.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 1.438095238095238,
|
|
"grad_norm": 0.5774661898612976,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1246,
|
|
"mean_token_accuracy": 0.6787533760070801,
|
|
"num_tokens": 179457871.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 1.4428571428571428,
|
|
"grad_norm": 0.5992771983146667,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1548,
|
|
"mean_token_accuracy": 0.6706414818763733,
|
|
"num_tokens": 180064071.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 1.4476190476190476,
|
|
"grad_norm": 0.5943005681037903,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1106,
|
|
"mean_token_accuracy": 0.6806790828704834,
|
|
"num_tokens": 180650796.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 1.4523809523809523,
|
|
"grad_norm": 0.6455477476119995,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1409,
|
|
"mean_token_accuracy": 0.6753484606742859,
|
|
"num_tokens": 181246825.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 1.457142857142857,
|
|
"grad_norm": 0.5515779852867126,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1429,
|
|
"mean_token_accuracy": 0.6737354397773743,
|
|
"num_tokens": 181855567.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 1.461904761904762,
|
|
"grad_norm": 0.6088519096374512,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1095,
|
|
"mean_token_accuracy": 0.680343508720398,
|
|
"num_tokens": 182433911.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 1.4666666666666668,
|
|
"grad_norm": 0.6310312747955322,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1307,
|
|
"mean_token_accuracy": 0.676478385925293,
|
|
"num_tokens": 183023144.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 1.4714285714285715,
|
|
"grad_norm": 0.6333861947059631,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1225,
|
|
"mean_token_accuracy": 0.6778949499130249,
|
|
"num_tokens": 183626514.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 1.4761904761904763,
|
|
"grad_norm": 0.6410499811172485,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1284,
|
|
"mean_token_accuracy": 0.6767443418502808,
|
|
"num_tokens": 184221439.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.480952380952381,
|
|
"grad_norm": 0.6700615882873535,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.134,
|
|
"mean_token_accuracy": 0.6758592128753662,
|
|
"num_tokens": 184819506.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 1.4857142857142858,
|
|
"grad_norm": 0.5785894989967346,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1338,
|
|
"mean_token_accuracy": 0.6757279634475708,
|
|
"num_tokens": 185419019.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 1.4904761904761905,
|
|
"grad_norm": 0.6253511309623718,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1212,
|
|
"mean_token_accuracy": 0.6801990270614624,
|
|
"num_tokens": 186010772.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 1.4952380952380953,
|
|
"grad_norm": 0.6034374237060547,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1178,
|
|
"mean_token_accuracy": 0.6792829036712646,
|
|
"num_tokens": 186589243.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 0.6875804662704468,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1165,
|
|
"mean_token_accuracy": 0.6799081563949585,
|
|
"num_tokens": 187182368.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 1.5047619047619047,
|
|
"grad_norm": 0.5927019119262695,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1179,
|
|
"mean_token_accuracy": 0.6792271733283997,
|
|
"num_tokens": 187763428.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 1.5095238095238095,
|
|
"grad_norm": 0.5725839734077454,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1129,
|
|
"mean_token_accuracy": 0.6808658838272095,
|
|
"num_tokens": 188359395.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 1.5142857142857142,
|
|
"grad_norm": 0.6134579181671143,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1329,
|
|
"mean_token_accuracy": 0.6752611398696899,
|
|
"num_tokens": 188952450.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 1.519047619047619,
|
|
"grad_norm": 0.5980193018913269,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1282,
|
|
"mean_token_accuracy": 0.6765316128730774,
|
|
"num_tokens": 189535853.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 1.5238095238095237,
|
|
"grad_norm": 0.6418870091438293,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1113,
|
|
"mean_token_accuracy": 0.6808905601501465,
|
|
"num_tokens": 190127386.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.5285714285714285,
|
|
"grad_norm": 0.5932308435440063,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1282,
|
|
"mean_token_accuracy": 0.6762252449989319,
|
|
"num_tokens": 190718877.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 1.5333333333333332,
|
|
"grad_norm": 0.6508740782737732,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1504,
|
|
"mean_token_accuracy": 0.6717185974121094,
|
|
"num_tokens": 191320553.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 1.538095238095238,
|
|
"grad_norm": 0.6029355525970459,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1219,
|
|
"mean_token_accuracy": 0.6790941953659058,
|
|
"num_tokens": 191911786.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 1.5428571428571427,
|
|
"grad_norm": 0.5820804834365845,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1483,
|
|
"mean_token_accuracy": 0.6729787588119507,
|
|
"num_tokens": 192517254.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 1.5476190476190477,
|
|
"grad_norm": 0.6086446642875671,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1438,
|
|
"mean_token_accuracy": 0.6730492115020752,
|
|
"num_tokens": 193113713.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 1.5523809523809524,
|
|
"grad_norm": 0.6287596821784973,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1255,
|
|
"mean_token_accuracy": 0.6779239177703857,
|
|
"num_tokens": 193718335.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 1.5571428571428572,
|
|
"grad_norm": 0.6495358347892761,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1267,
|
|
"mean_token_accuracy": 0.6764586567878723,
|
|
"num_tokens": 194303328.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 1.561904761904762,
|
|
"grad_norm": 0.6034678816795349,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1204,
|
|
"mean_token_accuracy": 0.6789346933364868,
|
|
"num_tokens": 194886509.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 1.5666666666666667,
|
|
"grad_norm": 0.6537843346595764,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1269,
|
|
"mean_token_accuracy": 0.678215742111206,
|
|
"num_tokens": 195456896.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 1.5714285714285714,
|
|
"grad_norm": 0.5981965661048889,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1237,
|
|
"mean_token_accuracy": 0.6771047115325928,
|
|
"num_tokens": 196053871.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.5761904761904761,
|
|
"grad_norm": 0.7181389331817627,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1236,
|
|
"mean_token_accuracy": 0.6774399280548096,
|
|
"num_tokens": 196654732.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 1.580952380952381,
|
|
"grad_norm": 0.6066569089889526,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1124,
|
|
"mean_token_accuracy": 0.6811067461967468,
|
|
"num_tokens": 197242864.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 1.5857142857142859,
|
|
"grad_norm": 0.7779151797294617,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1153,
|
|
"mean_token_accuracy": 0.6798511743545532,
|
|
"num_tokens": 197840214.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 1.5904761904761906,
|
|
"grad_norm": 0.5971040725708008,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1177,
|
|
"mean_token_accuracy": 0.6795299649238586,
|
|
"num_tokens": 198440572.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 1.5952380952380953,
|
|
"grad_norm": 0.6526306867599487,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1134,
|
|
"mean_token_accuracy": 0.6805366277694702,
|
|
"num_tokens": 199039184.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 1.6,
|
|
"grad_norm": 0.622909426689148,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1139,
|
|
"mean_token_accuracy": 0.6792494058609009,
|
|
"num_tokens": 199626548.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 1.6047619047619048,
|
|
"grad_norm": 0.6684408187866211,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.128,
|
|
"mean_token_accuracy": 0.6774076819419861,
|
|
"num_tokens": 200222258.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 1.6095238095238096,
|
|
"grad_norm": 0.5934977531433105,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1203,
|
|
"mean_token_accuracy": 0.6792654991149902,
|
|
"num_tokens": 200819172.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 1.6142857142857143,
|
|
"grad_norm": 0.6164219975471497,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1314,
|
|
"mean_token_accuracy": 0.6759560704231262,
|
|
"num_tokens": 201413549.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 1.619047619047619,
|
|
"grad_norm": 0.6061872839927673,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1162,
|
|
"mean_token_accuracy": 0.6795899868011475,
|
|
"num_tokens": 202014069.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.6238095238095238,
|
|
"grad_norm": 0.6192796230316162,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1476,
|
|
"mean_token_accuracy": 0.6721718311309814,
|
|
"num_tokens": 202600379.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 1.6285714285714286,
|
|
"grad_norm": 0.6233608722686768,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1226,
|
|
"mean_token_accuracy": 0.6779032945632935,
|
|
"num_tokens": 203203233.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 1.6333333333333333,
|
|
"grad_norm": 0.5831724405288696,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1159,
|
|
"mean_token_accuracy": 0.6793074607849121,
|
|
"num_tokens": 203802554.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 1.638095238095238,
|
|
"grad_norm": 0.6623408794403076,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1296,
|
|
"mean_token_accuracy": 0.677479088306427,
|
|
"num_tokens": 204395560.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 1.6428571428571428,
|
|
"grad_norm": 0.5827105045318604,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.113,
|
|
"mean_token_accuracy": 0.6808111071586609,
|
|
"num_tokens": 205001404.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 1.6476190476190475,
|
|
"grad_norm": 0.5602775812149048,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1066,
|
|
"mean_token_accuracy": 0.6823267936706543,
|
|
"num_tokens": 205599855.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 1.6523809523809523,
|
|
"grad_norm": 0.6435489654541016,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1124,
|
|
"mean_token_accuracy": 0.6803141832351685,
|
|
"num_tokens": 206163338.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 1.657142857142857,
|
|
"grad_norm": 0.5933458209037781,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.137,
|
|
"mean_token_accuracy": 0.6748683452606201,
|
|
"num_tokens": 206741397.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 1.6619047619047618,
|
|
"grad_norm": 0.5775367021560669,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1298,
|
|
"mean_token_accuracy": 0.6758445501327515,
|
|
"num_tokens": 207323297.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 0.5773342251777649,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1294,
|
|
"mean_token_accuracy": 0.6764418482780457,
|
|
"num_tokens": 207908589.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.6714285714285713,
|
|
"grad_norm": 0.6353156566619873,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1403,
|
|
"mean_token_accuracy": 0.6732203960418701,
|
|
"num_tokens": 208500281.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 1.6761904761904762,
|
|
"grad_norm": 0.5841516852378845,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1201,
|
|
"mean_token_accuracy": 0.6789692640304565,
|
|
"num_tokens": 209097427.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 1.680952380952381,
|
|
"grad_norm": 0.5935720205307007,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1217,
|
|
"mean_token_accuracy": 0.6778074502944946,
|
|
"num_tokens": 209704225.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 1.6857142857142857,
|
|
"grad_norm": 0.6088152527809143,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1177,
|
|
"mean_token_accuracy": 0.6796123385429382,
|
|
"num_tokens": 210313267.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 1.6904761904761905,
|
|
"grad_norm": 0.5818439722061157,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1273,
|
|
"mean_token_accuracy": 0.6770058870315552,
|
|
"num_tokens": 210918588.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 1.6952380952380952,
|
|
"grad_norm": 0.6217803955078125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1268,
|
|
"mean_token_accuracy": 0.6772897243499756,
|
|
"num_tokens": 211508221.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 1.7,
|
|
"grad_norm": 0.5793229937553406,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1359,
|
|
"mean_token_accuracy": 0.6748672127723694,
|
|
"num_tokens": 212108728.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 1.704761904761905,
|
|
"grad_norm": 0.5839233994483948,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1226,
|
|
"mean_token_accuracy": 0.6776269674301147,
|
|
"num_tokens": 212705437.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 1.7095238095238097,
|
|
"grad_norm": 0.6158073544502258,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1324,
|
|
"mean_token_accuracy": 0.6745504140853882,
|
|
"num_tokens": 213300176.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 1.7142857142857144,
|
|
"grad_norm": 0.6093515753746033,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.132,
|
|
"mean_token_accuracy": 0.6751940250396729,
|
|
"num_tokens": 213890731.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.7190476190476192,
|
|
"grad_norm": 0.629436194896698,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1147,
|
|
"mean_token_accuracy": 0.6785677075386047,
|
|
"num_tokens": 214471137.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 1.723809523809524,
|
|
"grad_norm": 0.6373199820518494,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1169,
|
|
"mean_token_accuracy": 0.6792606115341187,
|
|
"num_tokens": 215062165.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 1.7285714285714286,
|
|
"grad_norm": 0.5850217938423157,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1264,
|
|
"mean_token_accuracy": 0.6766684055328369,
|
|
"num_tokens": 215662977.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 1.7333333333333334,
|
|
"grad_norm": 0.676506757736206,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1328,
|
|
"mean_token_accuracy": 0.6750730276107788,
|
|
"num_tokens": 216253942.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 1.7380952380952381,
|
|
"grad_norm": 0.5996358394622803,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1234,
|
|
"mean_token_accuracy": 0.6771166920661926,
|
|
"num_tokens": 216847247.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 1.7428571428571429,
|
|
"grad_norm": 0.604375422000885,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1111,
|
|
"mean_token_accuracy": 0.6807925701141357,
|
|
"num_tokens": 217427979.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 1.7476190476190476,
|
|
"grad_norm": 0.6484256386756897,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1149,
|
|
"mean_token_accuracy": 0.67896568775177,
|
|
"num_tokens": 218020622.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 1.7523809523809524,
|
|
"grad_norm": 0.5445154905319214,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1238,
|
|
"mean_token_accuracy": 0.677640438079834,
|
|
"num_tokens": 218613768.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 1.7571428571428571,
|
|
"grad_norm": 0.5835940837860107,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1352,
|
|
"mean_token_accuracy": 0.6746830940246582,
|
|
"num_tokens": 219217863.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 1.7619047619047619,
|
|
"grad_norm": 0.6108807325363159,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1245,
|
|
"mean_token_accuracy": 0.6771240234375,
|
|
"num_tokens": 219826128.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.7666666666666666,
|
|
"grad_norm": 0.5301618576049805,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1193,
|
|
"mean_token_accuracy": 0.6791725158691406,
|
|
"num_tokens": 220424737.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 1.7714285714285714,
|
|
"grad_norm": 0.567722737789154,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1241,
|
|
"mean_token_accuracy": 0.6771541833877563,
|
|
"num_tokens": 221010073.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 1.776190476190476,
|
|
"grad_norm": 0.6946297883987427,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1205,
|
|
"mean_token_accuracy": 0.678805410861969,
|
|
"num_tokens": 221614799.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 1.7809523809523808,
|
|
"grad_norm": 0.5566631555557251,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1185,
|
|
"mean_token_accuracy": 0.6782611012458801,
|
|
"num_tokens": 222215943.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 1.7857142857142856,
|
|
"grad_norm": 0.5999249219894409,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.114,
|
|
"mean_token_accuracy": 0.6802798509597778,
|
|
"num_tokens": 222803822.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 1.7904761904761903,
|
|
"grad_norm": 0.5825783014297485,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1314,
|
|
"mean_token_accuracy": 0.6754652261734009,
|
|
"num_tokens": 223409541.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 1.795238095238095,
|
|
"grad_norm": 0.5893160700798035,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1183,
|
|
"mean_token_accuracy": 0.6782077550888062,
|
|
"num_tokens": 223996446.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 1.8,
|
|
"grad_norm": 0.5960800051689148,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1203,
|
|
"mean_token_accuracy": 0.678328275680542,
|
|
"num_tokens": 224599074.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 1.8047619047619048,
|
|
"grad_norm": 0.5972325205802917,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1122,
|
|
"mean_token_accuracy": 0.6802579760551453,
|
|
"num_tokens": 225184557.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 1.8095238095238095,
|
|
"grad_norm": 0.597683310508728,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1185,
|
|
"mean_token_accuracy": 0.6798061728477478,
|
|
"num_tokens": 225774197.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.8142857142857143,
|
|
"grad_norm": 0.575453519821167,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1072,
|
|
"mean_token_accuracy": 0.6810543537139893,
|
|
"num_tokens": 226359063.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 1.819047619047619,
|
|
"grad_norm": 0.5560538172721863,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1237,
|
|
"mean_token_accuracy": 0.6774187088012695,
|
|
"num_tokens": 226962202.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 1.8238095238095238,
|
|
"grad_norm": 0.6427722573280334,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1218,
|
|
"mean_token_accuracy": 0.677949070930481,
|
|
"num_tokens": 227541002.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 1.8285714285714287,
|
|
"grad_norm": 0.6143935322761536,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1221,
|
|
"mean_token_accuracy": 0.6778963804244995,
|
|
"num_tokens": 228134124.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 1.8333333333333335,
|
|
"grad_norm": 0.6365751624107361,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.112,
|
|
"mean_token_accuracy": 0.6797761917114258,
|
|
"num_tokens": 228729717.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 1.8380952380952382,
|
|
"grad_norm": 0.719041109085083,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1178,
|
|
"mean_token_accuracy": 0.6780564785003662,
|
|
"num_tokens": 229318931.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 1.842857142857143,
|
|
"grad_norm": 0.6031278967857361,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1246,
|
|
"mean_token_accuracy": 0.6776800155639648,
|
|
"num_tokens": 229923675.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 1.8476190476190477,
|
|
"grad_norm": 0.6627750396728516,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1149,
|
|
"mean_token_accuracy": 0.6797564029693604,
|
|
"num_tokens": 230514254.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 1.8523809523809525,
|
|
"grad_norm": 0.576654314994812,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1228,
|
|
"mean_token_accuracy": 0.6780418157577515,
|
|
"num_tokens": 231113801.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 1.8571428571428572,
|
|
"grad_norm": 0.6316273212432861,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1119,
|
|
"mean_token_accuracy": 0.6792047023773193,
|
|
"num_tokens": 231709098.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.861904761904762,
|
|
"grad_norm": 0.5546997785568237,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1247,
|
|
"mean_token_accuracy": 0.6769775748252869,
|
|
"num_tokens": 232311276.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 1.8666666666666667,
|
|
"grad_norm": 0.617088794708252,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.113,
|
|
"mean_token_accuracy": 0.6795423030853271,
|
|
"num_tokens": 232904607.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 1.8714285714285714,
|
|
"grad_norm": 0.611702561378479,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1057,
|
|
"mean_token_accuracy": 0.6821488738059998,
|
|
"num_tokens": 233493254.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 1.8761904761904762,
|
|
"grad_norm": 0.6276193261146545,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1154,
|
|
"mean_token_accuracy": 0.6800172328948975,
|
|
"num_tokens": 234085431.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 1.880952380952381,
|
|
"grad_norm": 0.6570289731025696,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1245,
|
|
"mean_token_accuracy": 0.676892876625061,
|
|
"num_tokens": 234685396.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 1.8857142857142857,
|
|
"grad_norm": 0.6350821256637573,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1253,
|
|
"mean_token_accuracy": 0.6770513653755188,
|
|
"num_tokens": 235280049.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 1.8904761904761904,
|
|
"grad_norm": 0.6419028639793396,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1258,
|
|
"mean_token_accuracy": 0.6772979497909546,
|
|
"num_tokens": 235867276.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 1.8952380952380952,
|
|
"grad_norm": 0.6098426580429077,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1126,
|
|
"mean_token_accuracy": 0.6804271936416626,
|
|
"num_tokens": 236448647.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 1.9,
|
|
"grad_norm": 0.5854616165161133,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1256,
|
|
"mean_token_accuracy": 0.6755622625350952,
|
|
"num_tokens": 237054180.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 1.9047619047619047,
|
|
"grad_norm": 0.6416271328926086,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1394,
|
|
"mean_token_accuracy": 0.6737023591995239,
|
|
"num_tokens": 237658218.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.9095238095238094,
|
|
"grad_norm": 0.5833379626274109,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1223,
|
|
"mean_token_accuracy": 0.6782907247543335,
|
|
"num_tokens": 238248393.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 1.9142857142857141,
|
|
"grad_norm": 0.6798136830329895,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1096,
|
|
"mean_token_accuracy": 0.6816190481185913,
|
|
"num_tokens": 238838548.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 1.919047619047619,
|
|
"grad_norm": 0.5994821786880493,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1154,
|
|
"mean_token_accuracy": 0.6799057722091675,
|
|
"num_tokens": 239442502.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 1.9238095238095239,
|
|
"grad_norm": 0.6224843263626099,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1273,
|
|
"mean_token_accuracy": 0.6760965585708618,
|
|
"num_tokens": 240029019.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 1.9285714285714286,
|
|
"grad_norm": 0.6100861430168152,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1134,
|
|
"mean_token_accuracy": 0.6803538799285889,
|
|
"num_tokens": 240623504.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 1.9333333333333333,
|
|
"grad_norm": 0.6026962399482727,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1022,
|
|
"mean_token_accuracy": 0.6817559599876404,
|
|
"num_tokens": 241217102.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 1.938095238095238,
|
|
"grad_norm": 0.6529442667961121,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1141,
|
|
"mean_token_accuracy": 0.6804797649383545,
|
|
"num_tokens": 241812222.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 1.9428571428571428,
|
|
"grad_norm": 0.6519430875778198,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1085,
|
|
"mean_token_accuracy": 0.680460512638092,
|
|
"num_tokens": 242388669.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 1.9476190476190476,
|
|
"grad_norm": 0.7020300626754761,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1111,
|
|
"mean_token_accuracy": 0.6802721619606018,
|
|
"num_tokens": 242965686.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 1.9523809523809523,
|
|
"grad_norm": 0.6024628281593323,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1183,
|
|
"mean_token_accuracy": 0.6792590022087097,
|
|
"num_tokens": 243553853.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.9571428571428573,
|
|
"grad_norm": 0.7494162321090698,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1135,
|
|
"mean_token_accuracy": 0.6788877844810486,
|
|
"num_tokens": 244139754.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 1.961904761904762,
|
|
"grad_norm": 0.6602755188941956,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1176,
|
|
"mean_token_accuracy": 0.6777335405349731,
|
|
"num_tokens": 244736423.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 1.9666666666666668,
|
|
"grad_norm": 0.7016980051994324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1186,
|
|
"mean_token_accuracy": 0.6798810362815857,
|
|
"num_tokens": 245337682.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 1.9714285714285715,
|
|
"grad_norm": 0.6483145356178284,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1172,
|
|
"mean_token_accuracy": 0.67914879322052,
|
|
"num_tokens": 245952105.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 1.9761904761904763,
|
|
"grad_norm": 0.678092896938324,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1039,
|
|
"mean_token_accuracy": 0.6819472312927246,
|
|
"num_tokens": 246540999.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 1.980952380952381,
|
|
"grad_norm": 0.7507527470588684,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1103,
|
|
"mean_token_accuracy": 0.6814316511154175,
|
|
"num_tokens": 247142303.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 1.9857142857142858,
|
|
"grad_norm": 0.625765323638916,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.108,
|
|
"mean_token_accuracy": 0.6812607049942017,
|
|
"num_tokens": 247732988.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 1.9904761904761905,
|
|
"grad_norm": 0.6421918869018555,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1022,
|
|
"mean_token_accuracy": 0.681933581829071,
|
|
"num_tokens": 248334744.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 1.9952380952380953,
|
|
"grad_norm": 0.6160528659820557,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1133,
|
|
"mean_token_accuracy": 0.6797480583190918,
|
|
"num_tokens": 248930347.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.703513503074646,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.129,
|
|
"mean_token_accuracy": 0.676598846912384,
|
|
"num_tokens": 249522093.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.0047619047619047,
|
|
"grad_norm": 0.7784668207168579,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0772,
|
|
"mean_token_accuracy": 0.687883734703064,
|
|
"num_tokens": 250112163.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 2.0095238095238095,
|
|
"grad_norm": 0.7685954570770264,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.065,
|
|
"mean_token_accuracy": 0.6909651160240173,
|
|
"num_tokens": 250690466.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 2.0142857142857142,
|
|
"grad_norm": 0.5822970867156982,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0678,
|
|
"mean_token_accuracy": 0.68952476978302,
|
|
"num_tokens": 251279220.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 2.019047619047619,
|
|
"grad_norm": 0.8003807663917542,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.071,
|
|
"mean_token_accuracy": 0.6891335248947144,
|
|
"num_tokens": 251871717.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 2.0238095238095237,
|
|
"grad_norm": 0.6656951904296875,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0907,
|
|
"mean_token_accuracy": 0.6843781471252441,
|
|
"num_tokens": 252474129.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 2.0285714285714285,
|
|
"grad_norm": 0.662339448928833,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0518,
|
|
"mean_token_accuracy": 0.693079948425293,
|
|
"num_tokens": 253069065.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 2.033333333333333,
|
|
"grad_norm": 0.6397184729576111,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.079,
|
|
"mean_token_accuracy": 0.6863641738891602,
|
|
"num_tokens": 253654392.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 2.038095238095238,
|
|
"grad_norm": 0.6415942907333374,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0688,
|
|
"mean_token_accuracy": 0.6888935565948486,
|
|
"num_tokens": 254245168.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 2.0428571428571427,
|
|
"grad_norm": 0.6560488939285278,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0824,
|
|
"mean_token_accuracy": 0.685175895690918,
|
|
"num_tokens": 254841132.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 2.0476190476190474,
|
|
"grad_norm": 0.5839130878448486,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0676,
|
|
"mean_token_accuracy": 0.6896021366119385,
|
|
"num_tokens": 255433793.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 2.052380952380952,
|
|
"grad_norm": 0.7360151410102844,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0658,
|
|
"mean_token_accuracy": 0.6903011798858643,
|
|
"num_tokens": 256015225.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 2.057142857142857,
|
|
"grad_norm": 0.633699893951416,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0638,
|
|
"mean_token_accuracy": 0.6905348300933838,
|
|
"num_tokens": 256617333.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 2.0619047619047617,
|
|
"grad_norm": 0.6784190535545349,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0913,
|
|
"mean_token_accuracy": 0.6846885681152344,
|
|
"num_tokens": 257214120.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 2.066666666666667,
|
|
"grad_norm": 0.6749794483184814,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0707,
|
|
"mean_token_accuracy": 0.6884802579879761,
|
|
"num_tokens": 257807506.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 2.0714285714285716,
|
|
"grad_norm": 0.6474000811576843,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0873,
|
|
"mean_token_accuracy": 0.6844640970230103,
|
|
"num_tokens": 258414225.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 2.0761904761904764,
|
|
"grad_norm": 0.6300811171531677,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0684,
|
|
"mean_token_accuracy": 0.688113808631897,
|
|
"num_tokens": 259009305.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 2.080952380952381,
|
|
"grad_norm": 0.6160655617713928,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0626,
|
|
"mean_token_accuracy": 0.690530002117157,
|
|
"num_tokens": 259600283.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 2.085714285714286,
|
|
"grad_norm": 0.5936851501464844,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0876,
|
|
"mean_token_accuracy": 0.6846874952316284,
|
|
"num_tokens": 260203907.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 2.0904761904761906,
|
|
"grad_norm": 0.6563723683357239,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0823,
|
|
"mean_token_accuracy": 0.6850008964538574,
|
|
"num_tokens": 260795356.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 2.0952380952380953,
|
|
"grad_norm": 0.6244327425956726,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0595,
|
|
"mean_token_accuracy": 0.6918923854827881,
|
|
"num_tokens": 261376414.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.1,
|
|
"grad_norm": 0.6768208146095276,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0854,
|
|
"mean_token_accuracy": 0.6857748031616211,
|
|
"num_tokens": 261965206.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 2.104761904761905,
|
|
"grad_norm": 0.6261032819747925,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0792,
|
|
"mean_token_accuracy": 0.6868791580200195,
|
|
"num_tokens": 262561371.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 2.1095238095238096,
|
|
"grad_norm": 0.6388991475105286,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.068,
|
|
"mean_token_accuracy": 0.689771294593811,
|
|
"num_tokens": 263159935.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 2.1142857142857143,
|
|
"grad_norm": 0.6453383564949036,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0803,
|
|
"mean_token_accuracy": 0.6851140260696411,
|
|
"num_tokens": 263754299.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 2.119047619047619,
|
|
"grad_norm": 0.6248214244842529,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0792,
|
|
"mean_token_accuracy": 0.686503529548645,
|
|
"num_tokens": 264354630.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 2.123809523809524,
|
|
"grad_norm": 0.6909031271934509,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0995,
|
|
"mean_token_accuracy": 0.681476891040802,
|
|
"num_tokens": 264964353.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 2.1285714285714286,
|
|
"grad_norm": 0.6381927132606506,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0816,
|
|
"mean_token_accuracy": 0.6861193180084229,
|
|
"num_tokens": 265561411.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 2.1333333333333333,
|
|
"grad_norm": 0.669456958770752,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.073,
|
|
"mean_token_accuracy": 0.6876237392425537,
|
|
"num_tokens": 266155790.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 2.138095238095238,
|
|
"grad_norm": 0.6266065239906311,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0788,
|
|
"mean_token_accuracy": 0.6870714426040649,
|
|
"num_tokens": 266757269.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 2.142857142857143,
|
|
"grad_norm": 0.6428273916244507,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0942,
|
|
"mean_token_accuracy": 0.6831685304641724,
|
|
"num_tokens": 267369143.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 2.1476190476190475,
|
|
"grad_norm": 0.6169470548629761,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0619,
|
|
"mean_token_accuracy": 0.6913155317306519,
|
|
"num_tokens": 267965437.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 2.1523809523809523,
|
|
"grad_norm": 0.6351789832115173,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0713,
|
|
"mean_token_accuracy": 0.6888561248779297,
|
|
"num_tokens": 268571463.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 2.157142857142857,
|
|
"grad_norm": 0.6532635688781738,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0698,
|
|
"mean_token_accuracy": 0.6889727115631104,
|
|
"num_tokens": 269157041.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 2.1619047619047618,
|
|
"grad_norm": 0.5989878177642822,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0682,
|
|
"mean_token_accuracy": 0.6890783309936523,
|
|
"num_tokens": 269758195.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 2.1666666666666665,
|
|
"grad_norm": 0.6337672472000122,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0969,
|
|
"mean_token_accuracy": 0.6822439432144165,
|
|
"num_tokens": 270349613.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 2.1714285714285713,
|
|
"grad_norm": 0.5972429513931274,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0831,
|
|
"mean_token_accuracy": 0.6840848326683044,
|
|
"num_tokens": 270947174.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 2.176190476190476,
|
|
"grad_norm": 0.6298529505729675,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0921,
|
|
"mean_token_accuracy": 0.6836713552474976,
|
|
"num_tokens": 271549889.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 2.1809523809523808,
|
|
"grad_norm": 0.574796199798584,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.064,
|
|
"mean_token_accuracy": 0.6906387805938721,
|
|
"num_tokens": 272139782.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 2.185714285714286,
|
|
"grad_norm": 0.6812316179275513,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0762,
|
|
"mean_token_accuracy": 0.687111496925354,
|
|
"num_tokens": 272746279.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 2.1904761904761907,
|
|
"grad_norm": 0.5981315970420837,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0626,
|
|
"mean_token_accuracy": 0.6907453536987305,
|
|
"num_tokens": 273348449.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.1952380952380954,
|
|
"grad_norm": 0.6438897252082825,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0853,
|
|
"mean_token_accuracy": 0.6858918070793152,
|
|
"num_tokens": 273949928.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 2.2,
|
|
"grad_norm": 0.6236709952354431,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0733,
|
|
"mean_token_accuracy": 0.6877059936523438,
|
|
"num_tokens": 274548070.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 2.204761904761905,
|
|
"grad_norm": 0.6749060153961182,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0758,
|
|
"mean_token_accuracy": 0.6867290735244751,
|
|
"num_tokens": 275135656.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 2.2095238095238097,
|
|
"grad_norm": 0.6628844738006592,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0765,
|
|
"mean_token_accuracy": 0.6874538660049438,
|
|
"num_tokens": 275740663.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 2.2142857142857144,
|
|
"grad_norm": 0.5728548169136047,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0754,
|
|
"mean_token_accuracy": 0.6882718205451965,
|
|
"num_tokens": 276346207.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 2.219047619047619,
|
|
"grad_norm": 0.6232889294624329,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0752,
|
|
"mean_token_accuracy": 0.6872685551643372,
|
|
"num_tokens": 276940208.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 2.223809523809524,
|
|
"grad_norm": 0.6447910070419312,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.091,
|
|
"mean_token_accuracy": 0.6836293339729309,
|
|
"num_tokens": 277539762.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 2.2285714285714286,
|
|
"grad_norm": 0.6113771796226501,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0865,
|
|
"mean_token_accuracy": 0.684094250202179,
|
|
"num_tokens": 278136526.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 2.2333333333333334,
|
|
"grad_norm": 0.6344524025917053,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0772,
|
|
"mean_token_accuracy": 0.6870338320732117,
|
|
"num_tokens": 278723575.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 2.238095238095238,
|
|
"grad_norm": 0.6180852055549622,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0544,
|
|
"mean_token_accuracy": 0.6927859783172607,
|
|
"num_tokens": 279313692.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 2.242857142857143,
|
|
"grad_norm": 0.6375457644462585,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0869,
|
|
"mean_token_accuracy": 0.6847492456436157,
|
|
"num_tokens": 279911596.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 2.2476190476190476,
|
|
"grad_norm": 0.6032583117485046,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0701,
|
|
"mean_token_accuracy": 0.6893506050109863,
|
|
"num_tokens": 280516626.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 2.2523809523809524,
|
|
"grad_norm": 0.6571868062019348,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0723,
|
|
"mean_token_accuracy": 0.6889193654060364,
|
|
"num_tokens": 281109826.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 2.257142857142857,
|
|
"grad_norm": 0.5816087126731873,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0783,
|
|
"mean_token_accuracy": 0.6873452663421631,
|
|
"num_tokens": 281705908.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 2.261904761904762,
|
|
"grad_norm": 0.6110855340957642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0733,
|
|
"mean_token_accuracy": 0.6875293850898743,
|
|
"num_tokens": 282295646.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 2.2666666666666666,
|
|
"grad_norm": 0.5722987055778503,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.064,
|
|
"mean_token_accuracy": 0.6898794174194336,
|
|
"num_tokens": 282882984.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 2.2714285714285714,
|
|
"grad_norm": 0.5756980776786804,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0705,
|
|
"mean_token_accuracy": 0.6888871192932129,
|
|
"num_tokens": 283470314.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 2.276190476190476,
|
|
"grad_norm": 0.6090242862701416,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0729,
|
|
"mean_token_accuracy": 0.6876958012580872,
|
|
"num_tokens": 284064822.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 2.280952380952381,
|
|
"grad_norm": 0.551956295967102,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0666,
|
|
"mean_token_accuracy": 0.6899924278259277,
|
|
"num_tokens": 284659143.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 2.2857142857142856,
|
|
"grad_norm": 0.617386519908905,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0789,
|
|
"mean_token_accuracy": 0.6873286366462708,
|
|
"num_tokens": 285260603.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.2904761904761903,
|
|
"grad_norm": 0.5895305871963501,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0668,
|
|
"mean_token_accuracy": 0.6887931823730469,
|
|
"num_tokens": 285858617.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 2.295238095238095,
|
|
"grad_norm": 0.575018584728241,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0733,
|
|
"mean_token_accuracy": 0.6886229515075684,
|
|
"num_tokens": 286462909.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 2.3,
|
|
"grad_norm": 0.680483341217041,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0686,
|
|
"mean_token_accuracy": 0.6894232034683228,
|
|
"num_tokens": 287057508.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 2.3047619047619046,
|
|
"grad_norm": 0.6086472868919373,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0784,
|
|
"mean_token_accuracy": 0.6863738298416138,
|
|
"num_tokens": 287647864.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 2.3095238095238093,
|
|
"grad_norm": 0.6269891858100891,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0803,
|
|
"mean_token_accuracy": 0.6864203810691833,
|
|
"num_tokens": 288244654.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 2.314285714285714,
|
|
"grad_norm": 0.6842952370643616,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0897,
|
|
"mean_token_accuracy": 0.684012770652771,
|
|
"num_tokens": 288833805.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 2.319047619047619,
|
|
"grad_norm": 0.5772620439529419,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0728,
|
|
"mean_token_accuracy": 0.6879225969314575,
|
|
"num_tokens": 289430249.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 2.323809523809524,
|
|
"grad_norm": 0.6799498796463013,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0737,
|
|
"mean_token_accuracy": 0.6892322897911072,
|
|
"num_tokens": 290017640.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 2.3285714285714287,
|
|
"grad_norm": 0.63170325756073,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0694,
|
|
"mean_token_accuracy": 0.6884621381759644,
|
|
"num_tokens": 290598414.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 2.3333333333333335,
|
|
"grad_norm": 0.6786331534385681,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.061,
|
|
"mean_token_accuracy": 0.6906882524490356,
|
|
"num_tokens": 291181812.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 2.3380952380952382,
|
|
"grad_norm": 0.6489508748054504,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0747,
|
|
"mean_token_accuracy": 0.6877006888389587,
|
|
"num_tokens": 291764317.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 2.342857142857143,
|
|
"grad_norm": 0.6271830797195435,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0809,
|
|
"mean_token_accuracy": 0.6854414343833923,
|
|
"num_tokens": 292350504.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 2.3476190476190477,
|
|
"grad_norm": 0.6458184123039246,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0777,
|
|
"mean_token_accuracy": 0.6864031553268433,
|
|
"num_tokens": 292951776.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 2.3523809523809525,
|
|
"grad_norm": 0.6648980379104614,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0695,
|
|
"mean_token_accuracy": 0.6887059211730957,
|
|
"num_tokens": 293532488.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 2.357142857142857,
|
|
"grad_norm": 0.6425085067749023,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0575,
|
|
"mean_token_accuracy": 0.6918837428092957,
|
|
"num_tokens": 294121146.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 2.361904761904762,
|
|
"grad_norm": 0.6645520329475403,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0768,
|
|
"mean_token_accuracy": 0.6873211860656738,
|
|
"num_tokens": 294726732.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 2.3666666666666667,
|
|
"grad_norm": 0.6538220047950745,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0682,
|
|
"mean_token_accuracy": 0.6892035007476807,
|
|
"num_tokens": 295306697.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 2.3714285714285714,
|
|
"grad_norm": 0.7154629230499268,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0893,
|
|
"mean_token_accuracy": 0.6833094358444214,
|
|
"num_tokens": 295894972.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 2.376190476190476,
|
|
"grad_norm": 0.6492322087287903,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0831,
|
|
"mean_token_accuracy": 0.6853781938552856,
|
|
"num_tokens": 296505345.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 2.380952380952381,
|
|
"grad_norm": 0.7426714301109314,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0664,
|
|
"mean_token_accuracy": 0.6893447637557983,
|
|
"num_tokens": 297100909.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.3857142857142857,
|
|
"grad_norm": 0.6399804353713989,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0743,
|
|
"mean_token_accuracy": 0.688417375087738,
|
|
"num_tokens": 297690267.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 2.3904761904761904,
|
|
"grad_norm": 0.599839985370636,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0646,
|
|
"mean_token_accuracy": 0.6897764801979065,
|
|
"num_tokens": 298283484.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 2.395238095238095,
|
|
"grad_norm": 0.6296051740646362,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0814,
|
|
"mean_token_accuracy": 0.685540497303009,
|
|
"num_tokens": 298880193.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 2.4,
|
|
"grad_norm": 0.5922709107398987,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.058,
|
|
"mean_token_accuracy": 0.6912336349487305,
|
|
"num_tokens": 299479995.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 2.4047619047619047,
|
|
"grad_norm": 0.608103334903717,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0731,
|
|
"mean_token_accuracy": 0.6877481937408447,
|
|
"num_tokens": 300068384.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 2.4095238095238094,
|
|
"grad_norm": 0.6003749966621399,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.083,
|
|
"mean_token_accuracy": 0.6847676038742065,
|
|
"num_tokens": 300687274.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 2.414285714285714,
|
|
"grad_norm": 0.5747948884963989,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.075,
|
|
"mean_token_accuracy": 0.6867921352386475,
|
|
"num_tokens": 301288728.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 2.419047619047619,
|
|
"grad_norm": 0.6287463307380676,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0698,
|
|
"mean_token_accuracy": 0.6888238787651062,
|
|
"num_tokens": 301868926.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 2.4238095238095236,
|
|
"grad_norm": 0.5455256104469299,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0644,
|
|
"mean_token_accuracy": 0.690401017665863,
|
|
"num_tokens": 302467630.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 2.4285714285714284,
|
|
"grad_norm": 0.6476891040802002,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.09,
|
|
"mean_token_accuracy": 0.6842606663703918,
|
|
"num_tokens": 303058965.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 2.4333333333333336,
|
|
"grad_norm": 0.6696739792823792,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.091,
|
|
"mean_token_accuracy": 0.6829922795295715,
|
|
"num_tokens": 303639694.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 2.4380952380952383,
|
|
"grad_norm": 0.5850697159767151,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0651,
|
|
"mean_token_accuracy": 0.6903361082077026,
|
|
"num_tokens": 304234504.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 2.442857142857143,
|
|
"grad_norm": 0.6123826503753662,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0848,
|
|
"mean_token_accuracy": 0.6855412125587463,
|
|
"num_tokens": 304822484.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 2.447619047619048,
|
|
"grad_norm": 0.6242313981056213,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.069,
|
|
"mean_token_accuracy": 0.6895902156829834,
|
|
"num_tokens": 305405226.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 2.4523809523809526,
|
|
"grad_norm": 0.6153740286827087,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0701,
|
|
"mean_token_accuracy": 0.6889458298683167,
|
|
"num_tokens": 306007153.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 2.4571428571428573,
|
|
"grad_norm": 0.6674852967262268,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0701,
|
|
"mean_token_accuracy": 0.6897221803665161,
|
|
"num_tokens": 306588836.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 2.461904761904762,
|
|
"grad_norm": 0.6560084819793701,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0804,
|
|
"mean_token_accuracy": 0.6861131191253662,
|
|
"num_tokens": 307200955.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 2.466666666666667,
|
|
"grad_norm": 0.5911952257156372,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0669,
|
|
"mean_token_accuracy": 0.6889474987983704,
|
|
"num_tokens": 307799028.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 2.4714285714285715,
|
|
"grad_norm": 0.6963088512420654,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0965,
|
|
"mean_token_accuracy": 0.6822454929351807,
|
|
"num_tokens": 308389748.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 2.4761904761904763,
|
|
"grad_norm": 0.7166724801063538,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0773,
|
|
"mean_token_accuracy": 0.6871429681777954,
|
|
"num_tokens": 308978715.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.480952380952381,
|
|
"grad_norm": 0.598521888256073,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0756,
|
|
"mean_token_accuracy": 0.6871167421340942,
|
|
"num_tokens": 309587298.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 2.4857142857142858,
|
|
"grad_norm": 0.6383949518203735,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0643,
|
|
"mean_token_accuracy": 0.6895929574966431,
|
|
"num_tokens": 310173585.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 2.4904761904761905,
|
|
"grad_norm": 0.6667410731315613,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0736,
|
|
"mean_token_accuracy": 0.6880219578742981,
|
|
"num_tokens": 310760313.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 2.4952380952380953,
|
|
"grad_norm": 0.6218487620353699,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0764,
|
|
"mean_token_accuracy": 0.6872262358665466,
|
|
"num_tokens": 311374002.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 0.6058824062347412,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0701,
|
|
"mean_token_accuracy": 0.6883900165557861,
|
|
"num_tokens": 311952533.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 2.5047619047619047,
|
|
"grad_norm": 0.6459484100341797,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.065,
|
|
"mean_token_accuracy": 0.6896857023239136,
|
|
"num_tokens": 312542383.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 2.5095238095238095,
|
|
"grad_norm": 0.6192833781242371,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0745,
|
|
"mean_token_accuracy": 0.686732828617096,
|
|
"num_tokens": 313136427.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 2.5142857142857142,
|
|
"grad_norm": 0.602884829044342,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0564,
|
|
"mean_token_accuracy": 0.6925665140151978,
|
|
"num_tokens": 313731115.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 2.519047619047619,
|
|
"grad_norm": 0.5805109143257141,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0644,
|
|
"mean_token_accuracy": 0.6895827651023865,
|
|
"num_tokens": 314316253.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 2.5238095238095237,
|
|
"grad_norm": 0.6484024524688721,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0634,
|
|
"mean_token_accuracy": 0.6902580857276917,
|
|
"num_tokens": 314906539.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 2.5285714285714285,
|
|
"grad_norm": 0.6236498355865479,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0611,
|
|
"mean_token_accuracy": 0.6907384991645813,
|
|
"num_tokens": 315491005.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 2.533333333333333,
|
|
"grad_norm": 0.68634432554245,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0759,
|
|
"mean_token_accuracy": 0.6861008405685425,
|
|
"num_tokens": 316086156.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 2.538095238095238,
|
|
"grad_norm": 0.6483022570610046,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0809,
|
|
"mean_token_accuracy": 0.6863186359405518,
|
|
"num_tokens": 316687284.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 2.5428571428571427,
|
|
"grad_norm": 0.6313026547431946,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.065,
|
|
"mean_token_accuracy": 0.6903449296951294,
|
|
"num_tokens": 317280976.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 2.5476190476190474,
|
|
"grad_norm": 0.7180777788162231,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.072,
|
|
"mean_token_accuracy": 0.6879873275756836,
|
|
"num_tokens": 317869704.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 2.552380952380952,
|
|
"grad_norm": 0.6203593611717224,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0754,
|
|
"mean_token_accuracy": 0.6873841285705566,
|
|
"num_tokens": 318453830.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 2.557142857142857,
|
|
"grad_norm": 0.7294032573699951,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0816,
|
|
"mean_token_accuracy": 0.6853822469711304,
|
|
"num_tokens": 319036628.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 2.5619047619047617,
|
|
"grad_norm": 0.6315251588821411,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0671,
|
|
"mean_token_accuracy": 0.6895589828491211,
|
|
"num_tokens": 319641680.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 2.5666666666666664,
|
|
"grad_norm": 0.6481133699417114,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0733,
|
|
"mean_token_accuracy": 0.6874011754989624,
|
|
"num_tokens": 320235018.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 2.571428571428571,
|
|
"grad_norm": 0.6537102460861206,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0782,
|
|
"mean_token_accuracy": 0.6865108609199524,
|
|
"num_tokens": 320839523.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.576190476190476,
|
|
"grad_norm": 0.5990563631057739,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0692,
|
|
"mean_token_accuracy": 0.6882610321044922,
|
|
"num_tokens": 321445469.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 2.580952380952381,
|
|
"grad_norm": 0.7251924276351929,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0769,
|
|
"mean_token_accuracy": 0.685580849647522,
|
|
"num_tokens": 322040382.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 2.585714285714286,
|
|
"grad_norm": 0.5734168291091919,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0731,
|
|
"mean_token_accuracy": 0.6879425048828125,
|
|
"num_tokens": 322631980.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 2.5904761904761906,
|
|
"grad_norm": 0.6524589657783508,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0715,
|
|
"mean_token_accuracy": 0.6874139308929443,
|
|
"num_tokens": 323217003.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 2.5952380952380953,
|
|
"grad_norm": 0.6292608976364136,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0751,
|
|
"mean_token_accuracy": 0.6870990991592407,
|
|
"num_tokens": 323797882.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 2.6,
|
|
"grad_norm": 0.631439208984375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0783,
|
|
"mean_token_accuracy": 0.685767412185669,
|
|
"num_tokens": 324381508.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 2.604761904761905,
|
|
"grad_norm": 0.621782124042511,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0954,
|
|
"mean_token_accuracy": 0.6821581125259399,
|
|
"num_tokens": 324979976.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 2.6095238095238096,
|
|
"grad_norm": 0.6306419372558594,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0677,
|
|
"mean_token_accuracy": 0.6885519623756409,
|
|
"num_tokens": 325579147.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 2.6142857142857143,
|
|
"grad_norm": 0.5700802206993103,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0701,
|
|
"mean_token_accuracy": 0.6891588568687439,
|
|
"num_tokens": 326189724.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 2.619047619047619,
|
|
"grad_norm": 0.5674880146980286,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0723,
|
|
"mean_token_accuracy": 0.6874587535858154,
|
|
"num_tokens": 326781040.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 2.623809523809524,
|
|
"grad_norm": 0.6210941076278687,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.066,
|
|
"mean_token_accuracy": 0.6903613805770874,
|
|
"num_tokens": 327384993.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 2.6285714285714286,
|
|
"grad_norm": 0.5762701630592346,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0541,
|
|
"mean_token_accuracy": 0.6926007866859436,
|
|
"num_tokens": 327967527.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 2.6333333333333333,
|
|
"grad_norm": 0.5869442224502563,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0602,
|
|
"mean_token_accuracy": 0.6907045841217041,
|
|
"num_tokens": 328556111.0,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 2.638095238095238,
|
|
"grad_norm": 0.6561670303344727,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.067,
|
|
"mean_token_accuracy": 0.6888686418533325,
|
|
"num_tokens": 329156419.0,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 2.642857142857143,
|
|
"grad_norm": 0.5729210376739502,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0908,
|
|
"mean_token_accuracy": 0.6830568313598633,
|
|
"num_tokens": 329765795.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 2.6476190476190475,
|
|
"grad_norm": 0.5583658218383789,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0715,
|
|
"mean_token_accuracy": 0.6889873743057251,
|
|
"num_tokens": 330366805.0,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 2.6523809523809523,
|
|
"grad_norm": 0.6156875491142273,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0597,
|
|
"mean_token_accuracy": 0.6899721622467041,
|
|
"num_tokens": 330960683.0,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 2.657142857142857,
|
|
"grad_norm": 0.5830056667327881,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0766,
|
|
"mean_token_accuracy": 0.6871880292892456,
|
|
"num_tokens": 331566617.0,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 2.6619047619047618,
|
|
"grad_norm": 0.6878387928009033,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0606,
|
|
"mean_token_accuracy": 0.6908230781555176,
|
|
"num_tokens": 332146780.0,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.6010000705718994,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.073,
|
|
"mean_token_accuracy": 0.6884846091270447,
|
|
"num_tokens": 332744565.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.6714285714285713,
|
|
"grad_norm": 0.6257455348968506,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0718,
|
|
"mean_token_accuracy": 0.6879858374595642,
|
|
"num_tokens": 333327246.0,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 2.6761904761904765,
|
|
"grad_norm": 0.6111727356910706,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0604,
|
|
"mean_token_accuracy": 0.6906289458274841,
|
|
"num_tokens": 333924279.0,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 2.680952380952381,
|
|
"grad_norm": 0.6363468170166016,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0628,
|
|
"mean_token_accuracy": 0.6897515058517456,
|
|
"num_tokens": 334526451.0,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 2.685714285714286,
|
|
"grad_norm": 0.6247795820236206,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0726,
|
|
"mean_token_accuracy": 0.6881762742996216,
|
|
"num_tokens": 335121296.0,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 2.6904761904761907,
|
|
"grad_norm": 0.7256935238838196,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.069,
|
|
"mean_token_accuracy": 0.6878950595855713,
|
|
"num_tokens": 335705229.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 2.6952380952380954,
|
|
"grad_norm": 0.6218934655189514,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0745,
|
|
"mean_token_accuracy": 0.687312126159668,
|
|
"num_tokens": 336296515.0,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 2.7,
|
|
"grad_norm": 0.64492267370224,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0772,
|
|
"mean_token_accuracy": 0.6871779561042786,
|
|
"num_tokens": 336898581.0,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 2.704761904761905,
|
|
"grad_norm": 0.6439410448074341,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0849,
|
|
"mean_token_accuracy": 0.685498833656311,
|
|
"num_tokens": 337492720.0,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 2.7095238095238097,
|
|
"grad_norm": 0.5982577204704285,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0685,
|
|
"mean_token_accuracy": 0.6890565752983093,
|
|
"num_tokens": 338088080.0,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 2.7142857142857144,
|
|
"grad_norm": 0.6382868885993958,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0678,
|
|
"mean_token_accuracy": 0.6893150806427002,
|
|
"num_tokens": 338682863.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 2.719047619047619,
|
|
"grad_norm": 0.5995696187019348,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0737,
|
|
"mean_token_accuracy": 0.6885708570480347,
|
|
"num_tokens": 339274595.0,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 2.723809523809524,
|
|
"grad_norm": 0.6478890180587769,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0736,
|
|
"mean_token_accuracy": 0.687543511390686,
|
|
"num_tokens": 339857633.0,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 2.7285714285714286,
|
|
"grad_norm": 0.6489014625549316,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0564,
|
|
"mean_token_accuracy": 0.6918776035308838,
|
|
"num_tokens": 340451043.0,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 2.7333333333333334,
|
|
"grad_norm": 0.6406450271606445,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0801,
|
|
"mean_token_accuracy": 0.6862790584564209,
|
|
"num_tokens": 341042238.0,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 2.738095238095238,
|
|
"grad_norm": 0.6261545419692993,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0766,
|
|
"mean_token_accuracy": 0.6869131922721863,
|
|
"num_tokens": 341644006.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 2.742857142857143,
|
|
"grad_norm": 0.5907791256904602,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0666,
|
|
"mean_token_accuracy": 0.6884465217590332,
|
|
"num_tokens": 342238235.0,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 2.7476190476190476,
|
|
"grad_norm": 0.638664186000824,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0609,
|
|
"mean_token_accuracy": 0.6908861994743347,
|
|
"num_tokens": 342831260.0,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 2.7523809523809524,
|
|
"grad_norm": 0.6344829797744751,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0762,
|
|
"mean_token_accuracy": 0.687269389629364,
|
|
"num_tokens": 343427629.0,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 2.757142857142857,
|
|
"grad_norm": 0.6150461435317993,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0761,
|
|
"mean_token_accuracy": 0.6873693466186523,
|
|
"num_tokens": 344021401.0,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 2.761904761904762,
|
|
"grad_norm": 0.6308332681655884,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0618,
|
|
"mean_token_accuracy": 0.6911748647689819,
|
|
"num_tokens": 344610108.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.7666666666666666,
|
|
"grad_norm": 0.55866539478302,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0725,
|
|
"mean_token_accuracy": 0.6869944334030151,
|
|
"num_tokens": 345217241.0,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 2.7714285714285714,
|
|
"grad_norm": 0.638909637928009,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0658,
|
|
"mean_token_accuracy": 0.6890157461166382,
|
|
"num_tokens": 345804258.0,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 2.776190476190476,
|
|
"grad_norm": 0.5688804984092712,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0689,
|
|
"mean_token_accuracy": 0.6887319087982178,
|
|
"num_tokens": 346399481.0,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 2.780952380952381,
|
|
"grad_norm": 0.6002762317657471,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0563,
|
|
"mean_token_accuracy": 0.6915134191513062,
|
|
"num_tokens": 346997922.0,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 2.7857142857142856,
|
|
"grad_norm": 0.6163663864135742,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.07,
|
|
"mean_token_accuracy": 0.6884576082229614,
|
|
"num_tokens": 347597863.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 2.7904761904761903,
|
|
"grad_norm": 0.580531656742096,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0638,
|
|
"mean_token_accuracy": 0.6888343095779419,
|
|
"num_tokens": 348201046.0,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 2.795238095238095,
|
|
"grad_norm": 0.5918668508529663,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0584,
|
|
"mean_token_accuracy": 0.6905962228775024,
|
|
"num_tokens": 348787326.0,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 2.8,
|
|
"grad_norm": 0.6383691430091858,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0689,
|
|
"mean_token_accuracy": 0.6883484125137329,
|
|
"num_tokens": 349380645.0,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 2.8047619047619046,
|
|
"grad_norm": 0.6115639805793762,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0654,
|
|
"mean_token_accuracy": 0.6897737979888916,
|
|
"num_tokens": 349983092.0,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 2.8095238095238093,
|
|
"grad_norm": 0.6397126317024231,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0617,
|
|
"mean_token_accuracy": 0.6903370022773743,
|
|
"num_tokens": 350576836.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 2.814285714285714,
|
|
"grad_norm": 0.6862447261810303,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0624,
|
|
"mean_token_accuracy": 0.691243588924408,
|
|
"num_tokens": 351172725.0,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 2.819047619047619,
|
|
"grad_norm": 0.6518527269363403,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0568,
|
|
"mean_token_accuracy": 0.6918639540672302,
|
|
"num_tokens": 351777142.0,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 2.8238095238095235,
|
|
"grad_norm": 0.7507683634757996,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0576,
|
|
"mean_token_accuracy": 0.6908581852912903,
|
|
"num_tokens": 352361834.0,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 2.8285714285714287,
|
|
"grad_norm": 0.6769391298294067,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0658,
|
|
"mean_token_accuracy": 0.6891970038414001,
|
|
"num_tokens": 352964892.0,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 2.8333333333333335,
|
|
"grad_norm": 0.7207344770431519,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0655,
|
|
"mean_token_accuracy": 0.688566267490387,
|
|
"num_tokens": 353563696.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 2.8380952380952382,
|
|
"grad_norm": 0.6687008142471313,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0663,
|
|
"mean_token_accuracy": 0.6889446377754211,
|
|
"num_tokens": 354162323.0,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 2.842857142857143,
|
|
"grad_norm": 0.6510334610939026,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0973,
|
|
"mean_token_accuracy": 0.6817850470542908,
|
|
"num_tokens": 354763224.0,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 2.8476190476190477,
|
|
"grad_norm": 0.6164536476135254,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0599,
|
|
"mean_token_accuracy": 0.6904336214065552,
|
|
"num_tokens": 355360011.0,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 2.8523809523809525,
|
|
"grad_norm": 0.6652323603630066,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0664,
|
|
"mean_token_accuracy": 0.6892472505569458,
|
|
"num_tokens": 355948770.0,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 2.857142857142857,
|
|
"grad_norm": 0.6170997619628906,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0749,
|
|
"mean_token_accuracy": 0.686385989189148,
|
|
"num_tokens": 356555915.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.861904761904762,
|
|
"grad_norm": 0.5823125839233398,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0762,
|
|
"mean_token_accuracy": 0.6866952180862427,
|
|
"num_tokens": 357168089.0,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 2.8666666666666667,
|
|
"grad_norm": 0.6084815859794617,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0589,
|
|
"mean_token_accuracy": 0.6905912160873413,
|
|
"num_tokens": 357762209.0,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 2.8714285714285714,
|
|
"grad_norm": 0.5347459316253662,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.065,
|
|
"mean_token_accuracy": 0.6898081302642822,
|
|
"num_tokens": 358365632.0,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 2.876190476190476,
|
|
"grad_norm": 0.6211216449737549,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0795,
|
|
"mean_token_accuracy": 0.6860474944114685,
|
|
"num_tokens": 358969038.0,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 2.880952380952381,
|
|
"grad_norm": 0.6298102736473083,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0616,
|
|
"mean_token_accuracy": 0.691013514995575,
|
|
"num_tokens": 359560638.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 2.8857142857142857,
|
|
"grad_norm": 0.6150857210159302,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0726,
|
|
"mean_token_accuracy": 0.6874991655349731,
|
|
"num_tokens": 360159047.0,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 2.8904761904761904,
|
|
"grad_norm": 0.6256808638572693,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0696,
|
|
"mean_token_accuracy": 0.6895867586135864,
|
|
"num_tokens": 360752550.0,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 2.895238095238095,
|
|
"grad_norm": 0.6338992714881897,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0706,
|
|
"mean_token_accuracy": 0.6878842115402222,
|
|
"num_tokens": 361348966.0,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 2.9,
|
|
"grad_norm": 0.6074673533439636,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0625,
|
|
"mean_token_accuracy": 0.690711498260498,
|
|
"num_tokens": 361933541.0,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 2.9047619047619047,
|
|
"grad_norm": 0.6169112324714661,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0679,
|
|
"mean_token_accuracy": 0.6893640756607056,
|
|
"num_tokens": 362522689.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 2.9095238095238094,
|
|
"grad_norm": 0.6712765097618103,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0481,
|
|
"mean_token_accuracy": 0.6935627460479736,
|
|
"num_tokens": 363107779.0,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 2.914285714285714,
|
|
"grad_norm": 0.6030009388923645,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0624,
|
|
"mean_token_accuracy": 0.6899582147598267,
|
|
"num_tokens": 363690809.0,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 2.919047619047619,
|
|
"grad_norm": 0.6335533261299133,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0632,
|
|
"mean_token_accuracy": 0.6892010569572449,
|
|
"num_tokens": 364279923.0,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 2.923809523809524,
|
|
"grad_norm": 0.6299601793289185,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0536,
|
|
"mean_token_accuracy": 0.6920279264450073,
|
|
"num_tokens": 364846929.0,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 2.928571428571429,
|
|
"grad_norm": 0.6494601964950562,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0797,
|
|
"mean_token_accuracy": 0.6871404051780701,
|
|
"num_tokens": 365427755.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 2.9333333333333336,
|
|
"grad_norm": 0.6412233710289001,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0618,
|
|
"mean_token_accuracy": 0.6902071833610535,
|
|
"num_tokens": 366022879.0,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 2.9380952380952383,
|
|
"grad_norm": 0.5901429653167725,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0639,
|
|
"mean_token_accuracy": 0.6891224384307861,
|
|
"num_tokens": 366608198.0,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 2.942857142857143,
|
|
"grad_norm": 0.6606128811836243,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0687,
|
|
"mean_token_accuracy": 0.6881773471832275,
|
|
"num_tokens": 367187170.0,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 2.947619047619048,
|
|
"grad_norm": 0.6021740436553955,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.062,
|
|
"mean_token_accuracy": 0.6895371675491333,
|
|
"num_tokens": 367778542.0,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 2.9523809523809526,
|
|
"grad_norm": 0.6304929852485657,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0685,
|
|
"mean_token_accuracy": 0.6876203417778015,
|
|
"num_tokens": 368374361.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.9571428571428573,
|
|
"grad_norm": 0.6775472164154053,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0693,
|
|
"mean_token_accuracy": 0.688637375831604,
|
|
"num_tokens": 368975961.0,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 2.961904761904762,
|
|
"grad_norm": 0.6188324689865112,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0446,
|
|
"mean_token_accuracy": 0.69502854347229,
|
|
"num_tokens": 369565801.0,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 2.966666666666667,
|
|
"grad_norm": 0.7237592339515686,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.069,
|
|
"mean_token_accuracy": 0.6882259845733643,
|
|
"num_tokens": 370147963.0,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 2.9714285714285715,
|
|
"grad_norm": 0.5706875920295715,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0661,
|
|
"mean_token_accuracy": 0.688866376876831,
|
|
"num_tokens": 370730337.0,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 2.9761904761904763,
|
|
"grad_norm": 0.6157565712928772,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0595,
|
|
"mean_token_accuracy": 0.6903921365737915,
|
|
"num_tokens": 371313464.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 2.980952380952381,
|
|
"grad_norm": 0.5899333953857422,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0634,
|
|
"mean_token_accuracy": 0.690090537071228,
|
|
"num_tokens": 371903211.0,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 2.9857142857142858,
|
|
"grad_norm": 0.6269708275794983,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0536,
|
|
"mean_token_accuracy": 0.6934218406677246,
|
|
"num_tokens": 372496314.0,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 2.9904761904761905,
|
|
"grad_norm": 0.6969268321990967,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0676,
|
|
"mean_token_accuracy": 0.688661515712738,
|
|
"num_tokens": 373096610.0,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 2.9952380952380953,
|
|
"grad_norm": 0.5695185661315918,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0693,
|
|
"mean_token_accuracy": 0.6875466108322144,
|
|
"num_tokens": 373694165.0,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.6636136174201965,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.0583,
|
|
"mean_token_accuracy": 0.6922066807746887,
|
|
"num_tokens": 374283247.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 630,
|
|
"total_flos": 2.1853937174671524e+18,
|
|
"train_loss": 1.1615400253780304,
|
|
"train_runtime": 1840.1375,
|
|
"train_samples_per_second": 175.286,
|
|
"train_steps_per_second": 0.342
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 630,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 315,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.1853937174671524e+18,
|
|
"train_batch_size": 128,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|