Model: Hyeongwon/P2-split2_prob_Qwen3-8B-Base_0325-04-bs128-lr1e-5-epoch6 Source: Original Platform
7604 lines
215 KiB
JSON
7604 lines
215 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 6.0,
|
|
"eval_steps": 500,
|
|
"global_step": 756,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.5677337646484375,
|
|
"epoch": 0.007936507936507936,
|
|
"grad_norm": 5.825740150213408,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.3956,
|
|
"mean_token_accuracy": 0.6547382255084813,
|
|
"num_tokens": 849869.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 0.569549560546875,
|
|
"epoch": 0.015873015873015872,
|
|
"grad_norm": 5.801156934108041,
|
|
"learning_rate": 2.6315789473684213e-07,
|
|
"loss": 1.4001,
|
|
"mean_token_accuracy": 0.6515501267276704,
|
|
"num_tokens": 1710146.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 0.5733184814453125,
|
|
"epoch": 0.023809523809523808,
|
|
"grad_norm": 5.697225721311094,
|
|
"learning_rate": 5.263157894736843e-07,
|
|
"loss": 1.3825,
|
|
"mean_token_accuracy": 0.6571523365564644,
|
|
"num_tokens": 2560005.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 0.5648651123046875,
|
|
"epoch": 0.031746031746031744,
|
|
"grad_norm": 5.692098743617845,
|
|
"learning_rate": 7.894736842105263e-07,
|
|
"loss": 1.3997,
|
|
"mean_token_accuracy": 0.65298081189394,
|
|
"num_tokens": 3457966.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.57421875,
|
|
"epoch": 0.03968253968253968,
|
|
"grad_norm": 5.78106605094393,
|
|
"learning_rate": 1.0526315789473685e-06,
|
|
"loss": 1.4008,
|
|
"mean_token_accuracy": 0.6524212104268372,
|
|
"num_tokens": 4321827.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.5650482177734375,
|
|
"epoch": 0.047619047619047616,
|
|
"grad_norm": 5.616712544244806,
|
|
"learning_rate": 1.3157894736842106e-06,
|
|
"loss": 1.3776,
|
|
"mean_token_accuracy": 0.6610458297654986,
|
|
"num_tokens": 5188122.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.5651702880859375,
|
|
"epoch": 0.05555555555555555,
|
|
"grad_norm": 5.431878258315051,
|
|
"learning_rate": 1.5789473684210526e-06,
|
|
"loss": 1.3756,
|
|
"mean_token_accuracy": 0.6562704290263355,
|
|
"num_tokens": 6042413.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.5759429931640625,
|
|
"epoch": 0.06349206349206349,
|
|
"grad_norm": 5.337410424762961,
|
|
"learning_rate": 1.8421052631578948e-06,
|
|
"loss": 1.3735,
|
|
"mean_token_accuracy": 0.6550004091113806,
|
|
"num_tokens": 6898441.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.562835693359375,
|
|
"epoch": 0.07142857142857142,
|
|
"grad_norm": 5.254246336716741,
|
|
"learning_rate": 2.105263157894737e-06,
|
|
"loss": 1.3621,
|
|
"mean_token_accuracy": 0.6594638815149665,
|
|
"num_tokens": 7794638.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.5644378662109375,
|
|
"epoch": 0.07936507936507936,
|
|
"grad_norm": 4.504374040638264,
|
|
"learning_rate": 2.368421052631579e-06,
|
|
"loss": 1.3207,
|
|
"mean_token_accuracy": 0.6636323751881719,
|
|
"num_tokens": 8673402.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.5662384033203125,
|
|
"epoch": 0.0873015873015873,
|
|
"grad_norm": 4.290465182305912,
|
|
"learning_rate": 2.631578947368421e-06,
|
|
"loss": 1.2869,
|
|
"mean_token_accuracy": 0.6718250620178878,
|
|
"num_tokens": 9525436.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.573974609375,
|
|
"epoch": 0.09523809523809523,
|
|
"grad_norm": 4.112783175310539,
|
|
"learning_rate": 2.8947368421052634e-06,
|
|
"loss": 1.2744,
|
|
"mean_token_accuracy": 0.6722556869499385,
|
|
"num_tokens": 10358777.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.5698089599609375,
|
|
"epoch": 0.10317460317460317,
|
|
"grad_norm": 3.231368850972016,
|
|
"learning_rate": 3.157894736842105e-06,
|
|
"loss": 1.1762,
|
|
"mean_token_accuracy": 0.6900928025133908,
|
|
"num_tokens": 11211677.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.568206787109375,
|
|
"epoch": 0.1111111111111111,
|
|
"grad_norm": 3.2472932637870775,
|
|
"learning_rate": 3.421052631578948e-06,
|
|
"loss": 1.1595,
|
|
"mean_token_accuracy": 0.6925145331770182,
|
|
"num_tokens": 12067363.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.5522003173828125,
|
|
"epoch": 0.11904761904761904,
|
|
"grad_norm": 2.955470813245027,
|
|
"learning_rate": 3.6842105263157896e-06,
|
|
"loss": 1.1484,
|
|
"mean_token_accuracy": 0.6949247056618333,
|
|
"num_tokens": 12945458.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.5524139404296875,
|
|
"epoch": 0.12698412698412698,
|
|
"grad_norm": 2.953534618001751,
|
|
"learning_rate": 3.947368421052632e-06,
|
|
"loss": 1.1231,
|
|
"mean_token_accuracy": 0.6996096298098564,
|
|
"num_tokens": 13815066.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.5273590087890625,
|
|
"epoch": 0.1349206349206349,
|
|
"grad_norm": 3.6738847590405457,
|
|
"learning_rate": 4.210526315789474e-06,
|
|
"loss": 1.0456,
|
|
"mean_token_accuracy": 0.715466492343694,
|
|
"num_tokens": 14685173.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.5328826904296875,
|
|
"epoch": 0.14285714285714285,
|
|
"grad_norm": 3.975231208187267,
|
|
"learning_rate": 4.473684210526316e-06,
|
|
"loss": 1.0218,
|
|
"mean_token_accuracy": 0.7184609142132103,
|
|
"num_tokens": 15522062.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.5339813232421875,
|
|
"epoch": 0.15079365079365079,
|
|
"grad_norm": 3.7568291660582056,
|
|
"learning_rate": 4.736842105263158e-06,
|
|
"loss": 0.9846,
|
|
"mean_token_accuracy": 0.7254857295192778,
|
|
"num_tokens": 16388252.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.539520263671875,
|
|
"epoch": 0.15873015873015872,
|
|
"grad_norm": 3.256772081860255,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.9572,
|
|
"mean_token_accuracy": 0.7309625665657222,
|
|
"num_tokens": 17235205.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.5419464111328125,
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 2.7287809182707607,
|
|
"learning_rate": 5.263157894736842e-06,
|
|
"loss": 0.9449,
|
|
"mean_token_accuracy": 0.734284377656877,
|
|
"num_tokens": 18090069.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.541290283203125,
|
|
"epoch": 0.1746031746031746,
|
|
"grad_norm": 2.661465266751511,
|
|
"learning_rate": 5.526315789473685e-06,
|
|
"loss": 0.9245,
|
|
"mean_token_accuracy": 0.7380136135034263,
|
|
"num_tokens": 18955962.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.5420989990234375,
|
|
"epoch": 0.18253968253968253,
|
|
"grad_norm": 2.1871180143795454,
|
|
"learning_rate": 5.789473684210527e-06,
|
|
"loss": 0.8806,
|
|
"mean_token_accuracy": 0.7492561861872673,
|
|
"num_tokens": 19812261.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.5357818603515625,
|
|
"epoch": 0.19047619047619047,
|
|
"grad_norm": 2.9467428670698266,
|
|
"learning_rate": 6.0526315789473685e-06,
|
|
"loss": 0.8643,
|
|
"mean_token_accuracy": 0.7490303930826485,
|
|
"num_tokens": 20647709.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.52435302734375,
|
|
"epoch": 0.1984126984126984,
|
|
"grad_norm": 2.907660231508903,
|
|
"learning_rate": 6.31578947368421e-06,
|
|
"loss": 0.861,
|
|
"mean_token_accuracy": 0.7502002012915909,
|
|
"num_tokens": 21486371.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.5368194580078125,
|
|
"epoch": 0.20634920634920634,
|
|
"grad_norm": 2.490760554791867,
|
|
"learning_rate": 6.578947368421054e-06,
|
|
"loss": 0.8441,
|
|
"mean_token_accuracy": 0.7527890643104911,
|
|
"num_tokens": 22318414.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.5297393798828125,
|
|
"epoch": 0.21428571428571427,
|
|
"grad_norm": 2.1289542070292082,
|
|
"learning_rate": 6.842105263157896e-06,
|
|
"loss": 0.8129,
|
|
"mean_token_accuracy": 0.7607404845766723,
|
|
"num_tokens": 23134604.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.51568603515625,
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 1.8284519133414685,
|
|
"learning_rate": 7.1052631578947375e-06,
|
|
"loss": 0.8081,
|
|
"mean_token_accuracy": 0.7623137319460511,
|
|
"num_tokens": 24009770.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.52362060546875,
|
|
"epoch": 0.23015873015873015,
|
|
"grad_norm": 1.9471502088574602,
|
|
"learning_rate": 7.368421052631579e-06,
|
|
"loss": 0.8048,
|
|
"mean_token_accuracy": 0.762959006242454,
|
|
"num_tokens": 24869806.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.512420654296875,
|
|
"epoch": 0.23809523809523808,
|
|
"grad_norm": 1.745367819572919,
|
|
"learning_rate": 7.631578947368423e-06,
|
|
"loss": 0.7896,
|
|
"mean_token_accuracy": 0.7669625347480178,
|
|
"num_tokens": 25762999.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.5128631591796875,
|
|
"epoch": 0.24603174603174602,
|
|
"grad_norm": 2.0285955824692774,
|
|
"learning_rate": 7.894736842105265e-06,
|
|
"loss": 0.7605,
|
|
"mean_token_accuracy": 0.77361392788589,
|
|
"num_tokens": 26651412.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.51251220703125,
|
|
"epoch": 0.25396825396825395,
|
|
"grad_norm": 1.8504240732599995,
|
|
"learning_rate": 8.157894736842106e-06,
|
|
"loss": 0.7568,
|
|
"mean_token_accuracy": 0.7740332204848528,
|
|
"num_tokens": 27520069.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.5124359130859375,
|
|
"epoch": 0.2619047619047619,
|
|
"grad_norm": 1.5796846536434936,
|
|
"learning_rate": 8.421052631578948e-06,
|
|
"loss": 0.7509,
|
|
"mean_token_accuracy": 0.7743185707367957,
|
|
"num_tokens": 28367541.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.506439208984375,
|
|
"epoch": 0.2698412698412698,
|
|
"grad_norm": 1.3378034052723207,
|
|
"learning_rate": 8.68421052631579e-06,
|
|
"loss": 0.7161,
|
|
"mean_token_accuracy": 0.7827055719681084,
|
|
"num_tokens": 29217570.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.4996490478515625,
|
|
"epoch": 0.2777777777777778,
|
|
"grad_norm": 1.679208635550248,
|
|
"learning_rate": 8.947368421052632e-06,
|
|
"loss": 0.7187,
|
|
"mean_token_accuracy": 0.7833654824644327,
|
|
"num_tokens": 30088869.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.4967041015625,
|
|
"epoch": 0.2857142857142857,
|
|
"grad_norm": 1.6387552610606482,
|
|
"learning_rate": 9.210526315789474e-06,
|
|
"loss": 0.7145,
|
|
"mean_token_accuracy": 0.7830667225643992,
|
|
"num_tokens": 30959821.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.500274658203125,
|
|
"epoch": 0.29365079365079366,
|
|
"grad_norm": 1.371112117801989,
|
|
"learning_rate": 9.473684210526315e-06,
|
|
"loss": 0.7097,
|
|
"mean_token_accuracy": 0.7837298880331218,
|
|
"num_tokens": 31830767.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.5020751953125,
|
|
"epoch": 0.30158730158730157,
|
|
"grad_norm": 1.6810348517072142,
|
|
"learning_rate": 9.736842105263159e-06,
|
|
"loss": 0.6954,
|
|
"mean_token_accuracy": 0.7868552934378386,
|
|
"num_tokens": 32692726.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.5,
|
|
"epoch": 0.30952380952380953,
|
|
"grad_norm": 1.3054950683288973,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.669,
|
|
"mean_token_accuracy": 0.7914999099448323,
|
|
"num_tokens": 33543076.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.4976806640625,
|
|
"epoch": 0.31746031746031744,
|
|
"grad_norm": 1.5523188226474438,
|
|
"learning_rate": 9.99995213807381e-06,
|
|
"loss": 0.677,
|
|
"mean_token_accuracy": 0.7901066686026752,
|
|
"num_tokens": 34404352.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.5042724609375,
|
|
"epoch": 0.3253968253968254,
|
|
"grad_norm": 1.3329229488550147,
|
|
"learning_rate": 9.99980855321154e-06,
|
|
"loss": 0.6721,
|
|
"mean_token_accuracy": 0.7911685910075903,
|
|
"num_tokens": 35236933.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 0.491058349609375,
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 1.2836381282552631,
|
|
"learning_rate": 9.999569248162095e-06,
|
|
"loss": 0.662,
|
|
"mean_token_accuracy": 0.7938097110018134,
|
|
"num_tokens": 36088050.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 0.491973876953125,
|
|
"epoch": 0.3412698412698413,
|
|
"grad_norm": 1.5436048820238097,
|
|
"learning_rate": 9.999234227506912e-06,
|
|
"loss": 0.659,
|
|
"mean_token_accuracy": 0.7951631429605186,
|
|
"num_tokens": 36942407.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 0.488128662109375,
|
|
"epoch": 0.3492063492063492,
|
|
"grad_norm": 1.5701418827048832,
|
|
"learning_rate": 9.998803497659885e-06,
|
|
"loss": 0.6567,
|
|
"mean_token_accuracy": 0.7952465042471886,
|
|
"num_tokens": 37803882.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 0.4854278564453125,
|
|
"epoch": 0.35714285714285715,
|
|
"grad_norm": 1.453745925853672,
|
|
"learning_rate": 9.998277066867236e-06,
|
|
"loss": 0.6557,
|
|
"mean_token_accuracy": 0.7963202544488013,
|
|
"num_tokens": 38667329.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 0.4832763671875,
|
|
"epoch": 0.36507936507936506,
|
|
"grad_norm": 1.4543393351762242,
|
|
"learning_rate": 9.997654945207368e-06,
|
|
"loss": 0.6415,
|
|
"mean_token_accuracy": 0.7979013016447425,
|
|
"num_tokens": 39524166.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 0.47686767578125,
|
|
"epoch": 0.373015873015873,
|
|
"grad_norm": 1.2844075952499214,
|
|
"learning_rate": 9.99693714459065e-06,
|
|
"loss": 0.6314,
|
|
"mean_token_accuracy": 0.8014799957163632,
|
|
"num_tokens": 40389693.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 0.485137939453125,
|
|
"epoch": 0.38095238095238093,
|
|
"grad_norm": 1.2055402617655024,
|
|
"learning_rate": 9.996123678759214e-06,
|
|
"loss": 0.6304,
|
|
"mean_token_accuracy": 0.8023288743570447,
|
|
"num_tokens": 41231841.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 0.4888763427734375,
|
|
"epoch": 0.3888888888888889,
|
|
"grad_norm": 1.562622920851612,
|
|
"learning_rate": 9.995214563286677e-06,
|
|
"loss": 0.6315,
|
|
"mean_token_accuracy": 0.8009699960239232,
|
|
"num_tokens": 42082897.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 0.4813079833984375,
|
|
"epoch": 0.3968253968253968,
|
|
"grad_norm": 1.1138521900837535,
|
|
"learning_rate": 9.994209815577843e-06,
|
|
"loss": 0.6138,
|
|
"mean_token_accuracy": 0.8051781668327749,
|
|
"num_tokens": 42941458.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.4780731201171875,
|
|
"epoch": 0.40476190476190477,
|
|
"grad_norm": 1.4023404249187725,
|
|
"learning_rate": 9.993109454868379e-06,
|
|
"loss": 0.6123,
|
|
"mean_token_accuracy": 0.805035431869328,
|
|
"num_tokens": 43801380.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 0.4847564697265625,
|
|
"epoch": 0.4126984126984127,
|
|
"grad_norm": 1.1362760383931592,
|
|
"learning_rate": 9.991913502224438e-06,
|
|
"loss": 0.6252,
|
|
"mean_token_accuracy": 0.8026665896177292,
|
|
"num_tokens": 44671335.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 0.4954071044921875,
|
|
"epoch": 0.42063492063492064,
|
|
"grad_norm": 1.2459663164920876,
|
|
"learning_rate": 9.990621980542258e-06,
|
|
"loss": 0.6116,
|
|
"mean_token_accuracy": 0.8040637490339577,
|
|
"num_tokens": 45508166.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 0.479949951171875,
|
|
"epoch": 0.42857142857142855,
|
|
"grad_norm": 1.3916959345611162,
|
|
"learning_rate": 9.989234914547725e-06,
|
|
"loss": 0.6209,
|
|
"mean_token_accuracy": 0.803072199691087,
|
|
"num_tokens": 46398974.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 0.472991943359375,
|
|
"epoch": 0.4365079365079365,
|
|
"grad_norm": 1.2293838348325725,
|
|
"learning_rate": 9.9877523307959e-06,
|
|
"loss": 0.619,
|
|
"mean_token_accuracy": 0.8017513235099614,
|
|
"num_tokens": 47311746.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 0.477813720703125,
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 1.3910584256010747,
|
|
"learning_rate": 9.986174257670509e-06,
|
|
"loss": 0.5928,
|
|
"mean_token_accuracy": 0.8104538763873279,
|
|
"num_tokens": 48177761.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 0.4864501953125,
|
|
"epoch": 0.4523809523809524,
|
|
"grad_norm": 1.398508282793997,
|
|
"learning_rate": 9.984500725383397e-06,
|
|
"loss": 0.5977,
|
|
"mean_token_accuracy": 0.8092257836833596,
|
|
"num_tokens": 49026375.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 0.4769744873046875,
|
|
"epoch": 0.4603174603174603,
|
|
"grad_norm": 1.5292432492685464,
|
|
"learning_rate": 9.98273176597396e-06,
|
|
"loss": 0.6023,
|
|
"mean_token_accuracy": 0.8064105985686183,
|
|
"num_tokens": 49889163.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 0.484161376953125,
|
|
"epoch": 0.46825396825396826,
|
|
"grad_norm": 1.2391735146187919,
|
|
"learning_rate": 9.980867413308516e-06,
|
|
"loss": 0.5945,
|
|
"mean_token_accuracy": 0.8099995800293982,
|
|
"num_tokens": 50735361.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 0.4808349609375,
|
|
"epoch": 0.47619047619047616,
|
|
"grad_norm": 1.1506873672230502,
|
|
"learning_rate": 9.978907703079672e-06,
|
|
"loss": 0.5839,
|
|
"mean_token_accuracy": 0.8109453665092587,
|
|
"num_tokens": 51597577.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.4707794189453125,
|
|
"epoch": 0.48412698412698413,
|
|
"grad_norm": 1.1563025667287559,
|
|
"learning_rate": 9.976852672805625e-06,
|
|
"loss": 0.5933,
|
|
"mean_token_accuracy": 0.8103347043506801,
|
|
"num_tokens": 52482382.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 0.47381591796875,
|
|
"epoch": 0.49206349206349204,
|
|
"grad_norm": 1.2114630802835358,
|
|
"learning_rate": 9.974702361829465e-06,
|
|
"loss": 0.587,
|
|
"mean_token_accuracy": 0.8098243419080973,
|
|
"num_tokens": 53370750.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 0.4756927490234375,
|
|
"epoch": 0.5,
|
|
"grad_norm": 1.1306748677183436,
|
|
"learning_rate": 9.972456811318399e-06,
|
|
"loss": 0.5792,
|
|
"mean_token_accuracy": 0.8143061874434352,
|
|
"num_tokens": 54235189.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 0.47760009765625,
|
|
"epoch": 0.5079365079365079,
|
|
"grad_norm": 1.1104384579683657,
|
|
"learning_rate": 9.970116064262975e-06,
|
|
"loss": 0.5711,
|
|
"mean_token_accuracy": 0.814652734901756,
|
|
"num_tokens": 55066936.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 0.475128173828125,
|
|
"epoch": 0.5158730158730159,
|
|
"grad_norm": 1.130873737994674,
|
|
"learning_rate": 9.96768016547626e-06,
|
|
"loss": 0.5749,
|
|
"mean_token_accuracy": 0.8144003404304385,
|
|
"num_tokens": 55922405.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 0.4756317138671875,
|
|
"epoch": 0.5238095238095238,
|
|
"grad_norm": 1.2543626844594944,
|
|
"learning_rate": 9.965149161592973e-06,
|
|
"loss": 0.5717,
|
|
"mean_token_accuracy": 0.8129943162202835,
|
|
"num_tokens": 56770275.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 0.47332763671875,
|
|
"epoch": 0.5317460317460317,
|
|
"grad_norm": 1.164483305334561,
|
|
"learning_rate": 9.962523101068608e-06,
|
|
"loss": 0.5781,
|
|
"mean_token_accuracy": 0.8122508767992258,
|
|
"num_tokens": 57627870.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 0.475616455078125,
|
|
"epoch": 0.5396825396825397,
|
|
"grad_norm": 1.0960487524124647,
|
|
"learning_rate": 9.959802034178489e-06,
|
|
"loss": 0.5661,
|
|
"mean_token_accuracy": 0.817779887933284,
|
|
"num_tokens": 58469884.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 0.4727325439453125,
|
|
"epoch": 0.5476190476190477,
|
|
"grad_norm": 1.0760531755759473,
|
|
"learning_rate": 9.956986013016816e-06,
|
|
"loss": 0.5655,
|
|
"mean_token_accuracy": 0.815218704752624,
|
|
"num_tokens": 59323796.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 0.472900390625,
|
|
"epoch": 0.5555555555555556,
|
|
"grad_norm": 1.150431448189494,
|
|
"learning_rate": 9.954075091495669e-06,
|
|
"loss": 0.5564,
|
|
"mean_token_accuracy": 0.8179643992334604,
|
|
"num_tokens": 60183841.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.4703521728515625,
|
|
"epoch": 0.5634920634920635,
|
|
"grad_norm": 1.0833066927711814,
|
|
"learning_rate": 9.951069325343972e-06,
|
|
"loss": 0.5668,
|
|
"mean_token_accuracy": 0.8150419541634619,
|
|
"num_tokens": 61048601.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 0.472991943359375,
|
|
"epoch": 0.5714285714285714,
|
|
"grad_norm": 1.1267753740162227,
|
|
"learning_rate": 9.947968772106428e-06,
|
|
"loss": 0.5683,
|
|
"mean_token_accuracy": 0.8151015248149633,
|
|
"num_tokens": 61887912.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 0.468231201171875,
|
|
"epoch": 0.5793650793650794,
|
|
"grad_norm": 1.0041998917193997,
|
|
"learning_rate": 9.944773491142416e-06,
|
|
"loss": 0.561,
|
|
"mean_token_accuracy": 0.8171937335282564,
|
|
"num_tokens": 62741342.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 0.464263916015625,
|
|
"epoch": 0.5873015873015873,
|
|
"grad_norm": 1.0641654456452165,
|
|
"learning_rate": 9.94148354362486e-06,
|
|
"loss": 0.5591,
|
|
"mean_token_accuracy": 0.8203174020163715,
|
|
"num_tokens": 63594617.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 0.4626922607421875,
|
|
"epoch": 0.5952380952380952,
|
|
"grad_norm": 1.0150387789211928,
|
|
"learning_rate": 9.938098992539045e-06,
|
|
"loss": 0.5534,
|
|
"mean_token_accuracy": 0.8201709003187716,
|
|
"num_tokens": 64467695.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 0.4632568359375,
|
|
"epoch": 0.6031746031746031,
|
|
"grad_norm": 1.1279871018458427,
|
|
"learning_rate": 9.93461990268143e-06,
|
|
"loss": 0.5589,
|
|
"mean_token_accuracy": 0.8168505723588169,
|
|
"num_tokens": 65333788.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 0.4673309326171875,
|
|
"epoch": 0.6111111111111112,
|
|
"grad_norm": 1.1373063081267694,
|
|
"learning_rate": 9.931046340658387e-06,
|
|
"loss": 0.5494,
|
|
"mean_token_accuracy": 0.8186001246795058,
|
|
"num_tokens": 66178918.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 0.4654693603515625,
|
|
"epoch": 0.6190476190476191,
|
|
"grad_norm": 0.987490904917509,
|
|
"learning_rate": 9.927378374884947e-06,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.8173182448372245,
|
|
"num_tokens": 67052342.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 0.4614715576171875,
|
|
"epoch": 0.626984126984127,
|
|
"grad_norm": 1.158480909536903,
|
|
"learning_rate": 9.923616075583465e-06,
|
|
"loss": 0.5521,
|
|
"mean_token_accuracy": 0.8191495719365776,
|
|
"num_tokens": 67913391.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 0.46258544921875,
|
|
"epoch": 0.6349206349206349,
|
|
"grad_norm": 1.0694507889690312,
|
|
"learning_rate": 9.919759514782304e-06,
|
|
"loss": 0.5518,
|
|
"mean_token_accuracy": 0.8191684056073427,
|
|
"num_tokens": 68772115.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.4591064453125,
|
|
"epoch": 0.6428571428571429,
|
|
"grad_norm": 1.031285333594921,
|
|
"learning_rate": 9.91580876631443e-06,
|
|
"loss": 0.5395,
|
|
"mean_token_accuracy": 0.8231914453208447,
|
|
"num_tokens": 69611653.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 0.4618072509765625,
|
|
"epoch": 0.6507936507936508,
|
|
"grad_norm": 1.2191568231885717,
|
|
"learning_rate": 9.91176390581602e-06,
|
|
"loss": 0.5609,
|
|
"mean_token_accuracy": 0.8178833266720176,
|
|
"num_tokens": 70496952.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 0.4582672119140625,
|
|
"epoch": 0.6587301587301587,
|
|
"grad_norm": 1.098798237502869,
|
|
"learning_rate": 9.907625010724999e-06,
|
|
"loss": 0.5426,
|
|
"mean_token_accuracy": 0.8210734003223479,
|
|
"num_tokens": 71343921.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 0.45758056640625,
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 1.0293307451137137,
|
|
"learning_rate": 9.903392160279564e-06,
|
|
"loss": 0.5547,
|
|
"mean_token_accuracy": 0.8177660717628896,
|
|
"num_tokens": 72240608.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 0.4568939208984375,
|
|
"epoch": 0.6746031746031746,
|
|
"grad_norm": 1.1591213276198025,
|
|
"learning_rate": 9.899065435516661e-06,
|
|
"loss": 0.5419,
|
|
"mean_token_accuracy": 0.8213843265548348,
|
|
"num_tokens": 73118452.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 0.45391845703125,
|
|
"epoch": 0.6825396825396826,
|
|
"grad_norm": 1.0203407509108033,
|
|
"learning_rate": 9.894644919270448e-06,
|
|
"loss": 0.5482,
|
|
"mean_token_accuracy": 0.8173369145952165,
|
|
"num_tokens": 73991069.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 0.4616546630859375,
|
|
"epoch": 0.6904761904761905,
|
|
"grad_norm": 1.2761061587844562,
|
|
"learning_rate": 9.890130696170691e-06,
|
|
"loss": 0.5398,
|
|
"mean_token_accuracy": 0.8226454192772508,
|
|
"num_tokens": 74839901.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 0.4576416015625,
|
|
"epoch": 0.6984126984126984,
|
|
"grad_norm": 1.0577284033847283,
|
|
"learning_rate": 9.885522852641156e-06,
|
|
"loss": 0.5527,
|
|
"mean_token_accuracy": 0.8187054474838078,
|
|
"num_tokens": 75749725.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 0.465087890625,
|
|
"epoch": 0.7063492063492064,
|
|
"grad_norm": 1.1084468078084884,
|
|
"learning_rate": 9.880821476897948e-06,
|
|
"loss": 0.5456,
|
|
"mean_token_accuracy": 0.8205522131174803,
|
|
"num_tokens": 76593690.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 0.466217041015625,
|
|
"epoch": 0.7142857142857143,
|
|
"grad_norm": 1.1432698793337261,
|
|
"learning_rate": 9.87602665894783e-06,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.8226723484694958,
|
|
"num_tokens": 77431030.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.4627227783203125,
|
|
"epoch": 0.7222222222222222,
|
|
"grad_norm": 1.2237349934344866,
|
|
"learning_rate": 9.871138490586489e-06,
|
|
"loss": 0.54,
|
|
"mean_token_accuracy": 0.8220892632380128,
|
|
"num_tokens": 78278605.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 0.457794189453125,
|
|
"epoch": 0.7301587301587301,
|
|
"grad_norm": 0.9735987604661573,
|
|
"learning_rate": 9.866157065396784e-06,
|
|
"loss": 0.5336,
|
|
"mean_token_accuracy": 0.8227689885534346,
|
|
"num_tokens": 79154216.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 0.4553680419921875,
|
|
"epoch": 0.7380952380952381,
|
|
"grad_norm": 1.2060097781830723,
|
|
"learning_rate": 9.861082478746962e-06,
|
|
"loss": 0.5453,
|
|
"mean_token_accuracy": 0.8193097808398306,
|
|
"num_tokens": 80039204.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 0.452606201171875,
|
|
"epoch": 0.746031746031746,
|
|
"grad_norm": 1.219116442820296,
|
|
"learning_rate": 9.855914827788814e-06,
|
|
"loss": 0.537,
|
|
"mean_token_accuracy": 0.8233561674132943,
|
|
"num_tokens": 80910348.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 0.454315185546875,
|
|
"epoch": 0.753968253968254,
|
|
"grad_norm": 1.1176715433890458,
|
|
"learning_rate": 9.850654211455837e-06,
|
|
"loss": 0.5371,
|
|
"mean_token_accuracy": 0.8221336985006928,
|
|
"num_tokens": 81765325.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 0.4631195068359375,
|
|
"epoch": 0.7619047619047619,
|
|
"grad_norm": 0.9620936989799753,
|
|
"learning_rate": 9.84530073046132e-06,
|
|
"loss": 0.5288,
|
|
"mean_token_accuracy": 0.8240236868150532,
|
|
"num_tokens": 82611125.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 0.454254150390625,
|
|
"epoch": 0.7698412698412699,
|
|
"grad_norm": 1.0990268102179221,
|
|
"learning_rate": 9.83985448729643e-06,
|
|
"loss": 0.5376,
|
|
"mean_token_accuracy": 0.8205642709508538,
|
|
"num_tokens": 83488021.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 0.45806884765625,
|
|
"epoch": 0.7777777777777778,
|
|
"grad_norm": 1.1036853861455271,
|
|
"learning_rate": 9.83431558622824e-06,
|
|
"loss": 0.533,
|
|
"mean_token_accuracy": 0.8217868432402611,
|
|
"num_tokens": 84321775.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 0.4533843994140625,
|
|
"epoch": 0.7857142857142857,
|
|
"grad_norm": 1.1621319158175323,
|
|
"learning_rate": 9.828684133297738e-06,
|
|
"loss": 0.5223,
|
|
"mean_token_accuracy": 0.8267802041955292,
|
|
"num_tokens": 85167087.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 0.453704833984375,
|
|
"epoch": 0.7936507936507936,
|
|
"grad_norm": 1.01710948689366,
|
|
"learning_rate": 9.822960236317804e-06,
|
|
"loss": 0.5324,
|
|
"mean_token_accuracy": 0.824018832296133,
|
|
"num_tokens": 86009383.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.4491729736328125,
|
|
"epoch": 0.8015873015873016,
|
|
"grad_norm": 1.136196189273348,
|
|
"learning_rate": 9.817144004871127e-06,
|
|
"loss": 0.5249,
|
|
"mean_token_accuracy": 0.8261763895861804,
|
|
"num_tokens": 86862628.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 0.447357177734375,
|
|
"epoch": 0.8095238095238095,
|
|
"grad_norm": 1.1653574778770674,
|
|
"learning_rate": 9.811235550308127e-06,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.8248972818255424,
|
|
"num_tokens": 87713395.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 0.4496612548828125,
|
|
"epoch": 0.8174603174603174,
|
|
"grad_norm": 1.0197520810298888,
|
|
"learning_rate": 9.805234985744804e-06,
|
|
"loss": 0.5374,
|
|
"mean_token_accuracy": 0.8215886438265443,
|
|
"num_tokens": 88602545.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 0.45355224609375,
|
|
"epoch": 0.8253968253968254,
|
|
"grad_norm": 1.0746283719190985,
|
|
"learning_rate": 9.799142426060595e-06,
|
|
"loss": 0.5211,
|
|
"mean_token_accuracy": 0.8281928705982864,
|
|
"num_tokens": 89444255.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 0.4513702392578125,
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 0.9405502354816935,
|
|
"learning_rate": 9.792957987896154e-06,
|
|
"loss": 0.5164,
|
|
"mean_token_accuracy": 0.8303409847430885,
|
|
"num_tokens": 90312774.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 0.4513397216796875,
|
|
"epoch": 0.8412698412698413,
|
|
"grad_norm": 0.9828002484074289,
|
|
"learning_rate": 9.786681789651134e-06,
|
|
"loss": 0.5167,
|
|
"mean_token_accuracy": 0.8268536222167313,
|
|
"num_tokens": 91177447.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 0.452178955078125,
|
|
"epoch": 0.8492063492063492,
|
|
"grad_norm": 1.0029391226175481,
|
|
"learning_rate": 9.780313951481904e-06,
|
|
"loss": 0.5159,
|
|
"mean_token_accuracy": 0.8277767463587224,
|
|
"num_tokens": 92026164.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 0.448455810546875,
|
|
"epoch": 0.8571428571428571,
|
|
"grad_norm": 1.0111683546148944,
|
|
"learning_rate": 9.773854595299269e-06,
|
|
"loss": 0.5158,
|
|
"mean_token_accuracy": 0.8292862558737397,
|
|
"num_tokens": 92891631.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 0.4544219970703125,
|
|
"epoch": 0.8650793650793651,
|
|
"grad_norm": 1.0693593999923847,
|
|
"learning_rate": 9.767303844766118e-06,
|
|
"loss": 0.5228,
|
|
"mean_token_accuracy": 0.8271551127545536,
|
|
"num_tokens": 93760535.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 0.4507293701171875,
|
|
"epoch": 0.873015873015873,
|
|
"grad_norm": 1.078077119226931,
|
|
"learning_rate": 9.760661825295068e-06,
|
|
"loss": 0.5188,
|
|
"mean_token_accuracy": 0.8268710542470217,
|
|
"num_tokens": 94599260.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.45849609375,
|
|
"epoch": 0.8809523809523809,
|
|
"grad_norm": 1.1500541245910931,
|
|
"learning_rate": 9.753928664046055e-06,
|
|
"loss": 0.5188,
|
|
"mean_token_accuracy": 0.8258566916920245,
|
|
"num_tokens": 95425799.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 0.455718994140625,
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 0.9411323026364743,
|
|
"learning_rate": 9.747104489923907e-06,
|
|
"loss": 0.5223,
|
|
"mean_token_accuracy": 0.8278644122183323,
|
|
"num_tokens": 96260271.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 0.44830322265625,
|
|
"epoch": 0.8968253968253969,
|
|
"grad_norm": 1.0496976356343377,
|
|
"learning_rate": 9.740189433575873e-06,
|
|
"loss": 0.5209,
|
|
"mean_token_accuracy": 0.8264745082706213,
|
|
"num_tokens": 97135925.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 0.4494781494140625,
|
|
"epoch": 0.9047619047619048,
|
|
"grad_norm": 1.1407666858849659,
|
|
"learning_rate": 9.733183627389117e-06,
|
|
"loss": 0.5223,
|
|
"mean_token_accuracy": 0.8253828585147858,
|
|
"num_tokens": 98011108.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 0.4512481689453125,
|
|
"epoch": 0.9126984126984127,
|
|
"grad_norm": 1.042415494174418,
|
|
"learning_rate": 9.726087205488192e-06,
|
|
"loss": 0.5122,
|
|
"mean_token_accuracy": 0.827279772143811,
|
|
"num_tokens": 98873029.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 0.4514007568359375,
|
|
"epoch": 0.9206349206349206,
|
|
"grad_norm": 1.2092755396739658,
|
|
"learning_rate": 9.718900303732465e-06,
|
|
"loss": 0.5205,
|
|
"mean_token_accuracy": 0.8262831498868763,
|
|
"num_tokens": 99734864.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 0.4513702392578125,
|
|
"epoch": 0.9285714285714286,
|
|
"grad_norm": 1.1637681555356034,
|
|
"learning_rate": 9.711623059713522e-06,
|
|
"loss": 0.525,
|
|
"mean_token_accuracy": 0.826652648858726,
|
|
"num_tokens": 100606926.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 0.45599365234375,
|
|
"epoch": 0.9365079365079365,
|
|
"grad_norm": 1.0209942008459238,
|
|
"learning_rate": 9.70425561275253e-06,
|
|
"loss": 0.5082,
|
|
"mean_token_accuracy": 0.8288828083314002,
|
|
"num_tokens": 101447599.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 0.4554595947265625,
|
|
"epoch": 0.9444444444444444,
|
|
"grad_norm": 1.0671983733664188,
|
|
"learning_rate": 9.696798103897567e-06,
|
|
"loss": 0.5105,
|
|
"mean_token_accuracy": 0.8294458598829806,
|
|
"num_tokens": 102275358.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 0.459686279296875,
|
|
"epoch": 0.9523809523809523,
|
|
"grad_norm": 1.043670325725254,
|
|
"learning_rate": 9.689250675920932e-06,
|
|
"loss": 0.5091,
|
|
"mean_token_accuracy": 0.8285032915882766,
|
|
"num_tokens": 103113293.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.4480743408203125,
|
|
"epoch": 0.9603174603174603,
|
|
"grad_norm": 0.9316811204172604,
|
|
"learning_rate": 9.6816134733164e-06,
|
|
"loss": 0.5139,
|
|
"mean_token_accuracy": 0.8276259712874889,
|
|
"num_tokens": 104000550.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 0.448577880859375,
|
|
"epoch": 0.9682539682539683,
|
|
"grad_norm": 1.0919546140808916,
|
|
"learning_rate": 9.67388664229646e-06,
|
|
"loss": 0.5134,
|
|
"mean_token_accuracy": 0.8282599356025457,
|
|
"num_tokens": 104859286.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 0.448150634765625,
|
|
"epoch": 0.9761904761904762,
|
|
"grad_norm": 0.999620140999962,
|
|
"learning_rate": 9.66607033078952e-06,
|
|
"loss": 0.5131,
|
|
"mean_token_accuracy": 0.827931288164109,
|
|
"num_tokens": 105729675.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 0.4438629150390625,
|
|
"epoch": 0.9841269841269841,
|
|
"grad_norm": 0.9685899298092844,
|
|
"learning_rate": 9.658164688437073e-06,
|
|
"loss": 0.5098,
|
|
"mean_token_accuracy": 0.8288385523483157,
|
|
"num_tokens": 106603968.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 0.44073486328125,
|
|
"epoch": 0.9920634920634921,
|
|
"grad_norm": 0.9358281617577068,
|
|
"learning_rate": 9.65016986659082e-06,
|
|
"loss": 0.5053,
|
|
"mean_token_accuracy": 0.8307707700878382,
|
|
"num_tokens": 107495007.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 0.4453887939453125,
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.0968390964445407,
|
|
"learning_rate": 9.642086018309798e-06,
|
|
"loss": 0.5189,
|
|
"mean_token_accuracy": 0.8269711588509381,
|
|
"num_tokens": 108364335.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 0.4580535888671875,
|
|
"epoch": 1.007936507936508,
|
|
"grad_norm": 1.0332257753415273,
|
|
"learning_rate": 9.63391329835742e-06,
|
|
"loss": 0.4924,
|
|
"mean_token_accuracy": 0.8341724565252662,
|
|
"num_tokens": 109202665.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 0.443359375,
|
|
"epoch": 1.0158730158730158,
|
|
"grad_norm": 0.9791939186386783,
|
|
"learning_rate": 9.625651863198538e-06,
|
|
"loss": 0.4937,
|
|
"mean_token_accuracy": 0.8334903731010854,
|
|
"num_tokens": 110061605.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 0.4462127685546875,
|
|
"epoch": 1.0238095238095237,
|
|
"grad_norm": 0.9767524502632213,
|
|
"learning_rate": 9.617301870996432e-06,
|
|
"loss": 0.4907,
|
|
"mean_token_accuracy": 0.8331266730092466,
|
|
"num_tokens": 110924491.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 0.4455108642578125,
|
|
"epoch": 1.0317460317460316,
|
|
"grad_norm": 1.0445136711209406,
|
|
"learning_rate": 9.608863481609784e-06,
|
|
"loss": 0.4924,
|
|
"mean_token_accuracy": 0.8337863022461534,
|
|
"num_tokens": 111773832.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.4438018798828125,
|
|
"epoch": 1.0396825396825398,
|
|
"grad_norm": 0.9414725400905224,
|
|
"learning_rate": 9.600336856589622e-06,
|
|
"loss": 0.4856,
|
|
"mean_token_accuracy": 0.835617205593735,
|
|
"num_tokens": 112637470.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 0.44439697265625,
|
|
"epoch": 1.0476190476190477,
|
|
"grad_norm": 0.9341197433359306,
|
|
"learning_rate": 9.591722159176229e-06,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.8329057167284191,
|
|
"num_tokens": 113501590.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"entropy": 0.4480133056640625,
|
|
"epoch": 1.0555555555555556,
|
|
"grad_norm": 0.9687991138519816,
|
|
"learning_rate": 9.583019554296004e-06,
|
|
"loss": 0.4921,
|
|
"mean_token_accuracy": 0.8349612141028047,
|
|
"num_tokens": 114360533.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"entropy": 0.44232177734375,
|
|
"epoch": 1.0634920634920635,
|
|
"grad_norm": 0.9629260310473902,
|
|
"learning_rate": 9.574229208558322e-06,
|
|
"loss": 0.4899,
|
|
"mean_token_accuracy": 0.8340120441280305,
|
|
"num_tokens": 115231953.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"entropy": 0.443511962890625,
|
|
"epoch": 1.0714285714285714,
|
|
"grad_norm": 0.904231182313968,
|
|
"learning_rate": 9.565351290252339e-06,
|
|
"loss": 0.4865,
|
|
"mean_token_accuracy": 0.8350210129283369,
|
|
"num_tokens": 116092221.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 0.443817138671875,
|
|
"epoch": 1.0793650793650793,
|
|
"grad_norm": 1.0016105467556813,
|
|
"learning_rate": 9.556385969343756e-06,
|
|
"loss": 0.493,
|
|
"mean_token_accuracy": 0.8340729284100235,
|
|
"num_tokens": 116984739.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"entropy": 0.441619873046875,
|
|
"epoch": 1.0873015873015872,
|
|
"grad_norm": 1.0079401042157452,
|
|
"learning_rate": 9.547333417471589e-06,
|
|
"loss": 0.4921,
|
|
"mean_token_accuracy": 0.8338921638205647,
|
|
"num_tokens": 117852991.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"entropy": 0.449127197265625,
|
|
"epoch": 1.0952380952380953,
|
|
"grad_norm": 1.0727711874297776,
|
|
"learning_rate": 9.538193807944864e-06,
|
|
"loss": 0.49,
|
|
"mean_token_accuracy": 0.8347254949621856,
|
|
"num_tokens": 118696927.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"entropy": 0.4441070556640625,
|
|
"epoch": 1.1031746031746033,
|
|
"grad_norm": 1.1054420578884305,
|
|
"learning_rate": 9.528967315739308e-06,
|
|
"loss": 0.4899,
|
|
"mean_token_accuracy": 0.8341114274226129,
|
|
"num_tokens": 119569613.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"entropy": 0.4412689208984375,
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 1.0649123433307148,
|
|
"learning_rate": 9.519654117493996e-06,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.8335490431636572,
|
|
"num_tokens": 120447089.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.441131591796875,
|
|
"epoch": 1.119047619047619,
|
|
"grad_norm": 0.9658018391507793,
|
|
"learning_rate": 9.510254391507971e-06,
|
|
"loss": 0.4839,
|
|
"mean_token_accuracy": 0.8368041082285345,
|
|
"num_tokens": 121314886.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"entropy": 0.44622802734375,
|
|
"epoch": 1.126984126984127,
|
|
"grad_norm": 0.9540717616355562,
|
|
"learning_rate": 9.500768317736832e-06,
|
|
"loss": 0.4797,
|
|
"mean_token_accuracy": 0.837146339006722,
|
|
"num_tokens": 122167617.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"entropy": 0.44293212890625,
|
|
"epoch": 1.1349206349206349,
|
|
"grad_norm": 1.0354981305656152,
|
|
"learning_rate": 9.49119607778928e-06,
|
|
"loss": 0.4849,
|
|
"mean_token_accuracy": 0.8341447049751878,
|
|
"num_tokens": 123013840.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"entropy": 0.4395904541015625,
|
|
"epoch": 1.1428571428571428,
|
|
"grad_norm": 0.9923916024668512,
|
|
"learning_rate": 9.481537854923654e-06,
|
|
"loss": 0.477,
|
|
"mean_token_accuracy": 0.8377352114766836,
|
|
"num_tokens": 123876490.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"entropy": 0.4472503662109375,
|
|
"epoch": 1.1507936507936507,
|
|
"grad_norm": 1.129639029346026,
|
|
"learning_rate": 9.471793834044416e-06,
|
|
"loss": 0.4853,
|
|
"mean_token_accuracy": 0.8350173779763281,
|
|
"num_tokens": 124713314.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 0.44598388671875,
|
|
"epoch": 1.1587301587301586,
|
|
"grad_norm": 1.186594461732162,
|
|
"learning_rate": 9.461964201698604e-06,
|
|
"loss": 0.4939,
|
|
"mean_token_accuracy": 0.8313662535510957,
|
|
"num_tokens": 125564746.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"entropy": 0.43914794921875,
|
|
"epoch": 1.1666666666666667,
|
|
"grad_norm": 0.9201175898857195,
|
|
"learning_rate": 9.452049146072278e-06,
|
|
"loss": 0.4803,
|
|
"mean_token_accuracy": 0.8369712042622268,
|
|
"num_tokens": 126415997.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"entropy": 0.445343017578125,
|
|
"epoch": 1.1746031746031746,
|
|
"grad_norm": 0.9688073354920573,
|
|
"learning_rate": 9.442048856986899e-06,
|
|
"loss": 0.4914,
|
|
"mean_token_accuracy": 0.8343577086925507,
|
|
"num_tokens": 127285760.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"entropy": 0.4484710693359375,
|
|
"epoch": 1.1825396825396826,
|
|
"grad_norm": 1.0757588730703025,
|
|
"learning_rate": 9.431963525895709e-06,
|
|
"loss": 0.4946,
|
|
"mean_token_accuracy": 0.8332444536499679,
|
|
"num_tokens": 128153685.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"entropy": 0.45037841796875,
|
|
"epoch": 1.1904761904761905,
|
|
"grad_norm": 1.0256311878794404,
|
|
"learning_rate": 9.421793345880055e-06,
|
|
"loss": 0.4789,
|
|
"mean_token_accuracy": 0.8380363639444113,
|
|
"num_tokens": 128990738.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.44927978515625,
|
|
"epoch": 1.1984126984126984,
|
|
"grad_norm": 1.0956116916076417,
|
|
"learning_rate": 9.4115385116457e-06,
|
|
"loss": 0.4919,
|
|
"mean_token_accuracy": 0.8342617130838335,
|
|
"num_tokens": 129848900.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"entropy": 0.448333740234375,
|
|
"epoch": 1.2063492063492063,
|
|
"grad_norm": 1.0166065436311602,
|
|
"learning_rate": 9.401199219519088e-06,
|
|
"loss": 0.4878,
|
|
"mean_token_accuracy": 0.8344792602583766,
|
|
"num_tokens": 130724709.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"entropy": 0.44781494140625,
|
|
"epoch": 1.2142857142857142,
|
|
"grad_norm": 0.9085771039074888,
|
|
"learning_rate": 9.390775667443602e-06,
|
|
"loss": 0.4761,
|
|
"mean_token_accuracy": 0.8378241760656238,
|
|
"num_tokens": 131582811.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"entropy": 0.4468536376953125,
|
|
"epoch": 1.2222222222222223,
|
|
"grad_norm": 1.0868732902444567,
|
|
"learning_rate": 9.380268054975745e-06,
|
|
"loss": 0.4835,
|
|
"mean_token_accuracy": 0.8363062706775963,
|
|
"num_tokens": 132429743.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"entropy": 0.4462738037109375,
|
|
"epoch": 1.2301587301587302,
|
|
"grad_norm": 1.0628171497183283,
|
|
"learning_rate": 9.36967658328135e-06,
|
|
"loss": 0.4854,
|
|
"mean_token_accuracy": 0.8348783804103732,
|
|
"num_tokens": 133291943.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 0.44970703125,
|
|
"epoch": 1.2380952380952381,
|
|
"grad_norm": 0.9804992859024957,
|
|
"learning_rate": 9.359001455131713e-06,
|
|
"loss": 0.4815,
|
|
"mean_token_accuracy": 0.8365443642251194,
|
|
"num_tokens": 134149814.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"entropy": 0.44989013671875,
|
|
"epoch": 1.246031746031746,
|
|
"grad_norm": 1.1446452708449568,
|
|
"learning_rate": 9.34824287489971e-06,
|
|
"loss": 0.4728,
|
|
"mean_token_accuracy": 0.839170094113797,
|
|
"num_tokens": 134989406.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"entropy": 0.4441986083984375,
|
|
"epoch": 1.253968253968254,
|
|
"grad_norm": 1.003349637480706,
|
|
"learning_rate": 9.337401048555892e-06,
|
|
"loss": 0.4688,
|
|
"mean_token_accuracy": 0.8404755499213934,
|
|
"num_tokens": 135832642.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"entropy": 0.4422149658203125,
|
|
"epoch": 1.2619047619047619,
|
|
"grad_norm": 1.0839125563857472,
|
|
"learning_rate": 9.326476183664535e-06,
|
|
"loss": 0.4797,
|
|
"mean_token_accuracy": 0.837718007620424,
|
|
"num_tokens": 136724748.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"entropy": 0.449066162109375,
|
|
"epoch": 1.2698412698412698,
|
|
"grad_norm": 1.0019921797525027,
|
|
"learning_rate": 9.315468489379668e-06,
|
|
"loss": 0.4788,
|
|
"mean_token_accuracy": 0.8362822770141065,
|
|
"num_tokens": 137570382.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.4457855224609375,
|
|
"epoch": 1.2777777777777777,
|
|
"grad_norm": 0.9885822793795929,
|
|
"learning_rate": 9.304378176441076e-06,
|
|
"loss": 0.4779,
|
|
"mean_token_accuracy": 0.8382827825844288,
|
|
"num_tokens": 138431194.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"entropy": 0.4445953369140625,
|
|
"epoch": 1.2857142857142856,
|
|
"grad_norm": 1.040992724515245,
|
|
"learning_rate": 9.29320545717025e-06,
|
|
"loss": 0.4673,
|
|
"mean_token_accuracy": 0.840416397433728,
|
|
"num_tokens": 139287890.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"entropy": 0.44342041015625,
|
|
"epoch": 1.2936507936507937,
|
|
"grad_norm": 0.9829545106372576,
|
|
"learning_rate": 9.281950545466336e-06,
|
|
"loss": 0.4814,
|
|
"mean_token_accuracy": 0.8361613317392766,
|
|
"num_tokens": 140160179.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"entropy": 0.44000244140625,
|
|
"epoch": 1.3015873015873016,
|
|
"grad_norm": 1.0385970457713267,
|
|
"learning_rate": 9.27061365680204e-06,
|
|
"loss": 0.4803,
|
|
"mean_token_accuracy": 0.8369025052525103,
|
|
"num_tokens": 141002875.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"entropy": 0.4385528564453125,
|
|
"epoch": 1.3095238095238095,
|
|
"grad_norm": 0.9812301520220873,
|
|
"learning_rate": 9.25919500821949e-06,
|
|
"loss": 0.471,
|
|
"mean_token_accuracy": 0.838864213321358,
|
|
"num_tokens": 141858286.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 0.4442901611328125,
|
|
"epoch": 1.3174603174603174,
|
|
"grad_norm": 1.0236518502998646,
|
|
"learning_rate": 9.247694818326092e-06,
|
|
"loss": 0.4711,
|
|
"mean_token_accuracy": 0.8398910835385323,
|
|
"num_tokens": 142698339.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"entropy": 0.4426116943359375,
|
|
"epoch": 1.3253968253968254,
|
|
"grad_norm": 1.1382281143976174,
|
|
"learning_rate": 9.236113307290345e-06,
|
|
"loss": 0.4742,
|
|
"mean_token_accuracy": 0.837109467945993,
|
|
"num_tokens": 143561112.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"entropy": 0.442718505859375,
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 0.9746191934401784,
|
|
"learning_rate": 9.224450696837617e-06,
|
|
"loss": 0.4756,
|
|
"mean_token_accuracy": 0.8384752809070051,
|
|
"num_tokens": 144390223.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"entropy": 0.4402008056640625,
|
|
"epoch": 1.3412698412698414,
|
|
"grad_norm": 0.9749642677850219,
|
|
"learning_rate": 9.212707210245908e-06,
|
|
"loss": 0.4881,
|
|
"mean_token_accuracy": 0.8348734346218407,
|
|
"num_tokens": 145276276.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"entropy": 0.4441070556640625,
|
|
"epoch": 1.3492063492063493,
|
|
"grad_norm": 1.0438747532350088,
|
|
"learning_rate": 9.200883072341573e-06,
|
|
"loss": 0.4761,
|
|
"mean_token_accuracy": 0.8384365830570459,
|
|
"num_tokens": 146148957.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.4490203857421875,
|
|
"epoch": 1.3571428571428572,
|
|
"grad_norm": 0.9016207370694161,
|
|
"learning_rate": 9.188978509495022e-06,
|
|
"loss": 0.475,
|
|
"mean_token_accuracy": 0.8379638059996068,
|
|
"num_tokens": 146999892.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"entropy": 0.4430694580078125,
|
|
"epoch": 1.3650793650793651,
|
|
"grad_norm": 0.8815407085280926,
|
|
"learning_rate": 9.176993749616374e-06,
|
|
"loss": 0.4768,
|
|
"mean_token_accuracy": 0.8367542624473572,
|
|
"num_tokens": 147888947.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"entropy": 0.451995849609375,
|
|
"epoch": 1.373015873015873,
|
|
"grad_norm": 0.9246528792063293,
|
|
"learning_rate": 9.164929022151106e-06,
|
|
"loss": 0.4871,
|
|
"mean_token_accuracy": 0.8344444935210049,
|
|
"num_tokens": 148771994.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"entropy": 0.444549560546875,
|
|
"epoch": 1.380952380952381,
|
|
"grad_norm": 0.9018527258286749,
|
|
"learning_rate": 9.15278455807566e-06,
|
|
"loss": 0.4715,
|
|
"mean_token_accuracy": 0.8390994230285287,
|
|
"num_tokens": 149626353.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"entropy": 0.4414825439453125,
|
|
"epoch": 1.3888888888888888,
|
|
"grad_norm": 0.8544852116993605,
|
|
"learning_rate": 9.140560589893012e-06,
|
|
"loss": 0.4697,
|
|
"mean_token_accuracy": 0.8393202098086476,
|
|
"num_tokens": 150484433.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 0.4454803466796875,
|
|
"epoch": 1.3968253968253967,
|
|
"grad_norm": 1.0071897404357584,
|
|
"learning_rate": 9.128257351628224e-06,
|
|
"loss": 0.473,
|
|
"mean_token_accuracy": 0.8398340521380305,
|
|
"num_tokens": 151351171.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"entropy": 0.440704345703125,
|
|
"epoch": 1.4047619047619047,
|
|
"grad_norm": 1.0906938840190845,
|
|
"learning_rate": 9.115875078823975e-06,
|
|
"loss": 0.4829,
|
|
"mean_token_accuracy": 0.8349933759309351,
|
|
"num_tokens": 152198704.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"entropy": 0.4438629150390625,
|
|
"epoch": 1.4126984126984126,
|
|
"grad_norm": 0.9630152565863848,
|
|
"learning_rate": 9.103414008536029e-06,
|
|
"loss": 0.4762,
|
|
"mean_token_accuracy": 0.8377989139407873,
|
|
"num_tokens": 153027562.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"entropy": 0.4464569091796875,
|
|
"epoch": 1.4206349206349207,
|
|
"grad_norm": 0.968332662888831,
|
|
"learning_rate": 9.09087437932872e-06,
|
|
"loss": 0.47,
|
|
"mean_token_accuracy": 0.8383941231295466,
|
|
"num_tokens": 153863890.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"entropy": 0.4459228515625,
|
|
"epoch": 1.4285714285714286,
|
|
"grad_norm": 0.8682147767187481,
|
|
"learning_rate": 9.07825643127037e-06,
|
|
"loss": 0.477,
|
|
"mean_token_accuracy": 0.8366480157710612,
|
|
"num_tokens": 154707913.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.4376068115234375,
|
|
"epoch": 1.4365079365079365,
|
|
"grad_norm": 0.897993109890026,
|
|
"learning_rate": 9.065560405928699e-06,
|
|
"loss": 0.4756,
|
|
"mean_token_accuracy": 0.8380651730112731,
|
|
"num_tokens": 155590050.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"entropy": 0.4394073486328125,
|
|
"epoch": 1.4444444444444444,
|
|
"grad_norm": 0.9388047872340883,
|
|
"learning_rate": 9.0527865463662e-06,
|
|
"loss": 0.4709,
|
|
"mean_token_accuracy": 0.8392384983599186,
|
|
"num_tokens": 156449879.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"entropy": 0.440673828125,
|
|
"epoch": 1.4523809523809523,
|
|
"grad_norm": 0.8193746423443552,
|
|
"learning_rate": 9.039935097135479e-06,
|
|
"loss": 0.4584,
|
|
"mean_token_accuracy": 0.8437643311917782,
|
|
"num_tokens": 157304143.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"entropy": 0.43817138671875,
|
|
"epoch": 1.4603174603174602,
|
|
"grad_norm": 0.9374511059068703,
|
|
"learning_rate": 9.027006304274584e-06,
|
|
"loss": 0.4748,
|
|
"mean_token_accuracy": 0.8367259805090725,
|
|
"num_tokens": 158177988.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"entropy": 0.4360198974609375,
|
|
"epoch": 1.4682539682539684,
|
|
"grad_norm": 0.8212121210922411,
|
|
"learning_rate": 9.014000415302286e-06,
|
|
"loss": 0.4783,
|
|
"mean_token_accuracy": 0.8371384800411761,
|
|
"num_tokens": 159060066.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 0.441680908203125,
|
|
"epoch": 1.4761904761904763,
|
|
"grad_norm": 0.8806995545843207,
|
|
"learning_rate": 9.000917679213344e-06,
|
|
"loss": 0.474,
|
|
"mean_token_accuracy": 0.8378347246907651,
|
|
"num_tokens": 159942986.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"entropy": 0.4442596435546875,
|
|
"epoch": 1.4841269841269842,
|
|
"grad_norm": 0.8463270672966281,
|
|
"learning_rate": 8.987758346473739e-06,
|
|
"loss": 0.4649,
|
|
"mean_token_accuracy": 0.8411796907894313,
|
|
"num_tokens": 160782816.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"entropy": 0.4425201416015625,
|
|
"epoch": 1.492063492063492,
|
|
"grad_norm": 0.839482572154392,
|
|
"learning_rate": 8.974522669015872e-06,
|
|
"loss": 0.4672,
|
|
"mean_token_accuracy": 0.8414199482649565,
|
|
"num_tokens": 161643512.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"entropy": 0.436920166015625,
|
|
"epoch": 1.5,
|
|
"grad_norm": 0.9422232012137579,
|
|
"learning_rate": 8.961210900233757e-06,
|
|
"loss": 0.4593,
|
|
"mean_token_accuracy": 0.8416055347770452,
|
|
"num_tokens": 162503001.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"entropy": 0.4347076416015625,
|
|
"epoch": 1.507936507936508,
|
|
"grad_norm": 0.9312174445116989,
|
|
"learning_rate": 8.947823294978147e-06,
|
|
"loss": 0.4741,
|
|
"mean_token_accuracy": 0.8390083778649569,
|
|
"num_tokens": 163388010.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.4320220947265625,
|
|
"epoch": 1.5158730158730158,
|
|
"grad_norm": 0.8634591037958418,
|
|
"learning_rate": 8.934360109551671e-06,
|
|
"loss": 0.4694,
|
|
"mean_token_accuracy": 0.8393782819621265,
|
|
"num_tokens": 164270399.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"entropy": 0.4329681396484375,
|
|
"epoch": 1.5238095238095237,
|
|
"grad_norm": 1.0268418396952028,
|
|
"learning_rate": 8.920821601703927e-06,
|
|
"loss": 0.4657,
|
|
"mean_token_accuracy": 0.8410811661742628,
|
|
"num_tokens": 165155523.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"entropy": 0.4373931884765625,
|
|
"epoch": 1.5317460317460316,
|
|
"grad_norm": 0.911296138695116,
|
|
"learning_rate": 8.907208030626538e-06,
|
|
"loss": 0.4647,
|
|
"mean_token_accuracy": 0.8417128617875278,
|
|
"num_tokens": 166004219.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"entropy": 0.436920166015625,
|
|
"epoch": 1.5396825396825395,
|
|
"grad_norm": 0.8615585964723216,
|
|
"learning_rate": 8.8935196569482e-06,
|
|
"loss": 0.4659,
|
|
"mean_token_accuracy": 0.841303990688175,
|
|
"num_tokens": 166887486.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"entropy": 0.4330291748046875,
|
|
"epoch": 1.5476190476190477,
|
|
"grad_norm": 0.9022520563994237,
|
|
"learning_rate": 8.879756742729683e-06,
|
|
"loss": 0.4642,
|
|
"mean_token_accuracy": 0.842128555290401,
|
|
"num_tokens": 167743608.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 0.4404449462890625,
|
|
"epoch": 1.5555555555555556,
|
|
"grad_norm": 0.8427585056849268,
|
|
"learning_rate": 8.865919551458823e-06,
|
|
"loss": 0.4638,
|
|
"mean_token_accuracy": 0.8412150857038796,
|
|
"num_tokens": 168602032.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"entropy": 0.43634033203125,
|
|
"epoch": 1.5634920634920635,
|
|
"grad_norm": 0.9473332832713499,
|
|
"learning_rate": 8.852008348045468e-06,
|
|
"loss": 0.4713,
|
|
"mean_token_accuracy": 0.8384702135808766,
|
|
"num_tokens": 169469975.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"entropy": 0.4295654296875,
|
|
"epoch": 1.5714285714285714,
|
|
"grad_norm": 0.8265269594435529,
|
|
"learning_rate": 8.838023398816417e-06,
|
|
"loss": 0.471,
|
|
"mean_token_accuracy": 0.8378414455801249,
|
|
"num_tokens": 170343282.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"entropy": 0.4375,
|
|
"epoch": 1.5793650793650795,
|
|
"grad_norm": 0.9350879056767756,
|
|
"learning_rate": 8.823964971510313e-06,
|
|
"loss": 0.4701,
|
|
"mean_token_accuracy": 0.8392301532439888,
|
|
"num_tokens": 171227432.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"entropy": 0.4355621337890625,
|
|
"epoch": 1.5873015873015874,
|
|
"grad_norm": 0.8262956094897539,
|
|
"learning_rate": 8.809833335272517e-06,
|
|
"loss": 0.4531,
|
|
"mean_token_accuracy": 0.8436351302079856,
|
|
"num_tokens": 172096305.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.4359893798828125,
|
|
"epoch": 1.5952380952380953,
|
|
"grad_norm": 0.8393643465691598,
|
|
"learning_rate": 8.795628760649965e-06,
|
|
"loss": 0.4552,
|
|
"mean_token_accuracy": 0.8432473209686577,
|
|
"num_tokens": 172910673.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"entropy": 0.4252777099609375,
|
|
"epoch": 1.6031746031746033,
|
|
"grad_norm": 0.930458721360079,
|
|
"learning_rate": 8.781351519585978e-06,
|
|
"loss": 0.4602,
|
|
"mean_token_accuracy": 0.8418687861412764,
|
|
"num_tokens": 173775762.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"entropy": 0.4301910400390625,
|
|
"epoch": 1.6111111111111112,
|
|
"grad_norm": 0.9255152673550228,
|
|
"learning_rate": 8.767001885415055e-06,
|
|
"loss": 0.4658,
|
|
"mean_token_accuracy": 0.8412896669469774,
|
|
"num_tokens": 174651858.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"entropy": 0.4319915771484375,
|
|
"epoch": 1.619047619047619,
|
|
"grad_norm": 0.8156780804217264,
|
|
"learning_rate": 8.752580132857652e-06,
|
|
"loss": 0.4576,
|
|
"mean_token_accuracy": 0.8430444840341806,
|
|
"num_tokens": 175519282.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"entropy": 0.4349212646484375,
|
|
"epoch": 1.626984126984127,
|
|
"grad_norm": 0.8770655718885645,
|
|
"learning_rate": 8.73808653801491e-06,
|
|
"loss": 0.4714,
|
|
"mean_token_accuracy": 0.8400326487608254,
|
|
"num_tokens": 176387199.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 0.430511474609375,
|
|
"epoch": 1.6349206349206349,
|
|
"grad_norm": 0.8757592238026767,
|
|
"learning_rate": 8.723521378363378e-06,
|
|
"loss": 0.4681,
|
|
"mean_token_accuracy": 0.8415880398824811,
|
|
"num_tokens": 177264131.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"entropy": 0.43365478515625,
|
|
"epoch": 1.6428571428571428,
|
|
"grad_norm": 0.89706430686004,
|
|
"learning_rate": 8.70888493274969e-06,
|
|
"loss": 0.4581,
|
|
"mean_token_accuracy": 0.8423688313923776,
|
|
"num_tokens": 178114003.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"entropy": 0.4362945556640625,
|
|
"epoch": 1.6507936507936507,
|
|
"grad_norm": 1.245121139741542,
|
|
"learning_rate": 8.694177481385244e-06,
|
|
"loss": 0.4681,
|
|
"mean_token_accuracy": 0.8389896345324814,
|
|
"num_tokens": 178950487.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"entropy": 0.4297637939453125,
|
|
"epoch": 1.6587301587301586,
|
|
"grad_norm": 0.9621053522750438,
|
|
"learning_rate": 8.679399305840815e-06,
|
|
"loss": 0.4694,
|
|
"mean_token_accuracy": 0.83825440146029,
|
|
"num_tokens": 179833212.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"entropy": 0.4279327392578125,
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 0.8737950990378116,
|
|
"learning_rate": 8.664550689041187e-06,
|
|
"loss": 0.461,
|
|
"mean_token_accuracy": 0.8423066223040223,
|
|
"num_tokens": 180712234.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.4318084716796875,
|
|
"epoch": 1.6746031746031746,
|
|
"grad_norm": 1.000096359581219,
|
|
"learning_rate": 8.649631915259716e-06,
|
|
"loss": 0.4741,
|
|
"mean_token_accuracy": 0.8375975685194135,
|
|
"num_tokens": 181566490.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"entropy": 0.4336090087890625,
|
|
"epoch": 1.6825396825396826,
|
|
"grad_norm": 0.8934119220152827,
|
|
"learning_rate": 8.634643270112903e-06,
|
|
"loss": 0.4667,
|
|
"mean_token_accuracy": 0.8412727518007159,
|
|
"num_tokens": 182413254.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"entropy": 0.430084228515625,
|
|
"epoch": 1.6904761904761905,
|
|
"grad_norm": 0.8156026818943841,
|
|
"learning_rate": 8.61958504055492e-06,
|
|
"loss": 0.4599,
|
|
"mean_token_accuracy": 0.8421376254409552,
|
|
"num_tokens": 183269538.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"entropy": 0.4370574951171875,
|
|
"epoch": 1.6984126984126984,
|
|
"grad_norm": 1.0433955766227752,
|
|
"learning_rate": 8.604457514872115e-06,
|
|
"loss": 0.4577,
|
|
"mean_token_accuracy": 0.8438415261916816,
|
|
"num_tokens": 184109496.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"entropy": 0.4349365234375,
|
|
"epoch": 1.7063492063492065,
|
|
"grad_norm": 0.8795834565304798,
|
|
"learning_rate": 8.589260982677496e-06,
|
|
"loss": 0.4716,
|
|
"mean_token_accuracy": 0.8374428367242217,
|
|
"num_tokens": 184968366.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 0.42926025390625,
|
|
"epoch": 1.7142857142857144,
|
|
"grad_norm": 0.8234311825574274,
|
|
"learning_rate": 8.573995734905185e-06,
|
|
"loss": 0.4689,
|
|
"mean_token_accuracy": 0.8390569076873362,
|
|
"num_tokens": 185857166.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"entropy": 0.436279296875,
|
|
"epoch": 1.7222222222222223,
|
|
"grad_norm": 0.9720711352685596,
|
|
"learning_rate": 8.558662063804843e-06,
|
|
"loss": 0.452,
|
|
"mean_token_accuracy": 0.8439166625030339,
|
|
"num_tokens": 186684767.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"entropy": 0.4309844970703125,
|
|
"epoch": 1.7301587301587302,
|
|
"grad_norm": 0.8753161905032254,
|
|
"learning_rate": 8.543260262936087e-06,
|
|
"loss": 0.4545,
|
|
"mean_token_accuracy": 0.843706154730171,
|
|
"num_tokens": 187534641.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"entropy": 0.429901123046875,
|
|
"epoch": 1.7380952380952381,
|
|
"grad_norm": 0.8611756266061616,
|
|
"learning_rate": 8.527790627162858e-06,
|
|
"loss": 0.4594,
|
|
"mean_token_accuracy": 0.8414032305590808,
|
|
"num_tokens": 188403747.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"entropy": 0.4284210205078125,
|
|
"epoch": 1.746031746031746,
|
|
"grad_norm": 0.917314816658313,
|
|
"learning_rate": 8.512253452647783e-06,
|
|
"loss": 0.4636,
|
|
"mean_token_accuracy": 0.8410017411224544,
|
|
"num_tokens": 189286051.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.435394287109375,
|
|
"epoch": 1.753968253968254,
|
|
"grad_norm": 0.8160829606015351,
|
|
"learning_rate": 8.496649036846502e-06,
|
|
"loss": 0.4556,
|
|
"mean_token_accuracy": 0.8419742425903678,
|
|
"num_tokens": 190135846.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"entropy": 0.4247589111328125,
|
|
"epoch": 1.7619047619047619,
|
|
"grad_norm": 1.0175216766708233,
|
|
"learning_rate": 8.480977678501974e-06,
|
|
"loss": 0.4658,
|
|
"mean_token_accuracy": 0.8410613937303424,
|
|
"num_tokens": 191023956.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"entropy": 0.43572998046875,
|
|
"epoch": 1.7698412698412698,
|
|
"grad_norm": 0.9507033398860977,
|
|
"learning_rate": 8.465239677638755e-06,
|
|
"loss": 0.4554,
|
|
"mean_token_accuracy": 0.8437660122290254,
|
|
"num_tokens": 191865715.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"entropy": 0.4373016357421875,
|
|
"epoch": 1.7777777777777777,
|
|
"grad_norm": 0.8180069161681153,
|
|
"learning_rate": 8.449435335557264e-06,
|
|
"loss": 0.4575,
|
|
"mean_token_accuracy": 0.8432631348259747,
|
|
"num_tokens": 192687536.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"entropy": 0.4383087158203125,
|
|
"epoch": 1.7857142857142856,
|
|
"grad_norm": 0.8838273166712945,
|
|
"learning_rate": 8.433564954828e-06,
|
|
"loss": 0.4526,
|
|
"mean_token_accuracy": 0.8442786163650453,
|
|
"num_tokens": 193514317.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 0.42999267578125,
|
|
"epoch": 1.7936507936507935,
|
|
"grad_norm": 0.9410341071186311,
|
|
"learning_rate": 8.417628839285757e-06,
|
|
"loss": 0.4581,
|
|
"mean_token_accuracy": 0.8429269646294415,
|
|
"num_tokens": 194368425.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"entropy": 0.4307098388671875,
|
|
"epoch": 1.8015873015873016,
|
|
"grad_norm": 0.90812536954616,
|
|
"learning_rate": 8.401627294023815e-06,
|
|
"loss": 0.4577,
|
|
"mean_token_accuracy": 0.8424977059476078,
|
|
"num_tokens": 195229420.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"entropy": 0.4311065673828125,
|
|
"epoch": 1.8095238095238095,
|
|
"grad_norm": 0.8936972613642765,
|
|
"learning_rate": 8.385560625388081e-06,
|
|
"loss": 0.4613,
|
|
"mean_token_accuracy": 0.8418103088624775,
|
|
"num_tokens": 196086060.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"entropy": 0.4331512451171875,
|
|
"epoch": 1.8174603174603174,
|
|
"grad_norm": 0.9023364391946196,
|
|
"learning_rate": 8.369429140971239e-06,
|
|
"loss": 0.4587,
|
|
"mean_token_accuracy": 0.840968404430896,
|
|
"num_tokens": 196949752.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"entropy": 0.431732177734375,
|
|
"epoch": 1.8253968253968254,
|
|
"grad_norm": 0.9269255151577744,
|
|
"learning_rate": 8.353233149606859e-06,
|
|
"loss": 0.4564,
|
|
"mean_token_accuracy": 0.8422707901336253,
|
|
"num_tokens": 197787383.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.4332275390625,
|
|
"epoch": 1.8333333333333335,
|
|
"grad_norm": 0.8746862673486592,
|
|
"learning_rate": 8.336972961363472e-06,
|
|
"loss": 0.4641,
|
|
"mean_token_accuracy": 0.8415999473072588,
|
|
"num_tokens": 198640204.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"entropy": 0.4254608154296875,
|
|
"epoch": 1.8412698412698414,
|
|
"grad_norm": 0.8419998918591282,
|
|
"learning_rate": 8.320648887538657e-06,
|
|
"loss": 0.4628,
|
|
"mean_token_accuracy": 0.8425387698225677,
|
|
"num_tokens": 199534945.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"entropy": 0.4349517822265625,
|
|
"epoch": 1.8492063492063493,
|
|
"grad_norm": 0.945589758024129,
|
|
"learning_rate": 8.304261240653054e-06,
|
|
"loss": 0.4546,
|
|
"mean_token_accuracy": 0.8429999812506139,
|
|
"num_tokens": 200401566.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"entropy": 0.4324798583984375,
|
|
"epoch": 1.8571428571428572,
|
|
"grad_norm": 0.9389620288256866,
|
|
"learning_rate": 8.287810334444406e-06,
|
|
"loss": 0.4616,
|
|
"mean_token_accuracy": 0.8408999373205006,
|
|
"num_tokens": 201286569.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"entropy": 0.4327392578125,
|
|
"epoch": 1.8650793650793651,
|
|
"grad_norm": 0.9056957266265069,
|
|
"learning_rate": 8.271296483861532e-06,
|
|
"loss": 0.4555,
|
|
"mean_token_accuracy": 0.8440436110831797,
|
|
"num_tokens": 202148785.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 0.43682861328125,
|
|
"epoch": 1.873015873015873,
|
|
"grad_norm": 0.9007501274176329,
|
|
"learning_rate": 8.254720005058317e-06,
|
|
"loss": 0.4511,
|
|
"mean_token_accuracy": 0.8437027987092733,
|
|
"num_tokens": 202969412.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"entropy": 0.429046630859375,
|
|
"epoch": 1.880952380952381,
|
|
"grad_norm": 0.7886955269176177,
|
|
"learning_rate": 8.238081215387639e-06,
|
|
"loss": 0.4572,
|
|
"mean_token_accuracy": 0.8425727025605738,
|
|
"num_tokens": 203845826.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"entropy": 0.4304656982421875,
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.9902829953426554,
|
|
"learning_rate": 8.221380433395308e-06,
|
|
"loss": 0.4522,
|
|
"mean_token_accuracy": 0.8438800727017224,
|
|
"num_tokens": 204713067.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"entropy": 0.4382476806640625,
|
|
"epoch": 1.8968253968253967,
|
|
"grad_norm": 0.8783861526345048,
|
|
"learning_rate": 8.204617978813963e-06,
|
|
"loss": 0.4544,
|
|
"mean_token_accuracy": 0.8443545303307474,
|
|
"num_tokens": 205549482.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"entropy": 0.4334259033203125,
|
|
"epoch": 1.9047619047619047,
|
|
"grad_norm": 0.7800627411645534,
|
|
"learning_rate": 8.187794172556947e-06,
|
|
"loss": 0.4535,
|
|
"mean_token_accuracy": 0.8426107591949403,
|
|
"num_tokens": 206394578.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.4372100830078125,
|
|
"epoch": 1.9126984126984126,
|
|
"grad_norm": 0.8924141210495853,
|
|
"learning_rate": 8.170909336712171e-06,
|
|
"loss": 0.4593,
|
|
"mean_token_accuracy": 0.8427824974060059,
|
|
"num_tokens": 207233636.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"entropy": 0.4353790283203125,
|
|
"epoch": 1.9206349206349205,
|
|
"grad_norm": 0.8893426872353928,
|
|
"learning_rate": 8.153963794535945e-06,
|
|
"loss": 0.4604,
|
|
"mean_token_accuracy": 0.841770654078573,
|
|
"num_tokens": 208074376.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"entropy": 0.4335174560546875,
|
|
"epoch": 1.9285714285714286,
|
|
"grad_norm": 0.8143885506939128,
|
|
"learning_rate": 8.136957870446779e-06,
|
|
"loss": 0.4591,
|
|
"mean_token_accuracy": 0.8414175752550364,
|
|
"num_tokens": 208947370.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"entropy": 0.4306640625,
|
|
"epoch": 1.9365079365079365,
|
|
"grad_norm": 0.8217558583786552,
|
|
"learning_rate": 8.119891890019187e-06,
|
|
"loss": 0.4502,
|
|
"mean_token_accuracy": 0.8447873778641224,
|
|
"num_tokens": 209798547.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"entropy": 0.4336090087890625,
|
|
"epoch": 1.9444444444444444,
|
|
"grad_norm": 0.8345483891742207,
|
|
"learning_rate": 8.102766179977452e-06,
|
|
"loss": 0.4548,
|
|
"mean_token_accuracy": 0.843397512100637,
|
|
"num_tokens": 210661829.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 0.4267578125,
|
|
"epoch": 1.9523809523809523,
|
|
"grad_norm": 0.886458439838755,
|
|
"learning_rate": 8.085581068189358e-06,
|
|
"loss": 0.4546,
|
|
"mean_token_accuracy": 0.8432880756445229,
|
|
"num_tokens": 211549046.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"entropy": 0.4273834228515625,
|
|
"epoch": 1.9603174603174605,
|
|
"grad_norm": 0.7893301359285466,
|
|
"learning_rate": 8.068336883659926e-06,
|
|
"loss": 0.4483,
|
|
"mean_token_accuracy": 0.8453035233542323,
|
|
"num_tokens": 212447521.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"entropy": 0.4306793212890625,
|
|
"epoch": 1.9682539682539684,
|
|
"grad_norm": 0.891429474690652,
|
|
"learning_rate": 8.051033956525113e-06,
|
|
"loss": 0.4539,
|
|
"mean_token_accuracy": 0.8432926838286221,
|
|
"num_tokens": 213310334.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"entropy": 0.429534912109375,
|
|
"epoch": 1.9761904761904763,
|
|
"grad_norm": 0.8247760750659134,
|
|
"learning_rate": 8.033672618045485e-06,
|
|
"loss": 0.4524,
|
|
"mean_token_accuracy": 0.8450775747187436,
|
|
"num_tokens": 214169043.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"entropy": 0.4324188232421875,
|
|
"epoch": 1.9841269841269842,
|
|
"grad_norm": 0.8524339615157,
|
|
"learning_rate": 8.016253200599885e-06,
|
|
"loss": 0.4519,
|
|
"mean_token_accuracy": 0.8445535181090236,
|
|
"num_tokens": 215005057.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.4263763427734375,
|
|
"epoch": 1.992063492063492,
|
|
"grad_norm": 0.8331975898868739,
|
|
"learning_rate": 7.998776037679061e-06,
|
|
"loss": 0.4437,
|
|
"mean_token_accuracy": 0.8456794614903629,
|
|
"num_tokens": 215869766.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"entropy": 0.4291229248046875,
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.8613130972882047,
|
|
"learning_rate": 7.981241463879284e-06,
|
|
"loss": 0.4466,
|
|
"mean_token_accuracy": 0.8456757622770965,
|
|
"num_tokens": 216731206.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"entropy": 0.428619384765625,
|
|
"epoch": 2.007936507936508,
|
|
"grad_norm": 0.9277446577089026,
|
|
"learning_rate": 7.963649814895945e-06,
|
|
"loss": 0.4256,
|
|
"mean_token_accuracy": 0.8530098241753876,
|
|
"num_tokens": 217589905.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"entropy": 0.4280853271484375,
|
|
"epoch": 2.015873015873016,
|
|
"grad_norm": 0.8708275069644504,
|
|
"learning_rate": 7.94600142751713e-06,
|
|
"loss": 0.432,
|
|
"mean_token_accuracy": 0.8501051301136613,
|
|
"num_tokens": 218446876.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"entropy": 0.42718505859375,
|
|
"epoch": 2.0238095238095237,
|
|
"grad_norm": 0.8842468147508419,
|
|
"learning_rate": 7.92829663961716e-06,
|
|
"loss": 0.433,
|
|
"mean_token_accuracy": 0.850572609808296,
|
|
"num_tokens": 219322571.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 0.42889404296875,
|
|
"epoch": 2.0317460317460316,
|
|
"grad_norm": 0.8439073466722959,
|
|
"learning_rate": 7.910535790150135e-06,
|
|
"loss": 0.4291,
|
|
"mean_token_accuracy": 0.8493411005474627,
|
|
"num_tokens": 220180160.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"entropy": 0.42620849609375,
|
|
"epoch": 2.0396825396825395,
|
|
"grad_norm": 0.8757701106721189,
|
|
"learning_rate": 7.892719219143446e-06,
|
|
"loss": 0.42,
|
|
"mean_token_accuracy": 0.8547583618201315,
|
|
"num_tokens": 221016578.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"entropy": 0.42437744140625,
|
|
"epoch": 2.0476190476190474,
|
|
"grad_norm": 0.8882389150609792,
|
|
"learning_rate": 7.874847267691254e-06,
|
|
"loss": 0.4293,
|
|
"mean_token_accuracy": 0.8507325639948249,
|
|
"num_tokens": 221871968.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"entropy": 0.4220123291015625,
|
|
"epoch": 2.0555555555555554,
|
|
"grad_norm": 0.8012689613491503,
|
|
"learning_rate": 7.856920277947969e-06,
|
|
"loss": 0.4236,
|
|
"mean_token_accuracy": 0.8522419198416173,
|
|
"num_tokens": 222738323.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"entropy": 0.4245758056640625,
|
|
"epoch": 2.0634920634920633,
|
|
"grad_norm": 0.9448934032497721,
|
|
"learning_rate": 7.83893859312169e-06,
|
|
"loss": 0.4286,
|
|
"mean_token_accuracy": 0.8518815254792571,
|
|
"num_tokens": 223584134.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.4188232421875,
|
|
"epoch": 2.0714285714285716,
|
|
"grad_norm": 0.8498468200899314,
|
|
"learning_rate": 7.820902557467648e-06,
|
|
"loss": 0.4256,
|
|
"mean_token_accuracy": 0.8522023572586477,
|
|
"num_tokens": 224461654.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"entropy": 0.42340087890625,
|
|
"epoch": 2.0793650793650795,
|
|
"grad_norm": 0.9494122603581977,
|
|
"learning_rate": 7.80281251628161e-06,
|
|
"loss": 0.4325,
|
|
"mean_token_accuracy": 0.8496407098136842,
|
|
"num_tokens": 225327562.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"entropy": 0.4233551025390625,
|
|
"epoch": 2.0873015873015874,
|
|
"grad_norm": 0.773955396676882,
|
|
"learning_rate": 7.784668815893256e-06,
|
|
"loss": 0.4201,
|
|
"mean_token_accuracy": 0.853766305372119,
|
|
"num_tokens": 226189031.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"entropy": 0.4216156005859375,
|
|
"epoch": 2.0952380952380953,
|
|
"grad_norm": 0.9147295809460214,
|
|
"learning_rate": 7.766471803659571e-06,
|
|
"loss": 0.4396,
|
|
"mean_token_accuracy": 0.8481186041608453,
|
|
"num_tokens": 227062309.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"entropy": 0.4239349365234375,
|
|
"epoch": 2.1031746031746033,
|
|
"grad_norm": 0.9057979347358639,
|
|
"learning_rate": 7.748221827958174e-06,
|
|
"loss": 0.4297,
|
|
"mean_token_accuracy": 0.8508882015012205,
|
|
"num_tokens": 227920598.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 0.422332763671875,
|
|
"epoch": 2.111111111111111,
|
|
"grad_norm": 0.7698507478470203,
|
|
"learning_rate": 7.729919238180663e-06,
|
|
"loss": 0.4239,
|
|
"mean_token_accuracy": 0.8522637677378953,
|
|
"num_tokens": 228773818.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"entropy": 0.4227447509765625,
|
|
"epoch": 2.119047619047619,
|
|
"grad_norm": 0.7899935011725722,
|
|
"learning_rate": 7.711564384725916e-06,
|
|
"loss": 0.4215,
|
|
"mean_token_accuracy": 0.8535870416089892,
|
|
"num_tokens": 229627711.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"entropy": 0.4207000732421875,
|
|
"epoch": 2.126984126984127,
|
|
"grad_norm": 0.7934125500230887,
|
|
"learning_rate": 7.693157618993392e-06,
|
|
"loss": 0.4334,
|
|
"mean_token_accuracy": 0.8498726398684084,
|
|
"num_tokens": 230489032.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"entropy": 0.421234130859375,
|
|
"epoch": 2.134920634920635,
|
|
"grad_norm": 0.7955872905920104,
|
|
"learning_rate": 7.674699293376397e-06,
|
|
"loss": 0.4349,
|
|
"mean_token_accuracy": 0.8490500543266535,
|
|
"num_tokens": 231339019.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"entropy": 0.4244232177734375,
|
|
"epoch": 2.142857142857143,
|
|
"grad_norm": 0.7854549442643708,
|
|
"learning_rate": 7.656189761255333e-06,
|
|
"loss": 0.4319,
|
|
"mean_token_accuracy": 0.8492707693949342,
|
|
"num_tokens": 232199672.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.4268646240234375,
|
|
"epoch": 2.1507936507936507,
|
|
"grad_norm": 0.7878191667376515,
|
|
"learning_rate": 7.63762937699095e-06,
|
|
"loss": 0.4309,
|
|
"mean_token_accuracy": 0.8514748462475836,
|
|
"num_tokens": 233076007.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"entropy": 0.4263763427734375,
|
|
"epoch": 2.1587301587301586,
|
|
"grad_norm": 0.7996554554456847,
|
|
"learning_rate": 7.619018495917543e-06,
|
|
"loss": 0.4302,
|
|
"mean_token_accuracy": 0.8500847779214382,
|
|
"num_tokens": 233942156.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"entropy": 0.4241943359375,
|
|
"epoch": 2.1666666666666665,
|
|
"grad_norm": 0.7525601660809861,
|
|
"learning_rate": 7.600357474336157e-06,
|
|
"loss": 0.432,
|
|
"mean_token_accuracy": 0.8499450846575201,
|
|
"num_tokens": 234844668.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"entropy": 0.4309234619140625,
|
|
"epoch": 2.1746031746031744,
|
|
"grad_norm": 0.8012481468665732,
|
|
"learning_rate": 7.581646669507768e-06,
|
|
"loss": 0.4329,
|
|
"mean_token_accuracy": 0.8488554251380265,
|
|
"num_tokens": 235697877.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"entropy": 0.425933837890625,
|
|
"epoch": 2.1825396825396823,
|
|
"grad_norm": 0.7849735766550899,
|
|
"learning_rate": 7.56288643964644e-06,
|
|
"loss": 0.4253,
|
|
"mean_token_accuracy": 0.851461592130363,
|
|
"num_tokens": 236586699.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 0.4276123046875,
|
|
"epoch": 2.1904761904761907,
|
|
"grad_norm": 0.8396810265614048,
|
|
"learning_rate": 7.544077143912467e-06,
|
|
"loss": 0.425,
|
|
"mean_token_accuracy": 0.8501534420065582,
|
|
"num_tokens": 237426378.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"entropy": 0.4206085205078125,
|
|
"epoch": 2.1984126984126986,
|
|
"grad_norm": 0.8220018076708352,
|
|
"learning_rate": 7.525219142405501e-06,
|
|
"loss": 0.4272,
|
|
"mean_token_accuracy": 0.8498959382995963,
|
|
"num_tokens": 238297987.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"entropy": 0.4322357177734375,
|
|
"epoch": 2.2063492063492065,
|
|
"grad_norm": 0.8197639207257438,
|
|
"learning_rate": 7.506312796157649e-06,
|
|
"loss": 0.4381,
|
|
"mean_token_accuracy": 0.8488305411301553,
|
|
"num_tokens": 239171101.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"entropy": 0.4268646240234375,
|
|
"epoch": 2.2142857142857144,
|
|
"grad_norm": 0.783646321002473,
|
|
"learning_rate": 7.487358467126573e-06,
|
|
"loss": 0.4242,
|
|
"mean_token_accuracy": 0.8518037595786154,
|
|
"num_tokens": 240034337.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"entropy": 0.43035888671875,
|
|
"epoch": 2.2222222222222223,
|
|
"grad_norm": 0.8546249612241452,
|
|
"learning_rate": 7.468356518188551e-06,
|
|
"loss": 0.4174,
|
|
"mean_token_accuracy": 0.8534762058407068,
|
|
"num_tokens": 240860927.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.42437744140625,
|
|
"epoch": 2.2301587301587302,
|
|
"grad_norm": 0.87740351405863,
|
|
"learning_rate": 7.449307313131533e-06,
|
|
"loss": 0.4296,
|
|
"mean_token_accuracy": 0.8500415538437665,
|
|
"num_tokens": 241739076.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"entropy": 0.4241180419921875,
|
|
"epoch": 2.238095238095238,
|
|
"grad_norm": 0.8620781094998687,
|
|
"learning_rate": 7.4302112166481814e-06,
|
|
"loss": 0.4152,
|
|
"mean_token_accuracy": 0.8549392893910408,
|
|
"num_tokens": 242574011.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"entropy": 0.4252471923828125,
|
|
"epoch": 2.246031746031746,
|
|
"grad_norm": 0.8379775455346818,
|
|
"learning_rate": 7.411068594328876e-06,
|
|
"loss": 0.4292,
|
|
"mean_token_accuracy": 0.8494298844598234,
|
|
"num_tokens": 243458878.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"entropy": 0.4227294921875,
|
|
"epoch": 2.253968253968254,
|
|
"grad_norm": 0.825758180012109,
|
|
"learning_rate": 7.391879812654727e-06,
|
|
"loss": 0.4257,
|
|
"mean_token_accuracy": 0.852616976480931,
|
|
"num_tokens": 244313964.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"entropy": 0.42498779296875,
|
|
"epoch": 2.261904761904762,
|
|
"grad_norm": 0.7941347937575597,
|
|
"learning_rate": 7.37264523899056e-06,
|
|
"loss": 0.4204,
|
|
"mean_token_accuracy": 0.8534517176449299,
|
|
"num_tokens": 245200322.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 0.4271087646484375,
|
|
"epoch": 2.2698412698412698,
|
|
"grad_norm": 0.8928939293234606,
|
|
"learning_rate": 7.353365241577869e-06,
|
|
"loss": 0.4274,
|
|
"mean_token_accuracy": 0.8513154000975192,
|
|
"num_tokens": 246083539.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"entropy": 0.427947998046875,
|
|
"epoch": 2.2777777777777777,
|
|
"grad_norm": 0.8392795753081728,
|
|
"learning_rate": 7.3340401895277816e-06,
|
|
"loss": 0.4276,
|
|
"mean_token_accuracy": 0.8511864547617733,
|
|
"num_tokens": 246933619.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"entropy": 0.4322662353515625,
|
|
"epoch": 2.2857142857142856,
|
|
"grad_norm": 0.8013508005420562,
|
|
"learning_rate": 7.314670452813982e-06,
|
|
"loss": 0.4188,
|
|
"mean_token_accuracy": 0.8539650039747357,
|
|
"num_tokens": 247765672.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"entropy": 0.4230804443359375,
|
|
"epoch": 2.2936507936507935,
|
|
"grad_norm": 0.7925852081903219,
|
|
"learning_rate": 7.295256402265636e-06,
|
|
"loss": 0.4208,
|
|
"mean_token_accuracy": 0.8516722363419831,
|
|
"num_tokens": 248628378.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"entropy": 0.4259033203125,
|
|
"epoch": 2.3015873015873014,
|
|
"grad_norm": 0.881087099852364,
|
|
"learning_rate": 7.275798409560282e-06,
|
|
"loss": 0.4286,
|
|
"mean_token_accuracy": 0.8508175020106137,
|
|
"num_tokens": 249501143.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.42486572265625,
|
|
"epoch": 2.3095238095238093,
|
|
"grad_norm": 0.8044072073317771,
|
|
"learning_rate": 7.256296847216727e-06,
|
|
"loss": 0.4208,
|
|
"mean_token_accuracy": 0.8538436009548604,
|
|
"num_tokens": 250356099.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"entropy": 0.42413330078125,
|
|
"epoch": 2.317460317460317,
|
|
"grad_norm": 0.8884297194353945,
|
|
"learning_rate": 7.236752088587905e-06,
|
|
"loss": 0.4278,
|
|
"mean_token_accuracy": 0.850508657284081,
|
|
"num_tokens": 251219125.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"entropy": 0.4213409423828125,
|
|
"epoch": 2.3253968253968256,
|
|
"grad_norm": 0.8328136207372636,
|
|
"learning_rate": 7.217164507853734e-06,
|
|
"loss": 0.423,
|
|
"mean_token_accuracy": 0.8522739242762327,
|
|
"num_tokens": 252080434.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"entropy": 0.426666259765625,
|
|
"epoch": 2.3333333333333335,
|
|
"grad_norm": 0.8427474548537396,
|
|
"learning_rate": 7.197534480013951e-06,
|
|
"loss": 0.4203,
|
|
"mean_token_accuracy": 0.85275460453704,
|
|
"num_tokens": 252923218.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"entropy": 0.430084228515625,
|
|
"epoch": 2.3412698412698414,
|
|
"grad_norm": 0.9004833679442211,
|
|
"learning_rate": 7.177862380880935e-06,
|
|
"loss": 0.4218,
|
|
"mean_token_accuracy": 0.8528542476706207,
|
|
"num_tokens": 253761289.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 0.4216156005859375,
|
|
"epoch": 2.3492063492063493,
|
|
"grad_norm": 0.8913589598940626,
|
|
"learning_rate": 7.158148587072509e-06,
|
|
"loss": 0.425,
|
|
"mean_token_accuracy": 0.8505891724489629,
|
|
"num_tokens": 254643716.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"entropy": 0.426910400390625,
|
|
"epoch": 2.357142857142857,
|
|
"grad_norm": 0.7894369274614703,
|
|
"learning_rate": 7.138393476004725e-06,
|
|
"loss": 0.425,
|
|
"mean_token_accuracy": 0.8516062931157649,
|
|
"num_tokens": 255486573.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"entropy": 0.426300048828125,
|
|
"epoch": 2.365079365079365,
|
|
"grad_norm": 0.8212693828322741,
|
|
"learning_rate": 7.118597425884659e-06,
|
|
"loss": 0.4154,
|
|
"mean_token_accuracy": 0.8540734858252108,
|
|
"num_tokens": 256345326.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"entropy": 0.4244842529296875,
|
|
"epoch": 2.373015873015873,
|
|
"grad_norm": 0.8408664594175462,
|
|
"learning_rate": 7.098760815703139e-06,
|
|
"loss": 0.4159,
|
|
"mean_token_accuracy": 0.8559228433296084,
|
|
"num_tokens": 257185152.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"entropy": 0.42059326171875,
|
|
"epoch": 2.380952380952381,
|
|
"grad_norm": 0.8309509931885469,
|
|
"learning_rate": 7.078884025227519e-06,
|
|
"loss": 0.4215,
|
|
"mean_token_accuracy": 0.8527105739340186,
|
|
"num_tokens": 258067282.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.421661376953125,
|
|
"epoch": 2.388888888888889,
|
|
"grad_norm": 0.7954323327920004,
|
|
"learning_rate": 7.058967434994388e-06,
|
|
"loss": 0.4251,
|
|
"mean_token_accuracy": 0.8514108480885625,
|
|
"num_tokens": 258944016.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"entropy": 0.425048828125,
|
|
"epoch": 2.3968253968253967,
|
|
"grad_norm": 0.7636778328503014,
|
|
"learning_rate": 7.0390114263022955e-06,
|
|
"loss": 0.4198,
|
|
"mean_token_accuracy": 0.8537601926364005,
|
|
"num_tokens": 259808268.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"entropy": 0.41961669921875,
|
|
"epoch": 2.4047619047619047,
|
|
"grad_norm": 0.8444532257440839,
|
|
"learning_rate": 7.019016381204448e-06,
|
|
"loss": 0.4264,
|
|
"mean_token_accuracy": 0.8519964478909969,
|
|
"num_tokens": 260684292.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"entropy": 0.4252777099609375,
|
|
"epoch": 2.4126984126984126,
|
|
"grad_norm": 0.8265870173926899,
|
|
"learning_rate": 6.998982682501394e-06,
|
|
"loss": 0.4233,
|
|
"mean_token_accuracy": 0.8529867087490857,
|
|
"num_tokens": 261555918.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"entropy": 0.4233245849609375,
|
|
"epoch": 2.4206349206349205,
|
|
"grad_norm": 0.8766807638096971,
|
|
"learning_rate": 6.978910713733696e-06,
|
|
"loss": 0.4207,
|
|
"mean_token_accuracy": 0.8529614573344588,
|
|
"num_tokens": 262425946.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 0.4260406494140625,
|
|
"epoch": 2.4285714285714284,
|
|
"grad_norm": 0.8180287946820591,
|
|
"learning_rate": 6.958800859174591e-06,
|
|
"loss": 0.4155,
|
|
"mean_token_accuracy": 0.8538580327294767,
|
|
"num_tokens": 263268966.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"entropy": 0.420166015625,
|
|
"epoch": 2.4365079365079367,
|
|
"grad_norm": 0.8038512105532972,
|
|
"learning_rate": 6.938653503822628e-06,
|
|
"loss": 0.4193,
|
|
"mean_token_accuracy": 0.8529025730676949,
|
|
"num_tokens": 264137961.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"entropy": 0.4186859130859375,
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.8356237787218255,
|
|
"learning_rate": 6.9184690333942995e-06,
|
|
"loss": 0.4179,
|
|
"mean_token_accuracy": 0.8538753935135901,
|
|
"num_tokens": 264995910.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"entropy": 0.4160308837890625,
|
|
"epoch": 2.4523809523809526,
|
|
"grad_norm": 0.8358036143672558,
|
|
"learning_rate": 6.898247834316662e-06,
|
|
"loss": 0.4147,
|
|
"mean_token_accuracy": 0.8543582037091255,
|
|
"num_tokens": 265867518.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"entropy": 0.41815185546875,
|
|
"epoch": 2.4603174603174605,
|
|
"grad_norm": 0.9260389067513531,
|
|
"learning_rate": 6.877990293719928e-06,
|
|
"loss": 0.4211,
|
|
"mean_token_accuracy": 0.8540931805036962,
|
|
"num_tokens": 266730039.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.4172515869140625,
|
|
"epoch": 2.4682539682539684,
|
|
"grad_norm": 0.7930039952405856,
|
|
"learning_rate": 6.857696799430064e-06,
|
|
"loss": 0.4248,
|
|
"mean_token_accuracy": 0.8519657654687762,
|
|
"num_tokens": 267605673.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"entropy": 0.4198455810546875,
|
|
"epoch": 2.4761904761904763,
|
|
"grad_norm": 0.8779922529454903,
|
|
"learning_rate": 6.83736773996136e-06,
|
|
"loss": 0.4276,
|
|
"mean_token_accuracy": 0.852175232488662,
|
|
"num_tokens": 268470812.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"entropy": 0.418670654296875,
|
|
"epoch": 2.484126984126984,
|
|
"grad_norm": 0.7739740399164128,
|
|
"learning_rate": 6.817003504508993e-06,
|
|
"loss": 0.4145,
|
|
"mean_token_accuracy": 0.853930065408349,
|
|
"num_tokens": 269329768.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"entropy": 0.4190521240234375,
|
|
"epoch": 2.492063492063492,
|
|
"grad_norm": 0.7927430903268082,
|
|
"learning_rate": 6.796604482941578e-06,
|
|
"loss": 0.4238,
|
|
"mean_token_accuracy": 0.8510767961852252,
|
|
"num_tokens": 270192672.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"entropy": 0.4205474853515625,
|
|
"epoch": 2.5,
|
|
"grad_norm": 0.7677286448168184,
|
|
"learning_rate": 6.7761710657936995e-06,
|
|
"loss": 0.4282,
|
|
"mean_token_accuracy": 0.8515617684461176,
|
|
"num_tokens": 271053623.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 0.415618896484375,
|
|
"epoch": 2.507936507936508,
|
|
"grad_norm": 0.7893175807304748,
|
|
"learning_rate": 6.75570364425844e-06,
|
|
"loss": 0.4215,
|
|
"mean_token_accuracy": 0.8526675584726036,
|
|
"num_tokens": 271921985.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"entropy": 0.4283447265625,
|
|
"epoch": 2.515873015873016,
|
|
"grad_norm": 0.8617893689163498,
|
|
"learning_rate": 6.735202610179886e-06,
|
|
"loss": 0.4235,
|
|
"mean_token_accuracy": 0.8520378330722451,
|
|
"num_tokens": 272757706.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"entropy": 0.42413330078125,
|
|
"epoch": 2.5238095238095237,
|
|
"grad_norm": 0.76248538584374,
|
|
"learning_rate": 6.714668356045629e-06,
|
|
"loss": 0.4155,
|
|
"mean_token_accuracy": 0.8540036669000983,
|
|
"num_tokens": 273603268.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"entropy": 0.421356201171875,
|
|
"epoch": 2.5317460317460316,
|
|
"grad_norm": 1.1471382166034823,
|
|
"learning_rate": 6.694101274979253e-06,
|
|
"loss": 0.4182,
|
|
"mean_token_accuracy": 0.8544265124946833,
|
|
"num_tokens": 274458735.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"entropy": 0.419586181640625,
|
|
"epoch": 2.5396825396825395,
|
|
"grad_norm": 0.8503843517257628,
|
|
"learning_rate": 6.673501760732805e-06,
|
|
"loss": 0.4188,
|
|
"mean_token_accuracy": 0.851504479534924,
|
|
"num_tokens": 275320028.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.41754150390625,
|
|
"epoch": 2.5476190476190474,
|
|
"grad_norm": 0.7742097684397823,
|
|
"learning_rate": 6.652870207679253e-06,
|
|
"loss": 0.4154,
|
|
"mean_token_accuracy": 0.8555147871375084,
|
|
"num_tokens": 276151262.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"entropy": 0.415802001953125,
|
|
"epoch": 2.5555555555555554,
|
|
"grad_norm": 0.7996726962055972,
|
|
"learning_rate": 6.632207010804949e-06,
|
|
"loss": 0.4175,
|
|
"mean_token_accuracy": 0.8534226748161018,
|
|
"num_tokens": 276997327.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"entropy": 0.420318603515625,
|
|
"epoch": 2.5634920634920633,
|
|
"grad_norm": 0.8023983937223226,
|
|
"learning_rate": 6.611512565702053e-06,
|
|
"loss": 0.4226,
|
|
"mean_token_accuracy": 0.8535379455424845,
|
|
"num_tokens": 277849848.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"entropy": 0.4129180908203125,
|
|
"epoch": 2.571428571428571,
|
|
"grad_norm": 0.794860570280225,
|
|
"learning_rate": 6.590787268560967e-06,
|
|
"loss": 0.4126,
|
|
"mean_token_accuracy": 0.8558539836667478,
|
|
"num_tokens": 278726761.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"entropy": 0.417694091796875,
|
|
"epoch": 2.5793650793650795,
|
|
"grad_norm": 0.8947468548309203,
|
|
"learning_rate": 6.570031516162746e-06,
|
|
"loss": 0.4161,
|
|
"mean_token_accuracy": 0.8547689928673208,
|
|
"num_tokens": 279572082.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 0.4159698486328125,
|
|
"epoch": 2.5873015873015874,
|
|
"grad_norm": 0.7955201654992391,
|
|
"learning_rate": 6.549245705871507e-06,
|
|
"loss": 0.4146,
|
|
"mean_token_accuracy": 0.854183979332447,
|
|
"num_tokens": 280414468.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"entropy": 0.4205780029296875,
|
|
"epoch": 2.5952380952380953,
|
|
"grad_norm": 0.8138022818439977,
|
|
"learning_rate": 6.528430235626819e-06,
|
|
"loss": 0.4216,
|
|
"mean_token_accuracy": 0.8531410917639732,
|
|
"num_tokens": 281237288.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"entropy": 0.4152374267578125,
|
|
"epoch": 2.6031746031746033,
|
|
"grad_norm": 0.8114079031107396,
|
|
"learning_rate": 6.5075855039360805e-06,
|
|
"loss": 0.4092,
|
|
"mean_token_accuracy": 0.8578996560536325,
|
|
"num_tokens": 282118057.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"entropy": 0.409637451171875,
|
|
"epoch": 2.611111111111111,
|
|
"grad_norm": 0.8647529166774726,
|
|
"learning_rate": 6.486711909866895e-06,
|
|
"loss": 0.4248,
|
|
"mean_token_accuracy": 0.8518201056867838,
|
|
"num_tokens": 283028330.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"entropy": 0.4239501953125,
|
|
"epoch": 2.619047619047619,
|
|
"grad_norm": 0.7331498819381451,
|
|
"learning_rate": 6.465809853039431e-06,
|
|
"loss": 0.4172,
|
|
"mean_token_accuracy": 0.8533883499912918,
|
|
"num_tokens": 283866607.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.425384521484375,
|
|
"epoch": 2.626984126984127,
|
|
"grad_norm": 0.9242263399118948,
|
|
"learning_rate": 6.444879733618766e-06,
|
|
"loss": 0.4229,
|
|
"mean_token_accuracy": 0.852979929652065,
|
|
"num_tokens": 284705319.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"entropy": 0.419525146484375,
|
|
"epoch": 2.634920634920635,
|
|
"grad_norm": 0.8158292669223365,
|
|
"learning_rate": 6.423921952307237e-06,
|
|
"loss": 0.4338,
|
|
"mean_token_accuracy": 0.8505428163334727,
|
|
"num_tokens": 285598883.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"entropy": 0.422210693359375,
|
|
"epoch": 2.642857142857143,
|
|
"grad_norm": 0.8529287289999934,
|
|
"learning_rate": 6.4029369103367545e-06,
|
|
"loss": 0.4199,
|
|
"mean_token_accuracy": 0.8537574140354991,
|
|
"num_tokens": 286461446.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"entropy": 0.4251708984375,
|
|
"epoch": 2.6507936507936507,
|
|
"grad_norm": 0.8196864990296487,
|
|
"learning_rate": 6.381925009461128e-06,
|
|
"loss": 0.4171,
|
|
"mean_token_accuracy": 0.8536815252155066,
|
|
"num_tokens": 287308399.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"entropy": 0.4163818359375,
|
|
"epoch": 2.6587301587301586,
|
|
"grad_norm": 0.7820718545979705,
|
|
"learning_rate": 6.3608866519483825e-06,
|
|
"loss": 0.4198,
|
|
"mean_token_accuracy": 0.8528619990684092,
|
|
"num_tokens": 288187832.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 0.4176025390625,
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.796216651976639,
|
|
"learning_rate": 6.339822240573041e-06,
|
|
"loss": 0.4169,
|
|
"mean_token_accuracy": 0.8543005757965147,
|
|
"num_tokens": 289047051.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"entropy": 0.421844482421875,
|
|
"epoch": 2.674603174603175,
|
|
"grad_norm": 0.8463751671443359,
|
|
"learning_rate": 6.3187321786084236e-06,
|
|
"loss": 0.423,
|
|
"mean_token_accuracy": 0.852651288267225,
|
|
"num_tokens": 289920851.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"entropy": 0.418731689453125,
|
|
"epoch": 2.682539682539683,
|
|
"grad_norm": 0.8240504405278195,
|
|
"learning_rate": 6.297616869818926e-06,
|
|
"loss": 0.4069,
|
|
"mean_token_accuracy": 0.8571417732164264,
|
|
"num_tokens": 290766931.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"entropy": 0.427032470703125,
|
|
"epoch": 2.6904761904761907,
|
|
"grad_norm": 0.8185363544673269,
|
|
"learning_rate": 6.276476718452289e-06,
|
|
"loss": 0.4155,
|
|
"mean_token_accuracy": 0.853483980987221,
|
|
"num_tokens": 291599836.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"entropy": 0.417877197265625,
|
|
"epoch": 2.6984126984126986,
|
|
"grad_norm": 0.837427213509895,
|
|
"learning_rate": 6.2553121292318595e-06,
|
|
"loss": 0.4211,
|
|
"mean_token_accuracy": 0.8524906514212489,
|
|
"num_tokens": 292454972.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.42510986328125,
|
|
"epoch": 2.7063492063492065,
|
|
"grad_norm": 0.8135990341026819,
|
|
"learning_rate": 6.23412350734884e-06,
|
|
"loss": 0.4166,
|
|
"mean_token_accuracy": 0.852956528775394,
|
|
"num_tokens": 293307675.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"entropy": 0.4229583740234375,
|
|
"epoch": 2.7142857142857144,
|
|
"grad_norm": 0.7369881660528143,
|
|
"learning_rate": 6.2129112584545325e-06,
|
|
"loss": 0.4144,
|
|
"mean_token_accuracy": 0.8540790337137878,
|
|
"num_tokens": 294149752.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"entropy": 0.4259033203125,
|
|
"epoch": 2.7222222222222223,
|
|
"grad_norm": 0.8315573451881167,
|
|
"learning_rate": 6.191675788652574e-06,
|
|
"loss": 0.4017,
|
|
"mean_token_accuracy": 0.8583689746446908,
|
|
"num_tokens": 294975614.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"entropy": 0.416900634765625,
|
|
"epoch": 2.7301587301587302,
|
|
"grad_norm": 0.8638440384540704,
|
|
"learning_rate": 6.170417504491157e-06,
|
|
"loss": 0.4147,
|
|
"mean_token_accuracy": 0.854499620385468,
|
|
"num_tokens": 295846874.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"entropy": 0.4163665771484375,
|
|
"epoch": 2.738095238095238,
|
|
"grad_norm": 0.8116865889754844,
|
|
"learning_rate": 6.149136812955256e-06,
|
|
"loss": 0.4166,
|
|
"mean_token_accuracy": 0.8544518309645355,
|
|
"num_tokens": 296730922.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 0.41357421875,
|
|
"epoch": 2.746031746031746,
|
|
"grad_norm": 0.7806791564546498,
|
|
"learning_rate": 6.1278341214588255e-06,
|
|
"loss": 0.4101,
|
|
"mean_token_accuracy": 0.8577063884586096,
|
|
"num_tokens": 297610941.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"entropy": 0.4183197021484375,
|
|
"epoch": 2.753968253968254,
|
|
"grad_norm": 0.8686079824008746,
|
|
"learning_rate": 6.106509837837004e-06,
|
|
"loss": 0.412,
|
|
"mean_token_accuracy": 0.8529722727835178,
|
|
"num_tokens": 298464464.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"entropy": 0.4134063720703125,
|
|
"epoch": 2.761904761904762,
|
|
"grad_norm": 0.8287811327498212,
|
|
"learning_rate": 6.0851643703383066e-06,
|
|
"loss": 0.407,
|
|
"mean_token_accuracy": 0.8568426473066211,
|
|
"num_tokens": 299315956.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"entropy": 0.420440673828125,
|
|
"epoch": 2.7698412698412698,
|
|
"grad_norm": 0.7606321520792506,
|
|
"learning_rate": 6.063798127616811e-06,
|
|
"loss": 0.4129,
|
|
"mean_token_accuracy": 0.8552384455688298,
|
|
"num_tokens": 300162540.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"entropy": 0.422515869140625,
|
|
"epoch": 2.7777777777777777,
|
|
"grad_norm": 0.7256068297614475,
|
|
"learning_rate": 6.042411518724327e-06,
|
|
"loss": 0.41,
|
|
"mean_token_accuracy": 0.8559038788080215,
|
|
"num_tokens": 301009855.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.42303466796875,
|
|
"epoch": 2.7857142857142856,
|
|
"grad_norm": 0.7634303802543713,
|
|
"learning_rate": 6.021004953102576e-06,
|
|
"loss": 0.4039,
|
|
"mean_token_accuracy": 0.8571093692444265,
|
|
"num_tokens": 301852351.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"entropy": 0.4193878173828125,
|
|
"epoch": 2.7936507936507935,
|
|
"grad_norm": 0.7645192522691564,
|
|
"learning_rate": 5.999578840575342e-06,
|
|
"loss": 0.4046,
|
|
"mean_token_accuracy": 0.8566430397331715,
|
|
"num_tokens": 302702189.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"entropy": 0.4232940673828125,
|
|
"epoch": 2.8015873015873014,
|
|
"grad_norm": 0.794739166753094,
|
|
"learning_rate": 5.978133591340633e-06,
|
|
"loss": 0.4091,
|
|
"mean_token_accuracy": 0.8565059076063335,
|
|
"num_tokens": 303546117.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"entropy": 0.4161529541015625,
|
|
"epoch": 2.8095238095238093,
|
|
"grad_norm": 0.8805512544331933,
|
|
"learning_rate": 5.956669615962821e-06,
|
|
"loss": 0.413,
|
|
"mean_token_accuracy": 0.8556133066304028,
|
|
"num_tokens": 304424136.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"entropy": 0.41705322265625,
|
|
"epoch": 2.817460317460317,
|
|
"grad_norm": 0.7877254936944273,
|
|
"learning_rate": 5.935187325364791e-06,
|
|
"loss": 0.42,
|
|
"mean_token_accuracy": 0.8545625568367541,
|
|
"num_tokens": 305299176.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 0.4157562255859375,
|
|
"epoch": 2.825396825396825,
|
|
"grad_norm": 0.8062504809460449,
|
|
"learning_rate": 5.913687130820064e-06,
|
|
"loss": 0.4104,
|
|
"mean_token_accuracy": 0.8556024674326181,
|
|
"num_tokens": 306180918.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"entropy": 0.41650390625,
|
|
"epoch": 2.8333333333333335,
|
|
"grad_norm": 0.7092100136349762,
|
|
"learning_rate": 5.892169443944929e-06,
|
|
"loss": 0.4151,
|
|
"mean_token_accuracy": 0.8552160942927003,
|
|
"num_tokens": 307053855.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"entropy": 0.4198150634765625,
|
|
"epoch": 2.8412698412698414,
|
|
"grad_norm": 0.8020397673815377,
|
|
"learning_rate": 5.870634676690564e-06,
|
|
"loss": 0.414,
|
|
"mean_token_accuracy": 0.8550357166677713,
|
|
"num_tokens": 307908233.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"entropy": 0.419891357421875,
|
|
"epoch": 2.8492063492063493,
|
|
"grad_norm": 0.8184927667236647,
|
|
"learning_rate": 5.8490832413351465e-06,
|
|
"loss": 0.406,
|
|
"mean_token_accuracy": 0.8566894140094519,
|
|
"num_tokens": 308765267.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"entropy": 0.4268646240234375,
|
|
"epoch": 2.857142857142857,
|
|
"grad_norm": 0.7696962615494287,
|
|
"learning_rate": 5.827515550475955e-06,
|
|
"loss": 0.4112,
|
|
"mean_token_accuracy": 0.8539468543604016,
|
|
"num_tokens": 309586897.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.4179840087890625,
|
|
"epoch": 2.865079365079365,
|
|
"grad_norm": 0.7851245206394726,
|
|
"learning_rate": 5.805932017021486e-06,
|
|
"loss": 0.4116,
|
|
"mean_token_accuracy": 0.8549962108954787,
|
|
"num_tokens": 310440896.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"entropy": 0.4180450439453125,
|
|
"epoch": 2.873015873015873,
|
|
"grad_norm": 0.7806163849576252,
|
|
"learning_rate": 5.784333054183533e-06,
|
|
"loss": 0.4069,
|
|
"mean_token_accuracy": 0.8565008505247533,
|
|
"num_tokens": 311297562.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"entropy": 0.4127197265625,
|
|
"epoch": 2.880952380952381,
|
|
"grad_norm": 0.754556378509014,
|
|
"learning_rate": 5.762719075469277e-06,
|
|
"loss": 0.4155,
|
|
"mean_token_accuracy": 0.8560635317116976,
|
|
"num_tokens": 312189513.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"entropy": 0.41400146484375,
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.833131948010438,
|
|
"learning_rate": 5.741090494673386e-06,
|
|
"loss": 0.4098,
|
|
"mean_token_accuracy": 0.8566766679286957,
|
|
"num_tokens": 313055977.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"entropy": 0.4152984619140625,
|
|
"epoch": 2.8968253968253967,
|
|
"grad_norm": 0.8035675568742273,
|
|
"learning_rate": 5.719447725870071e-06,
|
|
"loss": 0.417,
|
|
"mean_token_accuracy": 0.8535463376902044,
|
|
"num_tokens": 313934488.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 0.41644287109375,
|
|
"epoch": 2.9047619047619047,
|
|
"grad_norm": 0.809344160354769,
|
|
"learning_rate": 5.697791183405174e-06,
|
|
"loss": 0.4123,
|
|
"mean_token_accuracy": 0.8555832463316619,
|
|
"num_tokens": 314782888.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"entropy": 0.4123077392578125,
|
|
"epoch": 2.9126984126984126,
|
|
"grad_norm": 0.7542255175949691,
|
|
"learning_rate": 5.67612128188823e-06,
|
|
"loss": 0.4042,
|
|
"mean_token_accuracy": 0.8586938725784421,
|
|
"num_tokens": 315667111.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"entropy": 0.418426513671875,
|
|
"epoch": 2.9206349206349205,
|
|
"grad_norm": 0.7325186075881142,
|
|
"learning_rate": 5.654438436184531e-06,
|
|
"loss": 0.41,
|
|
"mean_token_accuracy": 0.8550154692493379,
|
|
"num_tokens": 316519645.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"entropy": 0.4178619384765625,
|
|
"epoch": 2.928571428571429,
|
|
"grad_norm": 0.764847574977915,
|
|
"learning_rate": 5.6327430614071794e-06,
|
|
"loss": 0.409,
|
|
"mean_token_accuracy": 0.8574743596836925,
|
|
"num_tokens": 317376914.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"entropy": 0.4179229736328125,
|
|
"epoch": 2.9365079365079367,
|
|
"grad_norm": 0.7979585773178869,
|
|
"learning_rate": 5.611035572909147e-06,
|
|
"loss": 0.4116,
|
|
"mean_token_accuracy": 0.8546005864627659,
|
|
"num_tokens": 318210944.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.411712646484375,
|
|
"epoch": 2.9444444444444446,
|
|
"grad_norm": 0.7465872378991787,
|
|
"learning_rate": 5.589316386275318e-06,
|
|
"loss": 0.4127,
|
|
"mean_token_accuracy": 0.8551405002363026,
|
|
"num_tokens": 319072977.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"entropy": 0.413848876953125,
|
|
"epoch": 2.9523809523809526,
|
|
"grad_norm": 0.7670391280421824,
|
|
"learning_rate": 5.567585917314535e-06,
|
|
"loss": 0.4085,
|
|
"mean_token_accuracy": 0.8564633526839316,
|
|
"num_tokens": 319936836.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"entropy": 0.41168212890625,
|
|
"epoch": 2.9603174603174605,
|
|
"grad_norm": 0.8099483587987164,
|
|
"learning_rate": 5.545844582051641e-06,
|
|
"loss": 0.4053,
|
|
"mean_token_accuracy": 0.8578686797991395,
|
|
"num_tokens": 320807541.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"entropy": 0.4144439697265625,
|
|
"epoch": 2.9682539682539684,
|
|
"grad_norm": 0.8134336846221772,
|
|
"learning_rate": 5.524092796719507e-06,
|
|
"loss": 0.4096,
|
|
"mean_token_accuracy": 0.8564304136671126,
|
|
"num_tokens": 321676330.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"entropy": 0.41241455078125,
|
|
"epoch": 2.9761904761904763,
|
|
"grad_norm": 0.7501989763747119,
|
|
"learning_rate": 5.502330977751072e-06,
|
|
"loss": 0.4012,
|
|
"mean_token_accuracy": 0.8606314528733492,
|
|
"num_tokens": 322526195.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 0.4184417724609375,
|
|
"epoch": 2.984126984126984,
|
|
"grad_norm": 0.8084127993444857,
|
|
"learning_rate": 5.4805595417713634e-06,
|
|
"loss": 0.4129,
|
|
"mean_token_accuracy": 0.854987567756325,
|
|
"num_tokens": 323373321.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"entropy": 0.413482666015625,
|
|
"epoch": 2.992063492063492,
|
|
"grad_norm": 0.8921476455980862,
|
|
"learning_rate": 5.458778905589528e-06,
|
|
"loss": 0.4048,
|
|
"mean_token_accuracy": 0.8568636071868241,
|
|
"num_tokens": 324241487.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"entropy": 0.413818359375,
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.7212429646275152,
|
|
"learning_rate": 5.436989486190846e-06,
|
|
"loss": 0.4132,
|
|
"mean_token_accuracy": 0.8552796910516918,
|
|
"num_tokens": 325114310.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"entropy": 0.41851806640625,
|
|
"epoch": 3.007936507936508,
|
|
"grad_norm": 0.8309684413468622,
|
|
"learning_rate": 5.415191700728749e-06,
|
|
"loss": 0.3803,
|
|
"mean_token_accuracy": 0.8651906503364444,
|
|
"num_tokens": 325956929.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"entropy": 0.415771484375,
|
|
"epoch": 3.015873015873016,
|
|
"grad_norm": 0.875627308634879,
|
|
"learning_rate": 5.393385966516838e-06,
|
|
"loss": 0.3949,
|
|
"mean_token_accuracy": 0.8609235784970224,
|
|
"num_tokens": 326825247.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.4132232666015625,
|
|
"epoch": 3.0238095238095237,
|
|
"grad_norm": 0.7660187349203336,
|
|
"learning_rate": 5.371572701020891e-06,
|
|
"loss": 0.3843,
|
|
"mean_token_accuracy": 0.865902341902256,
|
|
"num_tokens": 327664768.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"entropy": 0.4121246337890625,
|
|
"epoch": 3.0317460317460316,
|
|
"grad_norm": 0.876168435455912,
|
|
"learning_rate": 5.349752321850866e-06,
|
|
"loss": 0.3891,
|
|
"mean_token_accuracy": 0.8622553567402065,
|
|
"num_tokens": 328521474.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"entropy": 0.4065399169921875,
|
|
"epoch": 3.0396825396825395,
|
|
"grad_norm": 0.8349975533992948,
|
|
"learning_rate": 5.327925246752917e-06,
|
|
"loss": 0.3871,
|
|
"mean_token_accuracy": 0.8634061855264008,
|
|
"num_tokens": 329375199.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"entropy": 0.41015625,
|
|
"epoch": 3.0476190476190474,
|
|
"grad_norm": 0.7379532101462078,
|
|
"learning_rate": 5.306091893601384e-06,
|
|
"loss": 0.3854,
|
|
"mean_token_accuracy": 0.8652450819499791,
|
|
"num_tokens": 330238541.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"entropy": 0.4120330810546875,
|
|
"epoch": 3.0555555555555554,
|
|
"grad_norm": 0.7995763856973052,
|
|
"learning_rate": 5.284252680390803e-06,
|
|
"loss": 0.3919,
|
|
"mean_token_accuracy": 0.8609937699511647,
|
|
"num_tokens": 331111401.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 0.412261962890625,
|
|
"epoch": 3.0634920634920633,
|
|
"grad_norm": 0.7968948726175992,
|
|
"learning_rate": 5.2624080252279006e-06,
|
|
"loss": 0.3891,
|
|
"mean_token_accuracy": 0.8621770567260683,
|
|
"num_tokens": 331970501.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"entropy": 0.412689208984375,
|
|
"epoch": 3.0714285714285716,
|
|
"grad_norm": 0.7567371102755298,
|
|
"learning_rate": 5.240558346323582e-06,
|
|
"loss": 0.388,
|
|
"mean_token_accuracy": 0.8623263705521822,
|
|
"num_tokens": 332839444.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"entropy": 0.4113616943359375,
|
|
"epoch": 3.0793650793650795,
|
|
"grad_norm": 0.8253393750303774,
|
|
"learning_rate": 5.218704061984938e-06,
|
|
"loss": 0.3805,
|
|
"mean_token_accuracy": 0.8644652073271573,
|
|
"num_tokens": 333694157.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"entropy": 0.4080963134765625,
|
|
"epoch": 3.0873015873015874,
|
|
"grad_norm": 0.8150357016122449,
|
|
"learning_rate": 5.196845590607225e-06,
|
|
"loss": 0.3778,
|
|
"mean_token_accuracy": 0.8659757277928293,
|
|
"num_tokens": 334553848.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"entropy": 0.411529541015625,
|
|
"epoch": 3.0952380952380953,
|
|
"grad_norm": 0.810657705832448,
|
|
"learning_rate": 5.174983350665861e-06,
|
|
"loss": 0.3837,
|
|
"mean_token_accuracy": 0.862535847350955,
|
|
"num_tokens": 335414382.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.4155731201171875,
|
|
"epoch": 3.1031746031746033,
|
|
"grad_norm": 0.7578031832169086,
|
|
"learning_rate": 5.153117760708411e-06,
|
|
"loss": 0.388,
|
|
"mean_token_accuracy": 0.8647559527307749,
|
|
"num_tokens": 336270013.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"entropy": 0.4129638671875,
|
|
"epoch": 3.111111111111111,
|
|
"grad_norm": 0.7853251945488365,
|
|
"learning_rate": 5.131249239346574e-06,
|
|
"loss": 0.3874,
|
|
"mean_token_accuracy": 0.8632673225365579,
|
|
"num_tokens": 337153945.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"entropy": 0.41741943359375,
|
|
"epoch": 3.119047619047619,
|
|
"grad_norm": 0.8526319008895792,
|
|
"learning_rate": 5.109378205248177e-06,
|
|
"loss": 0.3813,
|
|
"mean_token_accuracy": 0.8653798257000744,
|
|
"num_tokens": 337986623.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"entropy": 0.41943359375,
|
|
"epoch": 3.126984126984127,
|
|
"grad_norm": 0.7777939267453691,
|
|
"learning_rate": 5.087505077129144e-06,
|
|
"loss": 0.3847,
|
|
"mean_token_accuracy": 0.8638610797934234,
|
|
"num_tokens": 338820053.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"entropy": 0.4080352783203125,
|
|
"epoch": 3.134920634920635,
|
|
"grad_norm": 0.8050487820823641,
|
|
"learning_rate": 5.065630273745495e-06,
|
|
"loss": 0.391,
|
|
"mean_token_accuracy": 0.8619571630842984,
|
|
"num_tokens": 339709184.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 0.4158172607421875,
|
|
"epoch": 3.142857142857143,
|
|
"grad_norm": 0.788047324475754,
|
|
"learning_rate": 5.043754213885319e-06,
|
|
"loss": 0.3806,
|
|
"mean_token_accuracy": 0.8652480882592499,
|
|
"num_tokens": 340560422.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"entropy": 0.4126739501953125,
|
|
"epoch": 3.1507936507936507,
|
|
"grad_norm": 0.7895581256609918,
|
|
"learning_rate": 5.021877316360759e-06,
|
|
"loss": 0.3857,
|
|
"mean_token_accuracy": 0.8641035025939345,
|
|
"num_tokens": 341427547.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"entropy": 0.4117431640625,
|
|
"epoch": 3.1587301587301586,
|
|
"grad_norm": 0.7885237864621762,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3809,
|
|
"mean_token_accuracy": 0.8656587679870427,
|
|
"num_tokens": 342297536.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"entropy": 0.42266845703125,
|
|
"epoch": 3.1666666666666665,
|
|
"grad_norm": 0.7347090872403708,
|
|
"learning_rate": 4.978122683639241e-06,
|
|
"loss": 0.3797,
|
|
"mean_token_accuracy": 0.8654965776950121,
|
|
"num_tokens": 343108738.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"entropy": 0.41461181640625,
|
|
"epoch": 3.1746031746031744,
|
|
"grad_norm": 0.7919607175007582,
|
|
"learning_rate": 4.956245786114683e-06,
|
|
"loss": 0.3805,
|
|
"mean_token_accuracy": 0.8652189085260034,
|
|
"num_tokens": 343963294.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.4164276123046875,
|
|
"epoch": 3.1825396825396823,
|
|
"grad_norm": 0.7982890026738051,
|
|
"learning_rate": 4.934369726254506e-06,
|
|
"loss": 0.3839,
|
|
"mean_token_accuracy": 0.8629119992256165,
|
|
"num_tokens": 344812036.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"entropy": 0.4126739501953125,
|
|
"epoch": 3.1904761904761907,
|
|
"grad_norm": 0.7923716094274679,
|
|
"learning_rate": 4.9124949228708566e-06,
|
|
"loss": 0.385,
|
|
"mean_token_accuracy": 0.8645118903368711,
|
|
"num_tokens": 345643006.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"entropy": 0.41131591796875,
|
|
"epoch": 3.1984126984126986,
|
|
"grad_norm": 0.8224201397236163,
|
|
"learning_rate": 4.890621794751825e-06,
|
|
"loss": 0.3781,
|
|
"mean_token_accuracy": 0.8670813706703484,
|
|
"num_tokens": 346526093.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"entropy": 0.4088134765625,
|
|
"epoch": 3.2063492063492065,
|
|
"grad_norm": 0.748278286726127,
|
|
"learning_rate": 4.8687507606534274e-06,
|
|
"loss": 0.3869,
|
|
"mean_token_accuracy": 0.8644262808375061,
|
|
"num_tokens": 347429334.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"entropy": 0.4080810546875,
|
|
"epoch": 3.2142857142857144,
|
|
"grad_norm": 0.7655034553814312,
|
|
"learning_rate": 4.8468822392915925e-06,
|
|
"loss": 0.3879,
|
|
"mean_token_accuracy": 0.86290636472404,
|
|
"num_tokens": 348300345.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 0.41357421875,
|
|
"epoch": 3.2222222222222223,
|
|
"grad_norm": 0.8371130749450066,
|
|
"learning_rate": 4.82501664933414e-06,
|
|
"loss": 0.3895,
|
|
"mean_token_accuracy": 0.8630081634037197,
|
|
"num_tokens": 349174728.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"entropy": 0.414794921875,
|
|
"epoch": 3.2301587301587302,
|
|
"grad_norm": 0.7863015466902707,
|
|
"learning_rate": 4.803154409392776e-06,
|
|
"loss": 0.3827,
|
|
"mean_token_accuracy": 0.864469132386148,
|
|
"num_tokens": 350019662.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"entropy": 0.411102294921875,
|
|
"epoch": 3.238095238095238,
|
|
"grad_norm": 0.7465717328173177,
|
|
"learning_rate": 4.781295938015063e-06,
|
|
"loss": 0.3831,
|
|
"mean_token_accuracy": 0.8645726442337036,
|
|
"num_tokens": 350867252.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"entropy": 0.41094970703125,
|
|
"epoch": 3.246031746031746,
|
|
"grad_norm": 0.7655319214119843,
|
|
"learning_rate": 4.759441653676419e-06,
|
|
"loss": 0.3788,
|
|
"mean_token_accuracy": 0.8645767103880644,
|
|
"num_tokens": 351713015.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"entropy": 0.4076080322265625,
|
|
"epoch": 3.253968253968254,
|
|
"grad_norm": 0.7922787284316188,
|
|
"learning_rate": 4.737591974772102e-06,
|
|
"loss": 0.383,
|
|
"mean_token_accuracy": 0.8641512831673026,
|
|
"num_tokens": 352612442.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.41400146484375,
|
|
"epoch": 3.261904761904762,
|
|
"grad_norm": 0.7897648138284737,
|
|
"learning_rate": 4.715747319609199e-06,
|
|
"loss": 0.3808,
|
|
"mean_token_accuracy": 0.8653241745196283,
|
|
"num_tokens": 353475199.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"entropy": 0.4135589599609375,
|
|
"epoch": 3.2698412698412698,
|
|
"grad_norm": 0.7900462489736286,
|
|
"learning_rate": 4.693908106398617e-06,
|
|
"loss": 0.3805,
|
|
"mean_token_accuracy": 0.8655834072269499,
|
|
"num_tokens": 354336321.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"entropy": 0.413421630859375,
|
|
"epoch": 3.2777777777777777,
|
|
"grad_norm": 0.8361561340006152,
|
|
"learning_rate": 4.6720747532470845e-06,
|
|
"loss": 0.3909,
|
|
"mean_token_accuracy": 0.8628287450410426,
|
|
"num_tokens": 355204238.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"entropy": 0.41632080078125,
|
|
"epoch": 3.2857142857142856,
|
|
"grad_norm": 0.7192804861618904,
|
|
"learning_rate": 4.650247678149135e-06,
|
|
"loss": 0.3822,
|
|
"mean_token_accuracy": 0.8637247635051608,
|
|
"num_tokens": 356039432.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"entropy": 0.417022705078125,
|
|
"epoch": 3.2936507936507935,
|
|
"grad_norm": 0.7884051950710803,
|
|
"learning_rate": 4.628427298979111e-06,
|
|
"loss": 0.3834,
|
|
"mean_token_accuracy": 0.8649981459602714,
|
|
"num_tokens": 356872476.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 0.4183197021484375,
|
|
"epoch": 3.3015873015873014,
|
|
"grad_norm": 0.7660377464634954,
|
|
"learning_rate": 4.606614033483164e-06,
|
|
"loss": 0.3733,
|
|
"mean_token_accuracy": 0.8667362979613245,
|
|
"num_tokens": 357702880.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"entropy": 0.4095458984375,
|
|
"epoch": 3.3095238095238093,
|
|
"grad_norm": 1.502576186555255,
|
|
"learning_rate": 4.5848082992712516e-06,
|
|
"loss": 0.3851,
|
|
"mean_token_accuracy": 0.8638884532265365,
|
|
"num_tokens": 358593212.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"entropy": 0.411895751953125,
|
|
"epoch": 3.317460317460317,
|
|
"grad_norm": 0.8516989355397148,
|
|
"learning_rate": 4.563010513809156e-06,
|
|
"loss": 0.374,
|
|
"mean_token_accuracy": 0.8689160253852606,
|
|
"num_tokens": 359427196.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"entropy": 0.415771484375,
|
|
"epoch": 3.3253968253968256,
|
|
"grad_norm": 0.8333941577643297,
|
|
"learning_rate": 4.541221094410473e-06,
|
|
"loss": 0.3886,
|
|
"mean_token_accuracy": 0.8632900207303464,
|
|
"num_tokens": 360297417.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"entropy": 0.4151153564453125,
|
|
"epoch": 3.3333333333333335,
|
|
"grad_norm": 0.9185443535766896,
|
|
"learning_rate": 4.519440458228638e-06,
|
|
"loss": 0.3929,
|
|
"mean_token_accuracy": 0.8609873973764479,
|
|
"num_tokens": 361158991.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.407745361328125,
|
|
"epoch": 3.3412698412698414,
|
|
"grad_norm": 0.7923581076176655,
|
|
"learning_rate": 4.497669022248931e-06,
|
|
"loss": 0.3768,
|
|
"mean_token_accuracy": 0.86583196464926,
|
|
"num_tokens": 362026434.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"entropy": 0.4108734130859375,
|
|
"epoch": 3.3492063492063493,
|
|
"grad_norm": 0.8878219569934822,
|
|
"learning_rate": 4.475907203280494e-06,
|
|
"loss": 0.3874,
|
|
"mean_token_accuracy": 0.862205957993865,
|
|
"num_tokens": 362894932.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"entropy": 0.4105682373046875,
|
|
"epoch": 3.357142857142857,
|
|
"grad_norm": 0.8849923219918312,
|
|
"learning_rate": 4.45415541794836e-06,
|
|
"loss": 0.3861,
|
|
"mean_token_accuracy": 0.8655079673044384,
|
|
"num_tokens": 363774304.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"entropy": 0.415740966796875,
|
|
"epoch": 3.365079365079365,
|
|
"grad_norm": 0.7854653600781764,
|
|
"learning_rate": 4.432414082685466e-06,
|
|
"loss": 0.3759,
|
|
"mean_token_accuracy": 0.8663219287991524,
|
|
"num_tokens": 364624495.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"entropy": 0.4079132080078125,
|
|
"epoch": 3.373015873015873,
|
|
"grad_norm": 0.7932914584936744,
|
|
"learning_rate": 4.410683613724684e-06,
|
|
"loss": 0.3827,
|
|
"mean_token_accuracy": 0.8652459299191833,
|
|
"num_tokens": 365522341.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 0.4139251708984375,
|
|
"epoch": 3.380952380952381,
|
|
"grad_norm": 0.8623623236366137,
|
|
"learning_rate": 4.388964427090855e-06,
|
|
"loss": 0.3818,
|
|
"mean_token_accuracy": 0.8645171159878373,
|
|
"num_tokens": 366384352.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"entropy": 0.406494140625,
|
|
"epoch": 3.388888888888889,
|
|
"grad_norm": 0.8905833082641034,
|
|
"learning_rate": 4.367256938592822e-06,
|
|
"loss": 0.3883,
|
|
"mean_token_accuracy": 0.8634491441771388,
|
|
"num_tokens": 367231565.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"entropy": 0.409393310546875,
|
|
"epoch": 3.3968253968253967,
|
|
"grad_norm": 0.8362209111342414,
|
|
"learning_rate": 4.345561563815471e-06,
|
|
"loss": 0.3722,
|
|
"mean_token_accuracy": 0.8681328790262341,
|
|
"num_tokens": 368105765.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"entropy": 0.4090118408203125,
|
|
"epoch": 3.4047619047619047,
|
|
"grad_norm": 0.7875786753075623,
|
|
"learning_rate": 4.323878718111771e-06,
|
|
"loss": 0.3815,
|
|
"mean_token_accuracy": 0.8649689424782991,
|
|
"num_tokens": 368976566.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"entropy": 0.4091339111328125,
|
|
"epoch": 3.4126984126984126,
|
|
"grad_norm": 0.8242239527343677,
|
|
"learning_rate": 4.302208816594829e-06,
|
|
"loss": 0.3775,
|
|
"mean_token_accuracy": 0.8660351554863155,
|
|
"num_tokens": 369847661.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.4122161865234375,
|
|
"epoch": 3.4206349206349205,
|
|
"grad_norm": 0.8253724820932592,
|
|
"learning_rate": 4.280552274129932e-06,
|
|
"loss": 0.3832,
|
|
"mean_token_accuracy": 0.8640619218349457,
|
|
"num_tokens": 370716519.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"entropy": 0.4095916748046875,
|
|
"epoch": 3.4285714285714284,
|
|
"grad_norm": 0.7522197274851726,
|
|
"learning_rate": 4.258909505326617e-06,
|
|
"loss": 0.3747,
|
|
"mean_token_accuracy": 0.8674253430217505,
|
|
"num_tokens": 371548994.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"entropy": 0.4106597900390625,
|
|
"epoch": 3.4365079365079367,
|
|
"grad_norm": 0.7580899790250368,
|
|
"learning_rate": 4.237280924530723e-06,
|
|
"loss": 0.3731,
|
|
"mean_token_accuracy": 0.8671010048128664,
|
|
"num_tokens": 372388644.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"entropy": 0.4091644287109375,
|
|
"epoch": 3.4444444444444446,
|
|
"grad_norm": 0.7966902079320883,
|
|
"learning_rate": 4.215666945816469e-06,
|
|
"loss": 0.3824,
|
|
"mean_token_accuracy": 0.8636285294778645,
|
|
"num_tokens": 373216255.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"entropy": 0.4067840576171875,
|
|
"epoch": 3.4523809523809526,
|
|
"grad_norm": 0.7259667199168536,
|
|
"learning_rate": 4.194067982978516e-06,
|
|
"loss": 0.3744,
|
|
"mean_token_accuracy": 0.8671418204903603,
|
|
"num_tokens": 374091119.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 0.4103546142578125,
|
|
"epoch": 3.4603174603174605,
|
|
"grad_norm": 0.7168906340219251,
|
|
"learning_rate": 4.172484449524047e-06,
|
|
"loss": 0.3806,
|
|
"mean_token_accuracy": 0.8649011980742216,
|
|
"num_tokens": 374964989.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"entropy": 0.4099578857421875,
|
|
"epoch": 3.4682539682539684,
|
|
"grad_norm": 0.7226054435885343,
|
|
"learning_rate": 4.150916758664857e-06,
|
|
"loss": 0.3743,
|
|
"mean_token_accuracy": 0.86769935535267,
|
|
"num_tokens": 375849312.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"entropy": 0.4129791259765625,
|
|
"epoch": 3.4761904761904763,
|
|
"grad_norm": 0.7254203102984561,
|
|
"learning_rate": 4.129365323309436e-06,
|
|
"loss": 0.3886,
|
|
"mean_token_accuracy": 0.8636708622798324,
|
|
"num_tokens": 376711755.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"entropy": 0.4055328369140625,
|
|
"epoch": 3.484126984126984,
|
|
"grad_norm": 0.7258886081350053,
|
|
"learning_rate": 4.107830556055072e-06,
|
|
"loss": 0.377,
|
|
"mean_token_accuracy": 0.865284236613661,
|
|
"num_tokens": 377601829.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"entropy": 0.4116363525390625,
|
|
"epoch": 3.492063492063492,
|
|
"grad_norm": 0.7385419803324385,
|
|
"learning_rate": 4.086312869179938e-06,
|
|
"loss": 0.3811,
|
|
"mean_token_accuracy": 0.8655071114189923,
|
|
"num_tokens": 378449007.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.409393310546875,
|
|
"epoch": 3.5,
|
|
"grad_norm": 0.7975589365886925,
|
|
"learning_rate": 4.06481267463521e-06,
|
|
"loss": 0.3746,
|
|
"mean_token_accuracy": 0.867948766797781,
|
|
"num_tokens": 379309394.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"entropy": 0.4044036865234375,
|
|
"epoch": 3.507936507936508,
|
|
"grad_norm": 0.7484471478807218,
|
|
"learning_rate": 4.04333038403718e-06,
|
|
"loss": 0.3755,
|
|
"mean_token_accuracy": 0.8677190546877682,
|
|
"num_tokens": 380174726.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"entropy": 0.406707763671875,
|
|
"epoch": 3.515873015873016,
|
|
"grad_norm": 0.7052307989651647,
|
|
"learning_rate": 4.021866408659368e-06,
|
|
"loss": 0.3766,
|
|
"mean_token_accuracy": 0.8668166692368686,
|
|
"num_tokens": 381047802.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"entropy": 0.4095001220703125,
|
|
"epoch": 3.5238095238095237,
|
|
"grad_norm": 0.7205324024463486,
|
|
"learning_rate": 4.000421159424658e-06,
|
|
"loss": 0.3782,
|
|
"mean_token_accuracy": 0.8670969372615218,
|
|
"num_tokens": 381900519.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"entropy": 0.40911865234375,
|
|
"epoch": 3.5317460317460316,
|
|
"grad_norm": 0.6751185639712526,
|
|
"learning_rate": 3.978995046897425e-06,
|
|
"loss": 0.3811,
|
|
"mean_token_accuracy": 0.8652258133515716,
|
|
"num_tokens": 382738529.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 0.4097747802734375,
|
|
"epoch": 3.5396825396825395,
|
|
"grad_norm": 0.8003232347426622,
|
|
"learning_rate": 3.957588481275674e-06,
|
|
"loss": 0.3813,
|
|
"mean_token_accuracy": 0.8646343694999814,
|
|
"num_tokens": 383603819.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"entropy": 0.4104156494140625,
|
|
"epoch": 3.5476190476190474,
|
|
"grad_norm": 0.7612125218536709,
|
|
"learning_rate": 3.9362018723831915e-06,
|
|
"loss": 0.3834,
|
|
"mean_token_accuracy": 0.8642422612756491,
|
|
"num_tokens": 384466493.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"entropy": 0.40789794921875,
|
|
"epoch": 3.5555555555555554,
|
|
"grad_norm": 0.7301586930422078,
|
|
"learning_rate": 3.914835629661695e-06,
|
|
"loss": 0.3691,
|
|
"mean_token_accuracy": 0.8685760577209294,
|
|
"num_tokens": 385303493.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"entropy": 0.4090576171875,
|
|
"epoch": 3.5634920634920633,
|
|
"grad_norm": 0.8028620237168601,
|
|
"learning_rate": 3.893490162162997e-06,
|
|
"loss": 0.3772,
|
|
"mean_token_accuracy": 0.8661560285836458,
|
|
"num_tokens": 386139059.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"entropy": 0.4043426513671875,
|
|
"epoch": 3.571428571428571,
|
|
"grad_norm": 0.8038126363701456,
|
|
"learning_rate": 3.872165878541175e-06,
|
|
"loss": 0.3819,
|
|
"mean_token_accuracy": 0.8657438950613141,
|
|
"num_tokens": 387035788.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.4053955078125,
|
|
"epoch": 3.5793650793650795,
|
|
"grad_norm": 0.7971696805205959,
|
|
"learning_rate": 3.850863187044745e-06,
|
|
"loss": 0.3783,
|
|
"mean_token_accuracy": 0.8658295255154371,
|
|
"num_tokens": 387893370.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"entropy": 0.4118804931640625,
|
|
"epoch": 3.5873015873015874,
|
|
"grad_norm": 0.7382565780061542,
|
|
"learning_rate": 3.829582495508844e-06,
|
|
"loss": 0.3774,
|
|
"mean_token_accuracy": 0.8669222141616046,
|
|
"num_tokens": 388743564.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"entropy": 0.4039764404296875,
|
|
"epoch": 3.5952380952380953,
|
|
"grad_norm": 0.762934354969153,
|
|
"learning_rate": 3.808324211347429e-06,
|
|
"loss": 0.3846,
|
|
"mean_token_accuracy": 0.8625594675540924,
|
|
"num_tokens": 389636739.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"entropy": 0.4062957763671875,
|
|
"epoch": 3.6031746031746033,
|
|
"grad_norm": 0.7869576244237314,
|
|
"learning_rate": 3.7870887415454687e-06,
|
|
"loss": 0.3772,
|
|
"mean_token_accuracy": 0.8657813919708133,
|
|
"num_tokens": 390510641.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"entropy": 0.4098968505859375,
|
|
"epoch": 3.611111111111111,
|
|
"grad_norm": 0.7351333487875162,
|
|
"learning_rate": 3.7658764926511613e-06,
|
|
"loss": 0.3659,
|
|
"mean_token_accuracy": 0.868850149679929,
|
|
"num_tokens": 391336036.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 0.40362548828125,
|
|
"epoch": 3.619047619047619,
|
|
"grad_norm": 0.734375379903563,
|
|
"learning_rate": 3.7446878707681413e-06,
|
|
"loss": 0.373,
|
|
"mean_token_accuracy": 0.8676298609934747,
|
|
"num_tokens": 392197671.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"entropy": 0.4044189453125,
|
|
"epoch": 3.626984126984127,
|
|
"grad_norm": 0.8070921691017913,
|
|
"learning_rate": 3.7235232815477123e-06,
|
|
"loss": 0.3723,
|
|
"mean_token_accuracy": 0.8680153395980597,
|
|
"num_tokens": 393048877.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"entropy": 0.4051971435546875,
|
|
"epoch": 3.634920634920635,
|
|
"grad_norm": 0.7382078569033306,
|
|
"learning_rate": 3.7023831301810765e-06,
|
|
"loss": 0.3807,
|
|
"mean_token_accuracy": 0.8656099583022296,
|
|
"num_tokens": 393913386.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"entropy": 0.412567138671875,
|
|
"epoch": 3.642857142857143,
|
|
"grad_norm": 0.7500851651198606,
|
|
"learning_rate": 3.6812678213915777e-06,
|
|
"loss": 0.3753,
|
|
"mean_token_accuracy": 0.866928874514997,
|
|
"num_tokens": 394741069.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"entropy": 0.4068450927734375,
|
|
"epoch": 3.6507936507936507,
|
|
"grad_norm": 0.7014301015459137,
|
|
"learning_rate": 3.6601777594269605e-06,
|
|
"loss": 0.3716,
|
|
"mean_token_accuracy": 0.8681608587503433,
|
|
"num_tokens": 395587154.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.4112701416015625,
|
|
"epoch": 3.6587301587301586,
|
|
"grad_norm": 0.7453844502592517,
|
|
"learning_rate": 3.6391133480516196e-06,
|
|
"loss": 0.37,
|
|
"mean_token_accuracy": 0.8680603308603168,
|
|
"num_tokens": 396429106.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"entropy": 0.40789794921875,
|
|
"epoch": 3.6666666666666665,
|
|
"grad_norm": 0.6714935595868241,
|
|
"learning_rate": 3.618074990538873e-06,
|
|
"loss": 0.3764,
|
|
"mean_token_accuracy": 0.8662193124182522,
|
|
"num_tokens": 397306106.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"entropy": 0.404541015625,
|
|
"epoch": 3.674603174603175,
|
|
"grad_norm": 0.7264996158823219,
|
|
"learning_rate": 3.5970630896632485e-06,
|
|
"loss": 0.3679,
|
|
"mean_token_accuracy": 0.8692012121900916,
|
|
"num_tokens": 398186044.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"entropy": 0.4099273681640625,
|
|
"epoch": 3.682539682539683,
|
|
"grad_norm": 0.6812527161900478,
|
|
"learning_rate": 3.5760780476927637e-06,
|
|
"loss": 0.3783,
|
|
"mean_token_accuracy": 0.866292960010469,
|
|
"num_tokens": 399059367.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"entropy": 0.4114532470703125,
|
|
"epoch": 3.6904761904761907,
|
|
"grad_norm": 0.7396202860678279,
|
|
"learning_rate": 3.5551202663812344e-06,
|
|
"loss": 0.3671,
|
|
"mean_token_accuracy": 0.8694238997995853,
|
|
"num_tokens": 399921480.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 0.41082763671875,
|
|
"epoch": 3.6984126984126986,
|
|
"grad_norm": 0.7505615061152298,
|
|
"learning_rate": 3.534190146960571e-06,
|
|
"loss": 0.3738,
|
|
"mean_token_accuracy": 0.8666451787576079,
|
|
"num_tokens": 400768716.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"entropy": 0.407745361328125,
|
|
"epoch": 3.7063492063492065,
|
|
"grad_norm": 0.7716901466331328,
|
|
"learning_rate": 3.5132880901331067e-06,
|
|
"loss": 0.3836,
|
|
"mean_token_accuracy": 0.8653549118898809,
|
|
"num_tokens": 401643010.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"entropy": 0.411224365234375,
|
|
"epoch": 3.7142857142857144,
|
|
"grad_norm": 0.7425696552972633,
|
|
"learning_rate": 3.492414496063921e-06,
|
|
"loss": 0.3699,
|
|
"mean_token_accuracy": 0.8682533628307283,
|
|
"num_tokens": 402482222.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"entropy": 0.4105377197265625,
|
|
"epoch": 3.7222222222222223,
|
|
"grad_norm": 0.7549782563712677,
|
|
"learning_rate": 3.4715697643731828e-06,
|
|
"loss": 0.374,
|
|
"mean_token_accuracy": 0.8664119308814406,
|
|
"num_tokens": 403330184.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"entropy": 0.4114990234375,
|
|
"epoch": 3.7301587301587302,
|
|
"grad_norm": 0.6882214771400156,
|
|
"learning_rate": 3.4507542941284933e-06,
|
|
"loss": 0.3772,
|
|
"mean_token_accuracy": 0.8662938089109957,
|
|
"num_tokens": 404170985.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.4065399169921875,
|
|
"epoch": 3.738095238095238,
|
|
"grad_norm": 0.7790709755576551,
|
|
"learning_rate": 3.4299684838372547e-06,
|
|
"loss": 0.3702,
|
|
"mean_token_accuracy": 0.8684107572771609,
|
|
"num_tokens": 405023111.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"entropy": 0.4109344482421875,
|
|
"epoch": 3.746031746031746,
|
|
"grad_norm": 0.8999085919269414,
|
|
"learning_rate": 3.4092127314390354e-06,
|
|
"loss": 0.3733,
|
|
"mean_token_accuracy": 0.8679695804603398,
|
|
"num_tokens": 405909984.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"entropy": 0.410552978515625,
|
|
"epoch": 3.753968253968254,
|
|
"grad_norm": 0.7852309398323011,
|
|
"learning_rate": 3.388487434297949e-06,
|
|
"loss": 0.3726,
|
|
"mean_token_accuracy": 0.868429503403604,
|
|
"num_tokens": 406762973.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"entropy": 0.4063873291015625,
|
|
"epoch": 3.761904761904762,
|
|
"grad_norm": 0.7497329657708961,
|
|
"learning_rate": 3.3677929891950527e-06,
|
|
"loss": 0.3675,
|
|
"mean_token_accuracy": 0.8680104180239141,
|
|
"num_tokens": 407632013.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"entropy": 0.4088287353515625,
|
|
"epoch": 3.7698412698412698,
|
|
"grad_norm": 0.9959878902569155,
|
|
"learning_rate": 3.347129792320748e-06,
|
|
"loss": 0.3803,
|
|
"mean_token_accuracy": 0.8661512886174023,
|
|
"num_tokens": 408479038.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 0.400726318359375,
|
|
"epoch": 3.7777777777777777,
|
|
"grad_norm": 0.7083662937552805,
|
|
"learning_rate": 3.3264982392671973e-06,
|
|
"loss": 0.3707,
|
|
"mean_token_accuracy": 0.8689758381806314,
|
|
"num_tokens": 409381423.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"entropy": 0.4104156494140625,
|
|
"epoch": 3.7857142857142856,
|
|
"grad_norm": 0.7381897728110274,
|
|
"learning_rate": 3.3058987250207476e-06,
|
|
"loss": 0.3677,
|
|
"mean_token_accuracy": 0.869597565382719,
|
|
"num_tokens": 410237991.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"entropy": 0.4114227294921875,
|
|
"epoch": 3.7936507936507935,
|
|
"grad_norm": 0.7658185988150478,
|
|
"learning_rate": 3.285331643954372e-06,
|
|
"loss": 0.3779,
|
|
"mean_token_accuracy": 0.8658644729293883,
|
|
"num_tokens": 411113505.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"entropy": 0.4087371826171875,
|
|
"epoch": 3.8015873015873014,
|
|
"grad_norm": 0.6773211053527161,
|
|
"learning_rate": 3.2647973898201157e-06,
|
|
"loss": 0.3692,
|
|
"mean_token_accuracy": 0.8686946122907102,
|
|
"num_tokens": 411973035.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"entropy": 0.41082763671875,
|
|
"epoch": 3.8095238095238093,
|
|
"grad_norm": 0.6795058958160979,
|
|
"learning_rate": 3.244296355741561e-06,
|
|
"loss": 0.3792,
|
|
"mean_token_accuracy": 0.864706945605576,
|
|
"num_tokens": 412804401.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.411712646484375,
|
|
"epoch": 3.817460317460317,
|
|
"grad_norm": 0.7172393778753827,
|
|
"learning_rate": 3.2238289342063013e-06,
|
|
"loss": 0.3741,
|
|
"mean_token_accuracy": 0.8672458734363317,
|
|
"num_tokens": 413645879.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"entropy": 0.40325927734375,
|
|
"epoch": 3.825396825396825,
|
|
"grad_norm": 0.7137250398041949,
|
|
"learning_rate": 3.203395517058423e-06,
|
|
"loss": 0.3815,
|
|
"mean_token_accuracy": 0.8636117246933281,
|
|
"num_tokens": 414526310.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"entropy": 0.40850830078125,
|
|
"epoch": 3.8333333333333335,
|
|
"grad_norm": 0.7060925413908444,
|
|
"learning_rate": 3.1829964954910076e-06,
|
|
"loss": 0.3744,
|
|
"mean_token_accuracy": 0.8668604497797787,
|
|
"num_tokens": 415386155.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"entropy": 0.4058837890625,
|
|
"epoch": 3.8412698412698414,
|
|
"grad_norm": 0.7071475776511494,
|
|
"learning_rate": 3.1626322600386418e-06,
|
|
"loss": 0.369,
|
|
"mean_token_accuracy": 0.8673932519741356,
|
|
"num_tokens": 416255235.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"entropy": 0.4070587158203125,
|
|
"epoch": 3.8492063492063493,
|
|
"grad_norm": 0.716470253050545,
|
|
"learning_rate": 3.1423032005699377e-06,
|
|
"loss": 0.3776,
|
|
"mean_token_accuracy": 0.8670607698149979,
|
|
"num_tokens": 417123335.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 0.4124298095703125,
|
|
"epoch": 3.857142857142857,
|
|
"grad_norm": 0.7331008247958041,
|
|
"learning_rate": 3.122009706280072e-06,
|
|
"loss": 0.3725,
|
|
"mean_token_accuracy": 0.8690087418071926,
|
|
"num_tokens": 417961712.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"entropy": 0.407073974609375,
|
|
"epoch": 3.865079365079365,
|
|
"grad_norm": 0.7215581902390997,
|
|
"learning_rate": 3.1017521656833384e-06,
|
|
"loss": 0.3738,
|
|
"mean_token_accuracy": 0.8678515437059104,
|
|
"num_tokens": 418833226.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"entropy": 0.4102020263671875,
|
|
"epoch": 3.873015873015873,
|
|
"grad_norm": 0.6750907563285418,
|
|
"learning_rate": 3.0815309666057013e-06,
|
|
"loss": 0.3798,
|
|
"mean_token_accuracy": 0.8680808427743614,
|
|
"num_tokens": 419693605.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"entropy": 0.4107666015625,
|
|
"epoch": 3.880952380952381,
|
|
"grad_norm": 0.7253529027679292,
|
|
"learning_rate": 3.061346496177374e-06,
|
|
"loss": 0.3765,
|
|
"mean_token_accuracy": 0.8668225076980889,
|
|
"num_tokens": 420533126.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"entropy": 0.403076171875,
|
|
"epoch": 3.888888888888889,
|
|
"grad_norm": 0.774847139075562,
|
|
"learning_rate": 3.0411991408254116e-06,
|
|
"loss": 0.3734,
|
|
"mean_token_accuracy": 0.8675551642663777,
|
|
"num_tokens": 421408158.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.407684326171875,
|
|
"epoch": 3.8968253968253967,
|
|
"grad_norm": 0.7430473963918918,
|
|
"learning_rate": 3.0210892862663043e-06,
|
|
"loss": 0.3669,
|
|
"mean_token_accuracy": 0.8696022001095116,
|
|
"num_tokens": 422285612.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"entropy": 0.40545654296875,
|
|
"epoch": 3.9047619047619047,
|
|
"grad_norm": 0.7359917176077896,
|
|
"learning_rate": 3.001017317498607e-06,
|
|
"loss": 0.3654,
|
|
"mean_token_accuracy": 0.8683894919231534,
|
|
"num_tokens": 423145278.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"entropy": 0.4095001220703125,
|
|
"epoch": 3.9126984126984126,
|
|
"grad_norm": 0.6995695516835777,
|
|
"learning_rate": 2.9809836187955532e-06,
|
|
"loss": 0.3759,
|
|
"mean_token_accuracy": 0.8677502269856632,
|
|
"num_tokens": 423982368.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"entropy": 0.4052734375,
|
|
"epoch": 3.9206349206349205,
|
|
"grad_norm": 0.7865083776711185,
|
|
"learning_rate": 2.960988573697705e-06,
|
|
"loss": 0.3769,
|
|
"mean_token_accuracy": 0.8668651487678289,
|
|
"num_tokens": 424855995.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"entropy": 0.4091949462890625,
|
|
"epoch": 3.928571428571429,
|
|
"grad_norm": 0.7069460296016946,
|
|
"learning_rate": 2.941032565005613e-06,
|
|
"loss": 0.3734,
|
|
"mean_token_accuracy": 0.8668679501861334,
|
|
"num_tokens": 425714116.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 0.4059906005859375,
|
|
"epoch": 3.9365079365079367,
|
|
"grad_norm": 0.7074879125016784,
|
|
"learning_rate": 2.9211159747724813e-06,
|
|
"loss": 0.3702,
|
|
"mean_token_accuracy": 0.86768330167979,
|
|
"num_tokens": 426587317.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"entropy": 0.4085845947265625,
|
|
"epoch": 3.9444444444444446,
|
|
"grad_norm": 0.6887885189112549,
|
|
"learning_rate": 2.90123918429686e-06,
|
|
"loss": 0.3693,
|
|
"mean_token_accuracy": 0.8686679415404797,
|
|
"num_tokens": 427459701.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"entropy": 0.4090728759765625,
|
|
"epoch": 3.9523809523809526,
|
|
"grad_norm": 0.7038820990298879,
|
|
"learning_rate": 2.881402574115344e-06,
|
|
"loss": 0.3693,
|
|
"mean_token_accuracy": 0.869040944147855,
|
|
"num_tokens": 428310157.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"entropy": 0.406158447265625,
|
|
"epoch": 3.9603174603174605,
|
|
"grad_norm": 0.7223999433933582,
|
|
"learning_rate": 2.8616065239952763e-06,
|
|
"loss": 0.3706,
|
|
"mean_token_accuracy": 0.8684421242214739,
|
|
"num_tokens": 429155049.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"entropy": 0.4044036865234375,
|
|
"epoch": 3.9682539682539684,
|
|
"grad_norm": 0.7016426279147887,
|
|
"learning_rate": 2.841851412927495e-06,
|
|
"loss": 0.3706,
|
|
"mean_token_accuracy": 0.8662050706334412,
|
|
"num_tokens": 430044250.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.405242919921875,
|
|
"epoch": 3.9761904761904763,
|
|
"grad_norm": 0.6864573418269565,
|
|
"learning_rate": 2.822137619119065e-06,
|
|
"loss": 0.365,
|
|
"mean_token_accuracy": 0.8711186717264354,
|
|
"num_tokens": 430927931.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"entropy": 0.40313720703125,
|
|
"epoch": 3.984126984126984,
|
|
"grad_norm": 0.7183772106536512,
|
|
"learning_rate": 2.8024655199860495e-06,
|
|
"loss": 0.3682,
|
|
"mean_token_accuracy": 0.8691108208149672,
|
|
"num_tokens": 431777495.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"entropy": 0.4088134765625,
|
|
"epoch": 3.992063492063492,
|
|
"grad_norm": 0.7147302557020478,
|
|
"learning_rate": 2.7828354921462668e-06,
|
|
"loss": 0.3622,
|
|
"mean_token_accuracy": 0.8704049359075725,
|
|
"num_tokens": 432616684.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"entropy": 0.40557861328125,
|
|
"epoch": 4.0,
|
|
"grad_norm": 0.724778260633041,
|
|
"learning_rate": 2.7632479114120963e-06,
|
|
"loss": 0.367,
|
|
"mean_token_accuracy": 0.8679725076071918,
|
|
"num_tokens": 433464885.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"entropy": 0.3993377685546875,
|
|
"epoch": 4.007936507936508,
|
|
"grad_norm": 0.7300051389709427,
|
|
"learning_rate": 2.7437031527832747e-06,
|
|
"loss": 0.3473,
|
|
"mean_token_accuracy": 0.8751431121490896,
|
|
"num_tokens": 434354281.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 0.40277099609375,
|
|
"epoch": 4.015873015873016,
|
|
"grad_norm": 0.7518820580172925,
|
|
"learning_rate": 2.72420159043972e-06,
|
|
"loss": 0.351,
|
|
"mean_token_accuracy": 0.8758351663127542,
|
|
"num_tokens": 435254037.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"entropy": 0.40679931640625,
|
|
"epoch": 4.023809523809524,
|
|
"grad_norm": 0.75823317605801,
|
|
"learning_rate": 2.704743597734365e-06,
|
|
"loss": 0.3449,
|
|
"mean_token_accuracy": 0.8767664707265794,
|
|
"num_tokens": 436096499.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"entropy": 0.4034576416015625,
|
|
"epoch": 4.031746031746032,
|
|
"grad_norm": 0.781839052148236,
|
|
"learning_rate": 2.685329547186018e-06,
|
|
"loss": 0.349,
|
|
"mean_token_accuracy": 0.8767024255357683,
|
|
"num_tokens": 436936261.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"entropy": 0.3980255126953125,
|
|
"epoch": 4.0396825396825395,
|
|
"grad_norm": 0.8874815588264233,
|
|
"learning_rate": 2.665959810472219e-06,
|
|
"loss": 0.3457,
|
|
"mean_token_accuracy": 0.8768184627406299,
|
|
"num_tokens": 437789126.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"entropy": 0.400390625,
|
|
"epoch": 4.0476190476190474,
|
|
"grad_norm": 0.8438997097584569,
|
|
"learning_rate": 2.6466347584221314e-06,
|
|
"loss": 0.3488,
|
|
"mean_token_accuracy": 0.8754238770343363,
|
|
"num_tokens": 438642725.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.399810791015625,
|
|
"epoch": 4.055555555555555,
|
|
"grad_norm": 0.7914559178878162,
|
|
"learning_rate": 2.6273547610094408e-06,
|
|
"loss": 0.3568,
|
|
"mean_token_accuracy": 0.8729163003154099,
|
|
"num_tokens": 439509332.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"entropy": 0.4043426513671875,
|
|
"epoch": 4.063492063492063,
|
|
"grad_norm": 0.8018090810161909,
|
|
"learning_rate": 2.608120187345273e-06,
|
|
"loss": 0.3589,
|
|
"mean_token_accuracy": 0.8719246378168464,
|
|
"num_tokens": 440358824.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"entropy": 0.40130615234375,
|
|
"epoch": 4.071428571428571,
|
|
"grad_norm": 0.717052204902591,
|
|
"learning_rate": 2.588931405671127e-06,
|
|
"loss": 0.347,
|
|
"mean_token_accuracy": 0.876106639392674,
|
|
"num_tokens": 441231571.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"entropy": 0.40350341796875,
|
|
"epoch": 4.079365079365079,
|
|
"grad_norm": 0.7833684120919279,
|
|
"learning_rate": 2.5697887833518215e-06,
|
|
"loss": 0.3481,
|
|
"mean_token_accuracy": 0.874689971562475,
|
|
"num_tokens": 442070234.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"entropy": 0.400054931640625,
|
|
"epoch": 4.087301587301587,
|
|
"grad_norm": 0.7624057899379104,
|
|
"learning_rate": 2.5506926868684683e-06,
|
|
"loss": 0.354,
|
|
"mean_token_accuracy": 0.8740922566503286,
|
|
"num_tokens": 442955553.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 0.4012908935546875,
|
|
"epoch": 4.095238095238095,
|
|
"grad_norm": 0.7656669668346728,
|
|
"learning_rate": 2.5316434818114517e-06,
|
|
"loss": 0.3412,
|
|
"mean_token_accuracy": 0.8769300729036331,
|
|
"num_tokens": 443803905.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"entropy": 0.398406982421875,
|
|
"epoch": 4.103174603174603,
|
|
"grad_norm": 0.7384783736599843,
|
|
"learning_rate": 2.5126415328734275e-06,
|
|
"loss": 0.3549,
|
|
"mean_token_accuracy": 0.875963733997196,
|
|
"num_tokens": 444676270.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"entropy": 0.3984222412109375,
|
|
"epoch": 4.111111111111111,
|
|
"grad_norm": 0.763214655206926,
|
|
"learning_rate": 2.4936872038423516e-06,
|
|
"loss": 0.3527,
|
|
"mean_token_accuracy": 0.8742309152148664,
|
|
"num_tokens": 445551871.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"entropy": 0.4028167724609375,
|
|
"epoch": 4.119047619047619,
|
|
"grad_norm": 0.7566995169023817,
|
|
"learning_rate": 2.4747808575945006e-06,
|
|
"loss": 0.351,
|
|
"mean_token_accuracy": 0.8753285491839051,
|
|
"num_tokens": 446395494.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"entropy": 0.4046173095703125,
|
|
"epoch": 4.1269841269841265,
|
|
"grad_norm": 0.7704383123576651,
|
|
"learning_rate": 2.4559228560875336e-06,
|
|
"loss": 0.3489,
|
|
"mean_token_accuracy": 0.8744379128329456,
|
|
"num_tokens": 447255878.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.4016876220703125,
|
|
"epoch": 4.134920634920635,
|
|
"grad_norm": 0.689868389625917,
|
|
"learning_rate": 2.4371135603535613e-06,
|
|
"loss": 0.3475,
|
|
"mean_token_accuracy": 0.8762855334207416,
|
|
"num_tokens": 448095199.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"entropy": 0.4026947021484375,
|
|
"epoch": 4.142857142857143,
|
|
"grad_norm": 0.7668680601519664,
|
|
"learning_rate": 2.4183533304922336e-06,
|
|
"loss": 0.3459,
|
|
"mean_token_accuracy": 0.8746473412029445,
|
|
"num_tokens": 448954987.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"entropy": 0.4035491943359375,
|
|
"epoch": 4.150793650793651,
|
|
"grad_norm": 0.72483989286729,
|
|
"learning_rate": 2.399642525663843e-06,
|
|
"loss": 0.3558,
|
|
"mean_token_accuracy": 0.8751249178312719,
|
|
"num_tokens": 449829424.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"entropy": 0.3994140625,
|
|
"epoch": 4.158730158730159,
|
|
"grad_norm": 0.706119007803981,
|
|
"learning_rate": 2.380981504082459e-06,
|
|
"loss": 0.349,
|
|
"mean_token_accuracy": 0.875414258800447,
|
|
"num_tokens": 450685443.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"entropy": 0.3993988037109375,
|
|
"epoch": 4.166666666666667,
|
|
"grad_norm": 0.7212946770565037,
|
|
"learning_rate": 2.3623706230090517e-06,
|
|
"loss": 0.3557,
|
|
"mean_token_accuracy": 0.8735111146233976,
|
|
"num_tokens": 451597512.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 0.4042205810546875,
|
|
"epoch": 4.174603174603175,
|
|
"grad_norm": 0.7005628568870317,
|
|
"learning_rate": 2.3438102387446686e-06,
|
|
"loss": 0.3469,
|
|
"mean_token_accuracy": 0.8763788240030408,
|
|
"num_tokens": 452424123.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"entropy": 0.3973541259765625,
|
|
"epoch": 4.182539682539683,
|
|
"grad_norm": 0.7155495360306364,
|
|
"learning_rate": 2.325300706623607e-06,
|
|
"loss": 0.353,
|
|
"mean_token_accuracy": 0.8734072712250054,
|
|
"num_tokens": 453294509.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"entropy": 0.4000091552734375,
|
|
"epoch": 4.190476190476191,
|
|
"grad_norm": 0.7006937562520844,
|
|
"learning_rate": 2.3068423810066085e-06,
|
|
"loss": 0.3528,
|
|
"mean_token_accuracy": 0.8746827309951186,
|
|
"num_tokens": 454176550.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"entropy": 0.404052734375,
|
|
"epoch": 4.198412698412699,
|
|
"grad_norm": 0.6995025837491133,
|
|
"learning_rate": 2.288435615274085e-06,
|
|
"loss": 0.3579,
|
|
"mean_token_accuracy": 0.8727600080892444,
|
|
"num_tokens": 455027872.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"entropy": 0.4065704345703125,
|
|
"epoch": 4.2063492063492065,
|
|
"grad_norm": 0.7479029235127206,
|
|
"learning_rate": 2.2700807618193393e-06,
|
|
"loss": 0.3416,
|
|
"mean_token_accuracy": 0.8783422014676034,
|
|
"num_tokens": 455894126.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 0.40460205078125,
|
|
"epoch": 4.214285714285714,
|
|
"grad_norm": 0.746817252036699,
|
|
"learning_rate": 2.251778172041828e-06,
|
|
"loss": 0.3455,
|
|
"mean_token_accuracy": 0.8771996637806296,
|
|
"num_tokens": 456741345.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"entropy": 0.402862548828125,
|
|
"epoch": 4.222222222222222,
|
|
"grad_norm": 0.6876078945117027,
|
|
"learning_rate": 2.2335281963404315e-06,
|
|
"loss": 0.3501,
|
|
"mean_token_accuracy": 0.8753550541587174,
|
|
"num_tokens": 457597774.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"entropy": 0.3993682861328125,
|
|
"epoch": 4.23015873015873,
|
|
"grad_norm": 0.7282590241527106,
|
|
"learning_rate": 2.2153311841067438e-06,
|
|
"loss": 0.3442,
|
|
"mean_token_accuracy": 0.8769136122427881,
|
|
"num_tokens": 458481487.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"entropy": 0.3993377685546875,
|
|
"epoch": 4.238095238095238,
|
|
"grad_norm": 0.7553930529448756,
|
|
"learning_rate": 2.1971874837183914e-06,
|
|
"loss": 0.3458,
|
|
"mean_token_accuracy": 0.8748537562787533,
|
|
"num_tokens": 459329943.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"entropy": 0.3927154541015625,
|
|
"epoch": 4.246031746031746,
|
|
"grad_norm": 0.7591301726961953,
|
|
"learning_rate": 2.179097442532352e-06,
|
|
"loss": 0.3394,
|
|
"mean_token_accuracy": 0.8797525470145047,
|
|
"num_tokens": 460196696.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 0.39837646484375,
|
|
"epoch": 4.253968253968254,
|
|
"grad_norm": 0.7626069369315728,
|
|
"learning_rate": 2.1610614068783112e-06,
|
|
"loss": 0.361,
|
|
"mean_token_accuracy": 0.8727622926235199,
|
|
"num_tokens": 461069051.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"entropy": 0.403106689453125,
|
|
"epoch": 4.261904761904762,
|
|
"grad_norm": 0.7708798701412393,
|
|
"learning_rate": 2.143079722052034e-06,
|
|
"loss": 0.3479,
|
|
"mean_token_accuracy": 0.8751750965602696,
|
|
"num_tokens": 461920899.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"entropy": 0.400177001953125,
|
|
"epoch": 4.26984126984127,
|
|
"grad_norm": 0.7389095619225245,
|
|
"learning_rate": 2.125152732308747e-06,
|
|
"loss": 0.3459,
|
|
"mean_token_accuracy": 0.8786491984501481,
|
|
"num_tokens": 462797643.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"entropy": 0.4058380126953125,
|
|
"epoch": 4.277777777777778,
|
|
"grad_norm": 0.665980057345897,
|
|
"learning_rate": 2.1072807808565547e-06,
|
|
"loss": 0.3501,
|
|
"mean_token_accuracy": 0.8757951087318361,
|
|
"num_tokens": 463640936.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"entropy": 0.40155029296875,
|
|
"epoch": 4.285714285714286,
|
|
"grad_norm": 0.7005429583368125,
|
|
"learning_rate": 2.0894642098498656e-06,
|
|
"loss": 0.3587,
|
|
"mean_token_accuracy": 0.8732366347685456,
|
|
"num_tokens": 464513012.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 0.4015350341796875,
|
|
"epoch": 4.2936507936507935,
|
|
"grad_norm": 0.6682325297585425,
|
|
"learning_rate": 2.0717033603828436e-06,
|
|
"loss": 0.3485,
|
|
"mean_token_accuracy": 0.8750377274118364,
|
|
"num_tokens": 465345613.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"entropy": 0.4040985107421875,
|
|
"epoch": 4.301587301587301,
|
|
"grad_norm": 0.7424671938333351,
|
|
"learning_rate": 2.0539985724828736e-06,
|
|
"loss": 0.3498,
|
|
"mean_token_accuracy": 0.875401156488806,
|
|
"num_tokens": 466181756.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"entropy": 0.401275634765625,
|
|
"epoch": 4.309523809523809,
|
|
"grad_norm": 0.695999191273483,
|
|
"learning_rate": 2.0363501851040573e-06,
|
|
"loss": 0.3436,
|
|
"mean_token_accuracy": 0.8764086258597672,
|
|
"num_tokens": 467035382.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"entropy": 0.40155029296875,
|
|
"epoch": 4.317460317460317,
|
|
"grad_norm": 0.6827039903530154,
|
|
"learning_rate": 2.0187585361207174e-06,
|
|
"loss": 0.3466,
|
|
"mean_token_accuracy": 0.8745089964941144,
|
|
"num_tokens": 467897340.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"entropy": 0.400177001953125,
|
|
"epoch": 4.325396825396825,
|
|
"grad_norm": 0.685211307868124,
|
|
"learning_rate": 2.001223962320941e-06,
|
|
"loss": 0.3517,
|
|
"mean_token_accuracy": 0.8753441325388849,
|
|
"num_tokens": 468764096.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 0.405242919921875,
|
|
"epoch": 4.333333333333333,
|
|
"grad_norm": 0.6682700395214807,
|
|
"learning_rate": 1.9837467994001165e-06,
|
|
"loss": 0.3457,
|
|
"mean_token_accuracy": 0.8773820898495615,
|
|
"num_tokens": 469610106.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"entropy": 0.399566650390625,
|
|
"epoch": 4.341269841269841,
|
|
"grad_norm": 0.6719545574593448,
|
|
"learning_rate": 1.9663273819545157e-06,
|
|
"loss": 0.3396,
|
|
"mean_token_accuracy": 0.8774642567150295,
|
|
"num_tokens": 470468046.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"entropy": 0.40380859375,
|
|
"epoch": 4.349206349206349,
|
|
"grad_norm": 0.7001669509560304,
|
|
"learning_rate": 1.948966043474889e-06,
|
|
"loss": 0.3458,
|
|
"mean_token_accuracy": 0.8756930027157068,
|
|
"num_tokens": 471309098.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"entropy": 0.396209716796875,
|
|
"epoch": 4.357142857142857,
|
|
"grad_norm": 0.75965811702668,
|
|
"learning_rate": 1.931663116340074e-06,
|
|
"loss": 0.3455,
|
|
"mean_token_accuracy": 0.8765083705075085,
|
|
"num_tokens": 472145738.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"entropy": 0.396392822265625,
|
|
"epoch": 4.365079365079365,
|
|
"grad_norm": 0.6756191619675378,
|
|
"learning_rate": 1.914418931810643e-06,
|
|
"loss": 0.3512,
|
|
"mean_token_accuracy": 0.8744937106966972,
|
|
"num_tokens": 473047197.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 0.401031494140625,
|
|
"epoch": 4.3730158730158735,
|
|
"grad_norm": 0.6965894626329614,
|
|
"learning_rate": 1.8972338200225509e-06,
|
|
"loss": 0.3421,
|
|
"mean_token_accuracy": 0.8775716116651893,
|
|
"num_tokens": 473907585.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"entropy": 0.4000091552734375,
|
|
"epoch": 4.380952380952381,
|
|
"grad_norm": 0.7076324681120165,
|
|
"learning_rate": 1.880108109980815e-06,
|
|
"loss": 0.3462,
|
|
"mean_token_accuracy": 0.8761595580726862,
|
|
"num_tokens": 474779332.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"entropy": 0.39825439453125,
|
|
"epoch": 4.388888888888889,
|
|
"grad_norm": 0.7050720543139621,
|
|
"learning_rate": 1.8630421295532252e-06,
|
|
"loss": 0.345,
|
|
"mean_token_accuracy": 0.8770742062479258,
|
|
"num_tokens": 475659187.0,
|
|
"step": 553
|
|
},
|
|
{
|
|
"entropy": 0.403778076171875,
|
|
"epoch": 4.396825396825397,
|
|
"grad_norm": 0.7340183406802493,
|
|
"learning_rate": 1.8460362054640573e-06,
|
|
"loss": 0.3478,
|
|
"mean_token_accuracy": 0.8751401146873832,
|
|
"num_tokens": 476487458.0,
|
|
"step": 554
|
|
},
|
|
{
|
|
"entropy": 0.39990234375,
|
|
"epoch": 4.404761904761905,
|
|
"grad_norm": 0.6861105491926857,
|
|
"learning_rate": 1.8290906632878297e-06,
|
|
"loss": 0.3431,
|
|
"mean_token_accuracy": 0.8780268509872258,
|
|
"num_tokens": 477345662.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 0.3991241455078125,
|
|
"epoch": 4.412698412698413,
|
|
"grad_norm": 0.7296898602599676,
|
|
"learning_rate": 1.8122058274430542e-06,
|
|
"loss": 0.3411,
|
|
"mean_token_accuracy": 0.8761810320429504,
|
|
"num_tokens": 478205977.0,
|
|
"step": 556
|
|
},
|
|
{
|
|
"entropy": 0.4037933349609375,
|
|
"epoch": 4.420634920634921,
|
|
"grad_norm": 1.0332210701383924,
|
|
"learning_rate": 1.7953820211860395e-06,
|
|
"loss": 0.356,
|
|
"mean_token_accuracy": 0.8737587067298591,
|
|
"num_tokens": 479048650.0,
|
|
"step": 557
|
|
},
|
|
{
|
|
"entropy": 0.3999481201171875,
|
|
"epoch": 4.428571428571429,
|
|
"grad_norm": 0.7091178286840939,
|
|
"learning_rate": 1.7786195666046935e-06,
|
|
"loss": 0.343,
|
|
"mean_token_accuracy": 0.8771154009737074,
|
|
"num_tokens": 479895873.0,
|
|
"step": 558
|
|
},
|
|
{
|
|
"entropy": 0.4032745361328125,
|
|
"epoch": 4.436507936507937,
|
|
"grad_norm": 0.6733078832793936,
|
|
"learning_rate": 1.7619187846123624e-06,
|
|
"loss": 0.3457,
|
|
"mean_token_accuracy": 0.8771998826414347,
|
|
"num_tokens": 480755429.0,
|
|
"step": 559
|
|
},
|
|
{
|
|
"entropy": 0.4007568359375,
|
|
"epoch": 4.444444444444445,
|
|
"grad_norm": 0.8490823775032588,
|
|
"learning_rate": 1.7452799949416833e-06,
|
|
"loss": 0.3517,
|
|
"mean_token_accuracy": 0.8754395125433803,
|
|
"num_tokens": 481608352.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 0.4008026123046875,
|
|
"epoch": 4.4523809523809526,
|
|
"grad_norm": 0.7225303298169462,
|
|
"learning_rate": 1.7287035161384673e-06,
|
|
"loss": 0.35,
|
|
"mean_token_accuracy": 0.8747482905164361,
|
|
"num_tokens": 482441149.0,
|
|
"step": 561
|
|
},
|
|
{
|
|
"entropy": 0.4021148681640625,
|
|
"epoch": 4.4603174603174605,
|
|
"grad_norm": 0.6624423396335506,
|
|
"learning_rate": 1.7121896655555958e-06,
|
|
"loss": 0.347,
|
|
"mean_token_accuracy": 0.8763077296316624,
|
|
"num_tokens": 483307531.0,
|
|
"step": 562
|
|
},
|
|
{
|
|
"entropy": 0.4007720947265625,
|
|
"epoch": 4.468253968253968,
|
|
"grad_norm": 0.6783795851745674,
|
|
"learning_rate": 1.695738759346947e-06,
|
|
"loss": 0.3516,
|
|
"mean_token_accuracy": 0.8752468260936439,
|
|
"num_tokens": 484156689.0,
|
|
"step": 563
|
|
},
|
|
{
|
|
"entropy": 0.3984375,
|
|
"epoch": 4.476190476190476,
|
|
"grad_norm": 0.7230409362049561,
|
|
"learning_rate": 1.6793511124613455e-06,
|
|
"loss": 0.3405,
|
|
"mean_token_accuracy": 0.8779969648458064,
|
|
"num_tokens": 485003773.0,
|
|
"step": 564
|
|
},
|
|
{
|
|
"entropy": 0.4019317626953125,
|
|
"epoch": 4.484126984126984,
|
|
"grad_norm": 0.6858561278935235,
|
|
"learning_rate": 1.6630270386365288e-06,
|
|
"loss": 0.3462,
|
|
"mean_token_accuracy": 0.8767383908852935,
|
|
"num_tokens": 485834271.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 0.4033966064453125,
|
|
"epoch": 4.492063492063492,
|
|
"grad_norm": 0.7715463405263099,
|
|
"learning_rate": 1.6467668503931432e-06,
|
|
"loss": 0.3406,
|
|
"mean_token_accuracy": 0.8790650884620845,
|
|
"num_tokens": 486676541.0,
|
|
"step": 566
|
|
},
|
|
{
|
|
"entropy": 0.3995513916015625,
|
|
"epoch": 4.5,
|
|
"grad_norm": 0.7299031695508553,
|
|
"learning_rate": 1.6305708590287616e-06,
|
|
"loss": 0.3413,
|
|
"mean_token_accuracy": 0.8776452434249222,
|
|
"num_tokens": 487533902.0,
|
|
"step": 567
|
|
},
|
|
{
|
|
"entropy": 0.39752197265625,
|
|
"epoch": 4.507936507936508,
|
|
"grad_norm": 0.7001696842835692,
|
|
"learning_rate": 1.6144393746119208e-06,
|
|
"loss": 0.3468,
|
|
"mean_token_accuracy": 0.8766471082344651,
|
|
"num_tokens": 488403340.0,
|
|
"step": 568
|
|
},
|
|
{
|
|
"entropy": 0.3946075439453125,
|
|
"epoch": 4.515873015873016,
|
|
"grad_norm": 0.6949363799298416,
|
|
"learning_rate": 1.5983727059761873e-06,
|
|
"loss": 0.3413,
|
|
"mean_token_accuracy": 0.8782787672244012,
|
|
"num_tokens": 489285650.0,
|
|
"step": 569
|
|
},
|
|
{
|
|
"entropy": 0.402496337890625,
|
|
"epoch": 4.523809523809524,
|
|
"grad_norm": 0.6662573552334149,
|
|
"learning_rate": 1.5823711607142428e-06,
|
|
"loss": 0.3448,
|
|
"mean_token_accuracy": 0.876647824421525,
|
|
"num_tokens": 490146251.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 0.3963165283203125,
|
|
"epoch": 4.531746031746032,
|
|
"grad_norm": 0.6722490242200185,
|
|
"learning_rate": 1.5664350451720022e-06,
|
|
"loss": 0.3343,
|
|
"mean_token_accuracy": 0.8809215794317424,
|
|
"num_tokens": 490981639.0,
|
|
"step": 571
|
|
},
|
|
{
|
|
"entropy": 0.401947021484375,
|
|
"epoch": 4.5396825396825395,
|
|
"grad_norm": 0.7667827684007154,
|
|
"learning_rate": 1.5505646644427375e-06,
|
|
"loss": 0.3443,
|
|
"mean_token_accuracy": 0.8768278043717146,
|
|
"num_tokens": 491819855.0,
|
|
"step": 572
|
|
},
|
|
{
|
|
"entropy": 0.4046630859375,
|
|
"epoch": 4.5476190476190474,
|
|
"grad_norm": 0.7217844340085546,
|
|
"learning_rate": 1.5347603223612462e-06,
|
|
"loss": 0.3453,
|
|
"mean_token_accuracy": 0.8769222623668611,
|
|
"num_tokens": 492664773.0,
|
|
"step": 573
|
|
},
|
|
{
|
|
"entropy": 0.396392822265625,
|
|
"epoch": 4.555555555555555,
|
|
"grad_norm": 0.6828293087400851,
|
|
"learning_rate": 1.5190223214980286e-06,
|
|
"loss": 0.3425,
|
|
"mean_token_accuracy": 0.876984007190913,
|
|
"num_tokens": 493538855.0,
|
|
"step": 574
|
|
},
|
|
{
|
|
"entropy": 0.3953704833984375,
|
|
"epoch": 4.563492063492063,
|
|
"grad_norm": 0.6985094822557292,
|
|
"learning_rate": 1.5033509631534986e-06,
|
|
"loss": 0.3481,
|
|
"mean_token_accuracy": 0.8754701013676822,
|
|
"num_tokens": 494419834.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 0.40057373046875,
|
|
"epoch": 4.571428571428571,
|
|
"grad_norm": 0.7055750733602428,
|
|
"learning_rate": 1.4877465473522178e-06,
|
|
"loss": 0.3449,
|
|
"mean_token_accuracy": 0.8770850743167102,
|
|
"num_tokens": 495279672.0,
|
|
"step": 576
|
|
},
|
|
{
|
|
"entropy": 0.3951416015625,
|
|
"epoch": 4.579365079365079,
|
|
"grad_norm": 0.6964133064600199,
|
|
"learning_rate": 1.4722093728371427e-06,
|
|
"loss": 0.3513,
|
|
"mean_token_accuracy": 0.874992523342371,
|
|
"num_tokens": 496156072.0,
|
|
"step": 577
|
|
},
|
|
{
|
|
"entropy": 0.40093994140625,
|
|
"epoch": 4.587301587301587,
|
|
"grad_norm": 0.6585867710192563,
|
|
"learning_rate": 1.4567397370639158e-06,
|
|
"loss": 0.3481,
|
|
"mean_token_accuracy": 0.8771389788016677,
|
|
"num_tokens": 497013976.0,
|
|
"step": 578
|
|
},
|
|
{
|
|
"entropy": 0.400543212890625,
|
|
"epoch": 4.595238095238095,
|
|
"grad_norm": 0.6695268179646108,
|
|
"learning_rate": 1.4413379361951596e-06,
|
|
"loss": 0.3424,
|
|
"mean_token_accuracy": 0.8771733501926064,
|
|
"num_tokens": 497869587.0,
|
|
"step": 579
|
|
},
|
|
{
|
|
"entropy": 0.4037933349609375,
|
|
"epoch": 4.603174603174603,
|
|
"grad_norm": 0.7406939877102566,
|
|
"learning_rate": 1.4260042650948187e-06,
|
|
"loss": 0.3427,
|
|
"mean_token_accuracy": 0.8756671342998743,
|
|
"num_tokens": 498692858.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 0.3985595703125,
|
|
"epoch": 4.611111111111111,
|
|
"grad_norm": 0.7048045979371886,
|
|
"learning_rate": 1.4107390173225045e-06,
|
|
"loss": 0.3469,
|
|
"mean_token_accuracy": 0.8772797528654337,
|
|
"num_tokens": 499558825.0,
|
|
"step": 581
|
|
},
|
|
{
|
|
"entropy": 0.4054718017578125,
|
|
"epoch": 4.619047619047619,
|
|
"grad_norm": 0.6900885423977635,
|
|
"learning_rate": 1.395542485127886e-06,
|
|
"loss": 0.3408,
|
|
"mean_token_accuracy": 0.878491104580462,
|
|
"num_tokens": 500399881.0,
|
|
"step": 582
|
|
},
|
|
{
|
|
"entropy": 0.4000701904296875,
|
|
"epoch": 4.6269841269841265,
|
|
"grad_norm": 0.6551912767795579,
|
|
"learning_rate": 1.3804149594450816e-06,
|
|
"loss": 0.3402,
|
|
"mean_token_accuracy": 0.8797827651724219,
|
|
"num_tokens": 501277242.0,
|
|
"step": 583
|
|
},
|
|
{
|
|
"entropy": 0.392608642578125,
|
|
"epoch": 4.634920634920634,
|
|
"grad_norm": 0.6872125661896025,
|
|
"learning_rate": 1.365356729887099e-06,
|
|
"loss": 0.3415,
|
|
"mean_token_accuracy": 0.8778949431143701,
|
|
"num_tokens": 502175769.0,
|
|
"step": 584
|
|
},
|
|
{
|
|
"entropy": 0.4004364013671875,
|
|
"epoch": 4.642857142857143,
|
|
"grad_norm": 0.6906654407257142,
|
|
"learning_rate": 1.3503680847402868e-06,
|
|
"loss": 0.3375,
|
|
"mean_token_accuracy": 0.879074421711266,
|
|
"num_tokens": 503037907.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 0.3996429443359375,
|
|
"epoch": 4.650793650793651,
|
|
"grad_norm": 0.7240342341344183,
|
|
"learning_rate": 1.3354493109588145e-06,
|
|
"loss": 0.343,
|
|
"mean_token_accuracy": 0.8791590658947825,
|
|
"num_tokens": 503882004.0,
|
|
"step": 586
|
|
},
|
|
{
|
|
"entropy": 0.3933563232421875,
|
|
"epoch": 4.658730158730159,
|
|
"grad_norm": 0.6759947923545749,
|
|
"learning_rate": 1.320600694159185e-06,
|
|
"loss": 0.3418,
|
|
"mean_token_accuracy": 0.8785993568599224,
|
|
"num_tokens": 504761280.0,
|
|
"step": 587
|
|
},
|
|
{
|
|
"entropy": 0.3975372314453125,
|
|
"epoch": 4.666666666666667,
|
|
"grad_norm": 0.6810332156222548,
|
|
"learning_rate": 1.3058225186147572e-06,
|
|
"loss": 0.3419,
|
|
"mean_token_accuracy": 0.8782242434099317,
|
|
"num_tokens": 505628924.0,
|
|
"step": 588
|
|
},
|
|
{
|
|
"entropy": 0.3953857421875,
|
|
"epoch": 4.674603174603175,
|
|
"grad_norm": 0.6952323670957825,
|
|
"learning_rate": 1.2911150672503098e-06,
|
|
"loss": 0.3349,
|
|
"mean_token_accuracy": 0.8792746933177114,
|
|
"num_tokens": 506483264.0,
|
|
"step": 589
|
|
},
|
|
{
|
|
"entropy": 0.400146484375,
|
|
"epoch": 4.682539682539683,
|
|
"grad_norm": 0.6615786248003038,
|
|
"learning_rate": 1.2764786216366236e-06,
|
|
"loss": 0.342,
|
|
"mean_token_accuracy": 0.8765605296939611,
|
|
"num_tokens": 507337582.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 0.3936614990234375,
|
|
"epoch": 4.690476190476191,
|
|
"grad_norm": 0.6423682264116367,
|
|
"learning_rate": 1.2619134619850908e-06,
|
|
"loss": 0.3403,
|
|
"mean_token_accuracy": 0.8784746997989714,
|
|
"num_tokens": 508222543.0,
|
|
"step": 591
|
|
},
|
|
{
|
|
"entropy": 0.40179443359375,
|
|
"epoch": 4.698412698412699,
|
|
"grad_norm": 0.7179320235597545,
|
|
"learning_rate": 1.2474198671423493e-06,
|
|
"loss": 0.3439,
|
|
"mean_token_accuracy": 0.8781091058626771,
|
|
"num_tokens": 509077470.0,
|
|
"step": 592
|
|
},
|
|
{
|
|
"entropy": 0.4020538330078125,
|
|
"epoch": 4.7063492063492065,
|
|
"grad_norm": 0.6640568389501444,
|
|
"learning_rate": 1.2329981145849468e-06,
|
|
"loss": 0.345,
|
|
"mean_token_accuracy": 0.8776707421056926,
|
|
"num_tokens": 509934412.0,
|
|
"step": 593
|
|
},
|
|
{
|
|
"entropy": 0.3987274169921875,
|
|
"epoch": 4.714285714285714,
|
|
"grad_norm": 0.6641554979878878,
|
|
"learning_rate": 1.2186484804140242e-06,
|
|
"loss": 0.333,
|
|
"mean_token_accuracy": 0.8802338382229209,
|
|
"num_tokens": 510796655.0,
|
|
"step": 594
|
|
},
|
|
{
|
|
"entropy": 0.3936614990234375,
|
|
"epoch": 4.722222222222222,
|
|
"grad_norm": 0.7311800509678725,
|
|
"learning_rate": 1.2043712393500355e-06,
|
|
"loss": 0.3465,
|
|
"mean_token_accuracy": 0.876534974668175,
|
|
"num_tokens": 511666478.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 0.3982086181640625,
|
|
"epoch": 4.73015873015873,
|
|
"grad_norm": 0.654011664415763,
|
|
"learning_rate": 1.1901666647274823e-06,
|
|
"loss": 0.336,
|
|
"mean_token_accuracy": 0.8799294792115688,
|
|
"num_tokens": 512547249.0,
|
|
"step": 596
|
|
},
|
|
{
|
|
"entropy": 0.4018096923828125,
|
|
"epoch": 4.738095238095238,
|
|
"grad_norm": 0.7906669325474568,
|
|
"learning_rate": 1.1760350284896876e-06,
|
|
"loss": 0.3423,
|
|
"mean_token_accuracy": 0.8780363285914063,
|
|
"num_tokens": 513406924.0,
|
|
"step": 597
|
|
},
|
|
{
|
|
"entropy": 0.394073486328125,
|
|
"epoch": 4.746031746031746,
|
|
"grad_norm": 0.6272944465027679,
|
|
"learning_rate": 1.1619766011835832e-06,
|
|
"loss": 0.3351,
|
|
"mean_token_accuracy": 0.8792783697135746,
|
|
"num_tokens": 514278305.0,
|
|
"step": 598
|
|
},
|
|
{
|
|
"entropy": 0.3991546630859375,
|
|
"epoch": 4.753968253968254,
|
|
"grad_norm": 0.7028608754920164,
|
|
"learning_rate": 1.1479916519545326e-06,
|
|
"loss": 0.3381,
|
|
"mean_token_accuracy": 0.8802824383601546,
|
|
"num_tokens": 515127083.0,
|
|
"step": 599
|
|
},
|
|
{
|
|
"entropy": 0.3970489501953125,
|
|
"epoch": 4.761904761904762,
|
|
"grad_norm": 0.7415718014919481,
|
|
"learning_rate": 1.1340804485411783e-06,
|
|
"loss": 0.3494,
|
|
"mean_token_accuracy": 0.8775360365398228,
|
|
"num_tokens": 515982781.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 0.400482177734375,
|
|
"epoch": 4.76984126984127,
|
|
"grad_norm": 0.6506891630519459,
|
|
"learning_rate": 1.1202432572703176e-06,
|
|
"loss": 0.3348,
|
|
"mean_token_accuracy": 0.879584884736687,
|
|
"num_tokens": 516838578.0,
|
|
"step": 601
|
|
},
|
|
{
|
|
"entropy": 0.398193359375,
|
|
"epoch": 4.777777777777778,
|
|
"grad_norm": 0.6609399081506822,
|
|
"learning_rate": 1.1064803430518002e-06,
|
|
"loss": 0.3403,
|
|
"mean_token_accuracy": 0.8773757833987474,
|
|
"num_tokens": 517695973.0,
|
|
"step": 602
|
|
},
|
|
{
|
|
"entropy": 0.3961334228515625,
|
|
"epoch": 4.785714285714286,
|
|
"grad_norm": 0.6536270048247466,
|
|
"learning_rate": 1.0927919693734618e-06,
|
|
"loss": 0.3403,
|
|
"mean_token_accuracy": 0.8781040622852743,
|
|
"num_tokens": 518570319.0,
|
|
"step": 603
|
|
},
|
|
{
|
|
"entropy": 0.398956298828125,
|
|
"epoch": 4.7936507936507935,
|
|
"grad_norm": 0.6661437239536121,
|
|
"learning_rate": 1.0791783982960736e-06,
|
|
"loss": 0.3429,
|
|
"mean_token_accuracy": 0.8768606032244861,
|
|
"num_tokens": 519417148.0,
|
|
"step": 604
|
|
},
|
|
{
|
|
"entropy": 0.399383544921875,
|
|
"epoch": 4.801587301587301,
|
|
"grad_norm": 0.6697036401243884,
|
|
"learning_rate": 1.0656398904483312e-06,
|
|
"loss": 0.3459,
|
|
"mean_token_accuracy": 0.8781998874619603,
|
|
"num_tokens": 520284498.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 0.3967742919921875,
|
|
"epoch": 4.809523809523809,
|
|
"grad_norm": 0.6448494446348442,
|
|
"learning_rate": 1.0521767050218562e-06,
|
|
"loss": 0.3453,
|
|
"mean_token_accuracy": 0.8755287849344313,
|
|
"num_tokens": 521161180.0,
|
|
"step": 606
|
|
},
|
|
{
|
|
"entropy": 0.4019927978515625,
|
|
"epoch": 4.817460317460317,
|
|
"grad_norm": 0.695391933649051,
|
|
"learning_rate": 1.0387890997662443e-06,
|
|
"loss": 0.3338,
|
|
"mean_token_accuracy": 0.8791229757480323,
|
|
"num_tokens": 522018351.0,
|
|
"step": 607
|
|
},
|
|
{
|
|
"entropy": 0.396759033203125,
|
|
"epoch": 4.825396825396825,
|
|
"grad_norm": 0.6885741043618135,
|
|
"learning_rate": 1.0254773309841277e-06,
|
|
"loss": 0.3452,
|
|
"mean_token_accuracy": 0.8766398807056248,
|
|
"num_tokens": 522908445.0,
|
|
"step": 608
|
|
},
|
|
{
|
|
"entropy": 0.4028167724609375,
|
|
"epoch": 4.833333333333333,
|
|
"grad_norm": 0.6824971060967869,
|
|
"learning_rate": 1.012241653526263e-06,
|
|
"loss": 0.3381,
|
|
"mean_token_accuracy": 0.8785937232896686,
|
|
"num_tokens": 523761885.0,
|
|
"step": 609
|
|
},
|
|
{
|
|
"entropy": 0.3957977294921875,
|
|
"epoch": 4.841269841269841,
|
|
"grad_norm": 0.6556820486631832,
|
|
"learning_rate": 9.990823207866578e-07,
|
|
"loss": 0.3431,
|
|
"mean_token_accuracy": 0.8786509921774268,
|
|
"num_tokens": 524634392.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 0.3975067138671875,
|
|
"epoch": 4.849206349206349,
|
|
"grad_norm": 0.6634339315868009,
|
|
"learning_rate": 9.85999584697716e-07,
|
|
"loss": 0.3458,
|
|
"mean_token_accuracy": 0.8759105852805078,
|
|
"num_tokens": 525481711.0,
|
|
"step": 611
|
|
},
|
|
{
|
|
"entropy": 0.3944549560546875,
|
|
"epoch": 4.857142857142857,
|
|
"grad_norm": 0.6633851252480073,
|
|
"learning_rate": 9.729936957254165e-07,
|
|
"loss": 0.3348,
|
|
"mean_token_accuracy": 0.8805793649517,
|
|
"num_tokens": 526350562.0,
|
|
"step": 612
|
|
},
|
|
{
|
|
"entropy": 0.3972015380859375,
|
|
"epoch": 4.865079365079366,
|
|
"grad_norm": 0.6809718096324041,
|
|
"learning_rate": 9.600649028645215e-07,
|
|
"loss": 0.3411,
|
|
"mean_token_accuracy": 0.877831466961652,
|
|
"num_tokens": 527208722.0,
|
|
"step": 613
|
|
},
|
|
{
|
|
"entropy": 0.3993377685546875,
|
|
"epoch": 4.8730158730158735,
|
|
"grad_norm": 0.7413910925997623,
|
|
"learning_rate": 9.472134536338007e-07,
|
|
"loss": 0.3348,
|
|
"mean_token_accuracy": 0.8798928018659353,
|
|
"num_tokens": 528070537.0,
|
|
"step": 614
|
|
},
|
|
{
|
|
"entropy": 0.3999786376953125,
|
|
"epoch": 4.880952380952381,
|
|
"grad_norm": 0.6404430937655207,
|
|
"learning_rate": 9.344395940713009e-07,
|
|
"loss": 0.3482,
|
|
"mean_token_accuracy": 0.8766381270252168,
|
|
"num_tokens": 528925481.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 0.4001312255859375,
|
|
"epoch": 4.888888888888889,
|
|
"grad_norm": 0.642942870387289,
|
|
"learning_rate": 9.217435687296305e-07,
|
|
"loss": 0.3388,
|
|
"mean_token_accuracy": 0.8791137794032693,
|
|
"num_tokens": 529762293.0,
|
|
"step": 616
|
|
},
|
|
{
|
|
"entropy": 0.3968963623046875,
|
|
"epoch": 4.896825396825397,
|
|
"grad_norm": 0.6411974157903991,
|
|
"learning_rate": 9.091256206712812e-07,
|
|
"loss": 0.3398,
|
|
"mean_token_accuracy": 0.8778149662539363,
|
|
"num_tokens": 530625692.0,
|
|
"step": 617
|
|
},
|
|
{
|
|
"entropy": 0.397918701171875,
|
|
"epoch": 4.904761904761905,
|
|
"grad_norm": 0.6795080852701798,
|
|
"learning_rate": 8.965859914639724e-07,
|
|
"loss": 0.3458,
|
|
"mean_token_accuracy": 0.8769173468463123,
|
|
"num_tokens": 531481363.0,
|
|
"step": 618
|
|
},
|
|
{
|
|
"entropy": 0.399322509765625,
|
|
"epoch": 4.912698412698413,
|
|
"grad_norm": 0.7268654576820524,
|
|
"learning_rate": 8.841249211760272e-07,
|
|
"loss": 0.3401,
|
|
"mean_token_accuracy": 0.8781247353181243,
|
|
"num_tokens": 532334443.0,
|
|
"step": 619
|
|
},
|
|
{
|
|
"entropy": 0.39544677734375,
|
|
"epoch": 4.920634920634921,
|
|
"grad_norm": 0.6991042720171687,
|
|
"learning_rate": 8.717426483717762e-07,
|
|
"loss": 0.3474,
|
|
"mean_token_accuracy": 0.8754720254801214,
|
|
"num_tokens": 533215439.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 0.39813232421875,
|
|
"epoch": 4.928571428571429,
|
|
"grad_norm": 0.6420905529792567,
|
|
"learning_rate": 8.594394101069897e-07,
|
|
"loss": 0.3449,
|
|
"mean_token_accuracy": 0.876462968531996,
|
|
"num_tokens": 534086645.0,
|
|
"step": 621
|
|
},
|
|
{
|
|
"entropy": 0.3993988037109375,
|
|
"epoch": 4.936507936507937,
|
|
"grad_norm": 0.634959288865702,
|
|
"learning_rate": 8.472154419243411e-07,
|
|
"loss": 0.3422,
|
|
"mean_token_accuracy": 0.8784619648940861,
|
|
"num_tokens": 534968024.0,
|
|
"step": 622
|
|
},
|
|
{
|
|
"entropy": 0.3939361572265625,
|
|
"epoch": 4.944444444444445,
|
|
"grad_norm": 0.6525948648538159,
|
|
"learning_rate": 8.350709778488941e-07,
|
|
"loss": 0.3433,
|
|
"mean_token_accuracy": 0.878953296225518,
|
|
"num_tokens": 535858097.0,
|
|
"step": 623
|
|
},
|
|
{
|
|
"entropy": 0.3915557861328125,
|
|
"epoch": 4.9523809523809526,
|
|
"grad_norm": 0.6338378330558326,
|
|
"learning_rate": 8.230062503836278e-07,
|
|
"loss": 0.3403,
|
|
"mean_token_accuracy": 0.8782070642337203,
|
|
"num_tokens": 536754856.0,
|
|
"step": 624
|
|
},
|
|
{
|
|
"entropy": 0.3979949951171875,
|
|
"epoch": 4.9603174603174605,
|
|
"grad_norm": 0.6515049678243575,
|
|
"learning_rate": 8.110214905049802e-07,
|
|
"loss": 0.3447,
|
|
"mean_token_accuracy": 0.8780298097990453,
|
|
"num_tokens": 537612880.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 0.39752197265625,
|
|
"epoch": 4.968253968253968,
|
|
"grad_norm": 0.6537688062696074,
|
|
"learning_rate": 7.991169276584281e-07,
|
|
"loss": 0.3383,
|
|
"mean_token_accuracy": 0.8791689327917993,
|
|
"num_tokens": 538459827.0,
|
|
"step": 626
|
|
},
|
|
{
|
|
"entropy": 0.4014129638671875,
|
|
"epoch": 4.976190476190476,
|
|
"grad_norm": 0.6761627263761556,
|
|
"learning_rate": 7.872927897540944e-07,
|
|
"loss": 0.3349,
|
|
"mean_token_accuracy": 0.8803192311897874,
|
|
"num_tokens": 539280570.0,
|
|
"step": 627
|
|
},
|
|
{
|
|
"entropy": 0.4014129638671875,
|
|
"epoch": 4.984126984126984,
|
|
"grad_norm": 0.6783420727462287,
|
|
"learning_rate": 7.75549303162384e-07,
|
|
"loss": 0.3441,
|
|
"mean_token_accuracy": 0.8776299306191504,
|
|
"num_tokens": 540103344.0,
|
|
"step": 628
|
|
},
|
|
{
|
|
"entropy": 0.3983917236328125,
|
|
"epoch": 4.992063492063492,
|
|
"grad_norm": 0.6003292581738014,
|
|
"learning_rate": 7.638866927096555e-07,
|
|
"loss": 0.3384,
|
|
"mean_token_accuracy": 0.8786575449630618,
|
|
"num_tokens": 540983421.0,
|
|
"step": 629
|
|
},
|
|
{
|
|
"entropy": 0.3915557861328125,
|
|
"epoch": 5.0,
|
|
"grad_norm": 0.6132257577458607,
|
|
"learning_rate": 7.523051816739074e-07,
|
|
"loss": 0.3407,
|
|
"mean_token_accuracy": 0.8787228460423648,
|
|
"num_tokens": 541874593.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 0.3969573974609375,
|
|
"epoch": 5.007936507936508,
|
|
"grad_norm": 0.6874008020158511,
|
|
"learning_rate": 7.408049917805104e-07,
|
|
"loss": 0.3324,
|
|
"mean_token_accuracy": 0.8809576267376542,
|
|
"num_tokens": 542746900.0,
|
|
"step": 631
|
|
},
|
|
{
|
|
"entropy": 0.3990631103515625,
|
|
"epoch": 5.015873015873016,
|
|
"grad_norm": 0.6533341249855773,
|
|
"learning_rate": 7.293863431979619e-07,
|
|
"loss": 0.3325,
|
|
"mean_token_accuracy": 0.8817571788094938,
|
|
"num_tokens": 543607351.0,
|
|
"step": 632
|
|
},
|
|
{
|
|
"entropy": 0.4006195068359375,
|
|
"epoch": 5.023809523809524,
|
|
"grad_norm": 0.6743032453873651,
|
|
"learning_rate": 7.180494545336642e-07,
|
|
"loss": 0.3252,
|
|
"mean_token_accuracy": 0.8821277469396591,
|
|
"num_tokens": 544429814.0,
|
|
"step": 633
|
|
},
|
|
{
|
|
"entropy": 0.392242431640625,
|
|
"epoch": 5.031746031746032,
|
|
"grad_norm": 0.6491407357261534,
|
|
"learning_rate": 7.067945428297524e-07,
|
|
"loss": 0.329,
|
|
"mean_token_accuracy": 0.8841568692587316,
|
|
"num_tokens": 545293765.0,
|
|
"step": 634
|
|
},
|
|
{
|
|
"entropy": 0.392059326171875,
|
|
"epoch": 5.0396825396825395,
|
|
"grad_norm": 0.6509793206709636,
|
|
"learning_rate": 6.956218235589263e-07,
|
|
"loss": 0.327,
|
|
"mean_token_accuracy": 0.8831868241541088,
|
|
"num_tokens": 546166241.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 0.394683837890625,
|
|
"epoch": 5.0476190476190474,
|
|
"grad_norm": 0.6965451021310177,
|
|
"learning_rate": 6.845315106203327e-07,
|
|
"loss": 0.3192,
|
|
"mean_token_accuracy": 0.8857454406097531,
|
|
"num_tokens": 547008292.0,
|
|
"step": 636
|
|
},
|
|
{
|
|
"entropy": 0.3927764892578125,
|
|
"epoch": 5.055555555555555,
|
|
"grad_norm": 0.7299760861326686,
|
|
"learning_rate": 6.735238163354669e-07,
|
|
"loss": 0.329,
|
|
"mean_token_accuracy": 0.8830623761750758,
|
|
"num_tokens": 547881345.0,
|
|
"step": 637
|
|
},
|
|
{
|
|
"entropy": 0.393829345703125,
|
|
"epoch": 5.063492063492063,
|
|
"grad_norm": 0.692591071370849,
|
|
"learning_rate": 6.625989514441089e-07,
|
|
"loss": 0.3263,
|
|
"mean_token_accuracy": 0.8835366195999086,
|
|
"num_tokens": 548753550.0,
|
|
"step": 638
|
|
},
|
|
{
|
|
"entropy": 0.3939666748046875,
|
|
"epoch": 5.071428571428571,
|
|
"grad_norm": 0.6832435052031313,
|
|
"learning_rate": 6.517571251002896e-07,
|
|
"loss": 0.3274,
|
|
"mean_token_accuracy": 0.8831272819079459,
|
|
"num_tokens": 549614624.0,
|
|
"step": 639
|
|
},
|
|
{
|
|
"entropy": 0.39398193359375,
|
|
"epoch": 5.079365079365079,
|
|
"grad_norm": 0.6839637384434816,
|
|
"learning_rate": 6.40998544868287e-07,
|
|
"loss": 0.3168,
|
|
"mean_token_accuracy": 0.8865264924243093,
|
|
"num_tokens": 550450538.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 0.3957672119140625,
|
|
"epoch": 5.087301587301587,
|
|
"grad_norm": 0.6628392349359786,
|
|
"learning_rate": 6.3032341671865e-07,
|
|
"loss": 0.3251,
|
|
"mean_token_accuracy": 0.8840126856230199,
|
|
"num_tokens": 551311533.0,
|
|
"step": 641
|
|
},
|
|
{
|
|
"entropy": 0.3985137939453125,
|
|
"epoch": 5.095238095238095,
|
|
"grad_norm": 0.6552020896616739,
|
|
"learning_rate": 6.197319450242562e-07,
|
|
"loss": 0.3219,
|
|
"mean_token_accuracy": 0.8848442859016359,
|
|
"num_tokens": 552168363.0,
|
|
"step": 642
|
|
},
|
|
{
|
|
"entropy": 0.3944549560546875,
|
|
"epoch": 5.103174603174603,
|
|
"grad_norm": 0.6459530624061904,
|
|
"learning_rate": 6.092243325564007e-07,
|
|
"loss": 0.3254,
|
|
"mean_token_accuracy": 0.885173340793699,
|
|
"num_tokens": 553049237.0,
|
|
"step": 643
|
|
},
|
|
{
|
|
"entropy": 0.396759033203125,
|
|
"epoch": 5.111111111111111,
|
|
"grad_norm": 0.6485251490072942,
|
|
"learning_rate": 5.98800780480912e-07,
|
|
"loss": 0.3345,
|
|
"mean_token_accuracy": 0.8795099183917046,
|
|
"num_tokens": 553918684.0,
|
|
"step": 644
|
|
},
|
|
{
|
|
"entropy": 0.3992462158203125,
|
|
"epoch": 5.119047619047619,
|
|
"grad_norm": 0.682884688994534,
|
|
"learning_rate": 5.884614883543027e-07,
|
|
"loss": 0.3294,
|
|
"mean_token_accuracy": 0.8827598677016795,
|
|
"num_tokens": 554768021.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 0.3957366943359375,
|
|
"epoch": 5.1269841269841265,
|
|
"grad_norm": 0.6622701991206025,
|
|
"learning_rate": 5.782066541199471e-07,
|
|
"loss": 0.3201,
|
|
"mean_token_accuracy": 0.8856850513257086,
|
|
"num_tokens": 555611362.0,
|
|
"step": 646
|
|
},
|
|
{
|
|
"entropy": 0.397430419921875,
|
|
"epoch": 5.134920634920635,
|
|
"grad_norm": 0.6494881440989898,
|
|
"learning_rate": 5.680364741042926e-07,
|
|
"loss": 0.3308,
|
|
"mean_token_accuracy": 0.8822413263842463,
|
|
"num_tokens": 556476117.0,
|
|
"step": 647
|
|
},
|
|
{
|
|
"entropy": 0.39849853515625,
|
|
"epoch": 5.142857142857143,
|
|
"grad_norm": 0.6399277286004064,
|
|
"learning_rate": 5.579511430131018e-07,
|
|
"loss": 0.3262,
|
|
"mean_token_accuracy": 0.8843574924394488,
|
|
"num_tokens": 557321161.0,
|
|
"step": 648
|
|
},
|
|
{
|
|
"entropy": 0.3938751220703125,
|
|
"epoch": 5.150793650793651,
|
|
"grad_norm": 0.6414200561803504,
|
|
"learning_rate": 5.479508539277229e-07,
|
|
"loss": 0.3262,
|
|
"mean_token_accuracy": 0.8831641948781908,
|
|
"num_tokens": 558195818.0,
|
|
"step": 649
|
|
},
|
|
{
|
|
"entropy": 0.3946533203125,
|
|
"epoch": 5.158730158730159,
|
|
"grad_norm": 0.6400500966934808,
|
|
"learning_rate": 5.380357983013962e-07,
|
|
"loss": 0.3247,
|
|
"mean_token_accuracy": 0.8846798562444746,
|
|
"num_tokens": 559060077.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 0.3952484130859375,
|
|
"epoch": 5.166666666666667,
|
|
"grad_norm": 0.665030112678579,
|
|
"learning_rate": 5.282061659555854e-07,
|
|
"loss": 0.3306,
|
|
"mean_token_accuracy": 0.8817098373547196,
|
|
"num_tokens": 559919625.0,
|
|
"step": 651
|
|
},
|
|
{
|
|
"entropy": 0.3953704833984375,
|
|
"epoch": 5.174603174603175,
|
|
"grad_norm": 0.6442855912711039,
|
|
"learning_rate": 5.184621450763455e-07,
|
|
"loss": 0.3286,
|
|
"mean_token_accuracy": 0.8841330683790147,
|
|
"num_tokens": 560767619.0,
|
|
"step": 652
|
|
},
|
|
{
|
|
"entropy": 0.3937225341796875,
|
|
"epoch": 5.182539682539683,
|
|
"grad_norm": 0.711237148073599,
|
|
"learning_rate": 5.088039222107205e-07,
|
|
"loss": 0.3317,
|
|
"mean_token_accuracy": 0.8824787489138544,
|
|
"num_tokens": 561614252.0,
|
|
"step": 653
|
|
},
|
|
{
|
|
"entropy": 0.3915252685546875,
|
|
"epoch": 5.190476190476191,
|
|
"grad_norm": 0.664655406106165,
|
|
"learning_rate": 4.992316822631693e-07,
|
|
"loss": 0.3247,
|
|
"mean_token_accuracy": 0.883714787196368,
|
|
"num_tokens": 562479272.0,
|
|
"step": 654
|
|
},
|
|
{
|
|
"entropy": 0.3939361572265625,
|
|
"epoch": 5.198412698412699,
|
|
"grad_norm": 0.6423893808843507,
|
|
"learning_rate": 4.897456084920282e-07,
|
|
"loss": 0.3233,
|
|
"mean_token_accuracy": 0.8836758630350232,
|
|
"num_tokens": 563325903.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 0.391815185546875,
|
|
"epoch": 5.2063492063492065,
|
|
"grad_norm": 0.609576579332021,
|
|
"learning_rate": 4.803458825060042e-07,
|
|
"loss": 0.3234,
|
|
"mean_token_accuracy": 0.8831925024278462,
|
|
"num_tokens": 564204898.0,
|
|
"step": 656
|
|
},
|
|
{
|
|
"entropy": 0.3961944580078125,
|
|
"epoch": 5.214285714285714,
|
|
"grad_norm": 0.6355056690611225,
|
|
"learning_rate": 4.710326842606927e-07,
|
|
"loss": 0.3209,
|
|
"mean_token_accuracy": 0.8843817953020334,
|
|
"num_tokens": 565054657.0,
|
|
"step": 657
|
|
},
|
|
{
|
|
"entropy": 0.3915863037109375,
|
|
"epoch": 5.222222222222222,
|
|
"grad_norm": 0.6715411599255142,
|
|
"learning_rate": 4.618061920551381e-07,
|
|
"loss": 0.3268,
|
|
"mean_token_accuracy": 0.8825240274891257,
|
|
"num_tokens": 565926348.0,
|
|
"step": 658
|
|
},
|
|
{
|
|
"entropy": 0.3952484130859375,
|
|
"epoch": 5.23015873015873,
|
|
"grad_norm": 0.6474113419989475,
|
|
"learning_rate": 4.526665825284132e-07,
|
|
"loss": 0.3344,
|
|
"mean_token_accuracy": 0.8826401890255511,
|
|
"num_tokens": 566799563.0,
|
|
"step": 659
|
|
},
|
|
{
|
|
"entropy": 0.393402099609375,
|
|
"epoch": 5.238095238095238,
|
|
"grad_norm": 0.6520320734154383,
|
|
"learning_rate": 4.4361403065624475e-07,
|
|
"loss": 0.3283,
|
|
"mean_token_accuracy": 0.881974630523473,
|
|
"num_tokens": 567686904.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 0.3928680419921875,
|
|
"epoch": 5.246031746031746,
|
|
"grad_norm": 0.6254784285922858,
|
|
"learning_rate": 4.3464870974766314e-07,
|
|
"loss": 0.3299,
|
|
"mean_token_accuracy": 0.882287971675396,
|
|
"num_tokens": 568563817.0,
|
|
"step": 661
|
|
},
|
|
{
|
|
"entropy": 0.396759033203125,
|
|
"epoch": 5.253968253968254,
|
|
"grad_norm": 0.6524894750875436,
|
|
"learning_rate": 4.257707914416781e-07,
|
|
"loss": 0.319,
|
|
"mean_token_accuracy": 0.8853690237738192,
|
|
"num_tokens": 569412950.0,
|
|
"step": 662
|
|
},
|
|
{
|
|
"entropy": 0.390960693359375,
|
|
"epoch": 5.261904761904762,
|
|
"grad_norm": 0.6550596337749973,
|
|
"learning_rate": 4.169804457039972e-07,
|
|
"loss": 0.3281,
|
|
"mean_token_accuracy": 0.8837977671064436,
|
|
"num_tokens": 570290370.0,
|
|
"step": 663
|
|
},
|
|
{
|
|
"entropy": 0.39520263671875,
|
|
"epoch": 5.26984126984127,
|
|
"grad_norm": 0.6599640261368089,
|
|
"learning_rate": 4.082778408237731e-07,
|
|
"loss": 0.3312,
|
|
"mean_token_accuracy": 0.8819447602145374,
|
|
"num_tokens": 571139089.0,
|
|
"step": 664
|
|
},
|
|
{
|
|
"entropy": 0.3977203369140625,
|
|
"epoch": 5.277777777777778,
|
|
"grad_norm": 0.6325202825006885,
|
|
"learning_rate": 3.996631434103776e-07,
|
|
"loss": 0.3216,
|
|
"mean_token_accuracy": 0.8842552327550948,
|
|
"num_tokens": 571974486.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 0.3957366943359375,
|
|
"epoch": 5.285714285714286,
|
|
"grad_norm": 0.629795019163263,
|
|
"learning_rate": 3.911365183902166e-07,
|
|
"loss": 0.3244,
|
|
"mean_token_accuracy": 0.883813981898129,
|
|
"num_tokens": 572833941.0,
|
|
"step": 666
|
|
},
|
|
{
|
|
"entropy": 0.393707275390625,
|
|
"epoch": 5.2936507936507935,
|
|
"grad_norm": 0.6615713948839467,
|
|
"learning_rate": 3.826981290035692e-07,
|
|
"loss": 0.3358,
|
|
"mean_token_accuracy": 0.8800787003710866,
|
|
"num_tokens": 573696025.0,
|
|
"step": 667
|
|
},
|
|
{
|
|
"entropy": 0.3954620361328125,
|
|
"epoch": 5.301587301587301,
|
|
"grad_norm": 0.6448584761089627,
|
|
"learning_rate": 3.7434813680146234e-07,
|
|
"loss": 0.3258,
|
|
"mean_token_accuracy": 0.8832776751369238,
|
|
"num_tokens": 574541353.0,
|
|
"step": 668
|
|
},
|
|
{
|
|
"entropy": 0.3994140625,
|
|
"epoch": 5.309523809523809,
|
|
"grad_norm": 0.6253174175232413,
|
|
"learning_rate": 3.6608670164258065e-07,
|
|
"loss": 0.328,
|
|
"mean_token_accuracy": 0.8827647585421801,
|
|
"num_tokens": 575378417.0,
|
|
"step": 669
|
|
},
|
|
{
|
|
"entropy": 0.397003173828125,
|
|
"epoch": 5.317460317460317,
|
|
"grad_norm": 0.6543163664275344,
|
|
"learning_rate": 3.5791398169020384e-07,
|
|
"loss": 0.3223,
|
|
"mean_token_accuracy": 0.8838555999100208,
|
|
"num_tokens": 576216763.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 0.3980865478515625,
|
|
"epoch": 5.325396825396825,
|
|
"grad_norm": 0.6727975870724803,
|
|
"learning_rate": 3.4983013340918024e-07,
|
|
"loss": 0.3319,
|
|
"mean_token_accuracy": 0.881680119317025,
|
|
"num_tokens": 577068285.0,
|
|
"step": 671
|
|
},
|
|
{
|
|
"entropy": 0.397216796875,
|
|
"epoch": 5.333333333333333,
|
|
"grad_norm": 0.6908497952151735,
|
|
"learning_rate": 3.4183531156292913e-07,
|
|
"loss": 0.3199,
|
|
"mean_token_accuracy": 0.8852394479326904,
|
|
"num_tokens": 577910888.0,
|
|
"step": 672
|
|
},
|
|
{
|
|
"entropy": 0.3982391357421875,
|
|
"epoch": 5.341269841269841,
|
|
"grad_norm": 0.6271777851924285,
|
|
"learning_rate": 3.3392966921047984e-07,
|
|
"loss": 0.3323,
|
|
"mean_token_accuracy": 0.8813259471207857,
|
|
"num_tokens": 578760857.0,
|
|
"step": 673
|
|
},
|
|
{
|
|
"entropy": 0.39141845703125,
|
|
"epoch": 5.349206349206349,
|
|
"grad_norm": 0.6420853255417144,
|
|
"learning_rate": 3.261133577035408e-07,
|
|
"loss": 0.3276,
|
|
"mean_token_accuracy": 0.8815803048200905,
|
|
"num_tokens": 579639850.0,
|
|
"step": 674
|
|
},
|
|
{
|
|
"entropy": 0.39239501953125,
|
|
"epoch": 5.357142857142857,
|
|
"grad_norm": 0.6400029343318571,
|
|
"learning_rate": 3.1838652668360173e-07,
|
|
"loss": 0.3208,
|
|
"mean_token_accuracy": 0.8846969213336706,
|
|
"num_tokens": 580506527.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 0.4005584716796875,
|
|
"epoch": 5.365079365079365,
|
|
"grad_norm": 0.6337302117382984,
|
|
"learning_rate": 3.1074932407906823e-07,
|
|
"loss": 0.3313,
|
|
"mean_token_accuracy": 0.8819033140316606,
|
|
"num_tokens": 581347475.0,
|
|
"step": 676
|
|
},
|
|
{
|
|
"entropy": 0.39678955078125,
|
|
"epoch": 5.3730158730158735,
|
|
"grad_norm": 0.6531503190443603,
|
|
"learning_rate": 3.0320189610243303e-07,
|
|
"loss": 0.3226,
|
|
"mean_token_accuracy": 0.8830904331989586,
|
|
"num_tokens": 582201245.0,
|
|
"step": 677
|
|
},
|
|
{
|
|
"entropy": 0.39251708984375,
|
|
"epoch": 5.380952380952381,
|
|
"grad_norm": 0.6298282608886412,
|
|
"learning_rate": 2.957443872474713e-07,
|
|
"loss": 0.3249,
|
|
"mean_token_accuracy": 0.8838722719810903,
|
|
"num_tokens": 583076088.0,
|
|
"step": 678
|
|
},
|
|
{
|
|
"entropy": 0.3914947509765625,
|
|
"epoch": 5.388888888888889,
|
|
"grad_norm": 0.6217255913819979,
|
|
"learning_rate": 2.883769402864789e-07,
|
|
"loss": 0.3235,
|
|
"mean_token_accuracy": 0.8837307607755065,
|
|
"num_tokens": 583938273.0,
|
|
"step": 679
|
|
},
|
|
{
|
|
"entropy": 0.3894500732421875,
|
|
"epoch": 5.396825396825397,
|
|
"grad_norm": 0.9295484786245685,
|
|
"learning_rate": 2.810996962675361e-07,
|
|
"loss": 0.3289,
|
|
"mean_token_accuracy": 0.8828169428743422,
|
|
"num_tokens": 584828853.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 0.39373779296875,
|
|
"epoch": 5.404761904761905,
|
|
"grad_norm": 0.6460727447308977,
|
|
"learning_rate": 2.739127945118092e-07,
|
|
"loss": 0.3332,
|
|
"mean_token_accuracy": 0.8817042661830783,
|
|
"num_tokens": 585681013.0,
|
|
"step": 681
|
|
},
|
|
{
|
|
"entropy": 0.3941650390625,
|
|
"epoch": 5.412698412698413,
|
|
"grad_norm": 0.6371402076274487,
|
|
"learning_rate": 2.668163726108841e-07,
|
|
"loss": 0.3294,
|
|
"mean_token_accuracy": 0.8820854951627553,
|
|
"num_tokens": 586567674.0,
|
|
"step": 682
|
|
},
|
|
{
|
|
"entropy": 0.3912506103515625,
|
|
"epoch": 5.420634920634921,
|
|
"grad_norm": 0.6599817270342999,
|
|
"learning_rate": 2.5981056642412796e-07,
|
|
"loss": 0.3274,
|
|
"mean_token_accuracy": 0.884115984197706,
|
|
"num_tokens": 587445470.0,
|
|
"step": 683
|
|
},
|
|
{
|
|
"entropy": 0.3984527587890625,
|
|
"epoch": 5.428571428571429,
|
|
"grad_norm": 0.6190960844327497,
|
|
"learning_rate": 2.528955100760938e-07,
|
|
"loss": 0.3225,
|
|
"mean_token_accuracy": 0.8846618658863008,
|
|
"num_tokens": 588268783.0,
|
|
"step": 684
|
|
},
|
|
{
|
|
"entropy": 0.396881103515625,
|
|
"epoch": 5.436507936507937,
|
|
"grad_norm": 0.6277108055718937,
|
|
"learning_rate": 2.460713359539474e-07,
|
|
"loss": 0.3247,
|
|
"mean_token_accuracy": 0.8857448240742087,
|
|
"num_tokens": 589106246.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 0.393524169921875,
|
|
"epoch": 5.444444444444445,
|
|
"grad_norm": 0.6346008314831126,
|
|
"learning_rate": 2.3933817470493445e-07,
|
|
"loss": 0.319,
|
|
"mean_token_accuracy": 0.8859792477451265,
|
|
"num_tokens": 589927765.0,
|
|
"step": 686
|
|
},
|
|
{
|
|
"entropy": 0.396209716796875,
|
|
"epoch": 5.4523809523809526,
|
|
"grad_norm": 0.6418782134867133,
|
|
"learning_rate": 2.3269615523388355e-07,
|
|
"loss": 0.3276,
|
|
"mean_token_accuracy": 0.8844058201648295,
|
|
"num_tokens": 590768483.0,
|
|
"step": 687
|
|
},
|
|
{
|
|
"entropy": 0.393310546875,
|
|
"epoch": 5.4603174603174605,
|
|
"grad_norm": 0.6145408122390106,
|
|
"learning_rate": 2.2614540470073276e-07,
|
|
"loss": 0.3276,
|
|
"mean_token_accuracy": 0.8833085368387401,
|
|
"num_tokens": 591653011.0,
|
|
"step": 688
|
|
},
|
|
{
|
|
"entropy": 0.391082763671875,
|
|
"epoch": 5.468253968253968,
|
|
"grad_norm": 0.6290181808829338,
|
|
"learning_rate": 2.1968604851809738e-07,
|
|
"loss": 0.3344,
|
|
"mean_token_accuracy": 0.8824751214124262,
|
|
"num_tokens": 592555939.0,
|
|
"step": 689
|
|
},
|
|
{
|
|
"entropy": 0.393218994140625,
|
|
"epoch": 5.476190476190476,
|
|
"grad_norm": 0.6052791805512318,
|
|
"learning_rate": 2.1331821034886846e-07,
|
|
"loss": 0.3268,
|
|
"mean_token_accuracy": 0.8844651393592358,
|
|
"num_tokens": 593422295.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 0.3939666748046875,
|
|
"epoch": 5.484126984126984,
|
|
"grad_norm": 0.6765361632797722,
|
|
"learning_rate": 2.0704201210384634e-07,
|
|
"loss": 0.3294,
|
|
"mean_token_accuracy": 0.8817935772240162,
|
|
"num_tokens": 594275578.0,
|
|
"step": 691
|
|
},
|
|
{
|
|
"entropy": 0.3902130126953125,
|
|
"epoch": 5.492063492063492,
|
|
"grad_norm": 0.6558765212307175,
|
|
"learning_rate": 2.0085757393940586e-07,
|
|
"loss": 0.3276,
|
|
"mean_token_accuracy": 0.8839560803025961,
|
|
"num_tokens": 595158105.0,
|
|
"step": 692
|
|
},
|
|
{
|
|
"entropy": 0.3959808349609375,
|
|
"epoch": 5.5,
|
|
"grad_norm": 0.6728354108367586,
|
|
"learning_rate": 1.9476501425519656e-07,
|
|
"loss": 0.3314,
|
|
"mean_token_accuracy": 0.8821393130347133,
|
|
"num_tokens": 596012207.0,
|
|
"step": 693
|
|
},
|
|
{
|
|
"entropy": 0.3945465087890625,
|
|
"epoch": 5.507936507936508,
|
|
"grad_norm": 0.6161700876601213,
|
|
"learning_rate": 1.8876444969187557e-07,
|
|
"loss": 0.3252,
|
|
"mean_token_accuracy": 0.884291214402765,
|
|
"num_tokens": 596867579.0,
|
|
"step": 694
|
|
},
|
|
{
|
|
"entropy": 0.3921051025390625,
|
|
"epoch": 5.515873015873016,
|
|
"grad_norm": 0.6659432185652765,
|
|
"learning_rate": 1.828559951288733e-07,
|
|
"loss": 0.3294,
|
|
"mean_token_accuracy": 0.8830416211858392,
|
|
"num_tokens": 597729003.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 0.3937225341796875,
|
|
"epoch": 5.523809523809524,
|
|
"grad_norm": 0.6337625434766239,
|
|
"learning_rate": 1.7703976368219633e-07,
|
|
"loss": 0.3387,
|
|
"mean_token_accuracy": 0.8793549695983529,
|
|
"num_tokens": 598610243.0,
|
|
"step": 696
|
|
},
|
|
{
|
|
"entropy": 0.398223876953125,
|
|
"epoch": 5.531746031746032,
|
|
"grad_norm": 0.6425672701806616,
|
|
"learning_rate": 1.713158667022613e-07,
|
|
"loss": 0.3282,
|
|
"mean_token_accuracy": 0.8831517458893359,
|
|
"num_tokens": 599468184.0,
|
|
"step": 697
|
|
},
|
|
{
|
|
"entropy": 0.390777587890625,
|
|
"epoch": 5.5396825396825395,
|
|
"grad_norm": 0.6588868479383553,
|
|
"learning_rate": 1.656844137717617e-07,
|
|
"loss": 0.3241,
|
|
"mean_token_accuracy": 0.8842832935042679,
|
|
"num_tokens": 600335530.0,
|
|
"step": 698
|
|
},
|
|
{
|
|
"entropy": 0.39007568359375,
|
|
"epoch": 5.5476190476190474,
|
|
"grad_norm": 0.6391375374634255,
|
|
"learning_rate": 1.601455127035717e-07,
|
|
"loss": 0.3303,
|
|
"mean_token_accuracy": 0.8812260185368359,
|
|
"num_tokens": 601219591.0,
|
|
"step": 699
|
|
},
|
|
{
|
|
"entropy": 0.3961639404296875,
|
|
"epoch": 5.555555555555555,
|
|
"grad_norm": 0.6312177790645791,
|
|
"learning_rate": 1.5469926953868063e-07,
|
|
"loss": 0.3277,
|
|
"mean_token_accuracy": 0.8838478000834584,
|
|
"num_tokens": 602080711.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 0.3947296142578125,
|
|
"epoch": 5.563492063492063,
|
|
"grad_norm": 0.6168029707269869,
|
|
"learning_rate": 1.4934578854416403e-07,
|
|
"loss": 0.3273,
|
|
"mean_token_accuracy": 0.8831527666188776,
|
|
"num_tokens": 602939251.0,
|
|
"step": 701
|
|
},
|
|
{
|
|
"entropy": 0.3935546875,
|
|
"epoch": 5.571428571428571,
|
|
"grad_norm": 0.6196463005466791,
|
|
"learning_rate": 1.440851722111858e-07,
|
|
"loss": 0.3214,
|
|
"mean_token_accuracy": 0.8847082569263875,
|
|
"num_tokens": 603814211.0,
|
|
"step": 702
|
|
},
|
|
{
|
|
"entropy": 0.3931884765625,
|
|
"epoch": 5.579365079365079,
|
|
"grad_norm": 0.6582430849772537,
|
|
"learning_rate": 1.389175212530397e-07,
|
|
"loss": 0.3279,
|
|
"mean_token_accuracy": 0.8828860782086849,
|
|
"num_tokens": 604668809.0,
|
|
"step": 703
|
|
},
|
|
{
|
|
"entropy": 0.395904541015625,
|
|
"epoch": 5.587301587301587,
|
|
"grad_norm": 0.6557024272945261,
|
|
"learning_rate": 1.3384293460321662e-07,
|
|
"loss": 0.3316,
|
|
"mean_token_accuracy": 0.8824727293103933,
|
|
"num_tokens": 605530653.0,
|
|
"step": 704
|
|
},
|
|
{
|
|
"entropy": 0.39093017578125,
|
|
"epoch": 5.595238095238095,
|
|
"grad_norm": 0.5952866739722456,
|
|
"learning_rate": 1.2886150941351317e-07,
|
|
"loss": 0.3282,
|
|
"mean_token_accuracy": 0.8835100992582738,
|
|
"num_tokens": 606405426.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 0.394195556640625,
|
|
"epoch": 5.603174603174603,
|
|
"grad_norm": 0.6329738035693675,
|
|
"learning_rate": 1.2397334105217097e-07,
|
|
"loss": 0.3251,
|
|
"mean_token_accuracy": 0.8846405958756804,
|
|
"num_tokens": 607272519.0,
|
|
"step": 706
|
|
},
|
|
{
|
|
"entropy": 0.3961181640625,
|
|
"epoch": 5.611111111111111,
|
|
"grad_norm": 0.6167631677238792,
|
|
"learning_rate": 1.1917852310205147e-07,
|
|
"loss": 0.3279,
|
|
"mean_token_accuracy": 0.883335932623595,
|
|
"num_tokens": 608126636.0,
|
|
"step": 707
|
|
},
|
|
{
|
|
"entropy": 0.3947296142578125,
|
|
"epoch": 5.619047619047619,
|
|
"grad_norm": 0.6123371507088005,
|
|
"learning_rate": 1.1447714735884463e-07,
|
|
"loss": 0.3197,
|
|
"mean_token_accuracy": 0.8852512533776462,
|
|
"num_tokens": 608972728.0,
|
|
"step": 708
|
|
},
|
|
{
|
|
"entropy": 0.3878631591796875,
|
|
"epoch": 5.6269841269841265,
|
|
"grad_norm": 0.6206905437393018,
|
|
"learning_rate": 1.0986930382930916e-07,
|
|
"loss": 0.3251,
|
|
"mean_token_accuracy": 0.8831897312775254,
|
|
"num_tokens": 609880767.0,
|
|
"step": 709
|
|
},
|
|
{
|
|
"entropy": 0.39312744140625,
|
|
"epoch": 5.634920634920634,
|
|
"grad_norm": 0.6362558759153903,
|
|
"learning_rate": 1.0535508072955225e-07,
|
|
"loss": 0.3284,
|
|
"mean_token_accuracy": 0.8815909679979086,
|
|
"num_tokens": 610732354.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 0.3891448974609375,
|
|
"epoch": 5.642857142857143,
|
|
"grad_norm": 0.6132840766865005,
|
|
"learning_rate": 1.0093456448333872e-07,
|
|
"loss": 0.3259,
|
|
"mean_token_accuracy": 0.8822133978828788,
|
|
"num_tokens": 611602054.0,
|
|
"step": 711
|
|
},
|
|
{
|
|
"entropy": 0.393310546875,
|
|
"epoch": 5.650793650793651,
|
|
"grad_norm": 0.6300293214423813,
|
|
"learning_rate": 9.660783972043786e-08,
|
|
"loss": 0.3285,
|
|
"mean_token_accuracy": 0.8818607972934842,
|
|
"num_tokens": 612448801.0,
|
|
"step": 712
|
|
},
|
|
{
|
|
"entropy": 0.3934173583984375,
|
|
"epoch": 5.658730158730159,
|
|
"grad_norm": 0.6904998334631229,
|
|
"learning_rate": 9.237498927500088e-08,
|
|
"loss": 0.3302,
|
|
"mean_token_accuracy": 0.881088858935982,
|
|
"num_tokens": 613302281.0,
|
|
"step": 713
|
|
},
|
|
{
|
|
"entropy": 0.3985748291015625,
|
|
"epoch": 5.666666666666667,
|
|
"grad_norm": 0.6336591385840509,
|
|
"learning_rate": 8.823609418397939e-08,
|
|
"loss": 0.324,
|
|
"mean_token_accuracy": 0.8830747129395604,
|
|
"num_tokens": 614133343.0,
|
|
"step": 714
|
|
},
|
|
{
|
|
"entropy": 0.3961181640625,
|
|
"epoch": 5.674603174603175,
|
|
"grad_norm": 0.6560814482182532,
|
|
"learning_rate": 8.419123368556991e-08,
|
|
"loss": 0.3281,
|
|
"mean_token_accuracy": 0.8820374673232436,
|
|
"num_tokens": 614971152.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 0.395751953125,
|
|
"epoch": 5.682539682539683,
|
|
"grad_norm": 0.61946329385844,
|
|
"learning_rate": 8.024048521769745e-08,
|
|
"loss": 0.3244,
|
|
"mean_token_accuracy": 0.8853642977774143,
|
|
"num_tokens": 615848347.0,
|
|
"step": 716
|
|
},
|
|
{
|
|
"entropy": 0.3966522216796875,
|
|
"epoch": 5.690476190476191,
|
|
"grad_norm": 0.6677220378111695,
|
|
"learning_rate": 7.638392441653542e-08,
|
|
"loss": 0.3315,
|
|
"mean_token_accuracy": 0.8841713918372989,
|
|
"num_tokens": 616696985.0,
|
|
"step": 717
|
|
},
|
|
{
|
|
"entropy": 0.386993408203125,
|
|
"epoch": 5.698412698412699,
|
|
"grad_norm": 0.6448605664251128,
|
|
"learning_rate": 7.262162511505466e-08,
|
|
"loss": 0.323,
|
|
"mean_token_accuracy": 0.8844177662394941,
|
|
"num_tokens": 617578479.0,
|
|
"step": 718
|
|
},
|
|
{
|
|
"entropy": 0.3964385986328125,
|
|
"epoch": 5.7063492063492065,
|
|
"grad_norm": 0.6465949886444489,
|
|
"learning_rate": 6.895365934161236e-08,
|
|
"loss": 0.3265,
|
|
"mean_token_accuracy": 0.883903234731406,
|
|
"num_tokens": 618415443.0,
|
|
"step": 719
|
|
},
|
|
{
|
|
"entropy": 0.3905792236328125,
|
|
"epoch": 5.714285714285714,
|
|
"grad_norm": 0.7626270402963101,
|
|
"learning_rate": 6.538009731857087e-08,
|
|
"loss": 0.3266,
|
|
"mean_token_accuracy": 0.8833001307211816,
|
|
"num_tokens": 619287864.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 0.3910675048828125,
|
|
"epoch": 5.722222222222222,
|
|
"grad_norm": 0.6559784419468065,
|
|
"learning_rate": 6.190100746095495e-08,
|
|
"loss": 0.3243,
|
|
"mean_token_accuracy": 0.8850177507847548,
|
|
"num_tokens": 620198481.0,
|
|
"step": 721
|
|
},
|
|
{
|
|
"entropy": 0.3970184326171875,
|
|
"epoch": 5.73015873015873,
|
|
"grad_norm": 0.6389982422035683,
|
|
"learning_rate": 5.851645637514114e-08,
|
|
"loss": 0.327,
|
|
"mean_token_accuracy": 0.8827162678353488,
|
|
"num_tokens": 621065449.0,
|
|
"step": 722
|
|
},
|
|
{
|
|
"entropy": 0.3971710205078125,
|
|
"epoch": 5.738095238095238,
|
|
"grad_norm": 0.6140180252911043,
|
|
"learning_rate": 5.522650885758374e-08,
|
|
"loss": 0.3204,
|
|
"mean_token_accuracy": 0.8850936810486019,
|
|
"num_tokens": 621906875.0,
|
|
"step": 723
|
|
},
|
|
{
|
|
"entropy": 0.3934326171875,
|
|
"epoch": 5.746031746031746,
|
|
"grad_norm": 0.6533501187294389,
|
|
"learning_rate": 5.203122789357307e-08,
|
|
"loss": 0.3342,
|
|
"mean_token_accuracy": 0.881910024676472,
|
|
"num_tokens": 622774268.0,
|
|
"step": 724
|
|
},
|
|
{
|
|
"entropy": 0.3961334228515625,
|
|
"epoch": 5.753968253968254,
|
|
"grad_norm": 0.6833130980259546,
|
|
"learning_rate": 4.893067465602863e-08,
|
|
"loss": 0.3307,
|
|
"mean_token_accuracy": 0.8820976046845317,
|
|
"num_tokens": 623625107.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 0.3968048095703125,
|
|
"epoch": 5.761904761904762,
|
|
"grad_norm": 0.763571756315237,
|
|
"learning_rate": 4.5924908504331735e-08,
|
|
"loss": 0.3303,
|
|
"mean_token_accuracy": 0.8829025984741747,
|
|
"num_tokens": 624511530.0,
|
|
"step": 726
|
|
},
|
|
{
|
|
"entropy": 0.392822265625,
|
|
"epoch": 5.76984126984127,
|
|
"grad_norm": 0.6168843969129469,
|
|
"learning_rate": 4.3013986983184705e-08,
|
|
"loss": 0.3234,
|
|
"mean_token_accuracy": 0.884893387556076,
|
|
"num_tokens": 625349169.0,
|
|
"step": 727
|
|
},
|
|
{
|
|
"entropy": 0.393035888671875,
|
|
"epoch": 5.777777777777778,
|
|
"grad_norm": 0.6294236264104904,
|
|
"learning_rate": 4.019796582151181e-08,
|
|
"loss": 0.3231,
|
|
"mean_token_accuracy": 0.883484820369631,
|
|
"num_tokens": 626205023.0,
|
|
"step": 728
|
|
},
|
|
{
|
|
"entropy": 0.392425537109375,
|
|
"epoch": 5.785714285714286,
|
|
"grad_norm": 0.7014456722284323,
|
|
"learning_rate": 3.747689893139228e-08,
|
|
"loss": 0.3253,
|
|
"mean_token_accuracy": 0.884223835542798,
|
|
"num_tokens": 627072409.0,
|
|
"step": 729
|
|
},
|
|
{
|
|
"entropy": 0.39483642578125,
|
|
"epoch": 5.7936507936507935,
|
|
"grad_norm": 0.6354033492793867,
|
|
"learning_rate": 3.4850838407027297e-08,
|
|
"loss": 0.3351,
|
|
"mean_token_accuracy": 0.8803266729228199,
|
|
"num_tokens": 627933762.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 0.392974853515625,
|
|
"epoch": 5.801587301587301,
|
|
"grad_norm": 0.6324944348230324,
|
|
"learning_rate": 3.2319834523742435e-08,
|
|
"loss": 0.3248,
|
|
"mean_token_accuracy": 0.8832277562469244,
|
|
"num_tokens": 628815182.0,
|
|
"step": 731
|
|
},
|
|
{
|
|
"entropy": 0.3950042724609375,
|
|
"epoch": 5.809523809523809,
|
|
"grad_norm": 0.629955389311813,
|
|
"learning_rate": 2.988393573702675e-08,
|
|
"loss": 0.3201,
|
|
"mean_token_accuracy": 0.884325556922704,
|
|
"num_tokens": 629666458.0,
|
|
"step": 732
|
|
},
|
|
{
|
|
"entropy": 0.3938446044921875,
|
|
"epoch": 5.817460317460317,
|
|
"grad_norm": 0.6166361263236042,
|
|
"learning_rate": 2.754318868160244e-08,
|
|
"loss": 0.3221,
|
|
"mean_token_accuracy": 0.8852822785265744,
|
|
"num_tokens": 630529787.0,
|
|
"step": 733
|
|
},
|
|
{
|
|
"entropy": 0.397705078125,
|
|
"epoch": 5.825396825396825,
|
|
"grad_norm": 0.644139271435762,
|
|
"learning_rate": 2.5297638170535542e-08,
|
|
"loss": 0.3219,
|
|
"mean_token_accuracy": 0.8845392796210945,
|
|
"num_tokens": 631374518.0,
|
|
"step": 734
|
|
},
|
|
{
|
|
"entropy": 0.3916778564453125,
|
|
"epoch": 5.833333333333333,
|
|
"grad_norm": 0.6747464037741341,
|
|
"learning_rate": 2.31473271943744e-08,
|
|
"loss": 0.3364,
|
|
"mean_token_accuracy": 0.8802501196041703,
|
|
"num_tokens": 632234249.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 0.3955078125,
|
|
"epoch": 5.841269841269841,
|
|
"grad_norm": 0.639174800920214,
|
|
"learning_rate": 2.109229692032977e-08,
|
|
"loss": 0.3255,
|
|
"mean_token_accuracy": 0.8846253966912627,
|
|
"num_tokens": 633096164.0,
|
|
"step": 736
|
|
},
|
|
{
|
|
"entropy": 0.3962554931640625,
|
|
"epoch": 5.849206349206349,
|
|
"grad_norm": 0.6365328488931461,
|
|
"learning_rate": 1.9132586691484323e-08,
|
|
"loss": 0.32,
|
|
"mean_token_accuracy": 0.8840158293023705,
|
|
"num_tokens": 633966696.0,
|
|
"step": 737
|
|
},
|
|
{
|
|
"entropy": 0.396240234375,
|
|
"epoch": 5.857142857142857,
|
|
"grad_norm": 0.6274839579353274,
|
|
"learning_rate": 1.7268234026041053e-08,
|
|
"loss": 0.3254,
|
|
"mean_token_accuracy": 0.8834780002944171,
|
|
"num_tokens": 634817676.0,
|
|
"step": 738
|
|
},
|
|
{
|
|
"entropy": 0.394012451171875,
|
|
"epoch": 5.865079365079366,
|
|
"grad_norm": 0.6482673934422294,
|
|
"learning_rate": 1.5499274616602723e-08,
|
|
"loss": 0.3246,
|
|
"mean_token_accuracy": 0.8846540525555611,
|
|
"num_tokens": 635677818.0,
|
|
"step": 739
|
|
},
|
|
{
|
|
"entropy": 0.3939056396484375,
|
|
"epoch": 5.8730158730158735,
|
|
"grad_norm": 0.6092517161901388,
|
|
"learning_rate": 1.3825742329492408e-08,
|
|
"loss": 0.3286,
|
|
"mean_token_accuracy": 0.8829479278065264,
|
|
"num_tokens": 636552146.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 0.3929901123046875,
|
|
"epoch": 5.880952380952381,
|
|
"grad_norm": 0.6568730517308575,
|
|
"learning_rate": 1.2247669204100699e-08,
|
|
"loss": 0.3324,
|
|
"mean_token_accuracy": 0.8809677893295884,
|
|
"num_tokens": 637434649.0,
|
|
"step": 741
|
|
},
|
|
{
|
|
"entropy": 0.3914947509765625,
|
|
"epoch": 5.888888888888889,
|
|
"grad_norm": 0.6200572094186191,
|
|
"learning_rate": 1.0765085452275614e-08,
|
|
"loss": 0.3292,
|
|
"mean_token_accuracy": 0.8837377014569938,
|
|
"num_tokens": 638294457.0,
|
|
"step": 742
|
|
},
|
|
{
|
|
"entropy": 0.3955535888671875,
|
|
"epoch": 5.896825396825397,
|
|
"grad_norm": 0.6375108635856576,
|
|
"learning_rate": 9.378019457743082e-09,
|
|
"loss": 0.3276,
|
|
"mean_token_accuracy": 0.8823563028126955,
|
|
"num_tokens": 639157905.0,
|
|
"step": 743
|
|
},
|
|
{
|
|
"entropy": 0.3954620361328125,
|
|
"epoch": 5.904761904761905,
|
|
"grad_norm": 0.6632626274807544,
|
|
"learning_rate": 8.086497775562918e-09,
|
|
"loss": 0.3306,
|
|
"mean_token_accuracy": 0.8825922501273453,
|
|
"num_tokens": 640011175.0,
|
|
"step": 744
|
|
},
|
|
{
|
|
"entropy": 0.3955535888671875,
|
|
"epoch": 5.912698412698413,
|
|
"grad_norm": 0.6203028600677455,
|
|
"learning_rate": 6.890545131621462e-09,
|
|
"loss": 0.3296,
|
|
"mean_token_accuracy": 0.8806997863575816,
|
|
"num_tokens": 640861100.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 0.3971710205078125,
|
|
"epoch": 5.920634920634921,
|
|
"grad_norm": 0.608418609243511,
|
|
"learning_rate": 5.790184422158063e-09,
|
|
"loss": 0.3201,
|
|
"mean_token_accuracy": 0.8849551575258374,
|
|
"num_tokens": 641710588.0,
|
|
"step": 746
|
|
},
|
|
{
|
|
"entropy": 0.398040771484375,
|
|
"epoch": 5.928571428571429,
|
|
"grad_norm": 0.6703364371814936,
|
|
"learning_rate": 4.785436713324876e-09,
|
|
"loss": 0.3223,
|
|
"mean_token_accuracy": 0.8841964863240719,
|
|
"num_tokens": 642574184.0,
|
|
"step": 747
|
|
},
|
|
{
|
|
"entropy": 0.390533447265625,
|
|
"epoch": 5.936507936507937,
|
|
"grad_norm": 0.6254194421549316,
|
|
"learning_rate": 3.876321240786629e-09,
|
|
"loss": 0.3255,
|
|
"mean_token_accuracy": 0.8837793176062405,
|
|
"num_tokens": 643427789.0,
|
|
"step": 748
|
|
},
|
|
{
|
|
"entropy": 0.391357421875,
|
|
"epoch": 5.944444444444445,
|
|
"grad_norm": 0.6047693920470687,
|
|
"learning_rate": 3.062855409350918e-09,
|
|
"loss": 0.3226,
|
|
"mean_token_accuracy": 0.8838337506167591,
|
|
"num_tokens": 644312139.0,
|
|
"step": 749
|
|
},
|
|
{
|
|
"entropy": 0.3914794921875,
|
|
"epoch": 5.9523809523809526,
|
|
"grad_norm": 0.620245834415008,
|
|
"learning_rate": 2.345054792634027e-09,
|
|
"loss": 0.3192,
|
|
"mean_token_accuracy": 0.8871927126310766,
|
|
"num_tokens": 645164595.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 0.3953399658203125,
|
|
"epoch": 5.9603174603174605,
|
|
"grad_norm": 0.597155253447394,
|
|
"learning_rate": 1.7229331327633935e-09,
|
|
"loss": 0.3258,
|
|
"mean_token_accuracy": 0.8841970260255039,
|
|
"num_tokens": 646011434.0,
|
|
"step": 751
|
|
},
|
|
{
|
|
"entropy": 0.3984527587890625,
|
|
"epoch": 5.968253968253968,
|
|
"grad_norm": 0.6471414852719458,
|
|
"learning_rate": 1.1965023401161457e-09,
|
|
"loss": 0.3224,
|
|
"mean_token_accuracy": 0.8848558394238353,
|
|
"num_tokens": 646838463.0,
|
|
"step": 752
|
|
},
|
|
{
|
|
"entropy": 0.3917388916015625,
|
|
"epoch": 5.976190476190476,
|
|
"grad_norm": 0.6216717165417032,
|
|
"learning_rate": 7.657724930887344e-10,
|
|
"loss": 0.3189,
|
|
"mean_token_accuracy": 0.8857344668358564,
|
|
"num_tokens": 647701193.0,
|
|
"step": 753
|
|
},
|
|
{
|
|
"entropy": 0.398895263671875,
|
|
"epoch": 5.984126984126984,
|
|
"grad_norm": 0.6120300659931632,
|
|
"learning_rate": 4.3075183790541875e-10,
|
|
"loss": 0.3228,
|
|
"mean_token_accuracy": 0.8856973070651293,
|
|
"num_tokens": 648543554.0,
|
|
"step": 754
|
|
},
|
|
{
|
|
"entropy": 0.39404296875,
|
|
"epoch": 5.992063492063492,
|
|
"grad_norm": 0.6483787873623111,
|
|
"learning_rate": 1.9144678845950393e-10,
|
|
"loss": 0.3344,
|
|
"mean_token_accuracy": 0.8800923773087561,
|
|
"num_tokens": 649397139.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 0.3967437744140625,
|
|
"epoch": 6.0,
|
|
"grad_norm": 0.6554944488124251,
|
|
"learning_rate": 4.786192619121721e-11,
|
|
"loss": 0.3239,
|
|
"mean_token_accuracy": 0.883914896287024,
|
|
"num_tokens": 650235668.0,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"step": 756,
|
|
"total_flos": 1202499003482112.0,
|
|
"train_loss": 0.440722111040953,
|
|
"train_runtime": 115067.3138,
|
|
"train_samples_per_second": 1.28,
|
|
"train_steps_per_second": 0.007
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 756,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 6,
|
|
"save_steps": 63,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1202499003482112.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|