Files
qwen2.5-3b-numina-sft/checkpoint-3315/trainer_state.json

3521 lines
97 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 200,
"global_step": 3315,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5955325569957495,
"epoch": 0.009051821679112922,
"grad_norm": 9.375,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.714,
"mean_token_accuracy": 0.8531711064279079,
"num_tokens": 73265.0,
"step": 10
},
{
"entropy": 0.601216921582818,
"epoch": 0.018103643358225844,
"grad_norm": 6.1875,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.6599,
"mean_token_accuracy": 0.8536688484251499,
"num_tokens": 151945.0,
"step": 20
},
{
"entropy": 0.619553691893816,
"epoch": 0.027155465037338764,
"grad_norm": 6.84375,
"learning_rate": 5.8e-06,
"loss": 0.5775,
"mean_token_accuracy": 0.863236166536808,
"num_tokens": 226319.0,
"step": 30
},
{
"entropy": 0.5566520445048809,
"epoch": 0.03620728671645169,
"grad_norm": 3.5625,
"learning_rate": 7.800000000000002e-06,
"loss": 0.4768,
"mean_token_accuracy": 0.8804129175841808,
"num_tokens": 300605.0,
"step": 40
},
{
"entropy": 0.46170808989554646,
"epoch": 0.04525910839556461,
"grad_norm": 82.5,
"learning_rate": 9.800000000000001e-06,
"loss": 0.4137,
"mean_token_accuracy": 0.8895155563950539,
"num_tokens": 372784.0,
"step": 50
},
{
"entropy": 0.4425128545612097,
"epoch": 0.05431093007467753,
"grad_norm": 2.46875,
"learning_rate": 1.18e-05,
"loss": 0.4117,
"mean_token_accuracy": 0.888019285351038,
"num_tokens": 444839.0,
"step": 60
},
{
"entropy": 0.41746695823967456,
"epoch": 0.06336275175379046,
"grad_norm": 2.21875,
"learning_rate": 1.38e-05,
"loss": 0.3682,
"mean_token_accuracy": 0.8940133817493916,
"num_tokens": 518697.0,
"step": 70
},
{
"entropy": 0.4343178853392601,
"epoch": 0.07241457343290338,
"grad_norm": 2.046875,
"learning_rate": 1.58e-05,
"loss": 0.3757,
"mean_token_accuracy": 0.8950759552419185,
"num_tokens": 591590.0,
"step": 80
},
{
"entropy": 0.40939719304442407,
"epoch": 0.0814663951120163,
"grad_norm": 1.5625,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.3417,
"mean_token_accuracy": 0.9000873222947121,
"num_tokens": 671925.0,
"step": 90
},
{
"entropy": 0.3810647170990705,
"epoch": 0.09051821679112922,
"grad_norm": 1.8984375,
"learning_rate": 1.98e-05,
"loss": 0.3389,
"mean_token_accuracy": 0.8990982733666897,
"num_tokens": 744979.0,
"step": 100
},
{
"entropy": 0.34255886916071177,
"epoch": 0.09957003847024214,
"grad_norm": 1.359375,
"learning_rate": 1.9999613285893108e-05,
"loss": 0.3095,
"mean_token_accuracy": 0.901997297257185,
"num_tokens": 824704.0,
"step": 110
},
{
"entropy": 0.3227020036429167,
"epoch": 0.10862186014935506,
"grad_norm": 1.8203125,
"learning_rate": 1.9998276534787115e-05,
"loss": 0.3199,
"mean_token_accuracy": 0.8994096107780933,
"num_tokens": 899168.0,
"step": 120
},
{
"entropy": 0.31191693879663945,
"epoch": 0.11767368182846798,
"grad_norm": 1.6796875,
"learning_rate": 1.9995985100042836e-05,
"loss": 0.3029,
"mean_token_accuracy": 0.9058460839092731,
"num_tokens": 974442.0,
"step": 130
},
{
"entropy": 0.3080141399055719,
"epoch": 0.1267255035075809,
"grad_norm": 1.9609375,
"learning_rate": 1.9992739200457505e-05,
"loss": 0.2907,
"mean_token_accuracy": 0.9111274629831314,
"num_tokens": 1054948.0,
"step": 140
},
{
"entropy": 0.311343339830637,
"epoch": 0.13577732518669383,
"grad_norm": 1.453125,
"learning_rate": 1.998853914596526e-05,
"loss": 0.2974,
"mean_token_accuracy": 0.9076914019882679,
"num_tokens": 1127040.0,
"step": 150
},
{
"entropy": 0.31157970502972604,
"epoch": 0.14482914686580675,
"grad_norm": 1.8125,
"learning_rate": 1.998338533760755e-05,
"loss": 0.3007,
"mean_token_accuracy": 0.9092378221452236,
"num_tokens": 1196429.0,
"step": 160
},
{
"entropy": 0.3010763289406896,
"epoch": 0.15388096854491967,
"grad_norm": 1.71875,
"learning_rate": 1.9977278267494844e-05,
"loss": 0.2976,
"mean_token_accuracy": 0.9085726246237755,
"num_tokens": 1275764.0,
"step": 170
},
{
"entropy": 0.31990424413234,
"epoch": 0.1629327902240326,
"grad_norm": 1.578125,
"learning_rate": 1.9970218518759626e-05,
"loss": 0.3079,
"mean_token_accuracy": 0.9044359527528286,
"num_tokens": 1353869.0,
"step": 180
},
{
"entropy": 0.29817488249391316,
"epoch": 0.1719846119031455,
"grad_norm": 1.765625,
"learning_rate": 1.9962206765500744e-05,
"loss": 0.2943,
"mean_token_accuracy": 0.9098256938159466,
"num_tokens": 1426066.0,
"step": 190
},
{
"entropy": 0.30092692840844393,
"epoch": 0.18103643358225843,
"grad_norm": 1.84375,
"learning_rate": 1.995324377271901e-05,
"loss": 0.3224,
"mean_token_accuracy": 0.9066158257424831,
"num_tokens": 1494682.0,
"step": 200
},
{
"epoch": 0.18103643358225843,
"eval_entropy": 0.31176194101572036,
"eval_loss": 0.29660511016845703,
"eval_mean_token_accuracy": 0.9063085055351258,
"eval_num_tokens": 1494682.0,
"eval_runtime": 9.3564,
"eval_samples_per_second": 53.439,
"eval_steps_per_second": 26.72,
"step": 200
},
{
"entropy": 0.30449779201298954,
"epoch": 0.19008825526137135,
"grad_norm": 1.5546875,
"learning_rate": 1.9943330396244186e-05,
"loss": 0.2936,
"mean_token_accuracy": 0.9093848384916783,
"num_tokens": 1574246.0,
"step": 210
},
{
"entropy": 0.29794183522462847,
"epoch": 0.19914007694048427,
"grad_norm": 1.484375,
"learning_rate": 1.993246758265324e-05,
"loss": 0.3014,
"mean_token_accuracy": 0.906050831079483,
"num_tokens": 1649636.0,
"step": 220
},
{
"entropy": 0.31836145240813496,
"epoch": 0.2081918986195972,
"grad_norm": 1.375,
"learning_rate": 1.992065636917998e-05,
"loss": 0.3124,
"mean_token_accuracy": 0.9049551673233509,
"num_tokens": 1723726.0,
"step": 230
},
{
"entropy": 0.3080146433785558,
"epoch": 0.2172437202987101,
"grad_norm": 1.8984375,
"learning_rate": 1.9907897883615997e-05,
"loss": 0.3253,
"mean_token_accuracy": 0.9040355876088142,
"num_tokens": 1800283.0,
"step": 240
},
{
"entropy": 0.28388842958956956,
"epoch": 0.22629554197782303,
"grad_norm": 1.6015625,
"learning_rate": 1.9894193344202993e-05,
"loss": 0.2784,
"mean_token_accuracy": 0.9126260556280613,
"num_tokens": 1870003.0,
"step": 250
},
{
"entropy": 0.28803354129195213,
"epoch": 0.23534736365693595,
"grad_norm": 1.4140625,
"learning_rate": 1.987954405951645e-05,
"loss": 0.2902,
"mean_token_accuracy": 0.9081695273518562,
"num_tokens": 1942710.0,
"step": 260
},
{
"entropy": 0.2951167915016413,
"epoch": 0.24439918533604887,
"grad_norm": 1.5625,
"learning_rate": 1.986395142834069e-05,
"loss": 0.3018,
"mean_token_accuracy": 0.9062638804316521,
"num_tokens": 2016368.0,
"step": 270
},
{
"entropy": 0.3020996805280447,
"epoch": 0.2534510070151618,
"grad_norm": 1.9765625,
"learning_rate": 1.984741693953529e-05,
"loss": 0.308,
"mean_token_accuracy": 0.9063209906220436,
"num_tokens": 2095809.0,
"step": 280
},
{
"entropy": 0.3082812769338489,
"epoch": 0.2625028286942747,
"grad_norm": 1.8046875,
"learning_rate": 1.9829942171892953e-05,
"loss": 0.3137,
"mean_token_accuracy": 0.9035673819482326,
"num_tokens": 2166206.0,
"step": 290
},
{
"entropy": 0.3025300450623035,
"epoch": 0.27155465037338766,
"grad_norm": 1.859375,
"learning_rate": 1.981152879398872e-05,
"loss": 0.2972,
"mean_token_accuracy": 0.9058074496686459,
"num_tokens": 2235018.0,
"step": 300
},
{
"entropy": 0.2980365388095379,
"epoch": 0.28060647205250056,
"grad_norm": 1.640625,
"learning_rate": 1.9792178564020676e-05,
"loss": 0.2879,
"mean_token_accuracy": 0.9077603787183761,
"num_tokens": 2304709.0,
"step": 310
},
{
"entropy": 0.2872885692864656,
"epoch": 0.2896582937316135,
"grad_norm": 1.8984375,
"learning_rate": 1.9771893329642042e-05,
"loss": 0.2889,
"mean_token_accuracy": 0.910741999745369,
"num_tokens": 2380839.0,
"step": 320
},
{
"entropy": 0.2809945376589894,
"epoch": 0.2987101154107264,
"grad_norm": 1.78125,
"learning_rate": 1.975067502778479e-05,
"loss": 0.2852,
"mean_token_accuracy": 0.9113192833960057,
"num_tokens": 2447193.0,
"step": 330
},
{
"entropy": 0.30356528908014296,
"epoch": 0.30776193708983934,
"grad_norm": 1.7890625,
"learning_rate": 1.9728525684474654e-05,
"loss": 0.3216,
"mean_token_accuracy": 0.9036834843456745,
"num_tokens": 2523000.0,
"step": 340
},
{
"entropy": 0.30291730873286726,
"epoch": 0.31681375876895224,
"grad_norm": 2.53125,
"learning_rate": 1.97054474146377e-05,
"loss": 0.306,
"mean_token_accuracy": 0.9072510033845902,
"num_tokens": 2597529.0,
"step": 350
},
{
"entropy": 0.30994318779557944,
"epoch": 0.3258655804480652,
"grad_norm": 1.5,
"learning_rate": 1.968144242189838e-05,
"loss": 0.3241,
"mean_token_accuracy": 0.9031497567892075,
"num_tokens": 2676886.0,
"step": 360
},
{
"entropy": 0.2981043761596084,
"epoch": 0.3349174021271781,
"grad_norm": 1.390625,
"learning_rate": 1.9656512998369105e-05,
"loss": 0.2952,
"mean_token_accuracy": 0.9080876216292382,
"num_tokens": 2749137.0,
"step": 370
},
{
"entropy": 0.2890668235719204,
"epoch": 0.343969223806291,
"grad_norm": 1.78125,
"learning_rate": 1.9630661524431408e-05,
"loss": 0.2883,
"mean_token_accuracy": 0.9096374675631523,
"num_tokens": 2818103.0,
"step": 380
},
{
"entropy": 0.28160234112292526,
"epoch": 0.3530210454854039,
"grad_norm": 1.96875,
"learning_rate": 1.960389046850863e-05,
"loss": 0.2732,
"mean_token_accuracy": 0.9154746599495411,
"num_tokens": 2887552.0,
"step": 390
},
{
"entropy": 0.2931002199649811,
"epoch": 0.36207286716451687,
"grad_norm": 1.84375,
"learning_rate": 1.9576202386830233e-05,
"loss": 0.3161,
"mean_token_accuracy": 0.9047551721334457,
"num_tokens": 2966949.0,
"step": 400
},
{
"epoch": 0.36207286716451687,
"eval_entropy": 0.2931007430553436,
"eval_loss": 0.29184412956237793,
"eval_mean_token_accuracy": 0.9072571034431458,
"eval_num_tokens": 2966949.0,
"eval_runtime": 9.3729,
"eval_samples_per_second": 53.345,
"eval_steps_per_second": 26.673,
"step": 400
},
{
"entropy": 0.307697238586843,
"epoch": 0.37112468884362976,
"grad_norm": 1.65625,
"learning_rate": 1.9547599923187724e-05,
"loss": 0.3194,
"mean_token_accuracy": 0.9044582702219486,
"num_tokens": 3039683.0,
"step": 410
},
{
"entropy": 0.2976506020873785,
"epoch": 0.3801765105227427,
"grad_norm": 1.984375,
"learning_rate": 1.9518085808682207e-05,
"loss": 0.3012,
"mean_token_accuracy": 0.9044804252684117,
"num_tokens": 3114377.0,
"step": 420
},
{
"entropy": 0.29503822289407255,
"epoch": 0.3892283322018556,
"grad_norm": 1.5390625,
"learning_rate": 1.9487662861463593e-05,
"loss": 0.2962,
"mean_token_accuracy": 0.9089334838092327,
"num_tokens": 3184012.0,
"step": 430
},
{
"entropy": 0.29528967328369615,
"epoch": 0.39828015388096855,
"grad_norm": 1.6328125,
"learning_rate": 1.9456333986461535e-05,
"loss": 0.2907,
"mean_token_accuracy": 0.9082771897315979,
"num_tokens": 3257132.0,
"step": 440
},
{
"entropy": 0.2871313957497478,
"epoch": 0.4073319755600815,
"grad_norm": 1.796875,
"learning_rate": 1.9424102175108034e-05,
"loss": 0.2906,
"mean_token_accuracy": 0.9087129518389702,
"num_tokens": 3333004.0,
"step": 450
},
{
"entropy": 0.2987273294478655,
"epoch": 0.4163837972391944,
"grad_norm": 1.515625,
"learning_rate": 1.9390970505051803e-05,
"loss": 0.3005,
"mean_token_accuracy": 0.9063193209469318,
"num_tokens": 3405646.0,
"step": 460
},
{
"entropy": 0.28730762992054226,
"epoch": 0.42543561891830733,
"grad_norm": 1.59375,
"learning_rate": 1.935694213986441e-05,
"loss": 0.2917,
"mean_token_accuracy": 0.9086456030607224,
"num_tokens": 3474841.0,
"step": 470
},
{
"entropy": 0.29593063089996574,
"epoch": 0.4344874405974202,
"grad_norm": 1.5625,
"learning_rate": 1.9322020328738183e-05,
"loss": 0.2909,
"mean_token_accuracy": 0.9100452527403832,
"num_tokens": 3550524.0,
"step": 480
},
{
"entropy": 0.29435052666813133,
"epoch": 0.4435392622765332,
"grad_norm": 1.609375,
"learning_rate": 1.928620840617598e-05,
"loss": 0.2988,
"mean_token_accuracy": 0.9083921477198601,
"num_tokens": 3622971.0,
"step": 490
},
{
"entropy": 0.2887372709810734,
"epoch": 0.45259108395564607,
"grad_norm": 1.9921875,
"learning_rate": 1.9249509791672802e-05,
"loss": 0.2819,
"mean_token_accuracy": 0.9114750146865844,
"num_tokens": 3692024.0,
"step": 500
},
{
"entropy": 0.2885109892114997,
"epoch": 0.461642905634759,
"grad_norm": 1.5703125,
"learning_rate": 1.921192798938925e-05,
"loss": 0.2758,
"mean_token_accuracy": 0.9127115234732628,
"num_tokens": 3762911.0,
"step": 510
},
{
"entropy": 0.29927421547472477,
"epoch": 0.4706947273138719,
"grad_norm": 1.8984375,
"learning_rate": 1.917346658781697e-05,
"loss": 0.3073,
"mean_token_accuracy": 0.9056219220161438,
"num_tokens": 3832948.0,
"step": 520
},
{
"entropy": 0.28745833188295367,
"epoch": 0.47974654899298486,
"grad_norm": 1.5078125,
"learning_rate": 1.9134129259435973e-05,
"loss": 0.3035,
"mean_token_accuracy": 0.9091703072190285,
"num_tokens": 3909890.0,
"step": 530
},
{
"entropy": 0.2832237346097827,
"epoch": 0.48879837067209775,
"grad_norm": 1.4375,
"learning_rate": 1.9093919760363996e-05,
"loss": 0.2625,
"mean_token_accuracy": 0.9148759163916111,
"num_tokens": 3984217.0,
"step": 540
},
{
"entropy": 0.29212499912828205,
"epoch": 0.4978501923512107,
"grad_norm": 1.671875,
"learning_rate": 1.905284192999783e-05,
"loss": 0.2967,
"mean_token_accuracy": 0.906308326870203,
"num_tokens": 4063305.0,
"step": 550
},
{
"entropy": 0.28526828065514565,
"epoch": 0.5069020140303236,
"grad_norm": 1.453125,
"learning_rate": 1.9010899690646723e-05,
"loss": 0.2929,
"mean_token_accuracy": 0.9091045394539833,
"num_tokens": 4135755.0,
"step": 560
},
{
"entropy": 0.29713739454746246,
"epoch": 0.5159538357094365,
"grad_norm": 1.609375,
"learning_rate": 1.896809704715787e-05,
"loss": 0.3054,
"mean_token_accuracy": 0.9077155306935311,
"num_tokens": 4215297.0,
"step": 570
},
{
"entropy": 0.29641823247075083,
"epoch": 0.5250056573885494,
"grad_norm": 1.3984375,
"learning_rate": 1.8924438086533986e-05,
"loss": 0.3045,
"mean_token_accuracy": 0.9068859592080116,
"num_tokens": 4287199.0,
"step": 580
},
{
"entropy": 0.28421772718429567,
"epoch": 0.5340574790676623,
"grad_norm": 1.8515625,
"learning_rate": 1.8879926977543086e-05,
"loss": 0.2828,
"mean_token_accuracy": 0.9111842639744282,
"num_tokens": 4358560.0,
"step": 590
},
{
"entropy": 0.2928624337539077,
"epoch": 0.5431093007467753,
"grad_norm": 1.625,
"learning_rate": 1.8834567970320413e-05,
"loss": 0.2985,
"mean_token_accuracy": 0.9082511819899082,
"num_tokens": 4431086.0,
"step": 600
},
{
"epoch": 0.5431093007467753,
"eval_entropy": 0.29717582327127456,
"eval_loss": 0.2887883186340332,
"eval_mean_token_accuracy": 0.9083560631275177,
"eval_num_tokens": 4431086.0,
"eval_runtime": 9.3798,
"eval_samples_per_second": 53.306,
"eval_steps_per_second": 26.653,
"step": 600
},
{
"entropy": 0.2891200602054596,
"epoch": 0.5521611224258882,
"grad_norm": 1.203125,
"learning_rate": 1.8788365395962613e-05,
"loss": 0.2879,
"mean_token_accuracy": 0.9087440736591816,
"num_tokens": 4506892.0,
"step": 610
},
{
"entropy": 0.3021440252661705,
"epoch": 0.5612129441050011,
"grad_norm": 1.59375,
"learning_rate": 1.8741323666114207e-05,
"loss": 0.3107,
"mean_token_accuracy": 0.9060862340033055,
"num_tokens": 4581533.0,
"step": 620
},
{
"entropy": 0.2959118351340294,
"epoch": 0.570264765784114,
"grad_norm": 2.078125,
"learning_rate": 1.8693447272546313e-05,
"loss": 0.2977,
"mean_token_accuracy": 0.9070049889385701,
"num_tokens": 4653692.0,
"step": 630
},
{
"entropy": 0.27470533456653357,
"epoch": 0.579316587463227,
"grad_norm": 1.5234375,
"learning_rate": 1.8644740786727763e-05,
"loss": 0.2725,
"mean_token_accuracy": 0.9146642610430717,
"num_tokens": 4721773.0,
"step": 640
},
{
"entropy": 0.3039343884214759,
"epoch": 0.5883684091423399,
"grad_norm": 1.8125,
"learning_rate": 1.859520885938861e-05,
"loss": 0.3184,
"mean_token_accuracy": 0.9047698460519313,
"num_tokens": 4802409.0,
"step": 650
},
{
"entropy": 0.3038432693108916,
"epoch": 0.5974202308214528,
"grad_norm": 1.609375,
"learning_rate": 1.854485622007603e-05,
"loss": 0.302,
"mean_token_accuracy": 0.9063945829868316,
"num_tokens": 4876537.0,
"step": 660
},
{
"entropy": 0.27984238266944883,
"epoch": 0.6064720525005657,
"grad_norm": 1.6953125,
"learning_rate": 1.8493687676702743e-05,
"loss": 0.2865,
"mean_token_accuracy": 0.9099802240729332,
"num_tokens": 4947515.0,
"step": 670
},
{
"entropy": 0.28516580928117036,
"epoch": 0.6155238741796787,
"grad_norm": 1.8125,
"learning_rate": 1.8441708115087917e-05,
"loss": 0.2864,
"mean_token_accuracy": 0.9089432120323181,
"num_tokens": 5025785.0,
"step": 680
},
{
"entropy": 0.2929608277976513,
"epoch": 0.6245756958587916,
"grad_norm": 1.4921875,
"learning_rate": 1.8388922498490653e-05,
"loss": 0.3068,
"mean_token_accuracy": 0.9048882365226746,
"num_tokens": 5102385.0,
"step": 690
},
{
"entropy": 0.30293994322419165,
"epoch": 0.6336275175379045,
"grad_norm": 1.421875,
"learning_rate": 1.8335335867136064e-05,
"loss": 0.3103,
"mean_token_accuracy": 0.9059915870428086,
"num_tokens": 5181243.0,
"step": 700
},
{
"entropy": 0.311605279520154,
"epoch": 0.6426793392170175,
"grad_norm": 1.5078125,
"learning_rate": 1.8280953337734016e-05,
"loss": 0.3232,
"mean_token_accuracy": 0.9042202472686768,
"num_tokens": 5256020.0,
"step": 710
},
{
"entropy": 0.28142720870673654,
"epoch": 0.6517311608961304,
"grad_norm": 2.1875,
"learning_rate": 1.8225780102990563e-05,
"loss": 0.268,
"mean_token_accuracy": 0.9141685187816619,
"num_tokens": 5329019.0,
"step": 720
},
{
"entropy": 0.29012490045279266,
"epoch": 0.6607829825752433,
"grad_norm": 1.578125,
"learning_rate": 1.8169821431112104e-05,
"loss": 0.2935,
"mean_token_accuracy": 0.9081948816776275,
"num_tokens": 5407811.0,
"step": 730
},
{
"entropy": 0.2991213478147984,
"epoch": 0.6698348042543562,
"grad_norm": 1.6875,
"learning_rate": 1.8113082665302366e-05,
"loss": 0.2983,
"mean_token_accuracy": 0.9063392855226994,
"num_tokens": 5477168.0,
"step": 740
},
{
"entropy": 0.29746117200702427,
"epoch": 0.6788866259334692,
"grad_norm": 1.7578125,
"learning_rate": 1.8055569223252215e-05,
"loss": 0.2978,
"mean_token_accuracy": 0.9071070611476898,
"num_tokens": 5555941.0,
"step": 750
},
{
"entropy": 0.26622334159910677,
"epoch": 0.687938447612582,
"grad_norm": 1.625,
"learning_rate": 1.799728659662232e-05,
"loss": 0.2602,
"mean_token_accuracy": 0.9196423992514611,
"num_tokens": 5626518.0,
"step": 760
},
{
"entropy": 0.2931471846997738,
"epoch": 0.6969902692916949,
"grad_norm": 1.609375,
"learning_rate": 1.793824035051882e-05,
"loss": 0.3079,
"mean_token_accuracy": 0.9071181505918503,
"num_tokens": 5699632.0,
"step": 770
},
{
"entropy": 0.2865557339042425,
"epoch": 0.7060420909708078,
"grad_norm": 1.5078125,
"learning_rate": 1.787843612296191e-05,
"loss": 0.2927,
"mean_token_accuracy": 0.9099504724144936,
"num_tokens": 5766441.0,
"step": 780
},
{
"entropy": 0.2953029813244939,
"epoch": 0.7150939126499208,
"grad_norm": 1.6328125,
"learning_rate": 1.781787962434751e-05,
"loss": 0.2973,
"mean_token_accuracy": 0.906347993761301,
"num_tokens": 5840121.0,
"step": 790
},
{
"entropy": 0.2896912330761552,
"epoch": 0.7241457343290337,
"grad_norm": 1.453125,
"learning_rate": 1.7756576636902013e-05,
"loss": 0.287,
"mean_token_accuracy": 0.9087827295064926,
"num_tokens": 5913798.0,
"step": 800
},
{
"epoch": 0.7241457343290337,
"eval_entropy": 0.2926266292333603,
"eval_loss": 0.28681185841560364,
"eval_mean_token_accuracy": 0.9089032611846923,
"eval_num_tokens": 5913798.0,
"eval_runtime": 9.3648,
"eval_samples_per_second": 53.391,
"eval_steps_per_second": 26.696,
"step": 800
},
{
"entropy": 0.3035820659250021,
"epoch": 0.7331975560081466,
"grad_norm": 1.28125,
"learning_rate": 1.769453301413016e-05,
"loss": 0.3031,
"mean_token_accuracy": 0.9059673763811589,
"num_tokens": 5995879.0,
"step": 810
},
{
"entropy": 0.3072130227461457,
"epoch": 0.7422493776872595,
"grad_norm": 1.7890625,
"learning_rate": 1.7631754680256118e-05,
"loss": 0.3046,
"mean_token_accuracy": 0.9036270663142204,
"num_tokens": 6068519.0,
"step": 820
},
{
"entropy": 0.2819986244663596,
"epoch": 0.7513011993663725,
"grad_norm": 1.5546875,
"learning_rate": 1.7568247629657816e-05,
"loss": 0.2865,
"mean_token_accuracy": 0.9112287394702434,
"num_tokens": 6139028.0,
"step": 830
},
{
"entropy": 0.29264403488487006,
"epoch": 0.7603530210454854,
"grad_norm": 1.828125,
"learning_rate": 1.750401792629457e-05,
"loss": 0.3128,
"mean_token_accuracy": 0.9057952709496021,
"num_tokens": 6211903.0,
"step": 840
},
{
"entropy": 0.2960445500910282,
"epoch": 0.7694048427245983,
"grad_norm": 1.34375,
"learning_rate": 1.7439071703128068e-05,
"loss": 0.3048,
"mean_token_accuracy": 0.9075722090899945,
"num_tokens": 6289443.0,
"step": 850
},
{
"entropy": 0.2825075998902321,
"epoch": 0.7784566644037112,
"grad_norm": 1.5078125,
"learning_rate": 1.7373415161536752e-05,
"loss": 0.2922,
"mean_token_accuracy": 0.907506238669157,
"num_tokens": 6370438.0,
"step": 860
},
{
"entropy": 0.3036248629912734,
"epoch": 0.7875084860828242,
"grad_norm": 2.34375,
"learning_rate": 1.73070545707237e-05,
"loss": 0.32,
"mean_token_accuracy": 0.9044312626123429,
"num_tokens": 6442859.0,
"step": 870
},
{
"entropy": 0.30435681212693455,
"epoch": 0.7965603077619371,
"grad_norm": 1.4921875,
"learning_rate": 1.7239996267118003e-05,
"loss": 0.2982,
"mean_token_accuracy": 0.9066942445933819,
"num_tokens": 6520055.0,
"step": 880
},
{
"entropy": 0.3001150920987129,
"epoch": 0.80561212944105,
"grad_norm": 1.8046875,
"learning_rate": 1.717224665376973e-05,
"loss": 0.3052,
"mean_token_accuracy": 0.904937519133091,
"num_tokens": 6596737.0,
"step": 890
},
{
"entropy": 0.303386352583766,
"epoch": 0.814663951120163,
"grad_norm": 1.703125,
"learning_rate": 1.7103812199738538e-05,
"loss": 0.3095,
"mean_token_accuracy": 0.9060303725302219,
"num_tokens": 6677638.0,
"step": 900
},
{
"entropy": 0.28049613647162913,
"epoch": 0.8237157727992759,
"grad_norm": 1.59375,
"learning_rate": 1.703469943947597e-05,
"loss": 0.2875,
"mean_token_accuracy": 0.911004551500082,
"num_tokens": 6749581.0,
"step": 910
},
{
"entropy": 0.31584621611982583,
"epoch": 0.8327675944783888,
"grad_norm": 1.6484375,
"learning_rate": 1.6964914972201522e-05,
"loss": 0.3265,
"mean_token_accuracy": 0.9020559079945087,
"num_tokens": 6826238.0,
"step": 920
},
{
"entropy": 0.27345783039927485,
"epoch": 0.8418194161575017,
"grad_norm": 1.7734375,
"learning_rate": 1.6894465461272513e-05,
"loss": 0.2763,
"mean_token_accuracy": 0.913234294205904,
"num_tokens": 6893946.0,
"step": 930
},
{
"entropy": 0.2751306457445025,
"epoch": 0.8508712378366147,
"grad_norm": 1.3203125,
"learning_rate": 1.6823357633547832e-05,
"loss": 0.2754,
"mean_token_accuracy": 0.9122063621878624,
"num_tokens": 6970697.0,
"step": 940
},
{
"entropy": 0.2728879269212484,
"epoch": 0.8599230595157276,
"grad_norm": 1.6640625,
"learning_rate": 1.6751598278745636e-05,
"loss": 0.277,
"mean_token_accuracy": 0.9147410795092583,
"num_tokens": 7037829.0,
"step": 950
},
{
"entropy": 0.2938439719378948,
"epoch": 0.8689748811948405,
"grad_norm": 1.703125,
"learning_rate": 1.6679194248795018e-05,
"loss": 0.3092,
"mean_token_accuracy": 0.9080867692828178,
"num_tokens": 7113672.0,
"step": 960
},
{
"entropy": 0.2854067673906684,
"epoch": 0.8780267028739533,
"grad_norm": 1.65625,
"learning_rate": 1.660615245718177e-05,
"loss": 0.277,
"mean_token_accuracy": 0.9124735839664936,
"num_tokens": 7183355.0,
"step": 970
},
{
"entropy": 0.2932927643880248,
"epoch": 0.8870785245530663,
"grad_norm": 1.75,
"learning_rate": 1.6532479878288237e-05,
"loss": 0.3118,
"mean_token_accuracy": 0.9067716941237449,
"num_tokens": 7258420.0,
"step": 980
},
{
"entropy": 0.27898168824613095,
"epoch": 0.8961303462321792,
"grad_norm": 1.3125,
"learning_rate": 1.645818354672738e-05,
"loss": 0.2646,
"mean_token_accuracy": 0.9143994279205799,
"num_tokens": 7327278.0,
"step": 990
},
{
"entropy": 0.2950664022937417,
"epoch": 0.9051821679112921,
"grad_norm": 1.828125,
"learning_rate": 1.6383270556671067e-05,
"loss": 0.2992,
"mean_token_accuracy": 0.9097571104764939,
"num_tokens": 7398297.0,
"step": 1000
},
{
"epoch": 0.9051821679112921,
"eval_entropy": 0.2895762438774109,
"eval_loss": 0.2849172353744507,
"eval_mean_token_accuracy": 0.9092698242664338,
"eval_num_tokens": 7398297.0,
"eval_runtime": 9.4134,
"eval_samples_per_second": 53.116,
"eval_steps_per_second": 26.558,
"step": 1000
},
{
"entropy": 0.2860976686701179,
"epoch": 0.914233989590405,
"grad_norm": 1.578125,
"learning_rate": 1.6307748061172687e-05,
"loss": 0.2902,
"mean_token_accuracy": 0.9105670429766178,
"num_tokens": 7469920.0,
"step": 1010
},
{
"entropy": 0.3092665681615472,
"epoch": 0.923285811269518,
"grad_norm": 1.4921875,
"learning_rate": 1.6231623271484158e-05,
"loss": 0.3148,
"mean_token_accuracy": 0.904122719168663,
"num_tokens": 7541832.0,
"step": 1020
},
{
"entropy": 0.3033360539004207,
"epoch": 0.9323376329486309,
"grad_norm": 1.3828125,
"learning_rate": 1.615490345636734e-05,
"loss": 0.3108,
"mean_token_accuracy": 0.904674070328474,
"num_tokens": 7618017.0,
"step": 1030
},
{
"entropy": 0.2807492554187775,
"epoch": 0.9413894546277438,
"grad_norm": 1.5234375,
"learning_rate": 1.6077595941399997e-05,
"loss": 0.2828,
"mean_token_accuracy": 0.9098519176244736,
"num_tokens": 7693714.0,
"step": 1040
},
{
"entropy": 0.2958640310913324,
"epoch": 0.9504412763068567,
"grad_norm": 4.0,
"learning_rate": 1.5999708108276297e-05,
"loss": 0.3073,
"mean_token_accuracy": 0.9051192864775658,
"num_tokens": 7770364.0,
"step": 1050
},
{
"entropy": 0.305174994841218,
"epoch": 0.9594930979859697,
"grad_norm": 1.71875,
"learning_rate": 1.5921247394102e-05,
"loss": 0.3091,
"mean_token_accuracy": 0.9064687371253968,
"num_tokens": 7847319.0,
"step": 1060
},
{
"entropy": 0.30125368386507034,
"epoch": 0.9685449196650826,
"grad_norm": 1.6171875,
"learning_rate": 1.584222129068429e-05,
"loss": 0.2968,
"mean_token_accuracy": 0.9080850504338741,
"num_tokens": 7919917.0,
"step": 1070
},
{
"entropy": 0.2805282440036535,
"epoch": 0.9775967413441955,
"grad_norm": 1.6875,
"learning_rate": 1.5762637343816455e-05,
"loss": 0.281,
"mean_token_accuracy": 0.9120527848601341,
"num_tokens": 7994892.0,
"step": 1080
},
{
"entropy": 0.26937505435198544,
"epoch": 0.9866485630233084,
"grad_norm": 1.4296875,
"learning_rate": 1.5682503152557362e-05,
"loss": 0.2869,
"mean_token_accuracy": 0.9115587025880814,
"num_tokens": 8067021.0,
"step": 1090
},
{
"entropy": 0.29133475106209517,
"epoch": 0.9957003847024214,
"grad_norm": 1.8671875,
"learning_rate": 1.5601826368505863e-05,
"loss": 0.2947,
"mean_token_accuracy": 0.9103871814906597,
"num_tokens": 8144631.0,
"step": 1100
},
{
"entropy": 0.29290189173741216,
"epoch": 1.0045259108395566,
"grad_norm": 1.484375,
"learning_rate": 1.5520614695070185e-05,
"loss": 0.29,
"mean_token_accuracy": 0.9142394998134711,
"num_tokens": 8216020.0,
"step": 1110
},
{
"entropy": 0.24365324322134257,
"epoch": 1.0135777325186694,
"grad_norm": 1.7890625,
"learning_rate": 1.5438875886732376e-05,
"loss": 0.2388,
"mean_token_accuracy": 0.9264398336410522,
"num_tokens": 8288245.0,
"step": 1120
},
{
"entropy": 0.2383655753917992,
"epoch": 1.0226295541977823,
"grad_norm": 1.90625,
"learning_rate": 1.5356617748307857e-05,
"loss": 0.2563,
"mean_token_accuracy": 0.9211653597652912,
"num_tokens": 8358954.0,
"step": 1130
},
{
"entropy": 0.24438943453133105,
"epoch": 1.0316813758768952,
"grad_norm": 1.203125,
"learning_rate": 1.52738481342002e-05,
"loss": 0.247,
"mean_token_accuracy": 0.922533193230629,
"num_tokens": 8430861.0,
"step": 1140
},
{
"entropy": 0.23646295368671416,
"epoch": 1.0407331975560081,
"grad_norm": 1.8671875,
"learning_rate": 1.519057494765113e-05,
"loss": 0.2363,
"mean_token_accuracy": 0.9258262030780315,
"num_tokens": 8502349.0,
"step": 1150
},
{
"entropy": 0.23925476390868425,
"epoch": 1.049785019235121,
"grad_norm": 1.90625,
"learning_rate": 1.5106806139985902e-05,
"loss": 0.2321,
"mean_token_accuracy": 0.9250497639179229,
"num_tokens": 8570295.0,
"step": 1160
},
{
"entropy": 0.23410421870648862,
"epoch": 1.058836840914234,
"grad_norm": 1.859375,
"learning_rate": 1.5022549709854064e-05,
"loss": 0.2314,
"mean_token_accuracy": 0.9263376846909523,
"num_tokens": 8642950.0,
"step": 1170
},
{
"entropy": 0.2557803673669696,
"epoch": 1.0678886625933468,
"grad_norm": 1.453125,
"learning_rate": 1.4937813702465706e-05,
"loss": 0.2403,
"mean_token_accuracy": 0.9217644087970257,
"num_tokens": 8717090.0,
"step": 1180
},
{
"entropy": 0.2509741667658091,
"epoch": 1.07694048427246,
"grad_norm": 1.8515625,
"learning_rate": 1.4852606208823268e-05,
"loss": 0.2604,
"mean_token_accuracy": 0.918582696467638,
"num_tokens": 8789212.0,
"step": 1190
},
{
"entropy": 0.24822968104854226,
"epoch": 1.0859923059515728,
"grad_norm": 1.6796875,
"learning_rate": 1.4766935364948968e-05,
"loss": 0.2467,
"mean_token_accuracy": 0.9217202328145504,
"num_tokens": 8864678.0,
"step": 1200
},
{
"epoch": 1.0859923059515728,
"eval_entropy": 0.25971108758449557,
"eval_loss": 0.2871861159801483,
"eval_mean_token_accuracy": 0.9092267100811005,
"eval_num_tokens": 8864678.0,
"eval_runtime": 9.3693,
"eval_samples_per_second": 53.366,
"eval_steps_per_second": 26.683,
"step": 1200
},
{
"entropy": 0.2364829015918076,
"epoch": 1.0950441276306857,
"grad_norm": 1.3515625,
"learning_rate": 1.4680809351107938e-05,
"loss": 0.2305,
"mean_token_accuracy": 0.9277058839797974,
"num_tokens": 8940159.0,
"step": 1210
},
{
"entropy": 0.2585814634338021,
"epoch": 1.1040959493097986,
"grad_norm": 1.4609375,
"learning_rate": 1.4594236391027136e-05,
"loss": 0.2741,
"mean_token_accuracy": 0.9160600006580353,
"num_tokens": 9015281.0,
"step": 1220
},
{
"entropy": 0.254960492067039,
"epoch": 1.1131477709889115,
"grad_norm": 2.03125,
"learning_rate": 1.4507224751110098e-05,
"loss": 0.2538,
"mean_token_accuracy": 0.9213059425354004,
"num_tokens": 9087327.0,
"step": 1230
},
{
"entropy": 0.24510137867182494,
"epoch": 1.1221995926680244,
"grad_norm": 2.0,
"learning_rate": 1.4419782739647622e-05,
"loss": 0.254,
"mean_token_accuracy": 0.9214532896876335,
"num_tokens": 9169314.0,
"step": 1240
},
{
"entropy": 0.24978135284036398,
"epoch": 1.1312514143471373,
"grad_norm": 1.5625,
"learning_rate": 1.4331918706024466e-05,
"loss": 0.2427,
"mean_token_accuracy": 0.9217463575303555,
"num_tokens": 9242147.0,
"step": 1250
},
{
"entropy": 0.24323475174605846,
"epoch": 1.1403032360262504,
"grad_norm": 1.46875,
"learning_rate": 1.4243641039922085e-05,
"loss": 0.2458,
"mean_token_accuracy": 0.9232472665607929,
"num_tokens": 9315385.0,
"step": 1260
},
{
"entropy": 0.2486905450001359,
"epoch": 1.1493550577053633,
"grad_norm": 1.296875,
"learning_rate": 1.4154958170517567e-05,
"loss": 0.2605,
"mean_token_accuracy": 0.9212081745266915,
"num_tokens": 9390697.0,
"step": 1270
},
{
"entropy": 0.2332840071991086,
"epoch": 1.1584068793844762,
"grad_norm": 1.609375,
"learning_rate": 1.4065878565678763e-05,
"loss": 0.2445,
"mean_token_accuracy": 0.9262549884617328,
"num_tokens": 9460002.0,
"step": 1280
},
{
"entropy": 0.23821364771574735,
"epoch": 1.167458701063589,
"grad_norm": 1.6875,
"learning_rate": 1.3976410731155731e-05,
"loss": 0.2413,
"mean_token_accuracy": 0.9262685626745224,
"num_tokens": 9527692.0,
"step": 1290
},
{
"entropy": 0.2529443813487887,
"epoch": 1.176510522742702,
"grad_norm": 1.6484375,
"learning_rate": 1.3886563209768574e-05,
"loss": 0.2557,
"mean_token_accuracy": 0.9197916373610496,
"num_tokens": 9603314.0,
"step": 1300
},
{
"entropy": 0.2601372007280588,
"epoch": 1.1855623444218149,
"grad_norm": 2.109375,
"learning_rate": 1.379634458059173e-05,
"loss": 0.2739,
"mean_token_accuracy": 0.917241058498621,
"num_tokens": 9677672.0,
"step": 1310
},
{
"entropy": 0.254310567304492,
"epoch": 1.1946141661009277,
"grad_norm": 2.25,
"learning_rate": 1.3705763458134789e-05,
"loss": 0.2536,
"mean_token_accuracy": 0.9209688879549504,
"num_tokens": 9754592.0,
"step": 1320
},
{
"entropy": 0.24703464321792126,
"epoch": 1.2036659877800409,
"grad_norm": 1.6171875,
"learning_rate": 1.3614828491519953e-05,
"loss": 0.2403,
"mean_token_accuracy": 0.9250944316387176,
"num_tokens": 9831409.0,
"step": 1330
},
{
"entropy": 0.24839757550507785,
"epoch": 1.2127178094591538,
"grad_norm": 1.3046875,
"learning_rate": 1.3523548363656174e-05,
"loss": 0.2503,
"mean_token_accuracy": 0.9210891291499138,
"num_tokens": 9907152.0,
"step": 1340
},
{
"entropy": 0.24397108815610408,
"epoch": 1.2217696311382666,
"grad_norm": 1.5390625,
"learning_rate": 1.343193179041005e-05,
"loss": 0.2461,
"mean_token_accuracy": 0.9231149226427078,
"num_tokens": 9980501.0,
"step": 1350
},
{
"entropy": 0.24823267050087452,
"epoch": 1.2308214528173795,
"grad_norm": 1.59375,
"learning_rate": 1.3339987519773623e-05,
"loss": 0.2627,
"mean_token_accuracy": 0.920201038569212,
"num_tokens": 10050393.0,
"step": 1360
},
{
"entropy": 0.2534356275573373,
"epoch": 1.2398732744964924,
"grad_norm": 1.5625,
"learning_rate": 1.3247724331029045e-05,
"loss": 0.2457,
"mean_token_accuracy": 0.9231408350169659,
"num_tokens": 10119599.0,
"step": 1370
},
{
"entropy": 0.25433759707957504,
"epoch": 1.2489250961756053,
"grad_norm": 1.546875,
"learning_rate": 1.3155151033910319e-05,
"loss": 0.2537,
"mean_token_accuracy": 0.9176562003791332,
"num_tokens": 10191402.0,
"step": 1380
},
{
"entropy": 0.2557266032323241,
"epoch": 1.2579769178547182,
"grad_norm": 1.5390625,
"learning_rate": 1.3062276467762085e-05,
"loss": 0.2606,
"mean_token_accuracy": 0.9184790156781674,
"num_tokens": 10267960.0,
"step": 1390
},
{
"entropy": 0.26457452643662693,
"epoch": 1.267028739533831,
"grad_norm": 1.453125,
"learning_rate": 1.29691095006956e-05,
"loss": 0.262,
"mean_token_accuracy": 0.9168847225606441,
"num_tokens": 10349616.0,
"step": 1400
},
{
"epoch": 1.267028739533831,
"eval_entropy": 0.2544795420765877,
"eval_loss": 0.28732097148895264,
"eval_mean_token_accuracy": 0.9092240772247314,
"eval_num_tokens": 10349616.0,
"eval_runtime": 9.3693,
"eval_samples_per_second": 53.366,
"eval_steps_per_second": 26.683,
"step": 1400
},
{
"entropy": 0.25217979392036793,
"epoch": 1.276080561212944,
"grad_norm": 1.5625,
"learning_rate": 1.2875659028741973e-05,
"loss": 0.2594,
"mean_token_accuracy": 0.9182466574013233,
"num_tokens": 10424645.0,
"step": 1410
},
{
"entropy": 0.2577937442809343,
"epoch": 1.2851323828920571,
"grad_norm": 2.046875,
"learning_rate": 1.2781933975002731e-05,
"loss": 0.2518,
"mean_token_accuracy": 0.9220298327505588,
"num_tokens": 10497343.0,
"step": 1420
},
{
"entropy": 0.26858933065086604,
"epoch": 1.29418420457117,
"grad_norm": 2.296875,
"learning_rate": 1.2687943288797784e-05,
"loss": 0.2842,
"mean_token_accuracy": 0.9142478197813034,
"num_tokens": 10566325.0,
"step": 1430
},
{
"entropy": 0.24459188301116228,
"epoch": 1.303236026250283,
"grad_norm": 1.765625,
"learning_rate": 1.2593695944810913e-05,
"loss": 0.2456,
"mean_token_accuracy": 0.9239174589514733,
"num_tokens": 10637542.0,
"step": 1440
},
{
"entropy": 0.24134990572929382,
"epoch": 1.3122878479293958,
"grad_norm": 1.5234375,
"learning_rate": 1.2499200942232827e-05,
"loss": 0.2344,
"mean_token_accuracy": 0.9229865886271,
"num_tokens": 10714625.0,
"step": 1450
},
{
"entropy": 0.2523044439032674,
"epoch": 1.3213396696085087,
"grad_norm": 1.828125,
"learning_rate": 1.2404467303901867e-05,
"loss": 0.2564,
"mean_token_accuracy": 0.9190959706902504,
"num_tokens": 10786171.0,
"step": 1460
},
{
"entropy": 0.2603725749999285,
"epoch": 1.3303914912876216,
"grad_norm": 1.625,
"learning_rate": 1.2309504075442462e-05,
"loss": 0.2692,
"mean_token_accuracy": 0.9169292628765107,
"num_tokens": 10859432.0,
"step": 1470
},
{
"entropy": 0.2403477132320404,
"epoch": 1.3394433129667345,
"grad_norm": 1.6328125,
"learning_rate": 1.2214320324401419e-05,
"loss": 0.2327,
"mean_token_accuracy": 0.9255956873297692,
"num_tokens": 10931450.0,
"step": 1480
},
{
"entropy": 0.25164176877588035,
"epoch": 1.3484951346458476,
"grad_norm": 1.421875,
"learning_rate": 1.2118925139382106e-05,
"loss": 0.2553,
"mean_token_accuracy": 0.9207783795893192,
"num_tokens": 11005579.0,
"step": 1490
},
{
"entropy": 0.2500301007181406,
"epoch": 1.3575469563249605,
"grad_norm": 1.4921875,
"learning_rate": 1.2023327629176613e-05,
"loss": 0.2567,
"mean_token_accuracy": 0.9211388893425465,
"num_tokens": 11081729.0,
"step": 1500
},
{
"entropy": 0.2503871817141771,
"epoch": 1.3665987780040734,
"grad_norm": 1.875,
"learning_rate": 1.1927536921896032e-05,
"loss": 0.2481,
"mean_token_accuracy": 0.9220583327114582,
"num_tokens": 11150400.0,
"step": 1510
},
{
"entropy": 0.24495826996862888,
"epoch": 1.3756505996831863,
"grad_norm": 2.328125,
"learning_rate": 1.1831562164098832e-05,
"loss": 0.2561,
"mean_token_accuracy": 0.9218058377504349,
"num_tokens": 11227357.0,
"step": 1520
},
{
"entropy": 0.22247098237276078,
"epoch": 1.3847024213622992,
"grad_norm": 1.65625,
"learning_rate": 1.1735412519917514e-05,
"loss": 0.2199,
"mean_token_accuracy": 0.9295127160847187,
"num_tokens": 11302017.0,
"step": 1530
},
{
"entropy": 0.25086742732673883,
"epoch": 1.393754243041412,
"grad_norm": 2.3125,
"learning_rate": 1.1639097170183578e-05,
"loss": 0.2555,
"mean_token_accuracy": 0.9186649046838283,
"num_tokens": 11375628.0,
"step": 1540
},
{
"entropy": 0.24195160605013372,
"epoch": 1.402806064720525,
"grad_norm": 2.34375,
"learning_rate": 1.1542625311550882e-05,
"loss": 0.2439,
"mean_token_accuracy": 0.9234603866934776,
"num_tokens": 11454478.0,
"step": 1550
},
{
"entropy": 0.24275779630988836,
"epoch": 1.411857886399638,
"grad_norm": 1.390625,
"learning_rate": 1.1446006155617518e-05,
"loss": 0.2388,
"mean_token_accuracy": 0.9236106254160404,
"num_tokens": 11529561.0,
"step": 1560
},
{
"entropy": 0.26370916329324245,
"epoch": 1.4209097080787507,
"grad_norm": 1.9609375,
"learning_rate": 1.1349248928046222e-05,
"loss": 0.2584,
"mean_token_accuracy": 0.9196252316236496,
"num_tokens": 11602833.0,
"step": 1570
},
{
"entropy": 0.24912301748991011,
"epoch": 1.4299615297578638,
"grad_norm": 1.4609375,
"learning_rate": 1.1252362867683482e-05,
"loss": 0.2488,
"mean_token_accuracy": 0.9228829652070999,
"num_tokens": 11684248.0,
"step": 1580
},
{
"entropy": 0.2414556547999382,
"epoch": 1.4390133514369767,
"grad_norm": 1.953125,
"learning_rate": 1.1155357225677367e-05,
"loss": 0.2412,
"mean_token_accuracy": 0.923858293145895,
"num_tokens": 11759520.0,
"step": 1590
},
{
"entropy": 0.25389058981090784,
"epoch": 1.4480651731160896,
"grad_norm": 2.546875,
"learning_rate": 1.1058241264594169e-05,
"loss": 0.2564,
"mean_token_accuracy": 0.9178573161363601,
"num_tokens": 11834379.0,
"step": 1600
},
{
"epoch": 1.4480651731160896,
"eval_entropy": 0.25336209374666213,
"eval_loss": 0.28672701120376587,
"eval_mean_token_accuracy": 0.9094241366386414,
"eval_num_tokens": 11834379.0,
"eval_runtime": 9.3897,
"eval_samples_per_second": 53.25,
"eval_steps_per_second": 26.625,
"step": 1600
},
{
"entropy": 0.24825670775026082,
"epoch": 1.4571169947952025,
"grad_norm": 1.640625,
"learning_rate": 1.0961024257533984e-05,
"loss": 0.2608,
"mean_token_accuracy": 0.9209218248724937,
"num_tokens": 11911969.0,
"step": 1610
},
{
"entropy": 0.2427467254921794,
"epoch": 1.4661688164743154,
"grad_norm": 1.96875,
"learning_rate": 1.0863715487245257e-05,
"loss": 0.2358,
"mean_token_accuracy": 0.9233093105256558,
"num_tokens": 11988285.0,
"step": 1620
},
{
"entropy": 0.24526806455105543,
"epoch": 1.4752206381534283,
"grad_norm": 1.4609375,
"learning_rate": 1.0766324245238435e-05,
"loss": 0.2462,
"mean_token_accuracy": 0.9207368507981301,
"num_tokens": 12069373.0,
"step": 1630
},
{
"entropy": 0.24581417106091977,
"epoch": 1.4842724598325412,
"grad_norm": 1.53125,
"learning_rate": 1.0668859830898764e-05,
"loss": 0.2436,
"mean_token_accuracy": 0.9223100118339062,
"num_tokens": 12142462.0,
"step": 1640
},
{
"entropy": 0.23745538275688888,
"epoch": 1.4933242815116543,
"grad_norm": 1.5,
"learning_rate": 1.0571331550598327e-05,
"loss": 0.2341,
"mean_token_accuracy": 0.9265984818339348,
"num_tokens": 12216014.0,
"step": 1650
},
{
"entropy": 0.2431069084443152,
"epoch": 1.502376103190767,
"grad_norm": 1.5703125,
"learning_rate": 1.0473748716807446e-05,
"loss": 0.2389,
"mean_token_accuracy": 0.9258911445736885,
"num_tokens": 12288450.0,
"step": 1660
},
{
"entropy": 0.23318624114617706,
"epoch": 1.51142792486988,
"grad_norm": 1.765625,
"learning_rate": 1.0376120647205475e-05,
"loss": 0.2375,
"mean_token_accuracy": 0.9254320353269577,
"num_tokens": 12357801.0,
"step": 1670
},
{
"entropy": 0.24539397489279507,
"epoch": 1.520479746548993,
"grad_norm": 1.5078125,
"learning_rate": 1.0278456663791087e-05,
"loss": 0.2476,
"mean_token_accuracy": 0.9217661775648593,
"num_tokens": 12431559.0,
"step": 1680
},
{
"entropy": 0.243877131305635,
"epoch": 1.5295315682281059,
"grad_norm": 1.375,
"learning_rate": 1.0180766091992196e-05,
"loss": 0.2419,
"mean_token_accuracy": 0.9216916620731354,
"num_tokens": 12507021.0,
"step": 1690
},
{
"entropy": 0.26184606496244667,
"epoch": 1.538583389907219,
"grad_norm": 1.5859375,
"learning_rate": 1.0083058259775496e-05,
"loss": 0.277,
"mean_token_accuracy": 0.9145717203617096,
"num_tokens": 12580160.0,
"step": 1700
},
{
"entropy": 0.2375843895599246,
"epoch": 1.5476352115863317,
"grad_norm": 1.625,
"learning_rate": 9.985342496755785e-06,
"loss": 0.2382,
"mean_token_accuracy": 0.9251136861741542,
"num_tokens": 12656225.0,
"step": 1710
},
{
"entropy": 0.2448040470480919,
"epoch": 1.5566870332654448,
"grad_norm": 1.4453125,
"learning_rate": 9.887628133305139e-06,
"loss": 0.2513,
"mean_token_accuracy": 0.9209414727985858,
"num_tokens": 12733650.0,
"step": 1720
},
{
"entropy": 0.2508842507377267,
"epoch": 1.5657388549445574,
"grad_norm": 1.4765625,
"learning_rate": 9.78992449966199e-06,
"loss": 0.2574,
"mean_token_accuracy": 0.9201901033520699,
"num_tokens": 12807398.0,
"step": 1730
},
{
"entropy": 0.26807491648942233,
"epoch": 1.5747906766236706,
"grad_norm": 1.421875,
"learning_rate": 9.69224092504024e-06,
"loss": 0.2916,
"mean_token_accuracy": 0.9138583980500699,
"num_tokens": 12887795.0,
"step": 1740
},
{
"entropy": 0.24304722100496293,
"epoch": 1.5838424983027835,
"grad_norm": 1.515625,
"learning_rate": 9.594586736738463e-06,
"loss": 0.2368,
"mean_token_accuracy": 0.9235754661262036,
"num_tokens": 12955809.0,
"step": 1750
},
{
"entropy": 0.24434948712587357,
"epoch": 1.5928943199818963,
"grad_norm": 1.6171875,
"learning_rate": 9.496971259249275e-06,
"loss": 0.2446,
"mean_token_accuracy": 0.9219508893787861,
"num_tokens": 13029267.0,
"step": 1760
},
{
"entropy": 0.2473485903814435,
"epoch": 1.6019461416610092,
"grad_norm": 1.7109375,
"learning_rate": 9.399403813369009e-06,
"loss": 0.2418,
"mean_token_accuracy": 0.9251487784087657,
"num_tokens": 13101356.0,
"step": 1770
},
{
"entropy": 0.23964370582252742,
"epoch": 1.6109979633401221,
"grad_norm": 1.6640625,
"learning_rate": 9.301893715307697e-06,
"loss": 0.2413,
"mean_token_accuracy": 0.9235708922147751,
"num_tokens": 13175685.0,
"step": 1780
},
{
"entropy": 0.23657145369797944,
"epoch": 1.6200497850192352,
"grad_norm": 1.734375,
"learning_rate": 9.204450275799533e-06,
"loss": 0.2399,
"mean_token_accuracy": 0.9240031912922859,
"num_tokens": 13245711.0,
"step": 1790
},
{
"entropy": 0.2463653065264225,
"epoch": 1.629101606698348,
"grad_norm": 1.8046875,
"learning_rate": 9.10708279921383e-06,
"loss": 0.2595,
"mean_token_accuracy": 0.9202878102660179,
"num_tokens": 13320025.0,
"step": 1800
},
{
"epoch": 1.629101606698348,
"eval_entropy": 0.25300506711006165,
"eval_loss": 0.2863253653049469,
"eval_mean_token_accuracy": 0.9098271555900573,
"eval_num_tokens": 13320025.0,
"eval_runtime": 9.4073,
"eval_samples_per_second": 53.15,
"eval_steps_per_second": 26.575,
"step": 1800
},
{
"entropy": 0.254715484008193,
"epoch": 1.638153428377461,
"grad_norm": 1.7109375,
"learning_rate": 9.009800582666592e-06,
"loss": 0.2788,
"mean_token_accuracy": 0.9186265878379345,
"num_tokens": 13401009.0,
"step": 1810
},
{
"entropy": 0.24843050315976142,
"epoch": 1.647205250056574,
"grad_norm": 1.7734375,
"learning_rate": 8.912612915132781e-06,
"loss": 0.2515,
"mean_token_accuracy": 0.922213239222765,
"num_tokens": 13475772.0,
"step": 1820
},
{
"entropy": 0.2353765547275543,
"epoch": 1.6562570717356868,
"grad_norm": 1.53125,
"learning_rate": 8.815529076559373e-06,
"loss": 0.2308,
"mean_token_accuracy": 0.9247016876935958,
"num_tokens": 13547929.0,
"step": 1830
},
{
"entropy": 0.2633091388270259,
"epoch": 1.6653088934147997,
"grad_norm": 1.8125,
"learning_rate": 8.718558336979247e-06,
"loss": 0.2669,
"mean_token_accuracy": 0.9184009425342083,
"num_tokens": 13622526.0,
"step": 1840
},
{
"entropy": 0.25728757921606304,
"epoch": 1.6743607150939126,
"grad_norm": 1.484375,
"learning_rate": 8.621709955626046e-06,
"loss": 0.2628,
"mean_token_accuracy": 0.9189327403903007,
"num_tokens": 13700063.0,
"step": 1850
},
{
"entropy": 0.24874310288578272,
"epoch": 1.6834125367730257,
"grad_norm": 1.46875,
"learning_rate": 8.524993180050058e-06,
"loss": 0.2633,
"mean_token_accuracy": 0.9198607362806797,
"num_tokens": 13776033.0,
"step": 1860
},
{
"entropy": 0.24581483229994774,
"epoch": 1.6924643584521384,
"grad_norm": 1.9453125,
"learning_rate": 8.428417245235224e-06,
"loss": 0.2483,
"mean_token_accuracy": 0.9217715479433537,
"num_tokens": 13842019.0,
"step": 1870
},
{
"entropy": 0.25347451251000164,
"epoch": 1.7015161801312515,
"grad_norm": 1.578125,
"learning_rate": 8.331991372717326e-06,
"loss": 0.2484,
"mean_token_accuracy": 0.9214810349047184,
"num_tokens": 13912940.0,
"step": 1880
},
{
"entropy": 0.23695338489487766,
"epoch": 1.7105680018103642,
"grad_norm": 1.4453125,
"learning_rate": 8.235724769703466e-06,
"loss": 0.2362,
"mean_token_accuracy": 0.9258873045444489,
"num_tokens": 13988716.0,
"step": 1890
},
{
"entropy": 0.2509100193157792,
"epoch": 1.7196198234894773,
"grad_norm": 1.7734375,
"learning_rate": 8.139626628192944e-06,
"loss": 0.2579,
"mean_token_accuracy": 0.9192474693059921,
"num_tokens": 14060707.0,
"step": 1900
},
{
"entropy": 0.25328262001276014,
"epoch": 1.7286716451685902,
"grad_norm": 2.234375,
"learning_rate": 8.04370612409953e-06,
"loss": 0.2518,
"mean_token_accuracy": 0.9190532967448235,
"num_tokens": 14141352.0,
"step": 1910
},
{
"entropy": 0.2612711830995977,
"epoch": 1.737723466847703,
"grad_norm": 1.828125,
"learning_rate": 7.947972416375316e-06,
"loss": 0.2639,
"mean_token_accuracy": 0.9177913695573807,
"num_tokens": 14220408.0,
"step": 1920
},
{
"entropy": 0.24010583832859994,
"epoch": 1.7467752885268162,
"grad_norm": 1.9609375,
"learning_rate": 7.852434646136191e-06,
"loss": 0.2445,
"mean_token_accuracy": 0.9230381302535534,
"num_tokens": 14290387.0,
"step": 1930
},
{
"entropy": 0.2533594885841012,
"epoch": 1.7558271102059289,
"grad_norm": 1.703125,
"learning_rate": 7.757101935788973e-06,
"loss": 0.2524,
"mean_token_accuracy": 0.9200525276362896,
"num_tokens": 14362019.0,
"step": 1940
},
{
"entropy": 0.2543038886040449,
"epoch": 1.764878931885042,
"grad_norm": 1.7578125,
"learning_rate": 7.661983388160374e-06,
"loss": 0.2682,
"mean_token_accuracy": 0.9186428181827069,
"num_tokens": 14434869.0,
"step": 1950
},
{
"entropy": 0.25175454616546633,
"epoch": 1.7739307535641546,
"grad_norm": 1.8125,
"learning_rate": 7.567088085627834e-06,
"loss": 0.24,
"mean_token_accuracy": 0.9222409397363662,
"num_tokens": 14514468.0,
"step": 1960
},
{
"entropy": 0.24533636067062617,
"epoch": 1.7829825752432678,
"grad_norm": 1.515625,
"learning_rate": 7.4724250892522545e-06,
"loss": 0.2563,
"mean_token_accuracy": 0.9214011885225772,
"num_tokens": 14582295.0,
"step": 1970
},
{
"entropy": 0.24225753750652074,
"epoch": 1.7920343969223806,
"grad_norm": 1.390625,
"learning_rate": 7.3780034379128305e-06,
"loss": 0.2397,
"mean_token_accuracy": 0.9227394610643387,
"num_tokens": 14658863.0,
"step": 1980
},
{
"entropy": 0.26012470331043,
"epoch": 1.8010862186014935,
"grad_norm": 2.015625,
"learning_rate": 7.283832147443985e-06,
"loss": 0.2671,
"mean_token_accuracy": 0.9166965551674366,
"num_tokens": 14734663.0,
"step": 1990
},
{
"entropy": 0.2509179048240185,
"epoch": 1.8101380402806064,
"grad_norm": 1.5546875,
"learning_rate": 7.1899202097744595e-06,
"loss": 0.2546,
"mean_token_accuracy": 0.9195629328489303,
"num_tokens": 14807350.0,
"step": 2000
},
{
"epoch": 1.8101380402806064,
"eval_entropy": 0.2543384581208229,
"eval_loss": 0.2856670618057251,
"eval_mean_token_accuracy": 0.9101451654434204,
"eval_num_tokens": 14807350.0,
"eval_runtime": 9.3922,
"eval_samples_per_second": 53.236,
"eval_steps_per_second": 26.618,
"step": 2000
},
{
"entropy": 0.24845433719456195,
"epoch": 1.8191898619597193,
"grad_norm": 1.53125,
"learning_rate": 7.0962765920687434e-06,
"loss": 0.2626,
"mean_token_accuracy": 0.9198015905916691,
"num_tokens": 14881559.0,
"step": 2010
},
{
"entropy": 0.23361015133559704,
"epoch": 1.8282416836388324,
"grad_norm": 1.6015625,
"learning_rate": 7.002910235870851e-06,
"loss": 0.2342,
"mean_token_accuracy": 0.9258812077343463,
"num_tokens": 14951806.0,
"step": 2020
},
{
"entropy": 0.2408413586206734,
"epoch": 1.837293505317945,
"grad_norm": 1.328125,
"learning_rate": 6.909830056250527e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.9247417151927948,
"num_tokens": 15022622.0,
"step": 2030
},
{
"entropy": 0.2619917577132583,
"epoch": 1.8463453269970582,
"grad_norm": 1.609375,
"learning_rate": 6.817044940951992e-06,
"loss": 0.2762,
"mean_token_accuracy": 0.9160731554031372,
"num_tokens": 15095864.0,
"step": 2040
},
{
"entropy": 0.2494908979162574,
"epoch": 1.8553971486761711,
"grad_norm": 1.515625,
"learning_rate": 6.7245637495453135e-06,
"loss": 0.2413,
"mean_token_accuracy": 0.9239408574998379,
"num_tokens": 15166717.0,
"step": 2050
},
{
"entropy": 0.2488134613260627,
"epoch": 1.864448970355284,
"grad_norm": 2.203125,
"learning_rate": 6.632395312580428e-06,
"loss": 0.2513,
"mean_token_accuracy": 0.9206522315740585,
"num_tokens": 15240239.0,
"step": 2060
},
{
"entropy": 0.255818460509181,
"epoch": 1.873500792034397,
"grad_norm": 1.484375,
"learning_rate": 6.540548430743981e-06,
"loss": 0.2857,
"mean_token_accuracy": 0.9164819419384003,
"num_tokens": 15323647.0,
"step": 2070
},
{
"entropy": 0.23680712506175042,
"epoch": 1.8825526137135098,
"grad_norm": 1.7421875,
"learning_rate": 6.449031874018978e-06,
"loss": 0.2374,
"mean_token_accuracy": 0.926248911768198,
"num_tokens": 15395170.0,
"step": 2080
},
{
"entropy": 0.2399167947471142,
"epoch": 1.891604435392623,
"grad_norm": 1.75,
"learning_rate": 6.357854380847397e-06,
"loss": 0.2409,
"mean_token_accuracy": 0.922540470957756,
"num_tokens": 15466907.0,
"step": 2090
},
{
"entropy": 0.25149166863411665,
"epoch": 1.9006562570717356,
"grad_norm": 1.953125,
"learning_rate": 6.267024657295784e-06,
"loss": 0.2517,
"mean_token_accuracy": 0.9214697033166885,
"num_tokens": 15544889.0,
"step": 2100
},
{
"entropy": 0.24887168370187282,
"epoch": 1.9097080787508487,
"grad_norm": 1.5546875,
"learning_rate": 6.176551376223972e-06,
"loss": 0.2539,
"mean_token_accuracy": 0.9207740597426891,
"num_tokens": 15619482.0,
"step": 2110
},
{
"entropy": 0.256895999237895,
"epoch": 1.9187599004299616,
"grad_norm": 1.515625,
"learning_rate": 6.086443176456951e-06,
"loss": 0.2654,
"mean_token_accuracy": 0.9176653556525707,
"num_tokens": 15693593.0,
"step": 2120
},
{
"entropy": 0.24922299664467573,
"epoch": 1.9278117221090745,
"grad_norm": 1.875,
"learning_rate": 5.996708661959979e-06,
"loss": 0.2437,
"mean_token_accuracy": 0.9216303117573261,
"num_tokens": 15767913.0,
"step": 2130
},
{
"entropy": 0.22873957753181456,
"epoch": 1.9368635437881874,
"grad_norm": 1.5078125,
"learning_rate": 5.907356401017046e-06,
"loss": 0.2189,
"mean_token_accuracy": 0.9284776009619236,
"num_tokens": 15841332.0,
"step": 2140
},
{
"entropy": 0.2649971529841423,
"epoch": 1.9459153654673003,
"grad_norm": 1.5078125,
"learning_rate": 5.818394925412738e-06,
"loss": 0.2855,
"mean_token_accuracy": 0.9157008796930313,
"num_tokens": 15914905.0,
"step": 2150
},
{
"entropy": 0.24455339908599855,
"epoch": 1.9549671871464134,
"grad_norm": 1.6484375,
"learning_rate": 5.729832729617567e-06,
"loss": 0.2443,
"mean_token_accuracy": 0.9228208027780056,
"num_tokens": 15990500.0,
"step": 2160
},
{
"entropy": 0.24984920993447304,
"epoch": 1.964019008825526,
"grad_norm": 2.390625,
"learning_rate": 5.641678269976879e-06,
"loss": 0.2478,
"mean_token_accuracy": 0.9218893505632877,
"num_tokens": 16062684.0,
"step": 2170
},
{
"entropy": 0.24794082455337046,
"epoch": 1.9730708305046392,
"grad_norm": 1.546875,
"learning_rate": 5.5539399639034145e-06,
"loss": 0.2507,
"mean_token_accuracy": 0.9215313449501992,
"num_tokens": 16133340.0,
"step": 2180
},
{
"entropy": 0.24754474610090255,
"epoch": 1.9821226521837518,
"grad_norm": 1.890625,
"learning_rate": 5.466626189073563e-06,
"loss": 0.2466,
"mean_token_accuracy": 0.9224697306752205,
"num_tokens": 16207210.0,
"step": 2190
},
{
"entropy": 0.2443618050776422,
"epoch": 1.991174473862865,
"grad_norm": 1.5234375,
"learning_rate": 5.3797452826274245e-06,
"loss": 0.2462,
"mean_token_accuracy": 0.921653438359499,
"num_tokens": 16282613.0,
"step": 2200
},
{
"epoch": 1.991174473862865,
"eval_entropy": 0.25493055218458177,
"eval_loss": 0.28528815507888794,
"eval_mean_token_accuracy": 0.9100957527160645,
"eval_num_tokens": 16282613.0,
"eval_runtime": 9.3856,
"eval_samples_per_second": 53.273,
"eval_steps_per_second": 26.636,
"step": 2200
},
{
"entropy": 0.2419594947535258,
"epoch": 2.0,
"grad_norm": 1.640625,
"learning_rate": 5.293305540372744e-06,
"loss": 0.2497,
"mean_token_accuracy": 0.9222977512922043,
"num_tokens": 16353095.0,
"step": 2210
},
{
"entropy": 0.2307557038962841,
"epoch": 2.009051821679113,
"grad_norm": 1.4296875,
"learning_rate": 5.2073152159927674e-06,
"loss": 0.2129,
"mean_token_accuracy": 0.9317504949867725,
"num_tokens": 16430491.0,
"step": 2220
},
{
"entropy": 0.2438331985846162,
"epoch": 2.018103643358226,
"grad_norm": 1.3125,
"learning_rate": 5.121782520258171e-06,
"loss": 0.249,
"mean_token_accuracy": 0.9242358595132828,
"num_tokens": 16508058.0,
"step": 2230
},
{
"entropy": 0.21921782195568085,
"epoch": 2.027155465037339,
"grad_norm": 1.328125,
"learning_rate": 5.036715620243039e-06,
"loss": 0.2053,
"mean_token_accuracy": 0.9350384041666985,
"num_tokens": 16575440.0,
"step": 2240
},
{
"entropy": 0.23980374177917838,
"epoch": 2.0362072867164516,
"grad_norm": 1.4609375,
"learning_rate": 4.952122638545035e-06,
"loss": 0.2313,
"mean_token_accuracy": 0.927023620903492,
"num_tokens": 16652627.0,
"step": 2250
},
{
"entropy": 0.22007083408534528,
"epoch": 2.0452591083955647,
"grad_norm": 1.3671875,
"learning_rate": 4.8680116525098056e-06,
"loss": 0.2198,
"mean_token_accuracy": 0.9333253562450409,
"num_tokens": 16718816.0,
"step": 2260
},
{
"entropy": 0.24282212648540735,
"epoch": 2.0543109300746774,
"grad_norm": 1.5,
"learning_rate": 4.784390693459753e-06,
"loss": 0.2399,
"mean_token_accuracy": 0.9239346958696842,
"num_tokens": 16789759.0,
"step": 2270
},
{
"entropy": 0.23785702120512725,
"epoch": 2.0633627517537905,
"grad_norm": 1.453125,
"learning_rate": 4.701267745927113e-06,
"loss": 0.2506,
"mean_token_accuracy": 0.9241972677409649,
"num_tokens": 16867039.0,
"step": 2280
},
{
"entropy": 0.24605063777416944,
"epoch": 2.0724145734329036,
"grad_norm": 1.46875,
"learning_rate": 4.618650746891599e-06,
"loss": 0.2507,
"mean_token_accuracy": 0.9215324610471726,
"num_tokens": 16941361.0,
"step": 2290
},
{
"entropy": 0.22479961309581994,
"epoch": 2.0814663951120163,
"grad_norm": 1.359375,
"learning_rate": 4.536547585022518e-06,
"loss": 0.2206,
"mean_token_accuracy": 0.9283983618021011,
"num_tokens": 17012141.0,
"step": 2300
},
{
"entropy": 0.24391699638217687,
"epoch": 2.0905182167911294,
"grad_norm": 2.15625,
"learning_rate": 4.454966099925531e-06,
"loss": 0.25,
"mean_token_accuracy": 0.9209645003080368,
"num_tokens": 17087024.0,
"step": 2310
},
{
"entropy": 0.22605140786617994,
"epoch": 2.099570038470242,
"grad_norm": 1.6328125,
"learning_rate": 4.3739140813940765e-06,
"loss": 0.2181,
"mean_token_accuracy": 0.9297716915607452,
"num_tokens": 17158091.0,
"step": 2320
},
{
"entropy": 0.23818891448900104,
"epoch": 2.108621860149355,
"grad_norm": 1.5078125,
"learning_rate": 4.293399268665581e-06,
"loss": 0.2398,
"mean_token_accuracy": 0.9242137163877487,
"num_tokens": 17229953.0,
"step": 2330
},
{
"entropy": 0.2529011068865657,
"epoch": 2.117673681828468,
"grad_norm": 1.4140625,
"learning_rate": 4.21342934968247e-06,
"loss": 0.2577,
"mean_token_accuracy": 0.9201614983379841,
"num_tokens": 17301299.0,
"step": 2340
},
{
"entropy": 0.2255427474156022,
"epoch": 2.126725503507581,
"grad_norm": 1.7109375,
"learning_rate": 4.134011960358094e-06,
"loss": 0.2255,
"mean_token_accuracy": 0.9295220628380776,
"num_tokens": 17377435.0,
"step": 2350
},
{
"entropy": 0.22926049511879681,
"epoch": 2.1357773251866936,
"grad_norm": 2.09375,
"learning_rate": 4.055154683847588e-06,
"loss": 0.2312,
"mean_token_accuracy": 0.9286687098443508,
"num_tokens": 17449350.0,
"step": 2360
},
{
"entropy": 0.22778470665216446,
"epoch": 2.1448291468658067,
"grad_norm": 1.4765625,
"learning_rate": 3.976865049823845e-06,
"loss": 0.2281,
"mean_token_accuracy": 0.9293324284255504,
"num_tokens": 17525880.0,
"step": 2370
},
{
"entropy": 0.23943039821460843,
"epoch": 2.15388096854492,
"grad_norm": 1.390625,
"learning_rate": 3.899150533758489e-06,
"loss": 0.2509,
"mean_token_accuracy": 0.9246469952166081,
"num_tokens": 17602733.0,
"step": 2380
},
{
"entropy": 0.23977632280439137,
"epoch": 2.1629327902240325,
"grad_norm": 1.3515625,
"learning_rate": 3.822018556208128e-06,
"loss": 0.2399,
"mean_token_accuracy": 0.9248318992555141,
"num_tokens": 17676158.0,
"step": 2390
},
{
"entropy": 0.2481432169675827,
"epoch": 2.1719846119031456,
"grad_norm": 1.7890625,
"learning_rate": 3.7454764821057754e-06,
"loss": 0.2396,
"mean_token_accuracy": 0.9235868014395237,
"num_tokens": 17750465.0,
"step": 2400
},
{
"epoch": 2.1719846119031456,
"eval_entropy": 0.24248164582252502,
"eval_loss": 0.2894599735736847,
"eval_mean_token_accuracy": 0.9097159428596496,
"eval_num_tokens": 17750465.0,
"eval_runtime": 9.4339,
"eval_samples_per_second": 53.0,
"eval_steps_per_second": 26.5,
"step": 2400
},
{
"entropy": 0.23743325080722572,
"epoch": 2.1810364335822583,
"grad_norm": 1.484375,
"learning_rate": 3.669531620057628e-06,
"loss": 0.2311,
"mean_token_accuracy": 0.9258677743375301,
"num_tokens": 17824288.0,
"step": 2410
},
{
"entropy": 0.23670416846871375,
"epoch": 2.1900882552613714,
"grad_norm": 1.4609375,
"learning_rate": 3.5941912216451812e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.9266407683491706,
"num_tokens": 17902584.0,
"step": 2420
},
{
"entropy": 0.235261020809412,
"epoch": 2.199140076940484,
"grad_norm": 1.515625,
"learning_rate": 3.5194624807328514e-06,
"loss": 0.2381,
"mean_token_accuracy": 0.9261479564011097,
"num_tokens": 17982974.0,
"step": 2430
},
{
"entropy": 0.23788492735475303,
"epoch": 2.208191898619597,
"grad_norm": 1.4296875,
"learning_rate": 3.4453525327810277e-06,
"loss": 0.2399,
"mean_token_accuracy": 0.9252388395369053,
"num_tokens": 18052254.0,
"step": 2440
},
{
"entropy": 0.21874225055798888,
"epoch": 2.2172437202987103,
"grad_norm": 1.578125,
"learning_rate": 3.371868454164775e-06,
"loss": 0.2098,
"mean_token_accuracy": 0.9329733081161976,
"num_tokens": 18120065.0,
"step": 2450
},
{
"entropy": 0.24790667220950127,
"epoch": 2.226295541977823,
"grad_norm": 1.890625,
"learning_rate": 3.299017261498136e-06,
"loss": 0.2539,
"mean_token_accuracy": 0.9227528609335423,
"num_tokens": 18193562.0,
"step": 2460
},
{
"entropy": 0.24773352714255453,
"epoch": 2.235347363656936,
"grad_norm": 1.5625,
"learning_rate": 3.226805910964156e-06,
"loss": 0.2573,
"mean_token_accuracy": 0.9219558417797089,
"num_tokens": 18267919.0,
"step": 2470
},
{
"entropy": 0.22798624727874994,
"epoch": 2.2443991853360488,
"grad_norm": 1.375,
"learning_rate": 3.1552412976506565e-06,
"loss": 0.2228,
"mean_token_accuracy": 0.9276571467518806,
"num_tokens": 18342460.0,
"step": 2480
},
{
"entropy": 0.21947569595649838,
"epoch": 2.253451007015162,
"grad_norm": 1.5234375,
"learning_rate": 3.084330254891883e-06,
"loss": 0.2231,
"mean_token_accuracy": 0.9298080869019032,
"num_tokens": 18412694.0,
"step": 2490
},
{
"entropy": 0.2307950984686613,
"epoch": 2.2625028286942745,
"grad_norm": 1.6796875,
"learning_rate": 3.0140795536160127e-06,
"loss": 0.227,
"mean_token_accuracy": 0.9273362122476101,
"num_tokens": 18482616.0,
"step": 2500
},
{
"entropy": 0.23491790611296892,
"epoch": 2.2715546503733877,
"grad_norm": 1.4296875,
"learning_rate": 2.944495901698631e-06,
"loss": 0.2394,
"mean_token_accuracy": 0.9233520910143852,
"num_tokens": 18554093.0,
"step": 2510
},
{
"entropy": 0.21908750645816327,
"epoch": 2.2806064720525008,
"grad_norm": 1.2109375,
"learning_rate": 2.8755859433222422e-06,
"loss": 0.2128,
"mean_token_accuracy": 0.9334690175950527,
"num_tokens": 18628041.0,
"step": 2520
},
{
"entropy": 0.24107547104358673,
"epoch": 2.2896582937316134,
"grad_norm": 1.6015625,
"learning_rate": 2.8073562583418336e-06,
"loss": 0.2431,
"mean_token_accuracy": 0.9256007336080074,
"num_tokens": 18703682.0,
"step": 2530
},
{
"entropy": 0.23574529979377984,
"epoch": 2.2987101154107266,
"grad_norm": 1.71875,
"learning_rate": 2.739813361656616e-06,
"loss": 0.2287,
"mean_token_accuracy": 0.9242596134543419,
"num_tokens": 18779322.0,
"step": 2540
},
{
"entropy": 0.2339877954684198,
"epoch": 2.3077619370898392,
"grad_norm": 1.7265625,
"learning_rate": 2.672963702587943e-06,
"loss": 0.2377,
"mean_token_accuracy": 0.9261183701455593,
"num_tokens": 18858427.0,
"step": 2550
},
{
"entropy": 0.23651442099362613,
"epoch": 2.3168137587689523,
"grad_norm": 1.9609375,
"learning_rate": 2.6068136642635024e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.9261867627501488,
"num_tokens": 18932723.0,
"step": 2560
},
{
"entropy": 0.23850015196949242,
"epoch": 2.325865580448065,
"grad_norm": 1.890625,
"learning_rate": 2.541369563007806e-06,
"loss": 0.2534,
"mean_token_accuracy": 0.9260534539818763,
"num_tokens": 19010914.0,
"step": 2570
},
{
"entropy": 0.23194469464942813,
"epoch": 2.334917402127178,
"grad_norm": 1.671875,
"learning_rate": 2.476637647739115e-06,
"loss": 0.2381,
"mean_token_accuracy": 0.926636103540659,
"num_tokens": 19085371.0,
"step": 2580
},
{
"entropy": 0.22836068961769343,
"epoch": 2.3439692238062912,
"grad_norm": 1.7265625,
"learning_rate": 2.412624099372719e-06,
"loss": 0.2321,
"mean_token_accuracy": 0.9283419884741306,
"num_tokens": 19151951.0,
"step": 2590
},
{
"entropy": 0.2302293201908469,
"epoch": 2.353021045485404,
"grad_norm": 1.421875,
"learning_rate": 2.349335030230785e-06,
"loss": 0.2302,
"mean_token_accuracy": 0.9274107903242111,
"num_tokens": 19227125.0,
"step": 2600
},
{
"epoch": 2.353021045485404,
"eval_entropy": 0.2420771769285202,
"eval_loss": 0.2899412512779236,
"eval_mean_token_accuracy": 0.9096628496646881,
"eval_num_tokens": 19227125.0,
"eval_runtime": 9.3687,
"eval_samples_per_second": 53.369,
"eval_steps_per_second": 26.685,
"step": 2600
},
{
"entropy": 0.234050558693707,
"epoch": 2.362072867164517,
"grad_norm": 1.5703125,
"learning_rate": 2.2867764834587003e-06,
"loss": 0.2332,
"mean_token_accuracy": 0.9264360308647156,
"num_tokens": 19301670.0,
"step": 2610
},
{
"entropy": 0.23359497357159853,
"epoch": 2.3711246888436297,
"grad_norm": 1.6171875,
"learning_rate": 2.224954432448071e-06,
"loss": 0.2359,
"mean_token_accuracy": 0.9275781489908695,
"num_tokens": 19373066.0,
"step": 2620
},
{
"entropy": 0.24756205026060343,
"epoch": 2.380176510522743,
"grad_norm": 1.6015625,
"learning_rate": 2.163874780266323e-06,
"loss": 0.259,
"mean_token_accuracy": 0.9211221620440483,
"num_tokens": 19453654.0,
"step": 2630
},
{
"entropy": 0.2272123709321022,
"epoch": 2.3892283322018555,
"grad_norm": 2.0625,
"learning_rate": 2.103543359093071e-06,
"loss": 0.2223,
"mean_token_accuracy": 0.9283259101212025,
"num_tokens": 19527264.0,
"step": 2640
},
{
"entropy": 0.22003255859017373,
"epoch": 2.3982801538809686,
"grad_norm": 1.375,
"learning_rate": 2.043965929663224e-06,
"loss": 0.2235,
"mean_token_accuracy": 0.9287537440657616,
"num_tokens": 19604492.0,
"step": 2650
},
{
"entropy": 0.22481423607096077,
"epoch": 2.4073319755600817,
"grad_norm": 1.6796875,
"learning_rate": 1.985148180716928e-06,
"loss": 0.2314,
"mean_token_accuracy": 0.9280698530375957,
"num_tokens": 19679416.0,
"step": 2660
},
{
"entropy": 0.22615873701870443,
"epoch": 2.4163837972391944,
"grad_norm": 1.390625,
"learning_rate": 1.927095728456364e-06,
"loss": 0.2222,
"mean_token_accuracy": 0.9290170624852181,
"num_tokens": 19753645.0,
"step": 2670
},
{
"entropy": 0.2225392703898251,
"epoch": 2.4254356189183075,
"grad_norm": 1.75,
"learning_rate": 1.8698141160095162e-06,
"loss": 0.2241,
"mean_token_accuracy": 0.9286721229553223,
"num_tokens": 19828790.0,
"step": 2680
},
{
"entropy": 0.2204814150929451,
"epoch": 2.43448744059742,
"grad_norm": 1.3515625,
"learning_rate": 1.8133088129008459e-06,
"loss": 0.2182,
"mean_token_accuracy": 0.9296285167336464,
"num_tokens": 19896900.0,
"step": 2690
},
{
"entropy": 0.22337221689522266,
"epoch": 2.4435392622765333,
"grad_norm": 1.5703125,
"learning_rate": 1.7575852145290717e-06,
"loss": 0.2232,
"mean_token_accuracy": 0.928014337271452,
"num_tokens": 19968242.0,
"step": 2700
},
{
"entropy": 0.2179032789543271,
"epoch": 2.452591083955646,
"grad_norm": 1.5234375,
"learning_rate": 1.7026486416519682e-06,
"loss": 0.211,
"mean_token_accuracy": 0.9319535449147225,
"num_tokens": 20041305.0,
"step": 2710
},
{
"entropy": 0.2261628670617938,
"epoch": 2.461642905634759,
"grad_norm": 1.5390625,
"learning_rate": 1.6485043398783295e-06,
"loss": 0.235,
"mean_token_accuracy": 0.9279113605618476,
"num_tokens": 20116751.0,
"step": 2720
},
{
"entropy": 0.24026810871437193,
"epoch": 2.4706947273138717,
"grad_norm": 1.4921875,
"learning_rate": 1.5951574791670754e-06,
"loss": 0.2505,
"mean_token_accuracy": 0.9243397124111652,
"num_tokens": 20193209.0,
"step": 2730
},
{
"entropy": 0.22616808880120515,
"epoch": 2.479746548992985,
"grad_norm": 1.4375,
"learning_rate": 1.5426131533336164e-06,
"loss": 0.2227,
"mean_token_accuracy": 0.9292748935520649,
"num_tokens": 20267532.0,
"step": 2740
},
{
"entropy": 0.22781651541590692,
"epoch": 2.4887983706720975,
"grad_norm": 1.4140625,
"learning_rate": 1.490876379563464e-06,
"loss": 0.2277,
"mean_token_accuracy": 0.928032499551773,
"num_tokens": 20341347.0,
"step": 2750
},
{
"entropy": 0.22965652998536826,
"epoch": 2.4978501923512106,
"grad_norm": 1.3203125,
"learning_rate": 1.4399520979331639e-06,
"loss": 0.217,
"mean_token_accuracy": 0.9308909751474858,
"num_tokens": 20418634.0,
"step": 2760
},
{
"entropy": 0.23352561388164758,
"epoch": 2.5069020140303238,
"grad_norm": 1.6484375,
"learning_rate": 1.3898451709385995e-06,
"loss": 0.2371,
"mean_token_accuracy": 0.9257538385689259,
"num_tokens": 20492992.0,
"step": 2770
},
{
"entropy": 0.22655329555273057,
"epoch": 2.5159538357094364,
"grad_norm": 1.546875,
"learning_rate": 1.3405603830306868e-06,
"loss": 0.228,
"mean_token_accuracy": 0.9283955104649066,
"num_tokens": 20567899.0,
"step": 2780
},
{
"entropy": 0.23698492981493474,
"epoch": 2.5250056573885495,
"grad_norm": 1.2890625,
"learning_rate": 1.2921024401585436e-06,
"loss": 0.2534,
"mean_token_accuracy": 0.9233708687126636,
"num_tokens": 20643853.0,
"step": 2790
},
{
"entropy": 0.23318157717585564,
"epoch": 2.534057479067662,
"grad_norm": 1.3515625,
"learning_rate": 1.2444759693201391e-06,
"loss": 0.2374,
"mean_token_accuracy": 0.9264273457229137,
"num_tokens": 20719127.0,
"step": 2800
},
{
"epoch": 2.534057479067662,
"eval_entropy": 0.2420763995051384,
"eval_loss": 0.29005327820777893,
"eval_mean_token_accuracy": 0.9095665421485901,
"eval_num_tokens": 20719127.0,
"eval_runtime": 9.3994,
"eval_samples_per_second": 53.195,
"eval_steps_per_second": 26.598,
"step": 2800
},
{
"entropy": 0.24478118922561407,
"epoch": 2.5431093007467753,
"grad_norm": 1.5703125,
"learning_rate": 1.197685518120485e-06,
"loss": 0.2534,
"mean_token_accuracy": 0.9210445381700992,
"num_tokens": 20791473.0,
"step": 2810
},
{
"entropy": 0.23202836168929936,
"epoch": 2.552161122425888,
"grad_norm": 1.3828125,
"learning_rate": 1.1517355543373988e-06,
"loss": 0.2284,
"mean_token_accuracy": 0.9267525814473629,
"num_tokens": 20867883.0,
"step": 2820
},
{
"entropy": 0.22680169045925141,
"epoch": 2.561212944105001,
"grad_norm": 1.4375,
"learning_rate": 1.1066304654949245e-06,
"loss": 0.2154,
"mean_token_accuracy": 0.9293528974056244,
"num_tokens": 20939137.0,
"step": 2830
},
{
"entropy": 0.23053370881825686,
"epoch": 2.5702647657841142,
"grad_norm": 1.6328125,
"learning_rate": 1.062374558444358e-06,
"loss": 0.2331,
"mean_token_accuracy": 0.9270932763814926,
"num_tokens": 21019630.0,
"step": 2840
},
{
"entropy": 0.23330926056951284,
"epoch": 2.579316587463227,
"grad_norm": 1.421875,
"learning_rate": 1.0189720589530372e-06,
"loss": 0.2369,
"mean_token_accuracy": 0.9256651438772678,
"num_tokens": 21097041.0,
"step": 2850
},
{
"entropy": 0.22963873716071248,
"epoch": 2.58836840914234,
"grad_norm": 1.4296875,
"learning_rate": 9.764271113008183e-07,
"loss": 0.2222,
"mean_token_accuracy": 0.9277059838175774,
"num_tokens": 21174887.0,
"step": 2860
},
{
"entropy": 0.23152102306485176,
"epoch": 2.5974202308214527,
"grad_norm": 1.6484375,
"learning_rate": 9.347437778843938e-07,
"loss": 0.2234,
"mean_token_accuracy": 0.9289267487823963,
"num_tokens": 21251344.0,
"step": 2870
},
{
"entropy": 0.24004797209054232,
"epoch": 2.606472052500566,
"grad_norm": 1.578125,
"learning_rate": 8.939260388293569e-07,
"loss": 0.2478,
"mean_token_accuracy": 0.9238497324287891,
"num_tokens": 21324495.0,
"step": 2880
},
{
"entropy": 0.22599442386999727,
"epoch": 2.6155238741796785,
"grad_norm": 1.515625,
"learning_rate": 8.539777916101888e-07,
"loss": 0.2228,
"mean_token_accuracy": 0.9293952472507954,
"num_tokens": 21398424.0,
"step": 2890
},
{
"entropy": 0.23568667601794005,
"epoch": 2.6245756958587916,
"grad_norm": 1.34375,
"learning_rate": 8.149028506780964e-07,
"loss": 0.2372,
"mean_token_accuracy": 0.9261699497699738,
"num_tokens": 21471703.0,
"step": 2900
},
{
"entropy": 0.23762993402779103,
"epoch": 2.6336275175379047,
"grad_norm": 1.5546875,
"learning_rate": 7.767049470967946e-07,
"loss": 0.2344,
"mean_token_accuracy": 0.9264717750251293,
"num_tokens": 21547014.0,
"step": 2910
},
{
"entropy": 0.2214735448360443,
"epoch": 2.6426793392170174,
"grad_norm": 1.53125,
"learning_rate": 7.393877281862394e-07,
"loss": 0.2169,
"mean_token_accuracy": 0.9300684794783592,
"num_tokens": 21620989.0,
"step": 2920
},
{
"entropy": 0.22688788436353208,
"epoch": 2.6517311608961305,
"grad_norm": 1.5625,
"learning_rate": 7.029547571743778e-07,
"loss": 0.2239,
"mean_token_accuracy": 0.929075525701046,
"num_tokens": 21696245.0,
"step": 2930
},
{
"entropy": 0.23520100452005863,
"epoch": 2.660782982575243,
"grad_norm": 1.4921875,
"learning_rate": 6.674095128568958e-07,
"loss": 0.2391,
"mean_token_accuracy": 0.9241717301309109,
"num_tokens": 21780059.0,
"step": 2940
},
{
"entropy": 0.2333427995443344,
"epoch": 2.6698348042543563,
"grad_norm": 1.625,
"learning_rate": 6.327553892650606e-07,
"loss": 0.2408,
"mean_token_accuracy": 0.9252305686473846,
"num_tokens": 21851491.0,
"step": 2950
},
{
"entropy": 0.24906855598092079,
"epoch": 2.678886625933469,
"grad_norm": 1.828125,
"learning_rate": 5.989956953416376e-07,
"loss": 0.2423,
"mean_token_accuracy": 0.9250664070248604,
"num_tokens": 21927668.0,
"step": 2960
},
{
"entropy": 0.22028352571651338,
"epoch": 2.687938447612582,
"grad_norm": 1.8515625,
"learning_rate": 5.661336546249352e-07,
"loss": 0.2192,
"mean_token_accuracy": 0.929874736070633,
"num_tokens": 21996776.0,
"step": 2970
},
{
"entropy": 0.2297042902559042,
"epoch": 2.696990269291695,
"grad_norm": 1.5703125,
"learning_rate": 5.341724049410024e-07,
"loss": 0.2327,
"mean_token_accuracy": 0.9275284387171269,
"num_tokens": 22071876.0,
"step": 2980
},
{
"entropy": 0.2260682400316,
"epoch": 2.706042090970808,
"grad_norm": 1.453125,
"learning_rate": 5.031149981040262e-07,
"loss": 0.2328,
"mean_token_accuracy": 0.9273493871092796,
"num_tokens": 22143958.0,
"step": 2990
},
{
"entropy": 0.2300025401636958,
"epoch": 2.715093912649921,
"grad_norm": 1.859375,
"learning_rate": 4.729643996249156e-07,
"loss": 0.2222,
"mean_token_accuracy": 0.9290709294378757,
"num_tokens": 22219617.0,
"step": 3000
},
{
"epoch": 2.715093912649921,
"eval_entropy": 0.24206615540385246,
"eval_loss": 0.2901514768600464,
"eval_mean_token_accuracy": 0.9095830063819885,
"eval_num_tokens": 22219617.0,
"eval_runtime": 9.4159,
"eval_samples_per_second": 53.102,
"eval_steps_per_second": 26.551,
"step": 3000
},
{
"entropy": 0.23495229706168175,
"epoch": 2.7241457343290336,
"grad_norm": 1.484375,
"learning_rate": 4.4372348842814716e-07,
"loss": 0.2359,
"mean_token_accuracy": 0.9247433744370938,
"num_tokens": 22293562.0,
"step": 3010
},
{
"entropy": 0.22726391404867172,
"epoch": 2.7331975560081467,
"grad_norm": 1.4921875,
"learning_rate": 4.1539505657687495e-07,
"loss": 0.2329,
"mean_token_accuracy": 0.9276506796479225,
"num_tokens": 22364740.0,
"step": 3020
},
{
"entropy": 0.23861285336315632,
"epoch": 2.7422493776872594,
"grad_norm": 1.6171875,
"learning_rate": 3.8798180900632253e-07,
"loss": 0.2505,
"mean_token_accuracy": 0.9261965282261372,
"num_tokens": 22438839.0,
"step": 3030
},
{
"entropy": 0.22645489294081927,
"epoch": 2.7513011993663725,
"grad_norm": 1.4921875,
"learning_rate": 3.6148636326550743e-07,
"loss": 0.2293,
"mean_token_accuracy": 0.930086625367403,
"num_tokens": 22510078.0,
"step": 3040
},
{
"entropy": 0.24204493686556816,
"epoch": 2.7603530210454856,
"grad_norm": 1.7109375,
"learning_rate": 3.3591124926730557e-07,
"loss": 0.243,
"mean_token_accuracy": 0.9232821561396122,
"num_tokens": 22579081.0,
"step": 3050
},
{
"entropy": 0.2360864533111453,
"epoch": 2.7694048427245983,
"grad_norm": 1.5390625,
"learning_rate": 3.1125890904688206e-07,
"loss": 0.2394,
"mean_token_accuracy": 0.9256729304790496,
"num_tokens": 22652498.0,
"step": 3060
},
{
"entropy": 0.23557743560522795,
"epoch": 2.778456664403711,
"grad_norm": 1.53125,
"learning_rate": 2.8753169652851245e-07,
"loss": 0.229,
"mean_token_accuracy": 0.9265096105635167,
"num_tokens": 22730659.0,
"step": 3070
},
{
"entropy": 0.2428856560960412,
"epoch": 2.787508486082824,
"grad_norm": 1.6328125,
"learning_rate": 2.6473187730082004e-07,
"loss": 0.2425,
"mean_token_accuracy": 0.9218696370720864,
"num_tokens": 22803396.0,
"step": 3080
},
{
"entropy": 0.2498979650437832,
"epoch": 2.796560307761937,
"grad_norm": 1.703125,
"learning_rate": 2.42861628400447e-07,
"loss": 0.2496,
"mean_token_accuracy": 0.9203889586031437,
"num_tokens": 22879925.0,
"step": 3090
},
{
"entropy": 0.2429232547059655,
"epoch": 2.80561212944105,
"grad_norm": 1.453125,
"learning_rate": 2.2192303810418148e-07,
"loss": 0.2424,
"mean_token_accuracy": 0.9244124636054039,
"num_tokens": 22962210.0,
"step": 3100
},
{
"entropy": 0.2421200337819755,
"epoch": 2.814663951120163,
"grad_norm": 1.734375,
"learning_rate": 2.0191810572955052e-07,
"loss": 0.2568,
"mean_token_accuracy": 0.9226670287549495,
"num_tokens": 23038980.0,
"step": 3110
},
{
"entropy": 0.22173038199543954,
"epoch": 2.823715772799276,
"grad_norm": 1.53125,
"learning_rate": 1.8284874144393284e-07,
"loss": 0.2194,
"mean_token_accuracy": 0.930086762458086,
"num_tokens": 23105957.0,
"step": 3120
},
{
"entropy": 0.2354667537845671,
"epoch": 2.8327675944783888,
"grad_norm": 1.53125,
"learning_rate": 1.6471676608214581e-07,
"loss": 0.2325,
"mean_token_accuracy": 0.9262704968452453,
"num_tokens": 23176390.0,
"step": 3130
},
{
"entropy": 0.23669061083346604,
"epoch": 2.8418194161575014,
"grad_norm": 1.6640625,
"learning_rate": 1.4752391097260233e-07,
"loss": 0.2389,
"mean_token_accuracy": 0.9259525135159492,
"num_tokens": 23253673.0,
"step": 3140
},
{
"entropy": 0.23215005043894052,
"epoch": 2.8508712378366146,
"grad_norm": 1.5546875,
"learning_rate": 1.3127181777198073e-07,
"loss": 0.2297,
"mean_token_accuracy": 0.9277803264558315,
"num_tokens": 23326885.0,
"step": 3150
},
{
"entropy": 0.22989463973790408,
"epoch": 2.8599230595157277,
"grad_norm": 1.421875,
"learning_rate": 1.159620383084814e-07,
"loss": 0.2177,
"mean_token_accuracy": 0.9286665737628936,
"num_tokens": 23394737.0,
"step": 3160
},
{
"entropy": 0.2309438543394208,
"epoch": 2.8689748811948403,
"grad_norm": 1.359375,
"learning_rate": 1.0159603443364308e-07,
"loss": 0.2257,
"mean_token_accuracy": 0.9272095516324044,
"num_tokens": 23470142.0,
"step": 3170
},
{
"entropy": 0.24182884357869625,
"epoch": 2.8780267028739535,
"grad_norm": 2.25,
"learning_rate": 8.817517788276775e-08,
"loss": 0.26,
"mean_token_accuracy": 0.9233683370053768,
"num_tokens": 23546197.0,
"step": 3180
},
{
"entropy": 0.23957613073289394,
"epoch": 2.8870785245530666,
"grad_norm": 1.734375,
"learning_rate": 7.570075014392775e-08,
"loss": 0.2544,
"mean_token_accuracy": 0.9227040722966194,
"num_tokens": 23622223.0,
"step": 3190
},
{
"entropy": 0.22729014521464705,
"epoch": 2.8961303462321792,
"grad_norm": 1.4375,
"learning_rate": 6.417394233561692e-08,
"loss": 0.2228,
"mean_token_accuracy": 0.9275835871696472,
"num_tokens": 23692447.0,
"step": 3200
},
{
"epoch": 2.8961303462321792,
"eval_entropy": 0.24213379493355752,
"eval_loss": 0.28997713327407837,
"eval_mean_token_accuracy": 0.9096816182136536,
"eval_num_tokens": 23692447.0,
"eval_runtime": 9.4008,
"eval_samples_per_second": 53.187,
"eval_steps_per_second": 26.593,
"step": 3200
},
{
"entropy": 0.2413923056796193,
"epoch": 2.905182167911292,
"grad_norm": 1.6015625,
"learning_rate": 5.359585509300602e-08,
"loss": 0.2428,
"mean_token_accuracy": 0.9240728266537189,
"num_tokens": 23762382.0,
"step": 3210
},
{
"entropy": 0.23166503813117742,
"epoch": 2.914233989590405,
"grad_norm": 1.5234375,
"learning_rate": 4.3967498462855753e-08,
"loss": 0.2314,
"mean_token_accuracy": 0.9276820600032807,
"num_tokens": 23832138.0,
"step": 3220
},
{
"entropy": 0.24169802255928516,
"epoch": 2.923285811269518,
"grad_norm": 1.5859375,
"learning_rate": 3.528979180706715e-08,
"loss": 0.2445,
"mean_token_accuracy": 0.9228494085371495,
"num_tokens": 23907140.0,
"step": 3230
},
{
"entropy": 0.2262877920642495,
"epoch": 2.932337632948631,
"grad_norm": 1.5390625,
"learning_rate": 2.7563563714902985e-08,
"loss": 0.2268,
"mean_token_accuracy": 0.9297634929418563,
"num_tokens": 23984475.0,
"step": 3240
},
{
"entropy": 0.2313869906589389,
"epoch": 2.941389454627744,
"grad_norm": 1.28125,
"learning_rate": 2.0789551923862117e-08,
"loss": 0.2316,
"mean_token_accuracy": 0.9266681142151356,
"num_tokens": 24063400.0,
"step": 3250
},
{
"entropy": 0.210309446323663,
"epoch": 2.9504412763068566,
"grad_norm": 1.4375,
"learning_rate": 1.4968403249244757e-08,
"loss": 0.2027,
"mean_token_accuracy": 0.9348952963948249,
"num_tokens": 24134590.0,
"step": 3260
},
{
"entropy": 0.22816819995641707,
"epoch": 2.9594930979859697,
"grad_norm": 1.5078125,
"learning_rate": 1.0100673522385196e-08,
"loss": 0.2296,
"mean_token_accuracy": 0.9287994243204594,
"num_tokens": 24205896.0,
"step": 3270
},
{
"entropy": 0.24220311921089888,
"epoch": 2.9685449196650824,
"grad_norm": 1.2734375,
"learning_rate": 6.186827537582041e-09,
"loss": 0.2341,
"mean_token_accuracy": 0.9245953395962715,
"num_tokens": 24282840.0,
"step": 3280
},
{
"entropy": 0.21936016846448184,
"epoch": 2.9775967413441955,
"grad_norm": 1.8125,
"learning_rate": 3.2272390077159323e-09,
"loss": 0.2172,
"mean_token_accuracy": 0.9313432745635509,
"num_tokens": 24353416.0,
"step": 3290
},
{
"entropy": 0.22437022235244514,
"epoch": 2.9866485630233086,
"grad_norm": 1.34375,
"learning_rate": 1.2221905285658698e-09,
"loss": 0.2246,
"mean_token_accuracy": 0.9301521182060242,
"num_tokens": 24427072.0,
"step": 3300
},
{
"entropy": 0.22978594526648521,
"epoch": 2.9957003847024213,
"grad_norm": 1.671875,
"learning_rate": 1.7187355182524125e-10,
"loss": 0.2394,
"mean_token_accuracy": 0.9282013036310672,
"num_tokens": 24498467.0,
"step": 3310
}
],
"logging_steps": 10,
"max_steps": 3315,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.085422064633119e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}