{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8961303462321792, "eval_steps": 200, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5955325569957495, "epoch": 0.009051821679112922, "grad_norm": 9.375, "learning_rate": 1.8000000000000001e-06, "loss": 0.714, "mean_token_accuracy": 0.8531711064279079, "num_tokens": 73265.0, "step": 10 }, { "entropy": 0.601216921582818, "epoch": 0.018103643358225844, "grad_norm": 6.1875, "learning_rate": 3.8000000000000005e-06, "loss": 0.6599, "mean_token_accuracy": 0.8536688484251499, "num_tokens": 151945.0, "step": 20 }, { "entropy": 0.619553691893816, "epoch": 0.027155465037338764, "grad_norm": 6.84375, "learning_rate": 5.8e-06, "loss": 0.5775, "mean_token_accuracy": 0.863236166536808, "num_tokens": 226319.0, "step": 30 }, { "entropy": 0.5566520445048809, "epoch": 0.03620728671645169, "grad_norm": 3.5625, "learning_rate": 7.800000000000002e-06, "loss": 0.4768, "mean_token_accuracy": 0.8804129175841808, "num_tokens": 300605.0, "step": 40 }, { "entropy": 0.46170808989554646, "epoch": 0.04525910839556461, "grad_norm": 82.5, "learning_rate": 9.800000000000001e-06, "loss": 0.4137, "mean_token_accuracy": 0.8895155563950539, "num_tokens": 372784.0, "step": 50 }, { "entropy": 0.4425128545612097, "epoch": 0.05431093007467753, "grad_norm": 2.46875, "learning_rate": 1.18e-05, "loss": 0.4117, "mean_token_accuracy": 0.888019285351038, "num_tokens": 444839.0, "step": 60 }, { "entropy": 0.41746695823967456, "epoch": 0.06336275175379046, "grad_norm": 2.21875, "learning_rate": 1.38e-05, "loss": 0.3682, "mean_token_accuracy": 0.8940133817493916, "num_tokens": 518697.0, "step": 70 }, { "entropy": 0.4343178853392601, "epoch": 0.07241457343290338, "grad_norm": 2.046875, "learning_rate": 1.58e-05, "loss": 0.3757, "mean_token_accuracy": 0.8950759552419185, "num_tokens": 591590.0, "step": 80 }, { "entropy": 0.40939719304442407, "epoch": 0.0814663951120163, "grad_norm": 1.5625, "learning_rate": 1.7800000000000002e-05, "loss": 0.3417, "mean_token_accuracy": 0.9000873222947121, "num_tokens": 671925.0, "step": 90 }, { "entropy": 0.3810647170990705, "epoch": 0.09051821679112922, "grad_norm": 1.8984375, "learning_rate": 1.98e-05, "loss": 0.3389, "mean_token_accuracy": 0.8990982733666897, "num_tokens": 744979.0, "step": 100 }, { "entropy": 0.34255886916071177, "epoch": 0.09957003847024214, "grad_norm": 1.359375, "learning_rate": 1.9999613285893108e-05, "loss": 0.3095, "mean_token_accuracy": 0.901997297257185, "num_tokens": 824704.0, "step": 110 }, { "entropy": 0.3227020036429167, "epoch": 0.10862186014935506, "grad_norm": 1.8203125, "learning_rate": 1.9998276534787115e-05, "loss": 0.3199, "mean_token_accuracy": 0.8994096107780933, "num_tokens": 899168.0, "step": 120 }, { "entropy": 0.31191693879663945, "epoch": 0.11767368182846798, "grad_norm": 1.6796875, "learning_rate": 1.9995985100042836e-05, "loss": 0.3029, "mean_token_accuracy": 0.9058460839092731, "num_tokens": 974442.0, "step": 130 }, { "entropy": 0.3080141399055719, "epoch": 0.1267255035075809, "grad_norm": 1.9609375, "learning_rate": 1.9992739200457505e-05, "loss": 0.2907, "mean_token_accuracy": 0.9111274629831314, "num_tokens": 1054948.0, "step": 140 }, { "entropy": 0.311343339830637, "epoch": 0.13577732518669383, "grad_norm": 1.453125, "learning_rate": 1.998853914596526e-05, "loss": 0.2974, "mean_token_accuracy": 0.9076914019882679, "num_tokens": 1127040.0, "step": 150 }, { "entropy": 0.31157970502972604, "epoch": 0.14482914686580675, "grad_norm": 1.8125, "learning_rate": 1.998338533760755e-05, "loss": 0.3007, "mean_token_accuracy": 0.9092378221452236, "num_tokens": 1196429.0, "step": 160 }, { "entropy": 0.3010763289406896, "epoch": 0.15388096854491967, "grad_norm": 1.71875, "learning_rate": 1.9977278267494844e-05, "loss": 0.2976, "mean_token_accuracy": 0.9085726246237755, "num_tokens": 1275764.0, "step": 170 }, { "entropy": 0.31990424413234, "epoch": 0.1629327902240326, "grad_norm": 1.578125, "learning_rate": 1.9970218518759626e-05, "loss": 0.3079, "mean_token_accuracy": 0.9044359527528286, "num_tokens": 1353869.0, "step": 180 }, { "entropy": 0.29817488249391316, "epoch": 0.1719846119031455, "grad_norm": 1.765625, "learning_rate": 1.9962206765500744e-05, "loss": 0.2943, "mean_token_accuracy": 0.9098256938159466, "num_tokens": 1426066.0, "step": 190 }, { "entropy": 0.30092692840844393, "epoch": 0.18103643358225843, "grad_norm": 1.84375, "learning_rate": 1.995324377271901e-05, "loss": 0.3224, "mean_token_accuracy": 0.9066158257424831, "num_tokens": 1494682.0, "step": 200 }, { "epoch": 0.18103643358225843, "eval_entropy": 0.31176194101572036, "eval_loss": 0.29660511016845703, "eval_mean_token_accuracy": 0.9063085055351258, "eval_num_tokens": 1494682.0, "eval_runtime": 9.3564, "eval_samples_per_second": 53.439, "eval_steps_per_second": 26.72, "step": 200 }, { "entropy": 0.30449779201298954, "epoch": 0.19008825526137135, "grad_norm": 1.5546875, "learning_rate": 1.9943330396244186e-05, "loss": 0.2936, "mean_token_accuracy": 0.9093848384916783, "num_tokens": 1574246.0, "step": 210 }, { "entropy": 0.29794183522462847, "epoch": 0.19914007694048427, "grad_norm": 1.484375, "learning_rate": 1.993246758265324e-05, "loss": 0.3014, "mean_token_accuracy": 0.906050831079483, "num_tokens": 1649636.0, "step": 220 }, { "entropy": 0.31836145240813496, "epoch": 0.2081918986195972, "grad_norm": 1.375, "learning_rate": 1.992065636917998e-05, "loss": 0.3124, "mean_token_accuracy": 0.9049551673233509, "num_tokens": 1723726.0, "step": 230 }, { "entropy": 0.3080146433785558, "epoch": 0.2172437202987101, "grad_norm": 1.8984375, "learning_rate": 1.9907897883615997e-05, "loss": 0.3253, "mean_token_accuracy": 0.9040355876088142, "num_tokens": 1800283.0, "step": 240 }, { "entropy": 0.28388842958956956, "epoch": 0.22629554197782303, "grad_norm": 1.6015625, "learning_rate": 1.9894193344202993e-05, "loss": 0.2784, "mean_token_accuracy": 0.9126260556280613, "num_tokens": 1870003.0, "step": 250 }, { "entropy": 0.28803354129195213, "epoch": 0.23534736365693595, "grad_norm": 1.4140625, "learning_rate": 1.987954405951645e-05, "loss": 0.2902, "mean_token_accuracy": 0.9081695273518562, "num_tokens": 1942710.0, "step": 260 }, { "entropy": 0.2951167915016413, "epoch": 0.24439918533604887, "grad_norm": 1.5625, "learning_rate": 1.986395142834069e-05, "loss": 0.3018, "mean_token_accuracy": 0.9062638804316521, "num_tokens": 2016368.0, "step": 270 }, { "entropy": 0.3020996805280447, "epoch": 0.2534510070151618, "grad_norm": 1.9765625, "learning_rate": 1.984741693953529e-05, "loss": 0.308, "mean_token_accuracy": 0.9063209906220436, "num_tokens": 2095809.0, "step": 280 }, { "entropy": 0.3082812769338489, "epoch": 0.2625028286942747, "grad_norm": 1.8046875, "learning_rate": 1.9829942171892953e-05, "loss": 0.3137, "mean_token_accuracy": 0.9035673819482326, "num_tokens": 2166206.0, "step": 290 }, { "entropy": 0.3025300450623035, "epoch": 0.27155465037338766, "grad_norm": 1.859375, "learning_rate": 1.981152879398872e-05, "loss": 0.2972, "mean_token_accuracy": 0.9058074496686459, "num_tokens": 2235018.0, "step": 300 }, { "entropy": 0.2980365388095379, "epoch": 0.28060647205250056, "grad_norm": 1.640625, "learning_rate": 1.9792178564020676e-05, "loss": 0.2879, "mean_token_accuracy": 0.9077603787183761, "num_tokens": 2304709.0, "step": 310 }, { "entropy": 0.2872885692864656, "epoch": 0.2896582937316135, "grad_norm": 1.8984375, "learning_rate": 1.9771893329642042e-05, "loss": 0.2889, "mean_token_accuracy": 0.910741999745369, "num_tokens": 2380839.0, "step": 320 }, { "entropy": 0.2809945376589894, "epoch": 0.2987101154107264, "grad_norm": 1.78125, "learning_rate": 1.975067502778479e-05, "loss": 0.2852, "mean_token_accuracy": 0.9113192833960057, "num_tokens": 2447193.0, "step": 330 }, { "entropy": 0.30356528908014296, "epoch": 0.30776193708983934, "grad_norm": 1.7890625, "learning_rate": 1.9728525684474654e-05, "loss": 0.3216, "mean_token_accuracy": 0.9036834843456745, "num_tokens": 2523000.0, "step": 340 }, { "entropy": 0.30291730873286726, "epoch": 0.31681375876895224, "grad_norm": 2.53125, "learning_rate": 1.97054474146377e-05, "loss": 0.306, "mean_token_accuracy": 0.9072510033845902, "num_tokens": 2597529.0, "step": 350 }, { "entropy": 0.30994318779557944, "epoch": 0.3258655804480652, "grad_norm": 1.5, "learning_rate": 1.968144242189838e-05, "loss": 0.3241, "mean_token_accuracy": 0.9031497567892075, "num_tokens": 2676886.0, "step": 360 }, { "entropy": 0.2981043761596084, "epoch": 0.3349174021271781, "grad_norm": 1.390625, "learning_rate": 1.9656512998369105e-05, "loss": 0.2952, "mean_token_accuracy": 0.9080876216292382, "num_tokens": 2749137.0, "step": 370 }, { "entropy": 0.2890668235719204, "epoch": 0.343969223806291, "grad_norm": 1.78125, "learning_rate": 1.9630661524431408e-05, "loss": 0.2883, "mean_token_accuracy": 0.9096374675631523, "num_tokens": 2818103.0, "step": 380 }, { "entropy": 0.28160234112292526, "epoch": 0.3530210454854039, "grad_norm": 1.96875, "learning_rate": 1.960389046850863e-05, "loss": 0.2732, "mean_token_accuracy": 0.9154746599495411, "num_tokens": 2887552.0, "step": 390 }, { "entropy": 0.2931002199649811, "epoch": 0.36207286716451687, "grad_norm": 1.84375, "learning_rate": 1.9576202386830233e-05, "loss": 0.3161, "mean_token_accuracy": 0.9047551721334457, "num_tokens": 2966949.0, "step": 400 }, { "epoch": 0.36207286716451687, "eval_entropy": 0.2931007430553436, "eval_loss": 0.29184412956237793, "eval_mean_token_accuracy": 0.9072571034431458, "eval_num_tokens": 2966949.0, "eval_runtime": 9.3729, "eval_samples_per_second": 53.345, "eval_steps_per_second": 26.673, "step": 400 }, { "entropy": 0.307697238586843, "epoch": 0.37112468884362976, "grad_norm": 1.65625, "learning_rate": 1.9547599923187724e-05, "loss": 0.3194, "mean_token_accuracy": 0.9044582702219486, "num_tokens": 3039683.0, "step": 410 }, { "entropy": 0.2976506020873785, "epoch": 0.3801765105227427, "grad_norm": 1.984375, "learning_rate": 1.9518085808682207e-05, "loss": 0.3012, "mean_token_accuracy": 0.9044804252684117, "num_tokens": 3114377.0, "step": 420 }, { "entropy": 0.29503822289407255, "epoch": 0.3892283322018556, "grad_norm": 1.5390625, "learning_rate": 1.9487662861463593e-05, "loss": 0.2962, "mean_token_accuracy": 0.9089334838092327, "num_tokens": 3184012.0, "step": 430 }, { "entropy": 0.29528967328369615, "epoch": 0.39828015388096855, "grad_norm": 1.6328125, "learning_rate": 1.9456333986461535e-05, "loss": 0.2907, "mean_token_accuracy": 0.9082771897315979, "num_tokens": 3257132.0, "step": 440 }, { "entropy": 0.2871313957497478, "epoch": 0.4073319755600815, "grad_norm": 1.796875, "learning_rate": 1.9424102175108034e-05, "loss": 0.2906, "mean_token_accuracy": 0.9087129518389702, "num_tokens": 3333004.0, "step": 450 }, { "entropy": 0.2987273294478655, "epoch": 0.4163837972391944, "grad_norm": 1.515625, "learning_rate": 1.9390970505051803e-05, "loss": 0.3005, "mean_token_accuracy": 0.9063193209469318, "num_tokens": 3405646.0, "step": 460 }, { "entropy": 0.28730762992054226, "epoch": 0.42543561891830733, "grad_norm": 1.59375, "learning_rate": 1.935694213986441e-05, "loss": 0.2917, "mean_token_accuracy": 0.9086456030607224, "num_tokens": 3474841.0, "step": 470 }, { "entropy": 0.29593063089996574, "epoch": 0.4344874405974202, "grad_norm": 1.5625, "learning_rate": 1.9322020328738183e-05, "loss": 0.2909, "mean_token_accuracy": 0.9100452527403832, "num_tokens": 3550524.0, "step": 480 }, { "entropy": 0.29435052666813133, "epoch": 0.4435392622765332, "grad_norm": 1.609375, "learning_rate": 1.928620840617598e-05, "loss": 0.2988, "mean_token_accuracy": 0.9083921477198601, "num_tokens": 3622971.0, "step": 490 }, { "entropy": 0.2887372709810734, "epoch": 0.45259108395564607, "grad_norm": 1.9921875, "learning_rate": 1.9249509791672802e-05, "loss": 0.2819, "mean_token_accuracy": 0.9114750146865844, "num_tokens": 3692024.0, "step": 500 }, { "entropy": 0.2885109892114997, "epoch": 0.461642905634759, "grad_norm": 1.5703125, "learning_rate": 1.921192798938925e-05, "loss": 0.2758, "mean_token_accuracy": 0.9127115234732628, "num_tokens": 3762911.0, "step": 510 }, { "entropy": 0.29927421547472477, "epoch": 0.4706947273138719, "grad_norm": 1.8984375, "learning_rate": 1.917346658781697e-05, "loss": 0.3073, "mean_token_accuracy": 0.9056219220161438, "num_tokens": 3832948.0, "step": 520 }, { "entropy": 0.28745833188295367, "epoch": 0.47974654899298486, "grad_norm": 1.5078125, "learning_rate": 1.9134129259435973e-05, "loss": 0.3035, "mean_token_accuracy": 0.9091703072190285, "num_tokens": 3909890.0, "step": 530 }, { "entropy": 0.2832237346097827, "epoch": 0.48879837067209775, "grad_norm": 1.4375, "learning_rate": 1.9093919760363996e-05, "loss": 0.2625, "mean_token_accuracy": 0.9148759163916111, "num_tokens": 3984217.0, "step": 540 }, { "entropy": 0.29212499912828205, "epoch": 0.4978501923512107, "grad_norm": 1.671875, "learning_rate": 1.905284192999783e-05, "loss": 0.2967, "mean_token_accuracy": 0.906308326870203, "num_tokens": 4063305.0, "step": 550 }, { "entropy": 0.28526828065514565, "epoch": 0.5069020140303236, "grad_norm": 1.453125, "learning_rate": 1.9010899690646723e-05, "loss": 0.2929, "mean_token_accuracy": 0.9091045394539833, "num_tokens": 4135755.0, "step": 560 }, { "entropy": 0.29713739454746246, "epoch": 0.5159538357094365, "grad_norm": 1.609375, "learning_rate": 1.896809704715787e-05, "loss": 0.3054, "mean_token_accuracy": 0.9077155306935311, "num_tokens": 4215297.0, "step": 570 }, { "entropy": 0.29641823247075083, "epoch": 0.5250056573885494, "grad_norm": 1.3984375, "learning_rate": 1.8924438086533986e-05, "loss": 0.3045, "mean_token_accuracy": 0.9068859592080116, "num_tokens": 4287199.0, "step": 580 }, { "entropy": 0.28421772718429567, "epoch": 0.5340574790676623, "grad_norm": 1.8515625, "learning_rate": 1.8879926977543086e-05, "loss": 0.2828, "mean_token_accuracy": 0.9111842639744282, "num_tokens": 4358560.0, "step": 590 }, { "entropy": 0.2928624337539077, "epoch": 0.5431093007467753, "grad_norm": 1.625, "learning_rate": 1.8834567970320413e-05, "loss": 0.2985, "mean_token_accuracy": 0.9082511819899082, "num_tokens": 4431086.0, "step": 600 }, { "epoch": 0.5431093007467753, "eval_entropy": 0.29717582327127456, "eval_loss": 0.2887883186340332, "eval_mean_token_accuracy": 0.9083560631275177, "eval_num_tokens": 4431086.0, "eval_runtime": 9.3798, "eval_samples_per_second": 53.306, "eval_steps_per_second": 26.653, "step": 600 }, { "entropy": 0.2891200602054596, "epoch": 0.5521611224258882, "grad_norm": 1.203125, "learning_rate": 1.8788365395962613e-05, "loss": 0.2879, "mean_token_accuracy": 0.9087440736591816, "num_tokens": 4506892.0, "step": 610 }, { "entropy": 0.3021440252661705, "epoch": 0.5612129441050011, "grad_norm": 1.59375, "learning_rate": 1.8741323666114207e-05, "loss": 0.3107, "mean_token_accuracy": 0.9060862340033055, "num_tokens": 4581533.0, "step": 620 }, { "entropy": 0.2959118351340294, "epoch": 0.570264765784114, "grad_norm": 2.078125, "learning_rate": 1.8693447272546313e-05, "loss": 0.2977, "mean_token_accuracy": 0.9070049889385701, "num_tokens": 4653692.0, "step": 630 }, { "entropy": 0.27470533456653357, "epoch": 0.579316587463227, "grad_norm": 1.5234375, "learning_rate": 1.8644740786727763e-05, "loss": 0.2725, "mean_token_accuracy": 0.9146642610430717, "num_tokens": 4721773.0, "step": 640 }, { "entropy": 0.3039343884214759, "epoch": 0.5883684091423399, "grad_norm": 1.8125, "learning_rate": 1.859520885938861e-05, "loss": 0.3184, "mean_token_accuracy": 0.9047698460519313, "num_tokens": 4802409.0, "step": 650 }, { "entropy": 0.3038432693108916, "epoch": 0.5974202308214528, "grad_norm": 1.609375, "learning_rate": 1.854485622007603e-05, "loss": 0.302, "mean_token_accuracy": 0.9063945829868316, "num_tokens": 4876537.0, "step": 660 }, { "entropy": 0.27984238266944883, "epoch": 0.6064720525005657, "grad_norm": 1.6953125, "learning_rate": 1.8493687676702743e-05, "loss": 0.2865, "mean_token_accuracy": 0.9099802240729332, "num_tokens": 4947515.0, "step": 670 }, { "entropy": 0.28516580928117036, "epoch": 0.6155238741796787, "grad_norm": 1.8125, "learning_rate": 1.8441708115087917e-05, "loss": 0.2864, "mean_token_accuracy": 0.9089432120323181, "num_tokens": 5025785.0, "step": 680 }, { "entropy": 0.2929608277976513, "epoch": 0.6245756958587916, "grad_norm": 1.4921875, "learning_rate": 1.8388922498490653e-05, "loss": 0.3068, "mean_token_accuracy": 0.9048882365226746, "num_tokens": 5102385.0, "step": 690 }, { "entropy": 0.30293994322419165, "epoch": 0.6336275175379045, "grad_norm": 1.421875, "learning_rate": 1.8335335867136064e-05, "loss": 0.3103, "mean_token_accuracy": 0.9059915870428086, "num_tokens": 5181243.0, "step": 700 }, { "entropy": 0.311605279520154, "epoch": 0.6426793392170175, "grad_norm": 1.5078125, "learning_rate": 1.8280953337734016e-05, "loss": 0.3232, "mean_token_accuracy": 0.9042202472686768, "num_tokens": 5256020.0, "step": 710 }, { "entropy": 0.28142720870673654, "epoch": 0.6517311608961304, "grad_norm": 2.1875, "learning_rate": 1.8225780102990563e-05, "loss": 0.268, "mean_token_accuracy": 0.9141685187816619, "num_tokens": 5329019.0, "step": 720 }, { "entropy": 0.29012490045279266, "epoch": 0.6607829825752433, "grad_norm": 1.578125, "learning_rate": 1.8169821431112104e-05, "loss": 0.2935, "mean_token_accuracy": 0.9081948816776275, "num_tokens": 5407811.0, "step": 730 }, { "entropy": 0.2991213478147984, "epoch": 0.6698348042543562, "grad_norm": 1.6875, "learning_rate": 1.8113082665302366e-05, "loss": 0.2983, "mean_token_accuracy": 0.9063392855226994, "num_tokens": 5477168.0, "step": 740 }, { "entropy": 0.29746117200702427, "epoch": 0.6788866259334692, "grad_norm": 1.7578125, "learning_rate": 1.8055569223252215e-05, "loss": 0.2978, "mean_token_accuracy": 0.9071070611476898, "num_tokens": 5555941.0, "step": 750 }, { "entropy": 0.26622334159910677, "epoch": 0.687938447612582, "grad_norm": 1.625, "learning_rate": 1.799728659662232e-05, "loss": 0.2602, "mean_token_accuracy": 0.9196423992514611, "num_tokens": 5626518.0, "step": 760 }, { "entropy": 0.2931471846997738, "epoch": 0.6969902692916949, "grad_norm": 1.609375, "learning_rate": 1.793824035051882e-05, "loss": 0.3079, "mean_token_accuracy": 0.9071181505918503, "num_tokens": 5699632.0, "step": 770 }, { "entropy": 0.2865557339042425, "epoch": 0.7060420909708078, "grad_norm": 1.5078125, "learning_rate": 1.787843612296191e-05, "loss": 0.2927, "mean_token_accuracy": 0.9099504724144936, "num_tokens": 5766441.0, "step": 780 }, { "entropy": 0.2953029813244939, "epoch": 0.7150939126499208, "grad_norm": 1.6328125, "learning_rate": 1.781787962434751e-05, "loss": 0.2973, "mean_token_accuracy": 0.906347993761301, "num_tokens": 5840121.0, "step": 790 }, { "entropy": 0.2896912330761552, "epoch": 0.7241457343290337, "grad_norm": 1.453125, "learning_rate": 1.7756576636902013e-05, "loss": 0.287, "mean_token_accuracy": 0.9087827295064926, "num_tokens": 5913798.0, "step": 800 }, { "epoch": 0.7241457343290337, "eval_entropy": 0.2926266292333603, "eval_loss": 0.28681185841560364, "eval_mean_token_accuracy": 0.9089032611846923, "eval_num_tokens": 5913798.0, "eval_runtime": 9.3648, "eval_samples_per_second": 53.391, "eval_steps_per_second": 26.696, "step": 800 }, { "entropy": 0.3035820659250021, "epoch": 0.7331975560081466, "grad_norm": 1.28125, "learning_rate": 1.769453301413016e-05, "loss": 0.3031, "mean_token_accuracy": 0.9059673763811589, "num_tokens": 5995879.0, "step": 810 }, { "entropy": 0.3072130227461457, "epoch": 0.7422493776872595, "grad_norm": 1.7890625, "learning_rate": 1.7631754680256118e-05, "loss": 0.3046, "mean_token_accuracy": 0.9036270663142204, "num_tokens": 6068519.0, "step": 820 }, { "entropy": 0.2819986244663596, "epoch": 0.7513011993663725, "grad_norm": 1.5546875, "learning_rate": 1.7568247629657816e-05, "loss": 0.2865, "mean_token_accuracy": 0.9112287394702434, "num_tokens": 6139028.0, "step": 830 }, { "entropy": 0.29264403488487006, "epoch": 0.7603530210454854, "grad_norm": 1.828125, "learning_rate": 1.750401792629457e-05, "loss": 0.3128, "mean_token_accuracy": 0.9057952709496021, "num_tokens": 6211903.0, "step": 840 }, { "entropy": 0.2960445500910282, "epoch": 0.7694048427245983, "grad_norm": 1.34375, "learning_rate": 1.7439071703128068e-05, "loss": 0.3048, "mean_token_accuracy": 0.9075722090899945, "num_tokens": 6289443.0, "step": 850 }, { "entropy": 0.2825075998902321, "epoch": 0.7784566644037112, "grad_norm": 1.5078125, "learning_rate": 1.7373415161536752e-05, "loss": 0.2922, "mean_token_accuracy": 0.907506238669157, "num_tokens": 6370438.0, "step": 860 }, { "entropy": 0.3036248629912734, "epoch": 0.7875084860828242, "grad_norm": 2.34375, "learning_rate": 1.73070545707237e-05, "loss": 0.32, "mean_token_accuracy": 0.9044312626123429, "num_tokens": 6442859.0, "step": 870 }, { "entropy": 0.30435681212693455, "epoch": 0.7965603077619371, "grad_norm": 1.4921875, "learning_rate": 1.7239996267118003e-05, "loss": 0.2982, "mean_token_accuracy": 0.9066942445933819, "num_tokens": 6520055.0, "step": 880 }, { "entropy": 0.3001150920987129, "epoch": 0.80561212944105, "grad_norm": 1.8046875, "learning_rate": 1.717224665376973e-05, "loss": 0.3052, "mean_token_accuracy": 0.904937519133091, "num_tokens": 6596737.0, "step": 890 }, { "entropy": 0.303386352583766, "epoch": 0.814663951120163, "grad_norm": 1.703125, "learning_rate": 1.7103812199738538e-05, "loss": 0.3095, "mean_token_accuracy": 0.9060303725302219, "num_tokens": 6677638.0, "step": 900 }, { "entropy": 0.28049613647162913, "epoch": 0.8237157727992759, "grad_norm": 1.59375, "learning_rate": 1.703469943947597e-05, "loss": 0.2875, "mean_token_accuracy": 0.911004551500082, "num_tokens": 6749581.0, "step": 910 }, { "entropy": 0.31584621611982583, "epoch": 0.8327675944783888, "grad_norm": 1.6484375, "learning_rate": 1.6964914972201522e-05, "loss": 0.3265, "mean_token_accuracy": 0.9020559079945087, "num_tokens": 6826238.0, "step": 920 }, { "entropy": 0.27345783039927485, "epoch": 0.8418194161575017, "grad_norm": 1.7734375, "learning_rate": 1.6894465461272513e-05, "loss": 0.2763, "mean_token_accuracy": 0.913234294205904, "num_tokens": 6893946.0, "step": 930 }, { "entropy": 0.2751306457445025, "epoch": 0.8508712378366147, "grad_norm": 1.3203125, "learning_rate": 1.6823357633547832e-05, "loss": 0.2754, "mean_token_accuracy": 0.9122063621878624, "num_tokens": 6970697.0, "step": 940 }, { "entropy": 0.2728879269212484, "epoch": 0.8599230595157276, "grad_norm": 1.6640625, "learning_rate": 1.6751598278745636e-05, "loss": 0.277, "mean_token_accuracy": 0.9147410795092583, "num_tokens": 7037829.0, "step": 950 }, { "entropy": 0.2938439719378948, "epoch": 0.8689748811948405, "grad_norm": 1.703125, "learning_rate": 1.6679194248795018e-05, "loss": 0.3092, "mean_token_accuracy": 0.9080867692828178, "num_tokens": 7113672.0, "step": 960 }, { "entropy": 0.2854067673906684, "epoch": 0.8780267028739533, "grad_norm": 1.65625, "learning_rate": 1.660615245718177e-05, "loss": 0.277, "mean_token_accuracy": 0.9124735839664936, "num_tokens": 7183355.0, "step": 970 }, { "entropy": 0.2932927643880248, "epoch": 0.8870785245530663, "grad_norm": 1.75, "learning_rate": 1.6532479878288237e-05, "loss": 0.3118, "mean_token_accuracy": 0.9067716941237449, "num_tokens": 7258420.0, "step": 980 }, { "entropy": 0.27898168824613095, "epoch": 0.8961303462321792, "grad_norm": 1.3125, "learning_rate": 1.645818354672738e-05, "loss": 0.2646, "mean_token_accuracy": 0.9143994279205799, "num_tokens": 7327278.0, "step": 990 }, { "entropy": 0.2950664022937417, "epoch": 0.9051821679112921, "grad_norm": 1.828125, "learning_rate": 1.6383270556671067e-05, "loss": 0.2992, "mean_token_accuracy": 0.9097571104764939, "num_tokens": 7398297.0, "step": 1000 }, { "epoch": 0.9051821679112921, "eval_entropy": 0.2895762438774109, "eval_loss": 0.2849172353744507, "eval_mean_token_accuracy": 0.9092698242664338, "eval_num_tokens": 7398297.0, "eval_runtime": 9.4134, "eval_samples_per_second": 53.116, "eval_steps_per_second": 26.558, "step": 1000 }, { "entropy": 0.2860976686701179, "epoch": 0.914233989590405, "grad_norm": 1.578125, "learning_rate": 1.6307748061172687e-05, "loss": 0.2902, "mean_token_accuracy": 0.9105670429766178, "num_tokens": 7469920.0, "step": 1010 }, { "entropy": 0.3092665681615472, "epoch": 0.923285811269518, "grad_norm": 1.4921875, "learning_rate": 1.6231623271484158e-05, "loss": 0.3148, "mean_token_accuracy": 0.904122719168663, "num_tokens": 7541832.0, "step": 1020 }, { "entropy": 0.3033360539004207, "epoch": 0.9323376329486309, "grad_norm": 1.3828125, "learning_rate": 1.615490345636734e-05, "loss": 0.3108, "mean_token_accuracy": 0.904674070328474, "num_tokens": 7618017.0, "step": 1030 }, { "entropy": 0.2807492554187775, "epoch": 0.9413894546277438, "grad_norm": 1.5234375, "learning_rate": 1.6077595941399997e-05, "loss": 0.2828, "mean_token_accuracy": 0.9098519176244736, "num_tokens": 7693714.0, "step": 1040 }, { "entropy": 0.2958640310913324, "epoch": 0.9504412763068567, "grad_norm": 4.0, "learning_rate": 1.5999708108276297e-05, "loss": 0.3073, "mean_token_accuracy": 0.9051192864775658, "num_tokens": 7770364.0, "step": 1050 }, { "entropy": 0.305174994841218, "epoch": 0.9594930979859697, "grad_norm": 1.71875, "learning_rate": 1.5921247394102e-05, "loss": 0.3091, "mean_token_accuracy": 0.9064687371253968, "num_tokens": 7847319.0, "step": 1060 }, { "entropy": 0.30125368386507034, "epoch": 0.9685449196650826, "grad_norm": 1.6171875, "learning_rate": 1.584222129068429e-05, "loss": 0.2968, "mean_token_accuracy": 0.9080850504338741, "num_tokens": 7919917.0, "step": 1070 }, { "entropy": 0.2805282440036535, "epoch": 0.9775967413441955, "grad_norm": 1.6875, "learning_rate": 1.5762637343816455e-05, "loss": 0.281, "mean_token_accuracy": 0.9120527848601341, "num_tokens": 7994892.0, "step": 1080 }, { "entropy": 0.26937505435198544, "epoch": 0.9866485630233084, "grad_norm": 1.4296875, "learning_rate": 1.5682503152557362e-05, "loss": 0.2869, "mean_token_accuracy": 0.9115587025880814, "num_tokens": 8067021.0, "step": 1090 }, { "entropy": 0.29133475106209517, "epoch": 0.9957003847024214, "grad_norm": 1.8671875, "learning_rate": 1.5601826368505863e-05, "loss": 0.2947, "mean_token_accuracy": 0.9103871814906597, "num_tokens": 8144631.0, "step": 1100 }, { "entropy": 0.29290189173741216, "epoch": 1.0045259108395566, "grad_norm": 1.484375, "learning_rate": 1.5520614695070185e-05, "loss": 0.29, "mean_token_accuracy": 0.9142394998134711, "num_tokens": 8216020.0, "step": 1110 }, { "entropy": 0.24365324322134257, "epoch": 1.0135777325186694, "grad_norm": 1.7890625, "learning_rate": 1.5438875886732376e-05, "loss": 0.2388, "mean_token_accuracy": 0.9264398336410522, "num_tokens": 8288245.0, "step": 1120 }, { "entropy": 0.2383655753917992, "epoch": 1.0226295541977823, "grad_norm": 1.90625, "learning_rate": 1.5356617748307857e-05, "loss": 0.2563, "mean_token_accuracy": 0.9211653597652912, "num_tokens": 8358954.0, "step": 1130 }, { "entropy": 0.24438943453133105, "epoch": 1.0316813758768952, "grad_norm": 1.203125, "learning_rate": 1.52738481342002e-05, "loss": 0.247, "mean_token_accuracy": 0.922533193230629, "num_tokens": 8430861.0, "step": 1140 }, { "entropy": 0.23646295368671416, "epoch": 1.0407331975560081, "grad_norm": 1.8671875, "learning_rate": 1.519057494765113e-05, "loss": 0.2363, "mean_token_accuracy": 0.9258262030780315, "num_tokens": 8502349.0, "step": 1150 }, { "entropy": 0.23925476390868425, "epoch": 1.049785019235121, "grad_norm": 1.90625, "learning_rate": 1.5106806139985902e-05, "loss": 0.2321, "mean_token_accuracy": 0.9250497639179229, "num_tokens": 8570295.0, "step": 1160 }, { "entropy": 0.23410421870648862, "epoch": 1.058836840914234, "grad_norm": 1.859375, "learning_rate": 1.5022549709854064e-05, "loss": 0.2314, "mean_token_accuracy": 0.9263376846909523, "num_tokens": 8642950.0, "step": 1170 }, { "entropy": 0.2557803673669696, "epoch": 1.0678886625933468, "grad_norm": 1.453125, "learning_rate": 1.4937813702465706e-05, "loss": 0.2403, "mean_token_accuracy": 0.9217644087970257, "num_tokens": 8717090.0, "step": 1180 }, { "entropy": 0.2509741667658091, "epoch": 1.07694048427246, "grad_norm": 1.8515625, "learning_rate": 1.4852606208823268e-05, "loss": 0.2604, "mean_token_accuracy": 0.918582696467638, "num_tokens": 8789212.0, "step": 1190 }, { "entropy": 0.24822968104854226, "epoch": 1.0859923059515728, "grad_norm": 1.6796875, "learning_rate": 1.4766935364948968e-05, "loss": 0.2467, "mean_token_accuracy": 0.9217202328145504, "num_tokens": 8864678.0, "step": 1200 }, { "epoch": 1.0859923059515728, "eval_entropy": 0.25971108758449557, "eval_loss": 0.2871861159801483, "eval_mean_token_accuracy": 0.9092267100811005, "eval_num_tokens": 8864678.0, "eval_runtime": 9.3693, "eval_samples_per_second": 53.366, "eval_steps_per_second": 26.683, "step": 1200 }, { "entropy": 0.2364829015918076, "epoch": 1.0950441276306857, "grad_norm": 1.3515625, "learning_rate": 1.4680809351107938e-05, "loss": 0.2305, "mean_token_accuracy": 0.9277058839797974, "num_tokens": 8940159.0, "step": 1210 }, { "entropy": 0.2585814634338021, "epoch": 1.1040959493097986, "grad_norm": 1.4609375, "learning_rate": 1.4594236391027136e-05, "loss": 0.2741, "mean_token_accuracy": 0.9160600006580353, "num_tokens": 9015281.0, "step": 1220 }, { "entropy": 0.254960492067039, "epoch": 1.1131477709889115, "grad_norm": 2.03125, "learning_rate": 1.4507224751110098e-05, "loss": 0.2538, "mean_token_accuracy": 0.9213059425354004, "num_tokens": 9087327.0, "step": 1230 }, { "entropy": 0.24510137867182494, "epoch": 1.1221995926680244, "grad_norm": 2.0, "learning_rate": 1.4419782739647622e-05, "loss": 0.254, "mean_token_accuracy": 0.9214532896876335, "num_tokens": 9169314.0, "step": 1240 }, { "entropy": 0.24978135284036398, "epoch": 1.1312514143471373, "grad_norm": 1.5625, "learning_rate": 1.4331918706024466e-05, "loss": 0.2427, "mean_token_accuracy": 0.9217463575303555, "num_tokens": 9242147.0, "step": 1250 }, { "entropy": 0.24323475174605846, "epoch": 1.1403032360262504, "grad_norm": 1.46875, "learning_rate": 1.4243641039922085e-05, "loss": 0.2458, "mean_token_accuracy": 0.9232472665607929, "num_tokens": 9315385.0, "step": 1260 }, { "entropy": 0.2486905450001359, "epoch": 1.1493550577053633, "grad_norm": 1.296875, "learning_rate": 1.4154958170517567e-05, "loss": 0.2605, "mean_token_accuracy": 0.9212081745266915, "num_tokens": 9390697.0, "step": 1270 }, { "entropy": 0.2332840071991086, "epoch": 1.1584068793844762, "grad_norm": 1.609375, "learning_rate": 1.4065878565678763e-05, "loss": 0.2445, "mean_token_accuracy": 0.9262549884617328, "num_tokens": 9460002.0, "step": 1280 }, { "entropy": 0.23821364771574735, "epoch": 1.167458701063589, "grad_norm": 1.6875, "learning_rate": 1.3976410731155731e-05, "loss": 0.2413, "mean_token_accuracy": 0.9262685626745224, "num_tokens": 9527692.0, "step": 1290 }, { "entropy": 0.2529443813487887, "epoch": 1.176510522742702, "grad_norm": 1.6484375, "learning_rate": 1.3886563209768574e-05, "loss": 0.2557, "mean_token_accuracy": 0.9197916373610496, "num_tokens": 9603314.0, "step": 1300 }, { "entropy": 0.2601372007280588, "epoch": 1.1855623444218149, "grad_norm": 2.109375, "learning_rate": 1.379634458059173e-05, "loss": 0.2739, "mean_token_accuracy": 0.917241058498621, "num_tokens": 9677672.0, "step": 1310 }, { "entropy": 0.254310567304492, "epoch": 1.1946141661009277, "grad_norm": 2.25, "learning_rate": 1.3705763458134789e-05, "loss": 0.2536, "mean_token_accuracy": 0.9209688879549504, "num_tokens": 9754592.0, "step": 1320 }, { "entropy": 0.24703464321792126, "epoch": 1.2036659877800409, "grad_norm": 1.6171875, "learning_rate": 1.3614828491519953e-05, "loss": 0.2403, "mean_token_accuracy": 0.9250944316387176, "num_tokens": 9831409.0, "step": 1330 }, { "entropy": 0.24839757550507785, "epoch": 1.2127178094591538, "grad_norm": 1.3046875, "learning_rate": 1.3523548363656174e-05, "loss": 0.2503, "mean_token_accuracy": 0.9210891291499138, "num_tokens": 9907152.0, "step": 1340 }, { "entropy": 0.24397108815610408, "epoch": 1.2217696311382666, "grad_norm": 1.5390625, "learning_rate": 1.343193179041005e-05, "loss": 0.2461, "mean_token_accuracy": 0.9231149226427078, "num_tokens": 9980501.0, "step": 1350 }, { "entropy": 0.24823267050087452, "epoch": 1.2308214528173795, "grad_norm": 1.59375, "learning_rate": 1.3339987519773623e-05, "loss": 0.2627, "mean_token_accuracy": 0.920201038569212, "num_tokens": 10050393.0, "step": 1360 }, { "entropy": 0.2534356275573373, "epoch": 1.2398732744964924, "grad_norm": 1.5625, "learning_rate": 1.3247724331029045e-05, "loss": 0.2457, "mean_token_accuracy": 0.9231408350169659, "num_tokens": 10119599.0, "step": 1370 }, { "entropy": 0.25433759707957504, "epoch": 1.2489250961756053, "grad_norm": 1.546875, "learning_rate": 1.3155151033910319e-05, "loss": 0.2537, "mean_token_accuracy": 0.9176562003791332, "num_tokens": 10191402.0, "step": 1380 }, { "entropy": 0.2557266032323241, "epoch": 1.2579769178547182, "grad_norm": 1.5390625, "learning_rate": 1.3062276467762085e-05, "loss": 0.2606, "mean_token_accuracy": 0.9184790156781674, "num_tokens": 10267960.0, "step": 1390 }, { "entropy": 0.26457452643662693, "epoch": 1.267028739533831, "grad_norm": 1.453125, "learning_rate": 1.29691095006956e-05, "loss": 0.262, "mean_token_accuracy": 0.9168847225606441, "num_tokens": 10349616.0, "step": 1400 }, { "epoch": 1.267028739533831, "eval_entropy": 0.2544795420765877, "eval_loss": 0.28732097148895264, "eval_mean_token_accuracy": 0.9092240772247314, "eval_num_tokens": 10349616.0, "eval_runtime": 9.3693, "eval_samples_per_second": 53.366, "eval_steps_per_second": 26.683, "step": 1400 }, { "entropy": 0.25217979392036793, "epoch": 1.276080561212944, "grad_norm": 1.5625, "learning_rate": 1.2875659028741973e-05, "loss": 0.2594, "mean_token_accuracy": 0.9182466574013233, "num_tokens": 10424645.0, "step": 1410 }, { "entropy": 0.2577937442809343, "epoch": 1.2851323828920571, "grad_norm": 2.046875, "learning_rate": 1.2781933975002731e-05, "loss": 0.2518, "mean_token_accuracy": 0.9220298327505588, "num_tokens": 10497343.0, "step": 1420 }, { "entropy": 0.26858933065086604, "epoch": 1.29418420457117, "grad_norm": 2.296875, "learning_rate": 1.2687943288797784e-05, "loss": 0.2842, "mean_token_accuracy": 0.9142478197813034, "num_tokens": 10566325.0, "step": 1430 }, { "entropy": 0.24459188301116228, "epoch": 1.303236026250283, "grad_norm": 1.765625, "learning_rate": 1.2593695944810913e-05, "loss": 0.2456, "mean_token_accuracy": 0.9239174589514733, "num_tokens": 10637542.0, "step": 1440 }, { "entropy": 0.24134990572929382, "epoch": 1.3122878479293958, "grad_norm": 1.5234375, "learning_rate": 1.2499200942232827e-05, "loss": 0.2344, "mean_token_accuracy": 0.9229865886271, "num_tokens": 10714625.0, "step": 1450 }, { "entropy": 0.2523044439032674, "epoch": 1.3213396696085087, "grad_norm": 1.828125, "learning_rate": 1.2404467303901867e-05, "loss": 0.2564, "mean_token_accuracy": 0.9190959706902504, "num_tokens": 10786171.0, "step": 1460 }, { "entropy": 0.2603725749999285, "epoch": 1.3303914912876216, "grad_norm": 1.625, "learning_rate": 1.2309504075442462e-05, "loss": 0.2692, "mean_token_accuracy": 0.9169292628765107, "num_tokens": 10859432.0, "step": 1470 }, { "entropy": 0.2403477132320404, "epoch": 1.3394433129667345, "grad_norm": 1.6328125, "learning_rate": 1.2214320324401419e-05, "loss": 0.2327, "mean_token_accuracy": 0.9255956873297692, "num_tokens": 10931450.0, "step": 1480 }, { "entropy": 0.25164176877588035, "epoch": 1.3484951346458476, "grad_norm": 1.421875, "learning_rate": 1.2118925139382106e-05, "loss": 0.2553, "mean_token_accuracy": 0.9207783795893192, "num_tokens": 11005579.0, "step": 1490 }, { "entropy": 0.2500301007181406, "epoch": 1.3575469563249605, "grad_norm": 1.4921875, "learning_rate": 1.2023327629176613e-05, "loss": 0.2567, "mean_token_accuracy": 0.9211388893425465, "num_tokens": 11081729.0, "step": 1500 }, { "entropy": 0.2503871817141771, "epoch": 1.3665987780040734, "grad_norm": 1.875, "learning_rate": 1.1927536921896032e-05, "loss": 0.2481, "mean_token_accuracy": 0.9220583327114582, "num_tokens": 11150400.0, "step": 1510 }, { "entropy": 0.24495826996862888, "epoch": 1.3756505996831863, "grad_norm": 2.328125, "learning_rate": 1.1831562164098832e-05, "loss": 0.2561, "mean_token_accuracy": 0.9218058377504349, "num_tokens": 11227357.0, "step": 1520 }, { "entropy": 0.22247098237276078, "epoch": 1.3847024213622992, "grad_norm": 1.65625, "learning_rate": 1.1735412519917514e-05, "loss": 0.2199, "mean_token_accuracy": 0.9295127160847187, "num_tokens": 11302017.0, "step": 1530 }, { "entropy": 0.25086742732673883, "epoch": 1.393754243041412, "grad_norm": 2.3125, "learning_rate": 1.1639097170183578e-05, "loss": 0.2555, "mean_token_accuracy": 0.9186649046838283, "num_tokens": 11375628.0, "step": 1540 }, { "entropy": 0.24195160605013372, "epoch": 1.402806064720525, "grad_norm": 2.34375, "learning_rate": 1.1542625311550882e-05, "loss": 0.2439, "mean_token_accuracy": 0.9234603866934776, "num_tokens": 11454478.0, "step": 1550 }, { "entropy": 0.24275779630988836, "epoch": 1.411857886399638, "grad_norm": 1.390625, "learning_rate": 1.1446006155617518e-05, "loss": 0.2388, "mean_token_accuracy": 0.9236106254160404, "num_tokens": 11529561.0, "step": 1560 }, { "entropy": 0.26370916329324245, "epoch": 1.4209097080787507, "grad_norm": 1.9609375, "learning_rate": 1.1349248928046222e-05, "loss": 0.2584, "mean_token_accuracy": 0.9196252316236496, "num_tokens": 11602833.0, "step": 1570 }, { "entropy": 0.24912301748991011, "epoch": 1.4299615297578638, "grad_norm": 1.4609375, "learning_rate": 1.1252362867683482e-05, "loss": 0.2488, "mean_token_accuracy": 0.9228829652070999, "num_tokens": 11684248.0, "step": 1580 }, { "entropy": 0.2414556547999382, "epoch": 1.4390133514369767, "grad_norm": 1.953125, "learning_rate": 1.1155357225677367e-05, "loss": 0.2412, "mean_token_accuracy": 0.923858293145895, "num_tokens": 11759520.0, "step": 1590 }, { "entropy": 0.25389058981090784, "epoch": 1.4480651731160896, "grad_norm": 2.546875, "learning_rate": 1.1058241264594169e-05, "loss": 0.2564, "mean_token_accuracy": 0.9178573161363601, "num_tokens": 11834379.0, "step": 1600 }, { "epoch": 1.4480651731160896, "eval_entropy": 0.25336209374666213, "eval_loss": 0.28672701120376587, "eval_mean_token_accuracy": 0.9094241366386414, "eval_num_tokens": 11834379.0, "eval_runtime": 9.3897, "eval_samples_per_second": 53.25, "eval_steps_per_second": 26.625, "step": 1600 }, { "entropy": 0.24825670775026082, "epoch": 1.4571169947952025, "grad_norm": 1.640625, "learning_rate": 1.0961024257533984e-05, "loss": 0.2608, "mean_token_accuracy": 0.9209218248724937, "num_tokens": 11911969.0, "step": 1610 }, { "entropy": 0.2427467254921794, "epoch": 1.4661688164743154, "grad_norm": 1.96875, "learning_rate": 1.0863715487245257e-05, "loss": 0.2358, "mean_token_accuracy": 0.9233093105256558, "num_tokens": 11988285.0, "step": 1620 }, { "entropy": 0.24526806455105543, "epoch": 1.4752206381534283, "grad_norm": 1.4609375, "learning_rate": 1.0766324245238435e-05, "loss": 0.2462, "mean_token_accuracy": 0.9207368507981301, "num_tokens": 12069373.0, "step": 1630 }, { "entropy": 0.24581417106091977, "epoch": 1.4842724598325412, "grad_norm": 1.53125, "learning_rate": 1.0668859830898764e-05, "loss": 0.2436, "mean_token_accuracy": 0.9223100118339062, "num_tokens": 12142462.0, "step": 1640 }, { "entropy": 0.23745538275688888, "epoch": 1.4933242815116543, "grad_norm": 1.5, "learning_rate": 1.0571331550598327e-05, "loss": 0.2341, "mean_token_accuracy": 0.9265984818339348, "num_tokens": 12216014.0, "step": 1650 }, { "entropy": 0.2431069084443152, "epoch": 1.502376103190767, "grad_norm": 1.5703125, "learning_rate": 1.0473748716807446e-05, "loss": 0.2389, "mean_token_accuracy": 0.9258911445736885, "num_tokens": 12288450.0, "step": 1660 }, { "entropy": 0.23318624114617706, "epoch": 1.51142792486988, "grad_norm": 1.765625, "learning_rate": 1.0376120647205475e-05, "loss": 0.2375, "mean_token_accuracy": 0.9254320353269577, "num_tokens": 12357801.0, "step": 1670 }, { "entropy": 0.24539397489279507, "epoch": 1.520479746548993, "grad_norm": 1.5078125, "learning_rate": 1.0278456663791087e-05, "loss": 0.2476, "mean_token_accuracy": 0.9217661775648593, "num_tokens": 12431559.0, "step": 1680 }, { "entropy": 0.243877131305635, "epoch": 1.5295315682281059, "grad_norm": 1.375, "learning_rate": 1.0180766091992196e-05, "loss": 0.2419, "mean_token_accuracy": 0.9216916620731354, "num_tokens": 12507021.0, "step": 1690 }, { "entropy": 0.26184606496244667, "epoch": 1.538583389907219, "grad_norm": 1.5859375, "learning_rate": 1.0083058259775496e-05, "loss": 0.277, "mean_token_accuracy": 0.9145717203617096, "num_tokens": 12580160.0, "step": 1700 }, { "entropy": 0.2375843895599246, "epoch": 1.5476352115863317, "grad_norm": 1.625, "learning_rate": 9.985342496755785e-06, "loss": 0.2382, "mean_token_accuracy": 0.9251136861741542, "num_tokens": 12656225.0, "step": 1710 }, { "entropy": 0.2448040470480919, "epoch": 1.5566870332654448, "grad_norm": 1.4453125, "learning_rate": 9.887628133305139e-06, "loss": 0.2513, "mean_token_accuracy": 0.9209414727985858, "num_tokens": 12733650.0, "step": 1720 }, { "entropy": 0.2508842507377267, "epoch": 1.5657388549445574, "grad_norm": 1.4765625, "learning_rate": 9.78992449966199e-06, "loss": 0.2574, "mean_token_accuracy": 0.9201901033520699, "num_tokens": 12807398.0, "step": 1730 }, { "entropy": 0.26807491648942233, "epoch": 1.5747906766236706, "grad_norm": 1.421875, "learning_rate": 9.69224092504024e-06, "loss": 0.2916, "mean_token_accuracy": 0.9138583980500699, "num_tokens": 12887795.0, "step": 1740 }, { "entropy": 0.24304722100496293, "epoch": 1.5838424983027835, "grad_norm": 1.515625, "learning_rate": 9.594586736738463e-06, "loss": 0.2368, "mean_token_accuracy": 0.9235754661262036, "num_tokens": 12955809.0, "step": 1750 }, { "entropy": 0.24434948712587357, "epoch": 1.5928943199818963, "grad_norm": 1.6171875, "learning_rate": 9.496971259249275e-06, "loss": 0.2446, "mean_token_accuracy": 0.9219508893787861, "num_tokens": 13029267.0, "step": 1760 }, { "entropy": 0.2473485903814435, "epoch": 1.6019461416610092, "grad_norm": 1.7109375, "learning_rate": 9.399403813369009e-06, "loss": 0.2418, "mean_token_accuracy": 0.9251487784087657, "num_tokens": 13101356.0, "step": 1770 }, { "entropy": 0.23964370582252742, "epoch": 1.6109979633401221, "grad_norm": 1.6640625, "learning_rate": 9.301893715307697e-06, "loss": 0.2413, "mean_token_accuracy": 0.9235708922147751, "num_tokens": 13175685.0, "step": 1780 }, { "entropy": 0.23657145369797944, "epoch": 1.6200497850192352, "grad_norm": 1.734375, "learning_rate": 9.204450275799533e-06, "loss": 0.2399, "mean_token_accuracy": 0.9240031912922859, "num_tokens": 13245711.0, "step": 1790 }, { "entropy": 0.2463653065264225, "epoch": 1.629101606698348, "grad_norm": 1.8046875, "learning_rate": 9.10708279921383e-06, "loss": 0.2595, "mean_token_accuracy": 0.9202878102660179, "num_tokens": 13320025.0, "step": 1800 }, { "epoch": 1.629101606698348, "eval_entropy": 0.25300506711006165, "eval_loss": 0.2863253653049469, "eval_mean_token_accuracy": 0.9098271555900573, "eval_num_tokens": 13320025.0, "eval_runtime": 9.4073, "eval_samples_per_second": 53.15, "eval_steps_per_second": 26.575, "step": 1800 }, { "entropy": 0.254715484008193, "epoch": 1.638153428377461, "grad_norm": 1.7109375, "learning_rate": 9.009800582666592e-06, "loss": 0.2788, "mean_token_accuracy": 0.9186265878379345, "num_tokens": 13401009.0, "step": 1810 }, { "entropy": 0.24843050315976142, "epoch": 1.647205250056574, "grad_norm": 1.7734375, "learning_rate": 8.912612915132781e-06, "loss": 0.2515, "mean_token_accuracy": 0.922213239222765, "num_tokens": 13475772.0, "step": 1820 }, { "entropy": 0.2353765547275543, "epoch": 1.6562570717356868, "grad_norm": 1.53125, "learning_rate": 8.815529076559373e-06, "loss": 0.2308, "mean_token_accuracy": 0.9247016876935958, "num_tokens": 13547929.0, "step": 1830 }, { "entropy": 0.2633091388270259, "epoch": 1.6653088934147997, "grad_norm": 1.8125, "learning_rate": 8.718558336979247e-06, "loss": 0.2669, "mean_token_accuracy": 0.9184009425342083, "num_tokens": 13622526.0, "step": 1840 }, { "entropy": 0.25728757921606304, "epoch": 1.6743607150939126, "grad_norm": 1.484375, "learning_rate": 8.621709955626046e-06, "loss": 0.2628, "mean_token_accuracy": 0.9189327403903007, "num_tokens": 13700063.0, "step": 1850 }, { "entropy": 0.24874310288578272, "epoch": 1.6834125367730257, "grad_norm": 1.46875, "learning_rate": 8.524993180050058e-06, "loss": 0.2633, "mean_token_accuracy": 0.9198607362806797, "num_tokens": 13776033.0, "step": 1860 }, { "entropy": 0.24581483229994774, "epoch": 1.6924643584521384, "grad_norm": 1.9453125, "learning_rate": 8.428417245235224e-06, "loss": 0.2483, "mean_token_accuracy": 0.9217715479433537, "num_tokens": 13842019.0, "step": 1870 }, { "entropy": 0.25347451251000164, "epoch": 1.7015161801312515, "grad_norm": 1.578125, "learning_rate": 8.331991372717326e-06, "loss": 0.2484, "mean_token_accuracy": 0.9214810349047184, "num_tokens": 13912940.0, "step": 1880 }, { "entropy": 0.23695338489487766, "epoch": 1.7105680018103642, "grad_norm": 1.4453125, "learning_rate": 8.235724769703466e-06, "loss": 0.2362, "mean_token_accuracy": 0.9258873045444489, "num_tokens": 13988716.0, "step": 1890 }, { "entropy": 0.2509100193157792, "epoch": 1.7196198234894773, "grad_norm": 1.7734375, "learning_rate": 8.139626628192944e-06, "loss": 0.2579, "mean_token_accuracy": 0.9192474693059921, "num_tokens": 14060707.0, "step": 1900 }, { "entropy": 0.25328262001276014, "epoch": 1.7286716451685902, "grad_norm": 2.234375, "learning_rate": 8.04370612409953e-06, "loss": 0.2518, "mean_token_accuracy": 0.9190532967448235, "num_tokens": 14141352.0, "step": 1910 }, { "entropy": 0.2612711830995977, "epoch": 1.737723466847703, "grad_norm": 1.828125, "learning_rate": 7.947972416375316e-06, "loss": 0.2639, "mean_token_accuracy": 0.9177913695573807, "num_tokens": 14220408.0, "step": 1920 }, { "entropy": 0.24010583832859994, "epoch": 1.7467752885268162, "grad_norm": 1.9609375, "learning_rate": 7.852434646136191e-06, "loss": 0.2445, "mean_token_accuracy": 0.9230381302535534, "num_tokens": 14290387.0, "step": 1930 }, { "entropy": 0.2533594885841012, "epoch": 1.7558271102059289, "grad_norm": 1.703125, "learning_rate": 7.757101935788973e-06, "loss": 0.2524, "mean_token_accuracy": 0.9200525276362896, "num_tokens": 14362019.0, "step": 1940 }, { "entropy": 0.2543038886040449, "epoch": 1.764878931885042, "grad_norm": 1.7578125, "learning_rate": 7.661983388160374e-06, "loss": 0.2682, "mean_token_accuracy": 0.9186428181827069, "num_tokens": 14434869.0, "step": 1950 }, { "entropy": 0.25175454616546633, "epoch": 1.7739307535641546, "grad_norm": 1.8125, "learning_rate": 7.567088085627834e-06, "loss": 0.24, "mean_token_accuracy": 0.9222409397363662, "num_tokens": 14514468.0, "step": 1960 }, { "entropy": 0.24533636067062617, "epoch": 1.7829825752432678, "grad_norm": 1.515625, "learning_rate": 7.4724250892522545e-06, "loss": 0.2563, "mean_token_accuracy": 0.9214011885225772, "num_tokens": 14582295.0, "step": 1970 }, { "entropy": 0.24225753750652074, "epoch": 1.7920343969223806, "grad_norm": 1.390625, "learning_rate": 7.3780034379128305e-06, "loss": 0.2397, "mean_token_accuracy": 0.9227394610643387, "num_tokens": 14658863.0, "step": 1980 }, { "entropy": 0.26012470331043, "epoch": 1.8010862186014935, "grad_norm": 2.015625, "learning_rate": 7.283832147443985e-06, "loss": 0.2671, "mean_token_accuracy": 0.9166965551674366, "num_tokens": 14734663.0, "step": 1990 }, { "entropy": 0.2509179048240185, "epoch": 1.8101380402806064, "grad_norm": 1.5546875, "learning_rate": 7.1899202097744595e-06, "loss": 0.2546, "mean_token_accuracy": 0.9195629328489303, "num_tokens": 14807350.0, "step": 2000 }, { "epoch": 1.8101380402806064, "eval_entropy": 0.2543384581208229, "eval_loss": 0.2856670618057251, "eval_mean_token_accuracy": 0.9101451654434204, "eval_num_tokens": 14807350.0, "eval_runtime": 9.3922, "eval_samples_per_second": 53.236, "eval_steps_per_second": 26.618, "step": 2000 }, { "entropy": 0.24845433719456195, "epoch": 1.8191898619597193, "grad_norm": 1.53125, "learning_rate": 7.0962765920687434e-06, "loss": 0.2626, "mean_token_accuracy": 0.9198015905916691, "num_tokens": 14881559.0, "step": 2010 }, { "entropy": 0.23361015133559704, "epoch": 1.8282416836388324, "grad_norm": 1.6015625, "learning_rate": 7.002910235870851e-06, "loss": 0.2342, "mean_token_accuracy": 0.9258812077343463, "num_tokens": 14951806.0, "step": 2020 }, { "entropy": 0.2408413586206734, "epoch": 1.837293505317945, "grad_norm": 1.328125, "learning_rate": 6.909830056250527e-06, "loss": 0.2379, "mean_token_accuracy": 0.9247417151927948, "num_tokens": 15022622.0, "step": 2030 }, { "entropy": 0.2619917577132583, "epoch": 1.8463453269970582, "grad_norm": 1.609375, "learning_rate": 6.817044940951992e-06, "loss": 0.2762, "mean_token_accuracy": 0.9160731554031372, "num_tokens": 15095864.0, "step": 2040 }, { "entropy": 0.2494908979162574, "epoch": 1.8553971486761711, "grad_norm": 1.515625, "learning_rate": 6.7245637495453135e-06, "loss": 0.2413, "mean_token_accuracy": 0.9239408574998379, "num_tokens": 15166717.0, "step": 2050 }, { "entropy": 0.2488134613260627, "epoch": 1.864448970355284, "grad_norm": 2.203125, "learning_rate": 6.632395312580428e-06, "loss": 0.2513, "mean_token_accuracy": 0.9206522315740585, "num_tokens": 15240239.0, "step": 2060 }, { "entropy": 0.255818460509181, "epoch": 1.873500792034397, "grad_norm": 1.484375, "learning_rate": 6.540548430743981e-06, "loss": 0.2857, "mean_token_accuracy": 0.9164819419384003, "num_tokens": 15323647.0, "step": 2070 }, { "entropy": 0.23680712506175042, "epoch": 1.8825526137135098, "grad_norm": 1.7421875, "learning_rate": 6.449031874018978e-06, "loss": 0.2374, "mean_token_accuracy": 0.926248911768198, "num_tokens": 15395170.0, "step": 2080 }, { "entropy": 0.2399167947471142, "epoch": 1.891604435392623, "grad_norm": 1.75, "learning_rate": 6.357854380847397e-06, "loss": 0.2409, "mean_token_accuracy": 0.922540470957756, "num_tokens": 15466907.0, "step": 2090 }, { "entropy": 0.25149166863411665, "epoch": 1.9006562570717356, "grad_norm": 1.953125, "learning_rate": 6.267024657295784e-06, "loss": 0.2517, "mean_token_accuracy": 0.9214697033166885, "num_tokens": 15544889.0, "step": 2100 }, { "entropy": 0.24887168370187282, "epoch": 1.9097080787508487, "grad_norm": 1.5546875, "learning_rate": 6.176551376223972e-06, "loss": 0.2539, "mean_token_accuracy": 0.9207740597426891, "num_tokens": 15619482.0, "step": 2110 }, { "entropy": 0.256895999237895, "epoch": 1.9187599004299616, "grad_norm": 1.515625, "learning_rate": 6.086443176456951e-06, "loss": 0.2654, "mean_token_accuracy": 0.9176653556525707, "num_tokens": 15693593.0, "step": 2120 }, { "entropy": 0.24922299664467573, "epoch": 1.9278117221090745, "grad_norm": 1.875, "learning_rate": 5.996708661959979e-06, "loss": 0.2437, "mean_token_accuracy": 0.9216303117573261, "num_tokens": 15767913.0, "step": 2130 }, { "entropy": 0.22873957753181456, "epoch": 1.9368635437881874, "grad_norm": 1.5078125, "learning_rate": 5.907356401017046e-06, "loss": 0.2189, "mean_token_accuracy": 0.9284776009619236, "num_tokens": 15841332.0, "step": 2140 }, { "entropy": 0.2649971529841423, "epoch": 1.9459153654673003, "grad_norm": 1.5078125, "learning_rate": 5.818394925412738e-06, "loss": 0.2855, "mean_token_accuracy": 0.9157008796930313, "num_tokens": 15914905.0, "step": 2150 }, { "entropy": 0.24455339908599855, "epoch": 1.9549671871464134, "grad_norm": 1.6484375, "learning_rate": 5.729832729617567e-06, "loss": 0.2443, "mean_token_accuracy": 0.9228208027780056, "num_tokens": 15990500.0, "step": 2160 }, { "entropy": 0.24984920993447304, "epoch": 1.964019008825526, "grad_norm": 2.390625, "learning_rate": 5.641678269976879e-06, "loss": 0.2478, "mean_token_accuracy": 0.9218893505632877, "num_tokens": 16062684.0, "step": 2170 }, { "entropy": 0.24794082455337046, "epoch": 1.9730708305046392, "grad_norm": 1.546875, "learning_rate": 5.5539399639034145e-06, "loss": 0.2507, "mean_token_accuracy": 0.9215313449501992, "num_tokens": 16133340.0, "step": 2180 }, { "entropy": 0.24754474610090255, "epoch": 1.9821226521837518, "grad_norm": 1.890625, "learning_rate": 5.466626189073563e-06, "loss": 0.2466, "mean_token_accuracy": 0.9224697306752205, "num_tokens": 16207210.0, "step": 2190 }, { "entropy": 0.2443618050776422, "epoch": 1.991174473862865, "grad_norm": 1.5234375, "learning_rate": 5.3797452826274245e-06, "loss": 0.2462, "mean_token_accuracy": 0.921653438359499, "num_tokens": 16282613.0, "step": 2200 }, { "epoch": 1.991174473862865, "eval_entropy": 0.25493055218458177, "eval_loss": 0.28528815507888794, "eval_mean_token_accuracy": 0.9100957527160645, "eval_num_tokens": 16282613.0, "eval_runtime": 9.3856, "eval_samples_per_second": 53.273, "eval_steps_per_second": 26.636, "step": 2200 }, { "entropy": 0.2419594947535258, "epoch": 2.0, "grad_norm": 1.640625, "learning_rate": 5.293305540372744e-06, "loss": 0.2497, "mean_token_accuracy": 0.9222977512922043, "num_tokens": 16353095.0, "step": 2210 }, { "entropy": 0.2307557038962841, "epoch": 2.009051821679113, "grad_norm": 1.4296875, "learning_rate": 5.2073152159927674e-06, "loss": 0.2129, "mean_token_accuracy": 0.9317504949867725, "num_tokens": 16430491.0, "step": 2220 }, { "entropy": 0.2438331985846162, "epoch": 2.018103643358226, "grad_norm": 1.3125, "learning_rate": 5.121782520258171e-06, "loss": 0.249, "mean_token_accuracy": 0.9242358595132828, "num_tokens": 16508058.0, "step": 2230 }, { "entropy": 0.21921782195568085, "epoch": 2.027155465037339, "grad_norm": 1.328125, "learning_rate": 5.036715620243039e-06, "loss": 0.2053, "mean_token_accuracy": 0.9350384041666985, "num_tokens": 16575440.0, "step": 2240 }, { "entropy": 0.23980374177917838, "epoch": 2.0362072867164516, "grad_norm": 1.4609375, "learning_rate": 4.952122638545035e-06, "loss": 0.2313, "mean_token_accuracy": 0.927023620903492, "num_tokens": 16652627.0, "step": 2250 }, { "entropy": 0.22007083408534528, "epoch": 2.0452591083955647, "grad_norm": 1.3671875, "learning_rate": 4.8680116525098056e-06, "loss": 0.2198, "mean_token_accuracy": 0.9333253562450409, "num_tokens": 16718816.0, "step": 2260 }, { "entropy": 0.24282212648540735, "epoch": 2.0543109300746774, "grad_norm": 1.5, "learning_rate": 4.784390693459753e-06, "loss": 0.2399, "mean_token_accuracy": 0.9239346958696842, "num_tokens": 16789759.0, "step": 2270 }, { "entropy": 0.23785702120512725, "epoch": 2.0633627517537905, "grad_norm": 1.453125, "learning_rate": 4.701267745927113e-06, "loss": 0.2506, "mean_token_accuracy": 0.9241972677409649, "num_tokens": 16867039.0, "step": 2280 }, { "entropy": 0.24605063777416944, "epoch": 2.0724145734329036, "grad_norm": 1.46875, "learning_rate": 4.618650746891599e-06, "loss": 0.2507, "mean_token_accuracy": 0.9215324610471726, "num_tokens": 16941361.0, "step": 2290 }, { "entropy": 0.22479961309581994, "epoch": 2.0814663951120163, "grad_norm": 1.359375, "learning_rate": 4.536547585022518e-06, "loss": 0.2206, "mean_token_accuracy": 0.9283983618021011, "num_tokens": 17012141.0, "step": 2300 }, { "entropy": 0.24391699638217687, "epoch": 2.0905182167911294, "grad_norm": 2.15625, "learning_rate": 4.454966099925531e-06, "loss": 0.25, "mean_token_accuracy": 0.9209645003080368, "num_tokens": 17087024.0, "step": 2310 }, { "entropy": 0.22605140786617994, "epoch": 2.099570038470242, "grad_norm": 1.6328125, "learning_rate": 4.3739140813940765e-06, "loss": 0.2181, "mean_token_accuracy": 0.9297716915607452, "num_tokens": 17158091.0, "step": 2320 }, { "entropy": 0.23818891448900104, "epoch": 2.108621860149355, "grad_norm": 1.5078125, "learning_rate": 4.293399268665581e-06, "loss": 0.2398, "mean_token_accuracy": 0.9242137163877487, "num_tokens": 17229953.0, "step": 2330 }, { "entropy": 0.2529011068865657, "epoch": 2.117673681828468, "grad_norm": 1.4140625, "learning_rate": 4.21342934968247e-06, "loss": 0.2577, "mean_token_accuracy": 0.9201614983379841, "num_tokens": 17301299.0, "step": 2340 }, { "entropy": 0.2255427474156022, "epoch": 2.126725503507581, "grad_norm": 1.7109375, "learning_rate": 4.134011960358094e-06, "loss": 0.2255, "mean_token_accuracy": 0.9295220628380776, "num_tokens": 17377435.0, "step": 2350 }, { "entropy": 0.22926049511879681, "epoch": 2.1357773251866936, "grad_norm": 2.09375, "learning_rate": 4.055154683847588e-06, "loss": 0.2312, "mean_token_accuracy": 0.9286687098443508, "num_tokens": 17449350.0, "step": 2360 }, { "entropy": 0.22778470665216446, "epoch": 2.1448291468658067, "grad_norm": 1.4765625, "learning_rate": 3.976865049823845e-06, "loss": 0.2281, "mean_token_accuracy": 0.9293324284255504, "num_tokens": 17525880.0, "step": 2370 }, { "entropy": 0.23943039821460843, "epoch": 2.15388096854492, "grad_norm": 1.390625, "learning_rate": 3.899150533758489e-06, "loss": 0.2509, "mean_token_accuracy": 0.9246469952166081, "num_tokens": 17602733.0, "step": 2380 }, { "entropy": 0.23977632280439137, "epoch": 2.1629327902240325, "grad_norm": 1.3515625, "learning_rate": 3.822018556208128e-06, "loss": 0.2399, "mean_token_accuracy": 0.9248318992555141, "num_tokens": 17676158.0, "step": 2390 }, { "entropy": 0.2481432169675827, "epoch": 2.1719846119031456, "grad_norm": 1.7890625, "learning_rate": 3.7454764821057754e-06, "loss": 0.2396, "mean_token_accuracy": 0.9235868014395237, "num_tokens": 17750465.0, "step": 2400 }, { "epoch": 2.1719846119031456, "eval_entropy": 0.24248164582252502, "eval_loss": 0.2894599735736847, "eval_mean_token_accuracy": 0.9097159428596496, "eval_num_tokens": 17750465.0, "eval_runtime": 9.4339, "eval_samples_per_second": 53.0, "eval_steps_per_second": 26.5, "step": 2400 }, { "entropy": 0.23743325080722572, "epoch": 2.1810364335822583, "grad_norm": 1.484375, "learning_rate": 3.669531620057628e-06, "loss": 0.2311, "mean_token_accuracy": 0.9258677743375301, "num_tokens": 17824288.0, "step": 2410 }, { "entropy": 0.23670416846871375, "epoch": 2.1900882552613714, "grad_norm": 1.4609375, "learning_rate": 3.5941912216451812e-06, "loss": 0.2354, "mean_token_accuracy": 0.9266407683491706, "num_tokens": 17902584.0, "step": 2420 }, { "entropy": 0.235261020809412, "epoch": 2.199140076940484, "grad_norm": 1.515625, "learning_rate": 3.5194624807328514e-06, "loss": 0.2381, "mean_token_accuracy": 0.9261479564011097, "num_tokens": 17982974.0, "step": 2430 }, { "entropy": 0.23788492735475303, "epoch": 2.208191898619597, "grad_norm": 1.4296875, "learning_rate": 3.4453525327810277e-06, "loss": 0.2399, "mean_token_accuracy": 0.9252388395369053, "num_tokens": 18052254.0, "step": 2440 }, { "entropy": 0.21874225055798888, "epoch": 2.2172437202987103, "grad_norm": 1.578125, "learning_rate": 3.371868454164775e-06, "loss": 0.2098, "mean_token_accuracy": 0.9329733081161976, "num_tokens": 18120065.0, "step": 2450 }, { "entropy": 0.24790667220950127, "epoch": 2.226295541977823, "grad_norm": 1.890625, "learning_rate": 3.299017261498136e-06, "loss": 0.2539, "mean_token_accuracy": 0.9227528609335423, "num_tokens": 18193562.0, "step": 2460 }, { "entropy": 0.24773352714255453, "epoch": 2.235347363656936, "grad_norm": 1.5625, "learning_rate": 3.226805910964156e-06, "loss": 0.2573, "mean_token_accuracy": 0.9219558417797089, "num_tokens": 18267919.0, "step": 2470 }, { "entropy": 0.22798624727874994, "epoch": 2.2443991853360488, "grad_norm": 1.375, "learning_rate": 3.1552412976506565e-06, "loss": 0.2228, "mean_token_accuracy": 0.9276571467518806, "num_tokens": 18342460.0, "step": 2480 }, { "entropy": 0.21947569595649838, "epoch": 2.253451007015162, "grad_norm": 1.5234375, "learning_rate": 3.084330254891883e-06, "loss": 0.2231, "mean_token_accuracy": 0.9298080869019032, "num_tokens": 18412694.0, "step": 2490 }, { "entropy": 0.2307950984686613, "epoch": 2.2625028286942745, "grad_norm": 1.6796875, "learning_rate": 3.0140795536160127e-06, "loss": 0.227, "mean_token_accuracy": 0.9273362122476101, "num_tokens": 18482616.0, "step": 2500 }, { "entropy": 0.23491790611296892, "epoch": 2.2715546503733877, "grad_norm": 1.4296875, "learning_rate": 2.944495901698631e-06, "loss": 0.2394, "mean_token_accuracy": 0.9233520910143852, "num_tokens": 18554093.0, "step": 2510 }, { "entropy": 0.21908750645816327, "epoch": 2.2806064720525008, "grad_norm": 1.2109375, "learning_rate": 2.8755859433222422e-06, "loss": 0.2128, "mean_token_accuracy": 0.9334690175950527, "num_tokens": 18628041.0, "step": 2520 }, { "entropy": 0.24107547104358673, "epoch": 2.2896582937316134, "grad_norm": 1.6015625, "learning_rate": 2.8073562583418336e-06, "loss": 0.2431, "mean_token_accuracy": 0.9256007336080074, "num_tokens": 18703682.0, "step": 2530 }, { "entropy": 0.23574529979377984, "epoch": 2.2987101154107266, "grad_norm": 1.71875, "learning_rate": 2.739813361656616e-06, "loss": 0.2287, "mean_token_accuracy": 0.9242596134543419, "num_tokens": 18779322.0, "step": 2540 }, { "entropy": 0.2339877954684198, "epoch": 2.3077619370898392, "grad_norm": 1.7265625, "learning_rate": 2.672963702587943e-06, "loss": 0.2377, "mean_token_accuracy": 0.9261183701455593, "num_tokens": 18858427.0, "step": 2550 }, { "entropy": 0.23651442099362613, "epoch": 2.3168137587689523, "grad_norm": 1.9609375, "learning_rate": 2.6068136642635024e-06, "loss": 0.2379, "mean_token_accuracy": 0.9261867627501488, "num_tokens": 18932723.0, "step": 2560 }, { "entropy": 0.23850015196949242, "epoch": 2.325865580448065, "grad_norm": 1.890625, "learning_rate": 2.541369563007806e-06, "loss": 0.2534, "mean_token_accuracy": 0.9260534539818763, "num_tokens": 19010914.0, "step": 2570 }, { "entropy": 0.23194469464942813, "epoch": 2.334917402127178, "grad_norm": 1.671875, "learning_rate": 2.476637647739115e-06, "loss": 0.2381, "mean_token_accuracy": 0.926636103540659, "num_tokens": 19085371.0, "step": 2580 }, { "entropy": 0.22836068961769343, "epoch": 2.3439692238062912, "grad_norm": 1.7265625, "learning_rate": 2.412624099372719e-06, "loss": 0.2321, "mean_token_accuracy": 0.9283419884741306, "num_tokens": 19151951.0, "step": 2590 }, { "entropy": 0.2302293201908469, "epoch": 2.353021045485404, "grad_norm": 1.421875, "learning_rate": 2.349335030230785e-06, "loss": 0.2302, "mean_token_accuracy": 0.9274107903242111, "num_tokens": 19227125.0, "step": 2600 }, { "epoch": 2.353021045485404, "eval_entropy": 0.2420771769285202, "eval_loss": 0.2899412512779236, "eval_mean_token_accuracy": 0.9096628496646881, "eval_num_tokens": 19227125.0, "eval_runtime": 9.3687, "eval_samples_per_second": 53.369, "eval_steps_per_second": 26.685, "step": 2600 }, { "entropy": 0.234050558693707, "epoch": 2.362072867164517, "grad_norm": 1.5703125, "learning_rate": 2.2867764834587003e-06, "loss": 0.2332, "mean_token_accuracy": 0.9264360308647156, "num_tokens": 19301670.0, "step": 2610 }, { "entropy": 0.23359497357159853, "epoch": 2.3711246888436297, "grad_norm": 1.6171875, "learning_rate": 2.224954432448071e-06, "loss": 0.2359, "mean_token_accuracy": 0.9275781489908695, "num_tokens": 19373066.0, "step": 2620 }, { "entropy": 0.24756205026060343, "epoch": 2.380176510522743, "grad_norm": 1.6015625, "learning_rate": 2.163874780266323e-06, "loss": 0.259, "mean_token_accuracy": 0.9211221620440483, "num_tokens": 19453654.0, "step": 2630 }, { "entropy": 0.2272123709321022, "epoch": 2.3892283322018555, "grad_norm": 2.0625, "learning_rate": 2.103543359093071e-06, "loss": 0.2223, "mean_token_accuracy": 0.9283259101212025, "num_tokens": 19527264.0, "step": 2640 }, { "entropy": 0.22003255859017373, "epoch": 2.3982801538809686, "grad_norm": 1.375, "learning_rate": 2.043965929663224e-06, "loss": 0.2235, "mean_token_accuracy": 0.9287537440657616, "num_tokens": 19604492.0, "step": 2650 }, { "entropy": 0.22481423607096077, "epoch": 2.4073319755600817, "grad_norm": 1.6796875, "learning_rate": 1.985148180716928e-06, "loss": 0.2314, "mean_token_accuracy": 0.9280698530375957, "num_tokens": 19679416.0, "step": 2660 }, { "entropy": 0.22615873701870443, "epoch": 2.4163837972391944, "grad_norm": 1.390625, "learning_rate": 1.927095728456364e-06, "loss": 0.2222, "mean_token_accuracy": 0.9290170624852181, "num_tokens": 19753645.0, "step": 2670 }, { "entropy": 0.2225392703898251, "epoch": 2.4254356189183075, "grad_norm": 1.75, "learning_rate": 1.8698141160095162e-06, "loss": 0.2241, "mean_token_accuracy": 0.9286721229553223, "num_tokens": 19828790.0, "step": 2680 }, { "entropy": 0.2204814150929451, "epoch": 2.43448744059742, "grad_norm": 1.3515625, "learning_rate": 1.8133088129008459e-06, "loss": 0.2182, "mean_token_accuracy": 0.9296285167336464, "num_tokens": 19896900.0, "step": 2690 }, { "entropy": 0.22337221689522266, "epoch": 2.4435392622765333, "grad_norm": 1.5703125, "learning_rate": 1.7575852145290717e-06, "loss": 0.2232, "mean_token_accuracy": 0.928014337271452, "num_tokens": 19968242.0, "step": 2700 }, { "entropy": 0.2179032789543271, "epoch": 2.452591083955646, "grad_norm": 1.5234375, "learning_rate": 1.7026486416519682e-06, "loss": 0.211, "mean_token_accuracy": 0.9319535449147225, "num_tokens": 20041305.0, "step": 2710 }, { "entropy": 0.2261628670617938, "epoch": 2.461642905634759, "grad_norm": 1.5390625, "learning_rate": 1.6485043398783295e-06, "loss": 0.235, "mean_token_accuracy": 0.9279113605618476, "num_tokens": 20116751.0, "step": 2720 }, { "entropy": 0.24026810871437193, "epoch": 2.4706947273138717, "grad_norm": 1.4921875, "learning_rate": 1.5951574791670754e-06, "loss": 0.2505, "mean_token_accuracy": 0.9243397124111652, "num_tokens": 20193209.0, "step": 2730 }, { "entropy": 0.22616808880120515, "epoch": 2.479746548992985, "grad_norm": 1.4375, "learning_rate": 1.5426131533336164e-06, "loss": 0.2227, "mean_token_accuracy": 0.9292748935520649, "num_tokens": 20267532.0, "step": 2740 }, { "entropy": 0.22781651541590692, "epoch": 2.4887983706720975, "grad_norm": 1.4140625, "learning_rate": 1.490876379563464e-06, "loss": 0.2277, "mean_token_accuracy": 0.928032499551773, "num_tokens": 20341347.0, "step": 2750 }, { "entropy": 0.22965652998536826, "epoch": 2.4978501923512106, "grad_norm": 1.3203125, "learning_rate": 1.4399520979331639e-06, "loss": 0.217, "mean_token_accuracy": 0.9308909751474858, "num_tokens": 20418634.0, "step": 2760 }, { "entropy": 0.23352561388164758, "epoch": 2.5069020140303238, "grad_norm": 1.6484375, "learning_rate": 1.3898451709385995e-06, "loss": 0.2371, "mean_token_accuracy": 0.9257538385689259, "num_tokens": 20492992.0, "step": 2770 }, { "entropy": 0.22655329555273057, "epoch": 2.5159538357094364, "grad_norm": 1.546875, "learning_rate": 1.3405603830306868e-06, "loss": 0.228, "mean_token_accuracy": 0.9283955104649066, "num_tokens": 20567899.0, "step": 2780 }, { "entropy": 0.23698492981493474, "epoch": 2.5250056573885495, "grad_norm": 1.2890625, "learning_rate": 1.2921024401585436e-06, "loss": 0.2534, "mean_token_accuracy": 0.9233708687126636, "num_tokens": 20643853.0, "step": 2790 }, { "entropy": 0.23318157717585564, "epoch": 2.534057479067662, "grad_norm": 1.3515625, "learning_rate": 1.2444759693201391e-06, "loss": 0.2374, "mean_token_accuracy": 0.9264273457229137, "num_tokens": 20719127.0, "step": 2800 }, { "epoch": 2.534057479067662, "eval_entropy": 0.2420763995051384, "eval_loss": 0.29005327820777893, "eval_mean_token_accuracy": 0.9095665421485901, "eval_num_tokens": 20719127.0, "eval_runtime": 9.3994, "eval_samples_per_second": 53.195, "eval_steps_per_second": 26.598, "step": 2800 }, { "entropy": 0.24478118922561407, "epoch": 2.5431093007467753, "grad_norm": 1.5703125, "learning_rate": 1.197685518120485e-06, "loss": 0.2534, "mean_token_accuracy": 0.9210445381700992, "num_tokens": 20791473.0, "step": 2810 }, { "entropy": 0.23202836168929936, "epoch": 2.552161122425888, "grad_norm": 1.3828125, "learning_rate": 1.1517355543373988e-06, "loss": 0.2284, "mean_token_accuracy": 0.9267525814473629, "num_tokens": 20867883.0, "step": 2820 }, { "entropy": 0.22680169045925141, "epoch": 2.561212944105001, "grad_norm": 1.4375, "learning_rate": 1.1066304654949245e-06, "loss": 0.2154, "mean_token_accuracy": 0.9293528974056244, "num_tokens": 20939137.0, "step": 2830 }, { "entropy": 0.23053370881825686, "epoch": 2.5702647657841142, "grad_norm": 1.6328125, "learning_rate": 1.062374558444358e-06, "loss": 0.2331, "mean_token_accuracy": 0.9270932763814926, "num_tokens": 21019630.0, "step": 2840 }, { "entropy": 0.23330926056951284, "epoch": 2.579316587463227, "grad_norm": 1.421875, "learning_rate": 1.0189720589530372e-06, "loss": 0.2369, "mean_token_accuracy": 0.9256651438772678, "num_tokens": 21097041.0, "step": 2850 }, { "entropy": 0.22963873716071248, "epoch": 2.58836840914234, "grad_norm": 1.4296875, "learning_rate": 9.764271113008183e-07, "loss": 0.2222, "mean_token_accuracy": 0.9277059838175774, "num_tokens": 21174887.0, "step": 2860 }, { "entropy": 0.23152102306485176, "epoch": 2.5974202308214527, "grad_norm": 1.6484375, "learning_rate": 9.347437778843938e-07, "loss": 0.2234, "mean_token_accuracy": 0.9289267487823963, "num_tokens": 21251344.0, "step": 2870 }, { "entropy": 0.24004797209054232, "epoch": 2.606472052500566, "grad_norm": 1.578125, "learning_rate": 8.939260388293569e-07, "loss": 0.2478, "mean_token_accuracy": 0.9238497324287891, "num_tokens": 21324495.0, "step": 2880 }, { "entropy": 0.22599442386999727, "epoch": 2.6155238741796785, "grad_norm": 1.515625, "learning_rate": 8.539777916101888e-07, "loss": 0.2228, "mean_token_accuracy": 0.9293952472507954, "num_tokens": 21398424.0, "step": 2890 }, { "entropy": 0.23568667601794005, "epoch": 2.6245756958587916, "grad_norm": 1.34375, "learning_rate": 8.149028506780964e-07, "loss": 0.2372, "mean_token_accuracy": 0.9261699497699738, "num_tokens": 21471703.0, "step": 2900 }, { "entropy": 0.23762993402779103, "epoch": 2.6336275175379047, "grad_norm": 1.5546875, "learning_rate": 7.767049470967946e-07, "loss": 0.2344, "mean_token_accuracy": 0.9264717750251293, "num_tokens": 21547014.0, "step": 2910 }, { "entropy": 0.2214735448360443, "epoch": 2.6426793392170174, "grad_norm": 1.53125, "learning_rate": 7.393877281862394e-07, "loss": 0.2169, "mean_token_accuracy": 0.9300684794783592, "num_tokens": 21620989.0, "step": 2920 }, { "entropy": 0.22688788436353208, "epoch": 2.6517311608961305, "grad_norm": 1.5625, "learning_rate": 7.029547571743778e-07, "loss": 0.2239, "mean_token_accuracy": 0.929075525701046, "num_tokens": 21696245.0, "step": 2930 }, { "entropy": 0.23520100452005863, "epoch": 2.660782982575243, "grad_norm": 1.4921875, "learning_rate": 6.674095128568958e-07, "loss": 0.2391, "mean_token_accuracy": 0.9241717301309109, "num_tokens": 21780059.0, "step": 2940 }, { "entropy": 0.2333427995443344, "epoch": 2.6698348042543563, "grad_norm": 1.625, "learning_rate": 6.327553892650606e-07, "loss": 0.2408, "mean_token_accuracy": 0.9252305686473846, "num_tokens": 21851491.0, "step": 2950 }, { "entropy": 0.24906855598092079, "epoch": 2.678886625933469, "grad_norm": 1.828125, "learning_rate": 5.989956953416376e-07, "loss": 0.2423, "mean_token_accuracy": 0.9250664070248604, "num_tokens": 21927668.0, "step": 2960 }, { "entropy": 0.22028352571651338, "epoch": 2.687938447612582, "grad_norm": 1.8515625, "learning_rate": 5.661336546249352e-07, "loss": 0.2192, "mean_token_accuracy": 0.929874736070633, "num_tokens": 21996776.0, "step": 2970 }, { "entropy": 0.2297042902559042, "epoch": 2.696990269291695, "grad_norm": 1.5703125, "learning_rate": 5.341724049410024e-07, "loss": 0.2327, "mean_token_accuracy": 0.9275284387171269, "num_tokens": 22071876.0, "step": 2980 }, { "entropy": 0.2260682400316, "epoch": 2.706042090970808, "grad_norm": 1.453125, "learning_rate": 5.031149981040262e-07, "loss": 0.2328, "mean_token_accuracy": 0.9273493871092796, "num_tokens": 22143958.0, "step": 2990 }, { "entropy": 0.2300025401636958, "epoch": 2.715093912649921, "grad_norm": 1.859375, "learning_rate": 4.729643996249156e-07, "loss": 0.2222, "mean_token_accuracy": 0.9290709294378757, "num_tokens": 22219617.0, "step": 3000 }, { "epoch": 2.715093912649921, "eval_entropy": 0.24206615540385246, "eval_loss": 0.2901514768600464, "eval_mean_token_accuracy": 0.9095830063819885, "eval_num_tokens": 22219617.0, "eval_runtime": 9.4159, "eval_samples_per_second": 53.102, "eval_steps_per_second": 26.551, "step": 3000 }, { "entropy": 0.23495229706168175, "epoch": 2.7241457343290336, "grad_norm": 1.484375, "learning_rate": 4.4372348842814716e-07, "loss": 0.2359, "mean_token_accuracy": 0.9247433744370938, "num_tokens": 22293562.0, "step": 3010 }, { "entropy": 0.22726391404867172, "epoch": 2.7331975560081467, "grad_norm": 1.4921875, "learning_rate": 4.1539505657687495e-07, "loss": 0.2329, "mean_token_accuracy": 0.9276506796479225, "num_tokens": 22364740.0, "step": 3020 }, { "entropy": 0.23861285336315632, "epoch": 2.7422493776872594, "grad_norm": 1.6171875, "learning_rate": 3.8798180900632253e-07, "loss": 0.2505, "mean_token_accuracy": 0.9261965282261372, "num_tokens": 22438839.0, "step": 3030 }, { "entropy": 0.22645489294081927, "epoch": 2.7513011993663725, "grad_norm": 1.4921875, "learning_rate": 3.6148636326550743e-07, "loss": 0.2293, "mean_token_accuracy": 0.930086625367403, "num_tokens": 22510078.0, "step": 3040 }, { "entropy": 0.24204493686556816, "epoch": 2.7603530210454856, "grad_norm": 1.7109375, "learning_rate": 3.3591124926730557e-07, "loss": 0.243, "mean_token_accuracy": 0.9232821561396122, "num_tokens": 22579081.0, "step": 3050 }, { "entropy": 0.2360864533111453, "epoch": 2.7694048427245983, "grad_norm": 1.5390625, "learning_rate": 3.1125890904688206e-07, "loss": 0.2394, "mean_token_accuracy": 0.9256729304790496, "num_tokens": 22652498.0, "step": 3060 }, { "entropy": 0.23557743560522795, "epoch": 2.778456664403711, "grad_norm": 1.53125, "learning_rate": 2.8753169652851245e-07, "loss": 0.229, "mean_token_accuracy": 0.9265096105635167, "num_tokens": 22730659.0, "step": 3070 }, { "entropy": 0.2428856560960412, "epoch": 2.787508486082824, "grad_norm": 1.6328125, "learning_rate": 2.6473187730082004e-07, "loss": 0.2425, "mean_token_accuracy": 0.9218696370720864, "num_tokens": 22803396.0, "step": 3080 }, { "entropy": 0.2498979650437832, "epoch": 2.796560307761937, "grad_norm": 1.703125, "learning_rate": 2.42861628400447e-07, "loss": 0.2496, "mean_token_accuracy": 0.9203889586031437, "num_tokens": 22879925.0, "step": 3090 }, { "entropy": 0.2429232547059655, "epoch": 2.80561212944105, "grad_norm": 1.453125, "learning_rate": 2.2192303810418148e-07, "loss": 0.2424, "mean_token_accuracy": 0.9244124636054039, "num_tokens": 22962210.0, "step": 3100 }, { "entropy": 0.2421200337819755, "epoch": 2.814663951120163, "grad_norm": 1.734375, "learning_rate": 2.0191810572955052e-07, "loss": 0.2568, "mean_token_accuracy": 0.9226670287549495, "num_tokens": 23038980.0, "step": 3110 }, { "entropy": 0.22173038199543954, "epoch": 2.823715772799276, "grad_norm": 1.53125, "learning_rate": 1.8284874144393284e-07, "loss": 0.2194, "mean_token_accuracy": 0.930086762458086, "num_tokens": 23105957.0, "step": 3120 }, { "entropy": 0.2354667537845671, "epoch": 2.8327675944783888, "grad_norm": 1.53125, "learning_rate": 1.6471676608214581e-07, "loss": 0.2325, "mean_token_accuracy": 0.9262704968452453, "num_tokens": 23176390.0, "step": 3130 }, { "entropy": 0.23669061083346604, "epoch": 2.8418194161575014, "grad_norm": 1.6640625, "learning_rate": 1.4752391097260233e-07, "loss": 0.2389, "mean_token_accuracy": 0.9259525135159492, "num_tokens": 23253673.0, "step": 3140 }, { "entropy": 0.23215005043894052, "epoch": 2.8508712378366146, "grad_norm": 1.5546875, "learning_rate": 1.3127181777198073e-07, "loss": 0.2297, "mean_token_accuracy": 0.9277803264558315, "num_tokens": 23326885.0, "step": 3150 }, { "entropy": 0.22989463973790408, "epoch": 2.8599230595157277, "grad_norm": 1.421875, "learning_rate": 1.159620383084814e-07, "loss": 0.2177, "mean_token_accuracy": 0.9286665737628936, "num_tokens": 23394737.0, "step": 3160 }, { "entropy": 0.2309438543394208, "epoch": 2.8689748811948403, "grad_norm": 1.359375, "learning_rate": 1.0159603443364308e-07, "loss": 0.2257, "mean_token_accuracy": 0.9272095516324044, "num_tokens": 23470142.0, "step": 3170 }, { "entropy": 0.24182884357869625, "epoch": 2.8780267028739535, "grad_norm": 2.25, "learning_rate": 8.817517788276775e-08, "loss": 0.26, "mean_token_accuracy": 0.9233683370053768, "num_tokens": 23546197.0, "step": 3180 }, { "entropy": 0.23957613073289394, "epoch": 2.8870785245530666, "grad_norm": 1.734375, "learning_rate": 7.570075014392775e-08, "loss": 0.2544, "mean_token_accuracy": 0.9227040722966194, "num_tokens": 23622223.0, "step": 3190 }, { "entropy": 0.22729014521464705, "epoch": 2.8961303462321792, "grad_norm": 1.4375, "learning_rate": 6.417394233561692e-08, "loss": 0.2228, "mean_token_accuracy": 0.9275835871696472, "num_tokens": 23692447.0, "step": 3200 }, { "epoch": 2.8961303462321792, "eval_entropy": 0.24213379493355752, "eval_loss": 0.28997713327407837, "eval_mean_token_accuracy": 0.9096816182136536, "eval_num_tokens": 23692447.0, "eval_runtime": 9.4008, "eval_samples_per_second": 53.187, "eval_steps_per_second": 26.593, "step": 3200 } ], "logging_steps": 10, "max_steps": 3315, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.9455206896408986e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }