[ { "loss": 3.2371, "grad_norm": 4.275357246398926, "learning_rate": 3.870967741935484e-05, "epoch": 0.010810810810810811, "step": 10 }, { "loss": 2.0982, "grad_norm": 1.6401941776275635, "learning_rate": 8.172043010752689e-05, "epoch": 0.021621621621621623, "step": 20 }, { "loss": 1.7814, "grad_norm": 1.0990010499954224, "learning_rate": 0.00012473118279569893, "epoch": 0.032432432432432434, "step": 30 }, { "loss": 1.8768, "grad_norm": 1.2032235860824585, "learning_rate": 0.00016774193548387098, "epoch": 0.043243243243243246, "step": 40 }, { "loss": 1.7967, "grad_norm": 0.8312140107154846, "learning_rate": 0.000210752688172043, "epoch": 0.05405405405405406, "step": 50 }, { "loss": 1.7088, "grad_norm": 0.7974215149879456, "learning_rate": 0.00025376344086021504, "epoch": 0.06486486486486487, "step": 60 }, { "loss": 1.7921, "grad_norm": 0.8754441142082214, "learning_rate": 0.0002967741935483871, "epoch": 0.07567567567567568, "step": 70 }, { "loss": 1.7169, "grad_norm": 0.9818633794784546, "learning_rate": 0.00033978494623655914, "epoch": 0.08648648648648649, "step": 80 }, { "loss": 1.8469, "grad_norm": 1.0391249656677246, "learning_rate": 0.0003827956989247312, "epoch": 0.0972972972972973, "step": 90 }, { "loss": 1.763, "grad_norm": 1.2312983274459839, "learning_rate": 0.00039998849055034085, "epoch": 0.10810810810810811, "step": 100 }, { "loss": 1.9223, "grad_norm": 1.136441707611084, "learning_rate": 0.00039991815982176333, "epoch": 0.11891891891891893, "step": 110 }, { "loss": 1.9628, "grad_norm": 0.9119946360588074, "learning_rate": 0.0003997839149608889, "epoch": 0.12972972972972974, "step": 120 }, { "eval_loss": 1.777016043663025, "eval_runtime": 18.3656, "eval_samples_per_second": 42.416, "eval_steps_per_second": 10.618, "epoch": 0.13297297297297297, "step": 123 }, { "loss": 1.6845, "grad_norm": 1.4189170598983765, "learning_rate": 0.00039958579888599896, "epoch": 0.14054054054054055, "step": 130 }, { "loss": 1.8529, "grad_norm": 1.0813618898391724, "learning_rate": 0.00039932387493509636, "epoch": 0.15135135135135136, "step": 140 }, { "loss": 1.7712, "grad_norm": 1.0759323835372925, "learning_rate": 0.00039899822684565697, "epoch": 0.16216216216216217, "step": 150 }, { "loss": 1.7869, "grad_norm": 1.0583269596099854, "learning_rate": 0.00039860895872785806, "epoch": 0.17297297297297298, "step": 160 }, { "loss": 1.8945, "grad_norm": 1.1669530868530273, "learning_rate": 0.0003981561950312943, "epoch": 0.1837837837837838, "step": 170 }, { "loss": 1.8797, "grad_norm": 1.0436373949050903, "learning_rate": 0.0003976400805051915, "epoch": 0.1945945945945946, "step": 180 }, { "loss": 1.9332, "grad_norm": 0.8406238555908203, "learning_rate": 0.00039706078015212907, "epoch": 0.20540540540540542, "step": 190 }, { "loss": 1.7786, "grad_norm": 1.1354097127914429, "learning_rate": 0.0003964184791752895, "epoch": 0.21621621621621623, "step": 200 }, { "loss": 1.8089, "grad_norm": 1.4123671054840088, "learning_rate": 0.0003957133829192479, "epoch": 0.22702702702702704, "step": 210 }, { "loss": 1.7916, "grad_norm": 0.9333382248878479, "learning_rate": 0.00039494571680432364, "epoch": 0.23783783783783785, "step": 220 }, { "loss": 1.7808, "grad_norm": 1.0521595478057861, "learning_rate": 0.0003941157262545123, "epoch": 0.24864864864864866, "step": 230 }, { "loss": 1.7977, "grad_norm": 1.2558528184890747, "learning_rate": 0.00039322367661902426, "epoch": 0.2594594594594595, "step": 240 }, { "eval_loss": 1.7805718183517456, "eval_runtime": 10.1271, "eval_samples_per_second": 76.922, "eval_steps_per_second": 19.255, "epoch": 0.26594594594594595, "step": 246 }, { "loss": 1.8482, "grad_norm": 1.1523817777633667, "learning_rate": 0.00039226985308745137, "epoch": 0.2702702702702703, "step": 250 }, { "loss": 1.685, "grad_norm": 0.9244216084480286, "learning_rate": 0.00039125456059859175, "epoch": 0.2810810810810811, "step": 260 }, { "loss": 1.8774, "grad_norm": 1.1236217021942139, "learning_rate": 0.0003901781237429604, "epoch": 0.2918918918918919, "step": 270 }, { "loss": 1.9237, "grad_norm": 1.112891435623169, "learning_rate": 0.0003890408866590171, "epoch": 0.3027027027027027, "step": 280 }, { "loss": 1.7769, "grad_norm": 1.0233204364776611, "learning_rate": 0.00038784321292314485, "epoch": 0.31351351351351353, "step": 290 }, { "loss": 1.8192, "grad_norm": 1.1586676836013794, "learning_rate": 0.00038658548543341384, "epoch": 0.32432432432432434, "step": 300 }, { "loss": 1.7718, "grad_norm": 1.034834384918213, "learning_rate": 0.00038526810628716854, "epoch": 0.33513513513513515, "step": 310 }, { "loss": 1.6869, "grad_norm": 1.1128815412521362, "learning_rate": 0.0003838914966524765, "epoch": 0.34594594594594597, "step": 320 }, { "loss": 1.8987, "grad_norm": 1.048621654510498, "learning_rate": 0.00038245609663348034, "epoch": 0.3567567567567568, "step": 330 }, { "loss": 1.7818, "grad_norm": 1.3258867263793945, "learning_rate": 0.00038096236512969556, "epoch": 0.3675675675675676, "step": 340 }, { "loss": 1.7062, "grad_norm": 0.9586314558982849, "learning_rate": 0.0003794107796893002, "epoch": 0.3783783783783784, "step": 350 }, { "loss": 1.8131, "grad_norm": 1.0099109411239624, "learning_rate": 0.00037780183635646145, "epoch": 0.3891891891891892, "step": 360 }, { "eval_loss": 1.7687468528747559, "eval_runtime": 10.0798, "eval_samples_per_second": 77.283, "eval_steps_per_second": 19.346, "epoch": 0.3989189189189189, "step": 369 }, { "loss": 1.8824, "grad_norm": 1.201002597808838, "learning_rate": 0.00037613604951274986, "epoch": 0.4, "step": 370 }, { "loss": 1.8594, "grad_norm": 0.9146278500556946, "learning_rate": 0.0003744139517126908, "epoch": 0.41081081081081083, "step": 380 }, { "loss": 1.7923, "grad_norm": 1.1093569993972778, "learning_rate": 0.00037263609351350583, "epoch": 0.42162162162162165, "step": 390 }, { "loss": 1.9701, "grad_norm": 0.9460511207580566, "learning_rate": 0.0003708030432990989, "epoch": 0.43243243243243246, "step": 400 }, { "loss": 1.7968, "grad_norm": 1.1481722593307495, "learning_rate": 0.0003689153870983431, "epoch": 0.44324324324324327, "step": 410 }, { "loss": 1.7019, "grad_norm": 1.1272804737091064, "learning_rate": 0.00036697372839772634, "epoch": 0.4540540540540541, "step": 420 }, { "loss": 1.7139, "grad_norm": 0.8615907430648804, "learning_rate": 0.000364978687948416, "epoch": 0.4648648648648649, "step": 430 }, { "loss": 1.7769, "grad_norm": 1.0832351446151733, "learning_rate": 0.0003629309035678035, "epoch": 0.4756756756756757, "step": 440 }, { "loss": 1.8117, "grad_norm": 1.0243345499038696, "learning_rate": 0.00036083102993559343, "epoch": 0.4864864864864865, "step": 450 }, { "loss": 1.7035, "grad_norm": 0.9396358728408813, "learning_rate": 0.00035867973838450153, "epoch": 0.4972972972972973, "step": 460 }, { "loss": 1.9568, "grad_norm": 0.9557101130485535, "learning_rate": 0.0003564777166856282, "epoch": 0.5081081081081081, "step": 470 }, { "loss": 1.9079, "grad_norm": 1.1307172775268555, "learning_rate": 0.00035422566882857765, "epoch": 0.518918918918919, "step": 480 }, { "loss": 1.8791, "grad_norm": 1.2252289056777954, "learning_rate": 0.0003519243147963909, "epoch": 0.5297297297297298, "step": 490 }, { "eval_loss": 1.7642391920089722, "eval_runtime": 10.1201, "eval_samples_per_second": 76.976, "eval_steps_per_second": 19.269, "epoch": 0.5318918918918919, "step": 492 }, { "loss": 1.7498, "grad_norm": 0.9916505813598633, "learning_rate": 0.00034957439033536647, "epoch": 0.5405405405405406, "step": 500 }, { "loss": 1.8562, "grad_norm": 1.2275047302246094, "learning_rate": 0.0003471766467198408, "epoch": 0.5513513513513514, "step": 510 }, { "loss": 1.7812, "grad_norm": 0.9753154516220093, "learning_rate": 0.00034473185051200515, "epoch": 0.5621621621621622, "step": 520 }, { "loss": 2.0087, "grad_norm": 1.2194623947143555, "learning_rate": 0.0003422407833168343, "epoch": 0.572972972972973, "step": 530 }, { "loss": 1.8641, "grad_norm": 1.1282182931900024, "learning_rate": 0.00033970424153220637, "epoch": 0.5837837837837838, "step": 540 }, { "loss": 1.8962, "grad_norm": 1.3077672719955444, "learning_rate": 0.0003371230360942931, "epoch": 0.5945945945945946, "step": 550 }, { "loss": 1.7113, "grad_norm": 1.1093400716781616, "learning_rate": 0.0003344979922183026, "epoch": 0.6054054054054054, "step": 560 }, { "loss": 1.8013, "grad_norm": 1.0412172079086304, "learning_rate": 0.0003318299491346565, "epoch": 0.6162162162162163, "step": 570 }, { "loss": 1.8316, "grad_norm": 1.1250932216644287, "learning_rate": 0.00032911975982068706, "epoch": 0.6270270270270271, "step": 580 }, { "loss": 1.7729, "grad_norm": 0.971480131149292, "learning_rate": 0.0003263682907279387, "epoch": 0.6378378378378379, "step": 590 }, { "loss": 1.745, "grad_norm": 1.1424800157546997, "learning_rate": 0.00032357642150516265, "epoch": 0.6486486486486487, "step": 600 }, { "loss": 1.6717, "grad_norm": 1.3536049127578735, "learning_rate": 0.00032074504471709146, "epoch": 0.6594594594594595, "step": 610 }, { "eval_loss": 1.7533202171325684, "eval_runtime": 10.0831, "eval_samples_per_second": 77.258, "eval_steps_per_second": 19.339, "epoch": 0.6648648648648648, "step": 615 }, { "loss": 1.7822, "grad_norm": 0.8749492168426514, "learning_rate": 0.0003178750655590848, "epoch": 0.6702702702702703, "step": 620 }, { "loss": 1.8368, "grad_norm": 3.0736031532287598, "learning_rate": 0.00031496740156773776, "epoch": 0.6810810810810811, "step": 630 }, { "loss": 1.7322, "grad_norm": 1.288352131843567, "learning_rate": 0.00031202298232754186, "epoch": 0.6918918918918919, "step": 640 }, { "loss": 1.8685, "grad_norm": 1.0477159023284912, "learning_rate": 0.00030904274917369686, "epoch": 0.7027027027027027, "step": 650 }, { "loss": 1.7483, "grad_norm": 0.9655544757843018, "learning_rate": 0.0003060276548911634, "epoch": 0.7135135135135136, "step": 660 }, { "loss": 1.8099, "grad_norm": 1.1260396242141724, "learning_rate": 0.00030297866341005684, "epoch": 0.7243243243243244, "step": 670 }, { "loss": 1.6145, "grad_norm": 1.1371850967407227, "learning_rate": 0.0002998967494974774, "epoch": 0.7351351351351352, "step": 680 }, { "loss": 1.8311, "grad_norm": 0.9440209865570068, "learning_rate": 0.0002967828984458751, "epoch": 0.745945945945946, "step": 690 }, { "loss": 1.9393, "grad_norm": 1.3496946096420288, "learning_rate": 0.00029363810575805106, "epoch": 0.7567567567567568, "step": 700 }, { "loss": 1.9767, "grad_norm": 1.0028049945831299, "learning_rate": 0.00029046337682889315, "epoch": 0.7675675675675676, "step": 710 }, { "loss": 1.8328, "grad_norm": 1.1777056455612183, "learning_rate": 0.00028725972662395013, "epoch": 0.7783783783783784, "step": 720 }, { "loss": 1.7484, "grad_norm": 1.2826964855194092, "learning_rate": 0.00028402817935494547, "epoch": 0.7891891891891892, "step": 730 }, { "eval_loss": 1.7475706338882446, "eval_runtime": 10.0081, "eval_samples_per_second": 77.837, "eval_steps_per_second": 19.484, "epoch": 0.7978378378378378, "step": 738 }, { "loss": 1.9079, "grad_norm": 1.1097257137298584, "learning_rate": 0.00028076976815233546, "epoch": 0.8, "step": 740 }, { "loss": 1.7847, "grad_norm": 1.1187055110931396, "learning_rate": 0.00027748553473501593, "epoch": 0.8108108108108109, "step": 750 }, { "loss": 1.6747, "grad_norm": 1.182005524635315, "learning_rate": 0.00027417652907728274, "epoch": 0.8216216216216217, "step": 760 }, { "loss": 1.7653, "grad_norm": 0.9777538180351257, "learning_rate": 0.000270843809073154, "epoch": 0.8324324324324325, "step": 770 }, { "loss": 1.7749, "grad_norm": 1.1285064220428467, "learning_rate": 0.0002674884401981597, "epoch": 0.8432432432432433, "step": 780 }, { "loss": 1.7904, "grad_norm": 0.9783152937889099, "learning_rate": 0.000264111495168707, "epoch": 0.8540540540540541, "step": 790 }, { "loss": 1.6915, "grad_norm": 1.107857346534729, "learning_rate": 0.0002607140535991321, "epoch": 0.8648648648648649, "step": 800 }, { "loss": 1.7857, "grad_norm": 1.2584813833236694, "learning_rate": 0.0002572972016565451, "epoch": 0.8756756756756757, "step": 810 }, { "loss": 1.8468, "grad_norm": 1.2436493635177612, "learning_rate": 0.00025386203171358157, "epoch": 0.8864864864864865, "step": 820 }, { "loss": 1.9164, "grad_norm": 1.624140739440918, "learning_rate": 0.00025040964199916856, "epoch": 0.8972972972972973, "step": 830 }, { "loss": 1.8009, "grad_norm": 1.0699501037597656, "learning_rate": 0.0002469411362474199, "epoch": 0.9081081081081082, "step": 840 }, { "loss": 1.6318, "grad_norm": 0.9692312479019165, "learning_rate": 0.0002434576233447703, "epoch": 0.918918918918919, "step": 850 }, { "loss": 1.752, "grad_norm": 0.9754092693328857, "learning_rate": 0.000239960216975463, "epoch": 0.9297297297297298, "step": 860 }, { "eval_loss": 1.7383391857147217, "eval_runtime": 10.0527, "eval_samples_per_second": 77.491, "eval_steps_per_second": 19.398, "epoch": 0.9308108108108109, "step": 861 }, { "loss": 1.9364, "grad_norm": 1.0026895999908447, "learning_rate": 0.00023645003526550292, "epoch": 0.9405405405405406, "step": 870 }, { "loss": 1.8438, "grad_norm": 1.269220232963562, "learning_rate": 0.00023292820042519066, "epoch": 0.9513513513513514, "step": 880 }, { "loss": 1.7952, "grad_norm": 1.0278656482696533, "learning_rate": 0.00022939583839034965, "epoch": 0.9621621621621622, "step": 890 }, { "loss": 1.6568, "grad_norm": 0.9819965958595276, "learning_rate": 0.0002258540784623631, "epoch": 0.972972972972973, "step": 900 }, { "loss": 1.8287, "grad_norm": 1.1272140741348267, "learning_rate": 0.00022230405294713465, "epoch": 0.9837837837837838, "step": 910 }, { "loss": 1.7379, "grad_norm": 1.1125059127807617, "learning_rate": 0.0002187468967930883, "epoch": 0.9945945945945946, "step": 920 }, { "loss": 1.7004, "grad_norm": 1.0192606449127197, "learning_rate": 0.000215183747228324, "epoch": 1.0054054054054054, "step": 930 }, { "loss": 1.5612, "grad_norm": 0.9857641458511353, "learning_rate": 0.000211615743397044, "epoch": 1.0162162162162163, "step": 940 }, { "loss": 1.2828, "grad_norm": 1.0608668327331543, "learning_rate": 0.00020804402599536661, "epoch": 1.027027027027027, "step": 950 }, { "loss": 1.4035, "grad_norm": 1.485253930091858, "learning_rate": 0.0002044697369066443, "epoch": 1.037837837837838, "step": 960 }, { "loss": 1.5254, "grad_norm": 0.9453800320625305, "learning_rate": 0.0002008940188364015, "epoch": 1.0486486486486486, "step": 970 }, { "loss": 1.4442, "grad_norm": 1.1382359266281128, "learning_rate": 0.00019731801494701044, "epoch": 1.0594594594594595, "step": 980 }, { "eval_loss": 1.7562943696975708, "eval_runtime": 10.0133, "eval_samples_per_second": 77.797, "eval_steps_per_second": 19.474, "epoch": 1.0637837837837838, "step": 984 }, { "loss": 1.6187, "grad_norm": 1.2494144439697266, "learning_rate": 0.0001937428684922197, "epoch": 1.0702702702702702, "step": 990 }, { "loss": 1.6528, "grad_norm": 0.9464777708053589, "learning_rate": 0.00019016972245165526, "epoch": 1.0810810810810811, "step": 1000 }, { "loss": 1.4064, "grad_norm": 0.9740603566169739, "learning_rate": 0.0001865997191654074, "epoch": 1.0918918918918918, "step": 1010 }, { "loss": 1.3695, "grad_norm": 1.2424192428588867, "learning_rate": 0.00018303399996882325, "epoch": 1.1027027027027028, "step": 1020 }, { "loss": 1.4764, "grad_norm": 1.0215702056884766, "learning_rate": 0.00017947370482762005, "epoch": 1.1135135135135135, "step": 1030 }, { "loss": 1.5442, "grad_norm": 1.0910210609436035, "learning_rate": 0.00017591997197343657, "epoch": 1.1243243243243244, "step": 1040 }, { "loss": 1.5187, "grad_norm": 1.1207563877105713, "learning_rate": 0.00017237393753993875, "epoch": 1.135135135135135, "step": 1050 }, { "loss": 1.4571, "grad_norm": 1.0761910676956177, "learning_rate": 0.0001688367351995959, "epoch": 1.145945945945946, "step": 1060 }, { "loss": 1.3436, "grad_norm": 0.9719659090042114, "learning_rate": 0.00016530949580124404, "epoch": 1.1567567567567567, "step": 1070 }, { "loss": 1.5315, "grad_norm": 1.0876080989837646, "learning_rate": 0.00016179334700855189, "epoch": 1.1675675675675676, "step": 1080 }, { "loss": 1.369, "grad_norm": 1.1940348148345947, "learning_rate": 0.0001582894129395051, "epoch": 1.1783783783783783, "step": 1090 }, { "loss": 1.4257, "grad_norm": 1.1275503635406494, "learning_rate": 0.00015479881380702415, "epoch": 1.1891891891891893, "step": 1100 }, { "eval_loss": 1.763828158378601, "eval_runtime": 9.9983, "eval_samples_per_second": 77.914, "eval_steps_per_second": 19.503, "epoch": 1.1967567567567567, "step": 1107 }, { "loss": 1.3932, "grad_norm": 1.2186589241027832, "learning_rate": 0.00015132266556083018, "epoch": 1.2, "step": 1110 }, { "loss": 1.5195, "grad_norm": 1.040711760520935, "learning_rate": 0.00014786207953067492, "epoch": 1.2108108108108109, "step": 1120 }, { "loss": 1.3596, "grad_norm": 1.1564419269561768, "learning_rate": 0.00014441816207104636, "epoch": 1.2216216216216216, "step": 1130 }, { "loss": 1.539, "grad_norm": 0.9457581639289856, "learning_rate": 0.00014099201420746585, "epoch": 1.2324324324324325, "step": 1140 }, { "loss": 1.4309, "grad_norm": 1.2473818063735962, "learning_rate": 0.00013758473128448837, "epoch": 1.2432432432432432, "step": 1150 }, { "loss": 1.513, "grad_norm": 1.0576856136322021, "learning_rate": 0.0001341974026155195, "epoch": 1.2540540540540541, "step": 1160 }, { "loss": 1.5212, "grad_norm": 1.021657943725586, "learning_rate": 0.00013083111113456025, "epoch": 1.2648648648648648, "step": 1170 }, { "loss": 1.4125, "grad_norm": 1.4797037839889526, "learning_rate": 0.0001274869330499914, "epoch": 1.2756756756756757, "step": 1180 }, { "loss": 1.4741, "grad_norm": 1.4238656759262085, "learning_rate": 0.00012416593750050803, "epoch": 1.2864864864864864, "step": 1190 }, { "loss": 1.5072, "grad_norm": 1.0679641962051392, "learning_rate": 0.00012086918621331431, "epoch": 1.2972972972972974, "step": 1200 }, { "loss": 1.3807, "grad_norm": 1.5260353088378906, "learning_rate": 0.00011759773316468794, "epoch": 1.308108108108108, "step": 1210 }, { "loss": 1.4398, "grad_norm": 1.005669355392456, "learning_rate": 0.00011435262424302224, "epoch": 1.318918918918919, "step": 1220 }, { "loss": 1.4314, "grad_norm": 1.116134762763977, "learning_rate": 0.00011113489691445385, "epoch": 1.3297297297297297, "step": 1230 }, { "eval_loss": 1.7593566179275513, "eval_runtime": 9.9147, "eval_samples_per_second": 78.57, "eval_steps_per_second": 19.668, "epoch": 1.3297297297297297, "step": 1230 }, { "loss": 1.4174, "grad_norm": 1.100644826889038, "learning_rate": 0.00010794557989118352, "epoch": 1.3405405405405406, "step": 1240 }, { "loss": 1.3901, "grad_norm": 0.9467904567718506, "learning_rate": 0.00010478569280259542, "epoch": 1.3513513513513513, "step": 1250 }, { "loss": 1.5013, "grad_norm": 1.2005168199539185, "learning_rate": 0.00010165624586927987, "epoch": 1.3621621621621622, "step": 1260 }, { "loss": 1.4634, "grad_norm": 1.0398645401000977, "learning_rate": 9.855823958006427e-05, "epoch": 1.372972972972973, "step": 1270 }, { "loss": 1.4728, "grad_norm": 1.1238207817077637, "learning_rate": 9.549266437215549e-05, "epoch": 1.3837837837837839, "step": 1280 }, { "loss": 1.453, "grad_norm": 1.067688226699829, "learning_rate": 9.246050031449569e-05, "epoch": 1.3945945945945946, "step": 1290 }, { "loss": 1.432, "grad_norm": 1.1034791469573975, "learning_rate": 8.946271679443276e-05, "epoch": 1.4054054054054055, "step": 1300 }, { "loss": 1.3956, "grad_norm": 1.4038920402526855, "learning_rate": 8.650027220780555e-05, "epoch": 1.4162162162162162, "step": 1310 }, { "loss": 1.3489, "grad_norm": 1.0994772911071777, "learning_rate": 8.357411365254341e-05, "epoch": 1.427027027027027, "step": 1320 }, { "loss": 1.3385, "grad_norm": 1.1797088384628296, "learning_rate": 8.068517662587798e-05, "epoch": 1.4378378378378378, "step": 1330 }, { "loss": 1.3012, "grad_norm": 1.1310184001922607, "learning_rate": 7.783438472526257e-05, "epoch": 1.4486486486486487, "step": 1340 }, { "loss": 1.4044, "grad_norm": 1.3859984874725342, "learning_rate": 7.502264935309742e-05, "epoch": 1.4594594594594594, "step": 1350 }, { "eval_loss": 1.7528764009475708, "eval_runtime": 10.0314, "eval_samples_per_second": 77.656, "eval_steps_per_second": 19.439, "epoch": 1.4627027027027026, "step": 1353 }, { "loss": 1.5262, "grad_norm": 1.2141237258911133, "learning_rate": 7.225086942535244e-05, "epoch": 1.4702702702702704, "step": 1360 }, { "loss": 1.4401, "grad_norm": 1.1930843591690063, "learning_rate": 6.95199310841829e-05, "epoch": 1.481081081081081, "step": 1370 }, { "loss": 1.3915, "grad_norm": 1.0784533023834229, "learning_rate": 6.6830707414628e-05, "epoch": 1.491891891891892, "step": 1380 }, { "loss": 1.5117, "grad_norm": 1.2977006435394287, "learning_rate": 6.41840581654848e-05, "epoch": 1.5027027027027027, "step": 1390 }, { "loss": 1.5385, "grad_norm": 1.091192603111267, "learning_rate": 6.158082947444484e-05, "epoch": 1.5135135135135136, "step": 1400 }, { "loss": 1.2558, "grad_norm": 1.2064927816390991, "learning_rate": 5.902185359758272e-05, "epoch": 1.5243243243243243, "step": 1410 }, { "loss": 1.3401, "grad_norm": 1.18263578414917, "learning_rate": 5.6507948643282905e-05, "epoch": 1.535135135135135, "step": 1420 }, { "loss": 1.4368, "grad_norm": 1.0201761722564697, "learning_rate": 5.4039918310688995e-05, "epoch": 1.545945945945946, "step": 1430 }, { "loss": 1.5864, "grad_norm": 1.0474286079406738, "learning_rate": 5.1618551632759904e-05, "epoch": 1.5567567567567568, "step": 1440 }, { "loss": 1.3581, "grad_norm": 0.9824125170707703, "learning_rate": 4.924462272401484e-05, "epoch": 1.5675675675675675, "step": 1450 }, { "loss": 1.4015, "grad_norm": 1.190414547920227, "learning_rate": 4.6918890533048034e-05, "epoch": 1.5783783783783782, "step": 1460 }, { "loss": 1.511, "grad_norm": 1.0979052782058716, "learning_rate": 4.464209859989146e-05, "epoch": 1.5891891891891892, "step": 1470 }, { "eval_loss": 1.7506372928619385, "eval_runtime": 10.0066, "eval_samples_per_second": 77.849, "eval_steps_per_second": 19.487, "epoch": 1.5956756756756758, "step": 1476 }, { "loss": 1.419, "grad_norm": 1.2529748678207397, "learning_rate": 4.241497481830396e-05, "epoch": 1.6, "step": 1480 }, { "loss": 1.2973, "grad_norm": 1.1004537343978882, "learning_rate": 4.023823120306269e-05, "epoch": 1.6108108108108108, "step": 1490 }, { "loss": 1.4756, "grad_norm": 1.2662088871002197, "learning_rate": 3.811256366233098e-05, "epoch": 1.6216216216216215, "step": 1500 }, { "loss": 1.3959, "grad_norm": 1.118185043334961, "learning_rate": 3.603865177517516e-05, "epoch": 1.6324324324324324, "step": 1510 }, { "loss": 1.4682, "grad_norm": 0.995052695274353, "learning_rate": 3.4017158574302564e-05, "epoch": 1.6432432432432433, "step": 1520 }, { "loss": 1.4508, "grad_norm": 1.0658509731292725, "learning_rate": 3.204873033408853e-05, "epoch": 1.654054054054054, "step": 1530 }, { "loss": 1.3456, "grad_norm": 1.1724168062210083, "learning_rate": 3.013399636396195e-05, "epoch": 1.6648648648648647, "step": 1540 }, { "loss": 1.4285, "grad_norm": 0.971674919128418, "learning_rate": 2.827356880721368e-05, "epoch": 1.6756756756756757, "step": 1550 }, { "loss": 1.2316, "grad_norm": 0.939606785774231, "learning_rate": 2.6468042445293883e-05, "epoch": 1.6864864864864866, "step": 1560 }, { "loss": 1.4636, "grad_norm": 1.2107715606689453, "learning_rate": 2.4717994507659147e-05, "epoch": 1.6972972972972973, "step": 1570 }, { "loss": 1.3471, "grad_norm": 1.3718624114990234, "learning_rate": 2.3023984487231466e-05, "epoch": 1.708108108108108, "step": 1580 }, { "loss": 1.4623, "grad_norm": 1.218471646308899, "learning_rate": 2.1386553961527666e-05, "epoch": 1.718918918918919, "step": 1590 }, { "eval_loss": 1.749324917793274, "eval_runtime": 10.0439, "eval_samples_per_second": 77.56, "eval_steps_per_second": 19.415, "epoch": 1.7286486486486488, "step": 1599 }, { "loss": 1.3439, "grad_norm": 1.4828319549560547, "learning_rate": 1.9806226419516192e-05, "epoch": 1.7297297297297298, "step": 1600 }, { "loss": 1.3913, "grad_norm": 1.1505448818206787, "learning_rate": 1.828350709425677e-05, "epoch": 1.7405405405405405, "step": 1610 }, { "loss": 1.4736, "grad_norm": 1.2893474102020264, "learning_rate": 1.68188828013768e-05, "epoch": 1.7513513513513512, "step": 1620 }, { "loss": 1.3173, "grad_norm": 1.2402210235595703, "learning_rate": 1.541282178343566e-05, "epoch": 1.7621621621621621, "step": 1630 }, { "loss": 1.3873, "grad_norm": 1.0396865606307983, "learning_rate": 1.4065773560226913e-05, "epoch": 1.772972972972973, "step": 1640 }, { "loss": 1.3416, "grad_norm": 1.0871046781539917, "learning_rate": 1.277816878506597e-05, "epoch": 1.7837837837837838, "step": 1650 }, { "loss": 1.4252, "grad_norm": 1.3051209449768066, "learning_rate": 1.1550419107109722e-05, "epoch": 1.7945945945945945, "step": 1660 }, { "loss": 1.2819, "grad_norm": 1.1347006559371948, "learning_rate": 1.0382917039751783e-05, "epoch": 1.8054054054054054, "step": 1670 }, { "loss": 1.3439, "grad_norm": 1.1445626020431519, "learning_rate": 9.276035835135166e-06, "epoch": 1.8162162162162163, "step": 1680 }, { "loss": 1.3387, "grad_norm": 1.1986947059631348, "learning_rate": 8.230129364823213e-06, "epoch": 1.827027027027027, "step": 1690 }, { "loss": 1.3861, "grad_norm": 1.0802196264266968, "learning_rate": 7.245532006666178e-06, "epoch": 1.8378378378378377, "step": 1700 }, { "loss": 1.4836, "grad_norm": 1.322296142578125, "learning_rate": 6.322558537900247e-06, "epoch": 1.8486486486486486, "step": 1710 }, { "loss": 1.35, "grad_norm": 1.2359806299209595, "learning_rate": 5.46150403451271e-06, "epoch": 1.8594594594594596, "step": 1720 }, { "eval_loss": 1.746907353401184, "eval_runtime": 10.0635, "eval_samples_per_second": 77.409, "eval_steps_per_second": 19.377, "epoch": 1.8616216216216217, "step": 1722 }, { "loss": 1.5115, "grad_norm": 0.9085004329681396, "learning_rate": 4.6626437769057955e-06, "epoch": 1.8702702702702703, "step": 1730 }, { "loss": 1.3037, "grad_norm": 1.031083583831787, "learning_rate": 3.9262331618890256e-06, "epoch": 1.881081081081081, "step": 1740 }, { "loss": 1.605, "grad_norm": 1.240249752998352, "learning_rate": 3.2525076210286e-06, "epoch": 1.8918918918918919, "step": 1750 }, { "loss": 1.3975, "grad_norm": 0.9747676849365234, "learning_rate": 2.6416825453794646e-06, "epoch": 1.9027027027027028, "step": 1760 }, { "loss": 1.3456, "grad_norm": 1.1354199647903442, "learning_rate": 2.093953216624556e-06, "epoch": 1.9135135135135135, "step": 1770 }, { "loss": 1.5099, "grad_norm": 1.3150416612625122, "learning_rate": 1.609494744642892e-06, "epoch": 1.9243243243243242, "step": 1780 }, { "loss": 1.3622, "grad_norm": 1.1703237295150757, "learning_rate": 1.188462011526692e-06, "epoch": 1.9351351351351351, "step": 1790 }, { "loss": 1.3636, "grad_norm": 1.1234310865402222, "learning_rate": 8.309896220654034e-07, "epoch": 1.945945945945946, "step": 1800 }, { "loss": 1.377, "grad_norm": 1.2217568159103394, "learning_rate": 5.371918607122827e-07, "epoch": 1.9567567567567568, "step": 1810 }, { "loss": 1.4881, "grad_norm": 1.1424119472503662, "learning_rate": 3.0716265504753263e-07, "epoch": 1.9675675675675675, "step": 1820 }, { "loss": 1.3919, "grad_norm": 1.0163501501083374, "learning_rate": 1.409755457494555e-07, "epoch": 1.9783783783783784, "step": 1830 }, { "loss": 1.3928, "grad_norm": 1.117492914199829, "learning_rate": 3.868366308346083e-08, "epoch": 1.9891891891891893, "step": 1840 }, { "eval_loss": 1.7465505599975586, "eval_runtime": 9.9858, "eval_samples_per_second": 78.011, "eval_steps_per_second": 19.528, "epoch": 1.9945945945945946, "step": 1845 }, { "loss": 1.3995, "grad_norm": 1.4473432302474976, "learning_rate": 3.1970991622998217e-10, "epoch": 2.0, "step": 1850 }, { "train_runtime": 958.4314, "train_samples_per_second": 30.878, "train_steps_per_second": 1.93, "total_flos": 9.431756938411008e+16, "train_loss": 1.6259772120295344, "epoch": 2.0, "step": 1850 } ]